#Toggle code
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
%%writefile wt_vs_all.py
#!/usr/bin/python
#First script:
# Input is a filtered pileup-like format
# there are lines in Orsi's format, and i dont use them
# writefile magic writes these files
# they will be later executed by slurm
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/SNV/SNV_list_withB_allsamples/'
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all'
subprocess.call(['mkdir',output_dir])
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/heatmap/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")): #strange .bai convention!!!
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup_picard_realign.bam','Sample4_RMdup_picard_realign.bam'] #R1
for i in range(1,8)+[9]: #R2
group.append('DS00'+str(i)+'_RMdup_picard_realign.bam')
for i in range(10,12): #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(41,51): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [73,74]: #R4test
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(81,98): #R4
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#matrix for heatmap
resolution=200 #resolution hard coded !!!!!!!!
heat_mat=np.zeros((resolution+1,resolution+1),dtype=np.int32)
#run the pipeline
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
heat_mat[int(resolution*group_freq),int(resolution*else_freq)]+=1
p.wait() # wait for the subprocess to exit
#save it
np.savetxt(output_dir + input_fname.split('.')[0]+'.mat',heat_mat,fmt='%d')
#Run them in slurm
import os
import subprocess
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/SNV/SNV_list_withB_allsamples/'
for filename in os.listdir(input_dir):
try:
print subprocess.check_output([ 'sbatch',
'--mem',str(1000),'./wt_vs_all.py' ,
filename],stderr=subprocess.STDOUT),
except subprocess.CalledProcessError, e:
print e.output,
import os
import numpy as np
from matplotlib.colors import LogNorm
import matplotlib as mpl
import pandas as pd #insted of numpy for much faster csv loading
#import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
inputdir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/heatmap/'
#load result matrices
m=pd.read_csv(inputdir+os.listdir(inputdir)[0],sep=' ',header=None)
for filename in os.listdir(inputdir):
try:
m+=pd.read_csv(inputdir+filename,sep=' ',header=None)
except:
pass
#plot
fig,ax=plt.subplots()
fig.set_size_inches(16,16)
# define the colormap
cmap = plt.cm.Greens
# extract all colors from the map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be white
cmaplist[0] = (1.0,1.0,1.0,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
bounds = [0,1,5,10,20,100]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
#show the image
cax = ax.imshow(m,interpolation='none',norm=norm,cmap=cmap,
vmin=0.2,vmax=30,alpha=0.8,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
cbar.outline.set_edgecolor('lightgrey')
#set grid
ax.grid(True,c='lightgrey',lw=1,linestyle='dotted')
ax.set_frame_on(False)
tics=ax.xaxis.set_ticks(np.linspace(0,200,6))
labs=ax.set_xticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
tics=ax.yaxis.set_ticks(np.linspace(0,200,6))
labs=ax.set_yticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
ax.set_xlim(-1,201)
ax.set_ylim(-1,201)
#enlarge font
mpl.rcParams['font.size']=14.0
# remove tick marks
ax.xaxis.set_tick_params(size=0)
ax.yaxis.set_tick_params(size=0)
#legend
ax.plot([],[],c='g',lw=4,label='WT vs others avg freq')
ax.legend(fancybox=True,loc='upper left')
#annotate
ax.set_title('')
ax.set_xlabel('other samples avg freq')
cax=ax.set_ylabel('WT samples avg freq')
# define the colormap
cmap = plt.cm.Greens
# extract all colors from the map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be white
cmaplist[0] = (1.0,1.0,1.0,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
bounds = [0,1,2,3,4]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,180:],interpolation='none',extent=[180,200,0,200],
aspect=0.1,vmin=0.2,vmax=4,alpha=0.8,origin='lower',
cmap=cmap,norm=norm)
cbar=fig.colorbar(cax,shrink=0.8)
cbar.outline.set_edgecolor('lightgrey')
#legend
ax.plot([],[],c='g',lw=4,label='WT vs others avg freq')
ax.legend(fancybox=True,loc='lower left')
#annotate
ax.set_title('')
ax.set_xlabel('other samples avg freq')
ax.set_ylabel('WT samples avg freq')
fig.set_size_inches(16,16)
#set w grd
ax.grid(True,c='lightgrey',lw=1,linestyle='dotted')
ax.set_frame_on(False)
tics=ax.xaxis.set_ticks(np.linspace(180,200,6))
tics=ax.yaxis.set_ticks(np.linspace(0,200,6))
ax.set_xlim(180,201)
ax.set_ylim(0,205)
labs=ax.set_yticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
labs=ax.set_xticklabels(['90%','92%','94%','96%','98%','100%'], rotation='horizontal')
# remove tick marks
ax.xaxis.set_tick_params(size=0)
ax.yaxis.set_tick_params(size=0)
#50% ones
rect=plt.Rectangle((196,90),4,30, fc='none',ec='r',lw=3, linestyle='dashed')
cax=ax.add_patch(rect)
#lower freq ones
rect=plt.Rectangle((198,124),2,18, fc='none',ec='b',lw=3,linestyle='dashed')
cax=ax.add_patch(rect)
There is a nice 1,0.5 cluster, altough there is an other cluster clearly separated for this, at 1,0,7. This spot might be mostly due to more than diploid regions.
Clusters are at considerably higher frequencies, than at BRCA1 homo
%%writefile wt_vs_all_collect_mut.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
#input_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/indel_list/'
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/SNV/SNV_list_withB_allsamples/'
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all'
subprocess.call(['mkdir',output_dir+'/het_indel'])
subprocess.call(['mkdir',output_dir+'/lowfreq_indel'])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")): #strange .bai convention!!!
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup_picard_realign.bam','Sample4_RMdup_picard_realign.bam'] #R1
for i in range(1,8)+[9]: #R2
group.append('DS00'+str(i)+'_RMdup_picard_realign.bam')
for i in range(10,12): #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(41,51): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [73,74]: #R4test
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(81,98): #R4
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(141,145): #R5
group.append('DS'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#output files
f_het=open(output_dir+'/het_snp/'+input_fname,'w')
f_lowfreq=open(output_dir+'/lowfreq_snp/'+input_fname,'w')
#run the pipeline
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
#save the line specific mutations
if(group_freq >= 0.45 and group_freq <= 0.60 and
else_freq >=0.98):
f_het.write(line)
if(group_freq >= 0.61 and group_freq <= 0.71 and
else_freq >= 0.99):
f_lowfreq.write(line)
p.wait() # wait for the subprocess to exit
#close files
f_het.close()
f_lowfreq.close()
#Run them in slurm
import os
import subprocess
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
for filename in os.listdir(input_dir):
try:
print subprocess.check_output([ 'sbatch',
'--mem',str(1000),'./wt_vs_all_collect_mut.py' ,
filename],stderr=subprocess.STDOUT),
except subprocess.CalledProcessError, e:
print e.output,
%%bash
rm /nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/het_snp/all.pup
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/het_snp/* > \
/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/het_snp/all.pup
echo The number of het SNPs:
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/het_snp/all.pup | wc -l
echo
rm /nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/lowfreq_snp/all.pup
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/lowfreq_snp/* > \
/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/lowfreq_snp/all.pup
echo The number of low frequency SNPs:
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/WT_vs_all/lowfreq_snp/all.pup | wc -l