#Toggle code
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
%%writefile wt_vs_all.py
#!/usr/bin/python
#First script:
# Input is a filtered pileup-like format
# writefile magic writes these files
# they will be later executed by slurm
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all'
subprocess.call(['mkdir',output_dir])
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/heatmap/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup_picard_realign.bam','Sample4_RMdup_picard_realign.bam'] #R1
for i in range(1,8)+[9]: #R2
group.append('DS00'+str(i)+'_RMdup_picard_realign.bam')
for i in range(10,12): #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(41,51): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [73,74]: #R4test
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(81,98): #R4
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#matrix for heatmap
resolution=200 #resolution hard coded !
heat_mat=np.zeros((resolution+1,resolution+1),dtype=np.int32)
#input file
f_in=open(input_dir+input_fname,'r')
#loop over positions
for line in f_in:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
indel_count=[]
for i in xrange(len(bases)):
indel_count.append(len(re.findall('[\-\+]',bases[i])))
indel_freq=np.array(indel_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(indel_freq[group_bool])
else_freq=np.mean(indel_freq[else_bool])
heat_mat[int(resolution*group_freq),int(resolution*else_freq)]+=1
#save it
np.savetxt(output_dir + input_fname.split('.')[0]+'.mat',heat_mat,fmt='%d')
#Run them in slurm
import os
import subprocess
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
for filename in os.listdir(input_dir):
try:
print subprocess.check_output([ 'sbatch',
'--mem',str(1000),'./wt_vs_all.py' ,
filename],stderr=subprocess.STDOUT),
except subprocess.CalledProcessError, e:
print e.output,
import os
import numpy as np
from matplotlib.colors import LogNorm
import matplotlib as mpl
import pandas as pd #instead of numpy for much faster csv loading
#import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
inputdir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/heatmap/'
#load result matrices
m=pd.read_csv(inputdir+os.listdir(inputdir)[0],sep=' ',header=None)
for filename in os.listdir(inputdir):
try:
m+=pd.read_csv(inputdir+filename,sep=' ',header=None)
except:
pass
#plot
fig,ax=plt.subplots()
fig.set_size_inches(16,16)
# define the colormap
cmap = plt.cm.Purples
# extract all colors from the map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be white
cmaplist[0] = (1.0,1.0,1.0,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
bounds = [0,1,5,10,20,100]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
#show the image
cax = ax.imshow(m,interpolation='none',norm=norm,cmap=cmap,alpha=0.8,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
cbar.outline.set_edgecolor('lightgrey')
#set grid
ax.grid(True,c='lightgrey',lw=1,linestyle='dotted')
ax.set_frame_on(False)
tics=ax.xaxis.set_ticks(np.linspace(0,200,6))
labs=ax.set_xticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
tics=ax.yaxis.set_ticks(np.linspace(0,200,6))
labs=ax.set_yticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
ax.set_xlim(-1,201)
ax.set_ylim(-1,201)
#enlarge font
mpl.rcParams['font.size']=14.0
# remove tick marks
ax.xaxis.set_tick_params(size=0)
ax.yaxis.set_tick_params(size=0)
#legend
ax.plot([],[],c='purple',lw=4,label='WT homo vs others avg freq')
ax.legend(fancybox=True,loc='upper left')
#annotate
ax.set_title('')
ax.set_xlabel('OTHER samples average indel freq')
cax=ax.set_ylabel('BRCA1 homo samples average indel freq')
# define the colormap
cmap = plt.cm.Purples
# extract all colors from the .jet map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be white
cmaplist[0] = (1.0,1.0,1.0,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
bounds = [0,1,2,3,4]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,:20],interpolation='none',extent=[0,20,0,200],
aspect=0.1,alpha=0.8,origin='lower',cmap=cmap,norm=norm)
cbar=fig.colorbar(cax,shrink=0.8)
cbar.outline.set_edgecolor('white')
#legend
ax.plot([],[],c='purple',lw=4,label='WT homo vs OTHERS avg freq')
ax.legend(fancybox=True,loc='upper left')
#annotate
ax.set_title('')
ax.set_xlabel('OTHER SAMPLES AVG FREQ')
ax.set_ylabel('WT SAMPLES AVG FREQ')
fig.set_size_inches(16,16)
#set w grd
ax.grid(True,c='lightgrey',lw=1,linestyle='dotted')
ax.set_frame_on(False)
tics=ax.xaxis.set_ticks(np.linspace(0,20,6))
tics=ax.yaxis.set_ticks(np.linspace(0,200,6))
ax.set_xlim(-1,21)
ax.set_ylim(-5,205)
labs=ax.set_yticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
labs=ax.set_xticklabels(['0%','2%','4%','6%','8%','10%'], rotation='horizontal')
# remove tick marks
ax.xaxis.set_tick_params(size=0)
ax.yaxis.set_tick_params(size=0)
#50% ones
rect=plt.Rectangle((0,74),1,26, fc='none',ec='r',lw=4, linestyle='dashed')
cax=ax.add_patch(rect)
#low freq ones
rect=plt.Rectangle((0,26),1,40, fc='none',ec='g',lw=4, linestyle='dashed')
cax=ax.add_patch(rect)
There is a nice 0,0.4 cluster, altough there is an other cluster not very clearly separated for this, at 0,0.2. This spot might be mostly due to more than diploid regions.
%%writefile wt_vs_all_collect_mut.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all'
subprocess.call(['mkdir',output_dir+'/het_indel'])
subprocess.call(['mkdir',output_dir+'/lowfreq_indel'])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup_picard_realign.bam','Sample4_RMdup_picard_realign.bam'] #R1
for i in range(1,8)+[9]: #R2
group.append('DS00'+str(i)+'_RMdup_picard_realign.bam')
for i in range(10,12): #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(41,51): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [73,74]: #R4test
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(81,98): #R4
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#input file
f_in=open(input_dir+input_fname,'r')
#output files
f_het=open(output_dir+'/het_indel/'+input_fname,'w')
f_lowfreq=open(output_dir+'/lowfreq_indel/'+input_fname,'w')
#loop over positions in file
for line in f_in:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
indel_count=[]
for i in xrange(len(bases)):
indel_count.append(len(re.findall('[\-\+]',bases[i])))
indel_freq=np.array(indel_count,dtype=np.double)/covs
#calculate group freqs
group_freq=np.mean(indel_freq[group_bool])
else_freq=np.mean(indel_freq[else_bool])
#save the line specific mutations
if(group_freq >= 0.36 and group_freq <= 0.5 and
else_freq <= 0.005):
f_het.write(line)
if(group_freq >= 0.13 and group_freq <= 0.35 and
else_freq <= 0.005):
f_lowfreq.write(line)
#close files
f_het.close()
f_lowfreq.close()
#Run them in slurm
import os
import subprocess
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
for filename in os.listdir(input_dir):
try:
print subprocess.check_output([ 'sbatch',
'--mem',str(1000),'./wt_vs_all_collect_mut.py' ,
filename],stderr=subprocess.STDOUT),
except subprocess.CalledProcessError, e:
print e.output,
How many indels are there?
%%bash
rm /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/het_indel/all.pup
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/het_indel/* > \
/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/het_indel/all.pup
echo The number of het indels:
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/het_indel/all.pup | wc -l
echo
rm /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/lowfreq_indel/all.pup
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/lowfreq_indel/* > \
/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/lowfreq_indel/all.pup
echo The number of low frequency indels:
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/WT_vs_all/lowfreq_indel/all.pup | wc -l