BRCA1 homozygote cell line specific indels¶

First calculate BRCA1 homozygote, and all other mean indel frequencies, to see the BRCA1 homozygote cell line specific indels¶

BRCA1 specific heterozygote indels have around 50% indel count in BRCA1 samples, and around 0% in non BRCA1 samples. So the average indel frequency among BRCA1 samples is around 50%, and around 0% among non BRCA1 samples. To see these indels I will plot almost all positions on an (average other samples indel freq, average BRCA1 indel freq ) plane. BRCA1 specific indels are expected to be in the middle left (0,0.5), and are expected to be clearly separated from other positions.

Actually indels have generally lower frequencies due to misalignement

Code is hidden in the notebook but can be seen by clicking the button below¶

#Toggle code

from IPython.display  import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

%%writefile brca1_vs_all.py
#!/usr/bin/python

#First script:
    # Input is a filtered pileup-like format
    # writefile magic writes these files
    # they will be later executed by slurm

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all'
subprocess.call(['mkdir',output_dir])
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/heatmap/'
subprocess.call(['mkdir',output_dir])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
    if (fnmatch.fnmatch(fname, '*.bam') and 
        not fnmatch.fnmatch(fname,"*.bai")):
        fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
    group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
    group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R4
    group.append('DS'+str(i)+'_RMdup_picard_realign.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)

#matrix for heatmap
resolution=200 #resolution hard coded !
heat_mat=np.zeros((resolution+1,resolution+1),dtype=np.int32)

#input file
f_in=open(input_dir+input_fname,'r')

#loop over positions
for line in f_in:
    #parse line
    linelist=line.strip().upper().split(' ')
    covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
    bases=linelist[4::2]
    indel_count=[]
    for i in xrange(len(bases)):
        indel_count.append(len(re.findall('[\-\+]',bases[i]))) 
    indel_freq=np.array(indel_count,dtype=np.double)/covs
    #calculate group freqs and save in matrix
    group_freq=np.mean(indel_freq[group_bool])
    else_freq=np.mean(indel_freq[else_bool])
    heat_mat[int(resolution*group_freq),int(resolution*else_freq)]+=1

#save it
np.savetxt(output_dir + input_fname.split('.')[0]+'.mat',heat_mat,fmt='%d')

#Run them in slurm

import os
import subprocess
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
for filename in os.listdir(input_dir):
    try:
        print subprocess.check_output([ 'sbatch',
                                       '--mem',str(1000),'./brca1_vs_all.py' ,
                                       filename],stderr=subprocess.STDOUT),
    except subprocess.CalledProcessError, e:
        print e.output,

Plot the 'heatmap'¶

its half heatmap, half scatter because of the high resolution

import os
import numpy as np
from matplotlib.colors import LogNorm
import matplotlib as mpl
import pandas as pd #instead of numpy for much faster csv loading
#import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

inputdir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/heatmap/'

#load result matrices
m=pd.read_csv(inputdir+os.listdir(inputdir)[0],sep=' ',header=None)
for filename in os.listdir(inputdir):
    try:
        m+=pd.read_csv(inputdir+filename,sep=' ',header=None)
    except:
        pass


#plot
fig,ax=plt.subplots()
fig.set_size_inches(16,16)

# define the colormap
cmap = plt.cm.Oranges
# extract all colors from the map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be white
cmaplist[0] = (1.0,1.0,1.0,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)

# define the bins and normalize
bounds = [0,1,5,10,20,100]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

#show the image
cax = ax.imshow(m,interpolation='none',norm=norm,cmap=cmap,alpha=0.8,origin='lower')

cbar=fig.colorbar(cax,shrink=0.8)
cbar.outline.set_edgecolor('lightgrey')

#set grid
ax.grid(True,c='lightgrey',lw=1,linestyle='dotted')
ax.set_frame_on(False)
tics=ax.xaxis.set_ticks(np.linspace(0,200,6))
labs=ax.set_xticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
tics=ax.yaxis.set_ticks(np.linspace(0,200,6))
labs=ax.set_yticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')

ax.set_xlim(-1,201)
ax.set_ylim(-1,201)

#enlarge font
mpl.rcParams['font.size']=14.0

# remove tick marks
ax.xaxis.set_tick_params(size=0)
ax.yaxis.set_tick_params(size=0)

#legend
ax.plot([],[],c='orange',lw=4,label='BRCA1 homo vs others avg freq')
ax.legend(fancybox=True,loc='upper left')

#annotate
ax.set_title('')
ax.set_xlabel('OTHER samples average indel freq')
cax=ax.set_ylabel('BRCA1 homo samples average indel freq')

Zoom on the interesting region¶

# define the colormap
cmap = plt.cm.Oranges
# extract all colors from the .jet map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be white
cmaplist[0] = (1.0,1.0,1.0,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)

# define the bins and normalize
bounds = [0,1,2,3,4,5,6,7]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,:20],interpolation='none',extent=[0,20,0,200],
                aspect=0.1,alpha=0.8,origin='lower',cmap=cmap,norm=norm)

cbar=fig.colorbar(cax,shrink=0.8)
cbar.outline.set_edgecolor('white')

#legend
ax.plot([],[],c='orange',lw=4,label='BRCA1 homo vs OTHERS avg freq')
ax.legend(fancybox=True,loc='upper left')

#annotate
ax.set_title('')
ax.set_xlabel('OTHER SAMPLES AVG FREQ')
ax.set_ylabel('WT SAMPLES AVG FREQ')
fig.set_size_inches(16,16)

#set w grd
ax.grid(True,c='lightgrey',lw=1,linestyle='dotted')
ax.set_frame_on(False)
tics=ax.xaxis.set_ticks(np.linspace(0,20,6))
tics=ax.yaxis.set_ticks(np.linspace(0,200,6))

ax.set_xlim(-1,21)
ax.set_ylim(-5,205)

labs=ax.set_yticklabels(['0%','20%','40%','60%','80%','100%'], rotation='horizontal')
labs=ax.set_xticklabels(['0%','2%','4%','6%','8%','10%'], rotation='horizontal')

# remove tick marks
ax.xaxis.set_tick_params(size=0)
ax.yaxis.set_tick_params(size=0)

#50% ones
rect=plt.Rectangle((0,66),1,46, fc='none',ec='b',lw=6, linestyle='dashed')
cax=ax.add_patch(rect)

#low freq ones
rect=plt.Rectangle((0,28),1,36, fc='none',ec='g',lw=6, linestyle='dashed')
cax=ax.add_patch(rect)

Conclusion:¶

There is a nice 0,0.4 cluster, altough there is an other cluster not very clearly separated for this, at 0,0.2. This spot might be mostly due to more than diploid regions.

Collect the line specific mutations¶

Write them to files

%%writefile brca1_vs_all_collect_mut.py
#!/usr/bin/python

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all'
subprocess.call(['mkdir',output_dir+'/het_indel'])
subprocess.call(['mkdir',output_dir+'/lowfreq_indel'])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
    if (fnmatch.fnmatch(fname, '*.bam') and 
        not fnmatch.fnmatch(fname,"*.bai")):
        fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
    group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
    group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R3
    group.append('DS'+str(i)+'_RMdup_picard_realign.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)

#input file
f_in=open(input_dir+input_fname,'r')

#output files
f_het=open(output_dir+'/het_indel/'+input_fname,'w')
f_lowfreq=open(output_dir+'/lowfreq_indel/'+input_fname,'w')

#loop over positions in file
for line in f_in:
    #parse line
    linelist=line.strip().upper().split(' ')
    covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
    bases=linelist[4::2]
    indel_count=[]
    for i in xrange(len(bases)):
        indel_count.append(len(re.findall('[\-\+]',bases[i]))) 
    indel_freq=np.array(indel_count,dtype=np.double)/covs

    #calculate group freqs
    group_freq=np.mean(indel_freq[group_bool])
    else_freq=np.mean(indel_freq[else_bool])
    #save the line specific mutations
    if(group_freq >= 0.33 and group_freq <= 0.6  and 
      else_freq <= 0.01):
        f_het.write(line)
    if(group_freq >= 0.14 and group_freq <= 0.32  and 
      else_freq <= 0.005):
        f_lowfreq.write(line)

#close files
f_het.close()
f_lowfreq.close()

#Run them in slurm

import os
import subprocess
input_dir='/nagyvinyok/adat83/sotejedlik/orsi/indel/indel_list_withB_allsamples/'
for filename in os.listdir(input_dir):
    try:
        print subprocess.check_output([ 'sbatch',
                                       '--mem',str(1000),'./brca1_vs_all_collect_mut.py' ,
                                       filename],stderr=subprocess.STDOUT),
    except subprocess.CalledProcessError, e:
        print e.output,

How many indels are there?

%%bash

rm /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/het_indel/all.pup 
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/het_indel/* > \
/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/het_indel/all.pup 

echo The number of het indels:
cat  /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/het_indel/all.pup | wc -l
echo

rm /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/lowfreq_indel/all.pup 
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/lowfreq_indel/* > \
/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/lowfreq_indel/all.pup 

echo The number of low frequency indels:
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/BRCA1_vs_all/lowfreq_indel/all.pup | wc -l

The number of het indels:
101

The number of low frequency indels:
24