Finding WT specific mutations


First calculate WT, and all other mean reference frequencies, to see the WT specific mutations

First script:

  • Input is a filtered pileup-like format
  • there are lines in Orsi's format, and i dont use them
  • writefile magic writes these files
  • they will be later executed by slurm
In [1]:
%%writefile wt_vs_all.py
#!/usr/bin/python

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all'
subprocess.call(['mkdir',output_dir])
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/heatmap/'
subprocess.call(['mkdir',output_dir])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#inpurt_fnames='7-0-10030365.pup'

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
        if (fnmatch.fnmatch(fname, '*.bam')
            and not fnmatch.fnmatch(fname,"*.bam.bai")):
                fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
    group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
    group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
    group.append('DS'+str(i)+'_RMdup.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)

#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname

#matrix for heatmap
resolution=1000 #resolution hard coded !!!!!!!!
heat_mat=np.zeros((resolution+1,resolution+1),dtype=np.int32)

#run the pipeline
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
        #parse line
        linelist=line.strip().upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]
        ref_count=[]
        for i in xrange(len(bases)):
            ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
        ref_freq=np.array(ref_count,dtype=np.double)/covs
        #calculate group freqs and save in matrix
        group_freq=np.mean(ref_freq[group_bool])
        else_freq=np.mean(ref_freq[else_bool])
        heat_mat[int(resolution*group_freq),int(resolution*else_freq)]+=1
p.wait() # wait for the subprocess to exit

#save it
np.savetxt(output_dir + input_fname.split('.')[0]+'.mat',heat_mat,fmt='%d')
Writing wt_vs_all.py
In [2]:
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
    group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
    group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
    group.append('DS'+str(i)+'_RMdup.bam')
print '\n'.join(group)
Sample1_RMdup.bam
Sample4_RMdup.bam
DS1_RMdup.bam
DS2_RMdup.bam
DS3_RMdup.bam
DS4_RMdup.bam
DS5_RMdup.bam
DS6_RMdup.bam
DS7_RMdup.bam
DS9_RMdup.bam
DS10_RMdup.bam
DS11_RMdup.bam
DS41_RMdup.bam
DS42_RMdup.bam
DS43_RMdup.bam
DS44_RMdup.bam
DS45_RMdup.bam
DS46_RMdup.bam
DS47R_RMdup.bam
DS48_RMdup.bam
DS49_RMdup.bam
DS50_RMdup.bam
DS73_RMdup.bam
DS74_RMdup.bam

Run counting in sbatch

In [4]:
import os
import subprocess
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
    print subprocess.call([ 'sbatch', '--mem',str(1000),'./wt_vs_all.py' ,filename]),
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

check state

  • It takes around 10mins to complete the run
In [5]:
print subprocess.check_output('squeue')
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  20284 eszakigri SIM_11_2  ragraat   R    4:51:38      1 jimgray85
  20975 eszakigri SIM_11_2  ragraat   R    3:02:13      1 jimgray84
  21552 eszakigri create_l    ribli   R      38:57      1 jimgray86
  21670 eszakigri create_l    ribli   R      16:16      1 jimgray86

Clean up

In [6]:
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','wt_vs_all.py']),
0 0

Load and plot result matrices

In [1]:
import os
from matplotlib.colors import LogNorm
import pandas as pd #insted of numpy for much faster csv loading
#import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/heatmap/'

#load result matrices
m=pd.read_csv(inputdir+os.listdir(inputdir)[0],sep=' ',header=None)
for filename in os.listdir(inputdir):
    m+=pd.read_csv(inputdir+filename,sep=' ',header=None)

#plot
fig,ax=plt.subplots()
cax = ax.imshow(m,interpolation='none',norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)

#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)
/usr/lib/pymodules/python2.7/matplotlib/font_manager.py:1218: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))
/usr/lib/pymodules/python2.7/matplotlib/font_manager.py:1228: UserWarning: findfont: Could not match :family=Bitstream Vera Sans:style=normal:variant=normal:weight=normal:stretch=normal:size=medium. Returning /usr/share/matplotlib/mpl-data/fonts/ttf/STIXGeneral.ttf
  UserWarning)
/usr/lib/pymodules/python2.7/matplotlib/font_manager.py:1228: UserWarning: findfont: Could not match :family=Bitstream Vera Sans:style=normal:variant=normal:weight=normal:stretch=normal:size=large. Returning /usr/share/matplotlib/mpl-data/fonts/ttf/STIXGeneral.ttf
  UserWarning)

Zoom on WT specific mutations

In [5]:
import numpy as np

#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,900:],interpolation='none',extent=[900,1000,0,1000],
                aspect=0.1,norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)

#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)

rect=plt.Rectangle((990,470),10,150, fc='none',ec='r',lw=5)
ax.add_patch(rect)
Out[5]:
<matplotlib.patches.Rectangle at 0x3b838d0>

Find the WT specific mutation positions

In [9]:
%%writefile find_rect.py
#!/usr/bin/python

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
subprocess.call(['mkdir',output_dir])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
        if (fnmatch.fnmatch(fname, '*.bam')
            and not fnmatch.fnmatch(fname,"*.bam.bai")):
                fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
    group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
    group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
    group.append('DS'+str(i)+'_RMdup.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)


#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname

#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
        #parse line
        linelist=line.strip().upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]
        ref_count=[]
        for i in xrange(len(bases)):
            ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
        ref_freq=np.array(ref_count,dtype=np.double)/covs
        #calculate group freqs and save in matrix
        group_freq=np.mean(ref_freq[group_bool])
        else_freq=np.mean(ref_freq[else_bool])        
        #select the red rectangle
        if ( group_freq >= 0.47 and group_freq <= 0.63 and else_freq >= 0.99 and else_freq <= 1.0):
            #print the the chr, pos and freqs
            output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+ '\n' )
p.wait() # wait for the subprocess to exit
output_f.close()
Overwriting find_rect.py
In [10]:
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
    print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [12]:
print subprocess.check_output('squeue')
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  20284 eszakigri SIM_11_2  ragraat   R    5:12:17      1 jimgray85
  20975 eszakigri SIM_11_2  ragraat   R    3:22:52      1 jimgray84
  21552 eszakigri create_l    ribli   R      59:36      1 jimgray86
  21670 eszakigri create_l    ribli   R      36:55      1 jimgray86

In [13]:
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
0 0

Results

Sample frequencies

In [7]:
import fnmatch
import os

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#plot 0.5
ax.axhline(0.5,c='r',lw=3)

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)

#mark interest
ax.add_patch(plt.Rectangle((54,0.95),7,0.1, fc='none',ec='r',lw=5))
Out[7]:
<matplotlib.patches.Rectangle at 0x5c1dd50>

Exactly none of the mutations are nicely shared whih sample1, sample 4

In [30]:
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(relFilenames)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(relFilenames)+1)
ax.set_ylim(0.,1.02)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)

#mark interest
ax.add_patch(plt.Rectangle((56,0.4),5,0.2, fc='none',ec='r',lw=5))
Out[30]:
<matplotlib.patches.Rectangle at 0x939b9d0>

Some of the mutations have lower than clean freq in only sapmple 4 (not sample1)

  • I dont understand this
In [8]:
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#plot 0.5
ax.axhline(0.5,c='r',lw=3)

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0.98,1.02)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)

#mark interest
ax.add_patch(plt.Rectangle((57,0.988),2,0.005, fc='none',ec='r',lw=5))
Out[8]:
<matplotlib.patches.Rectangle at 0x42ffed0>
In [37]:
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(relFilenames)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(relFilenames)+1)
ax.set_ylim(0.9,1.02)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)

#mark interest
ax.add_patch(plt.Rectangle((57,0.92),2,0.08, fc='none',ec='r',lw=5))
Out[37]:
<matplotlib.patches.Rectangle at 0x7faf42c79410>

Conclusion

  • The WT specific mutations are not nicely present in round1
  • This means, that there are no wt specific mutations which were present is sample1 (this can make sense if it waqs really frozen after BRCA or PCNA lines were created, until round1 )

Some confusion still:

  • but some of them has a low freq??

Plot and save these positions

In [6]:
from pandas import DataFrame as df
import os
import numpy as np

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
chrom_pos=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        chrom_pos.append(line.split(' ')[0:2])
    f.close()
    

chrom_pos=df(chrom_pos)
chrom_pos.columns=['chr','pos']

#plot it on chromosomes
for chrom in np.unique(np.array(chrom_pos['chr'])):
    fig,ax=plt.subplots()
    ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'o')
    ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
    ax.set_xlabel('position')
    fig.set_size_inches(12,1)
    
#save it
chrom_pos.to_csv('wt_spec_mut.csv',sep='\t',index=False)
print 'There are ',len(chrom_pos['pos']),' WT specific mutations'
There are  111  WT specific mutations

But what are the points around WT_freq ~ 0.7-8??

In [7]:
import numpy as np

#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,900:],interpolation='none',extent=[900,1000,0,1000],
                aspect=0.1,norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)

#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)

rect=plt.Rectangle((990,630),10,200, fc='none',ec='r',lw=5)
ax.add_patch(rect)
Out[7]:
<matplotlib.patches.Rectangle at 0x7f35c51f6d90>
In [18]:
%%writefile find_rect.py
#!/usr/bin/python

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
subprocess.call(['mkdir',output_dir])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
        if (fnmatch.fnmatch(fname, '*.bam')
            and not fnmatch.fnmatch(fname,"*.bam.bai")):
                fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
    group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
    group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
    group.append('DS'+str(i)+'_RMdup.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)


#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname

#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
        #parse line
        linelist=line.strip().upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]
        ref_count=[]
        for i in xrange(len(bases)):
            ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
        ref_freq=np.array(ref_count,dtype=np.double)/covs
        #calculate group freqs and save in matrix
        group_freq=np.mean(ref_freq[group_bool])
        else_freq=np.mean(ref_freq[else_bool])        
        #select the red rectangle
        #select the red rectangle
        if ( group_freq >= 0.63 and group_freq <= 0.83 and else_freq >= 0.99 and else_freq <= 1.0):
            #print the the chr, pos and freqs
            output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+'\n')
p.wait() # wait for the subprocess to exit
output_f.close()
Writing find_rect.py
In [19]:
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
    print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [20]:
print subprocess.check_output('squeue')
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  20284 eszakigri SIM_11_2  ragraat   R    5:34:07      1 jimgray85
  20975 eszakigri SIM_11_2  ragraat   R    3:44:42      1 jimgray84
  21552 eszakigri create_l    ribli   R    1:21:26      1 jimgray86
  21670 eszakigri create_l    ribli   R      58:45      1 jimgray86

In [21]:
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
0 0

Sample frequencies

mean

In [8]:
import numpy as np
import os
import fnmatch

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific, high frequency mutations')
fig.set_size_inches(16,6)

all points

In [9]:
import numpy as np
import os
import fnmatch

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    
#freqs=np.mean(freqs,axis=0)
#samples=np.mean(samples,axis=0)

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################


relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs[0])),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(freqs[0])+1)
ax.set_ylim(-0.1,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg in red rectangles')
fig.set_size_inches(16,6)

Positions

In [10]:
from pandas import DataFrame as df
import os
import numpy as np

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
chrom_pos=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        chrom_pos.append(line.split(' ')[0:2])
    f.close()
    

chrom_pos=df(chrom_pos)
chrom_pos.columns=['chr','pos']

#plot it on chromosomes
for chrom in np.unique(np.array(chrom_pos['chr'])):
    fig,ax=plt.subplots()
    ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'o')
    ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
    ax.set_xlabel('position')
    fig.set_size_inches(12,1)
    
print 'There are ',len(chrom_pos['pos']),' high frequency mutations'
There are  142  high frequency mutations

Conclusion

  • Lot of them are on chr2 which is because of triploidy
  • Sample scene is also not homogene

What is this?

In [12]:
#plot
fig,ax=plt.subplots()
cax = ax.imshow(m,interpolation='none',norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)

#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)

#mark interesting
ax.add_patch(plt.Rectangle((720,450),120,120, fc='none',ec='r',lw=5))
Out[12]:
<matplotlib.patches.Rectangle at 0x6a93890>
In [26]:
%%writefile find_rect.py
#!/usr/bin/python

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_mut_center/'
subprocess.call(['mkdir',output_dir])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
        if (fnmatch.fnmatch(fname, '*.bam')
            and not fnmatch.fnmatch(fname,"*.bam.bai")):
                fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
    group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
    group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
    group.append('DS'+str(i)+'_RMdup.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)


#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname

#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
        #parse line
        linelist=line.strip().upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]
        ref_count=[]
        for i in xrange(len(bases)):
            ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
        ref_freq=np.array(ref_count,dtype=np.double)/covs
        #calculate group freqs and save in matrix
        group_freq=np.mean(ref_freq[group_bool])
        else_freq=np.mean(ref_freq[else_bool])        
        #select the red rectangle
        #select the red rectangle
        if ( group_freq >= 0.45 and group_freq <= 0.57 and else_freq >= 0.72 and else_freq <= 0.84):
            #print the the chr, pos and freqs
            output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+'\n')
p.wait() # wait for the subprocess to exit
output_f.close()
Writing find_rect.py
In [27]:
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
    print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [30]:
print subprocess.check_output('squeue')
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  20284 eszakigri SIM_11_2  ragraat   R   21:20:34      1 jimgray85
  20975 eszakigri SIM_11_2  ragraat   R   19:31:09      1 jimgray84

In [31]:
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
0 0
In [13]:
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_mut_center/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific, high frequency mutations')
fig.set_size_inches(16,6)
In [14]:
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_mut_center/'
chrom_pos=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        chrom_pos.append(line.split(' ')[0:2])
    f.close()
    

chrom_pos=df(chrom_pos)
chrom_pos.columns=['chr','pos']

#plot it on chromosomes
for chrom in np.unique(np.array(chrom_pos['chr'])):
    fig,ax=plt.subplots()
    ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'o')
    ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
    ax.set_xlabel('position')
    fig.set_size_inches(12,1)
    
#save it
chrom_pos.to_csv('wt_spec_mut.csv',sep='\t',index=False)
print 'There are ',len(chrom_pos['pos']),' close tot he center mutations'
There are  2072  close tot he center mutations

Conclusion

  • These positions are acutally a BRCA LOH chr26, chr23

There is also something strange

  • WT LOH-s dont go to 0% or 100, and the whole picture is skewed
In [15]:
#plot
fig,ax=plt.subplots()
cax = ax.imshow(m,interpolation='none',norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)

#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)

rect=plt.Rectangle((300, 0), 150,70, fc='none',ec='r',lw=5)
rect2=plt.Rectangle((550, 930), 150,70, fc='none',ec='r',lw=5)
ax.add_patch(rect)
ax.add_patch(rect2)
Out[15]:
<matplotlib.patches.Rectangle at 0x7f35c4f61d10>

Find positions in the lower red rectangle and save the freqs

In [17]:
%%writefile find_rect.py
#!/usr/bin/python

#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os

#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/loh_anomaly/'
subprocess.call(['mkdir',output_dir])

#which file to run on come in cmdline arg
input_fname=sys.argv[1]

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
        if (fnmatch.fnmatch(fname, '*.bam')
            and not fnmatch.fnmatch(fname,"*.bam.bai")):
                fnames.append(fname)
fnames=sorted(fnames)

#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
    group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
    group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
    group.append('DS'+str(i)+'_RMdup.bam')

#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
    group_bool.append(sample in group)
    else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)


#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname

#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
        #parse line
        linelist=line.strip().upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]
        ref_count=[]
        for i in xrange(len(bases)):
            ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
        ref_freq=np.array(ref_count,dtype=np.double)/covs
        #calculate group freqs and save in matrix
        group_freq=np.mean(ref_freq[group_bool])
        else_freq=np.mean(ref_freq[else_bool])        
        #select the red rectangle
        #select the red rectangle
        #select the red rectangle
        if ( group_freq >= 0.0 and group_freq <= 0.07 and else_freq >= 0.3 and else_freq <= 0.45):
            #print the the chr, pos and freqs
            output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+'\n')
p.wait() # wait for the subprocess to exit
output_f.close()
Writing find_rect.py
In [19]:
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
    print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [23]:
print subprocess.check_output('squeue')
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  20284 eszakigri SIM_11_2  ragraat   R   23:55:36      1 jimgray85
  20975 eszakigri SIM_11_2  ragraat   R   22:06:11      1 jimgray84

In [24]:
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
0 0

Load the results

sample frequencies

In [25]:
import numpy as np
import os
import fnmatch
from matplotlib.patches import Ellipse

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/loh_anomaly/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        samples.append(range(len(line.split(' ')[2:])))
        freqs.append(map(float,line.split(' ')[2:]))
    f.close()
    
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)

#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')

#set ticks 
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
        if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
                relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################

relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')        
        
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(-0.1,0.6)
ax.set_xlabel('sample names')
ax.set_ylabel('avg in red rectangles')
fig.set_size_inches(16,6)

#mark interesting
ax.add_patch(Ellipse(xy=(55,0.5),width=1.5,height=0.05,angle=0,fc='none',ec='r',lw=5))
ax.add_patch(Ellipse(xy=(58,0.41),width=1.5,height=0.05,angle=0,fc='none',ec='r',lw=5))
Out[25]:
<matplotlib.patches.Ellipse at 0x50e1b90>

Positions

In [26]:
from pandas import DataFrame as df 

#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/loh_anomaly/'
chrom_pos=[]
for filename in os.listdir(inputdir):
    f=open(inputdir+filename)
    for line in f:
        chrom_pos.append(line.split(' ')[0:2])
    f.close()
    
chrom_pos=df(np.array(chrom_pos))
chrom_pos.columns=['chr','pos']
chroms=np.unique(np.array(chrom_pos['chr']))

for chrom in chroms:
    fig,ax=plt.subplots()
    ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'x')
    ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
    ax.set_xlabel('position')
    fig.set_size_inches(16,2)

Conclusion:

  • LOH happened on chr21 (after first round??) in WT, It is also present is PCNA

But PCNA has them too:

  • either it happened independently in PCNA, and in WT after first round
  • Or it happened earlier but because of heterogeneity of cells, first round was chosen from the not LOH population, but the predecessor of later round was chosen from the LOH population

We have already seen this LOH when looking at BRCA data


OVERALL CONCLUSION

WT specific mutations

  • there are no shared mutation between all the WT samples
  • R2,3,4 has shared mutations (~100) (these happened after recloning WT after R1 )

Anomalies

  • more or less explained