%%writefile wt_vs_all.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all'
subprocess.call(['mkdir',output_dir])
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/heatmap/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#inpurt_fnames='7-0-10030365.pup'
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam')
and not fnmatch.fnmatch(fname,"*.bam.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
group.append('DS'+str(i)+'_RMdup.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#matrix for heatmap
resolution=1000 #resolution hard coded !!!!!!!!
heat_mat=np.zeros((resolution+1,resolution+1),dtype=np.int32)
#run the pipeline
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
heat_mat[int(resolution*group_freq),int(resolution*else_freq)]+=1
p.wait() # wait for the subprocess to exit
#save it
np.savetxt(output_dir + input_fname.split('.')[0]+'.mat',heat_mat,fmt='%d')
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
group.append('DS'+str(i)+'_RMdup.bam')
print '\n'.join(group)
import os
import subprocess
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
print subprocess.call([ 'sbatch', '--mem',str(1000),'./wt_vs_all.py' ,filename]),
print subprocess.check_output('squeue')
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','wt_vs_all.py']),
import os
from matplotlib.colors import LogNorm
import pandas as pd #insted of numpy for much faster csv loading
#import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/heatmap/'
#load result matrices
m=pd.read_csv(inputdir+os.listdir(inputdir)[0],sep=' ',header=None)
for filename in os.listdir(inputdir):
m+=pd.read_csv(inputdir+filename,sep=' ',header=None)
#plot
fig,ax=plt.subplots()
cax = ax.imshow(m,interpolation='none',norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)
import numpy as np
#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,900:],interpolation='none',extent=[900,1000,0,1000],
aspect=0.1,norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)
rect=plt.Rectangle((990,470),10,150, fc='none',ec='r',lw=5)
ax.add_patch(rect)
%%writefile find_rect.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam')
and not fnmatch.fnmatch(fname,"*.bam.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
group.append('DS'+str(i)+'_RMdup.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
#select the red rectangle
if ( group_freq >= 0.47 and group_freq <= 0.63 and else_freq >= 0.99 and else_freq <= 1.0):
#print the the chr, pos and freqs
output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+ '\n' )
p.wait() # wait for the subprocess to exit
output_f.close()
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
print subprocess.check_output('squeue')
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
import fnmatch
import os
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#plot 0.5
ax.axhline(0.5,c='r',lw=3)
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)
#mark interest
ax.add_patch(plt.Rectangle((54,0.95),7,0.1, fc='none',ec='r',lw=5))
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(relFilenames)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(relFilenames)+1)
ax.set_ylim(0.,1.02)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)
#mark interest
ax.add_patch(plt.Rectangle((56,0.4),5,0.2, fc='none',ec='r',lw=5))
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#plot 0.5
ax.axhline(0.5,c='r',lw=3)
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0.98,1.02)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)
#mark interest
ax.add_patch(plt.Rectangle((57,0.988),2,0.005, fc='none',ec='r',lw=5))
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(relFilenames)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(relFilenames)+1)
ax.set_ylim(0.9,1.02)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific mutations')
fig.set_size_inches(16,6)
#mark interest
ax.add_patch(plt.Rectangle((57,0.92),2,0.08, fc='none',ec='r',lw=5))
from pandas import DataFrame as df
import os
import numpy as np
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts/'
chrom_pos=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
chrom_pos.append(line.split(' ')[0:2])
f.close()
chrom_pos=df(chrom_pos)
chrom_pos.columns=['chr','pos']
#plot it on chromosomes
for chrom in np.unique(np.array(chrom_pos['chr'])):
fig,ax=plt.subplots()
ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'o')
ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
ax.set_xlabel('position')
fig.set_size_inches(12,1)
#save it
chrom_pos.to_csv('wt_spec_mut.csv',sep='\t',index=False)
print 'There are ',len(chrom_pos['pos']),' WT specific mutations'
import numpy as np
#plot
fig,ax=plt.subplots()
cax = ax.imshow(np.array(m)[:,900:],interpolation='none',extent=[900,1000,0,1000],
aspect=0.1,norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)
rect=plt.Rectangle((990,630),10,200, fc='none',ec='r',lw=5)
ax.add_patch(rect)
%%writefile find_rect.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam')
and not fnmatch.fnmatch(fname,"*.bam.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
group.append('DS'+str(i)+'_RMdup.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
#select the red rectangle
#select the red rectangle
if ( group_freq >= 0.63 and group_freq <= 0.83 and else_freq >= 0.99 and else_freq <= 1.0):
#print the the chr, pos and freqs
output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+'\n')
p.wait() # wait for the subprocess to exit
output_f.close()
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
print subprocess.check_output('squeue')
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
import numpy as np
import os
import fnmatch
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific, high frequency mutations')
fig.set_size_inches(16,6)
import numpy as np
import os
import fnmatch
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
#freqs=np.mean(freqs,axis=0)
#samples=np.mean(samples,axis=0)
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs[0])),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(freqs[0])+1)
ax.set_ylim(-0.1,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg in red rectangles')
fig.set_size_inches(16,6)
from pandas import DataFrame as df
import os
import numpy as np
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_muts_high_freq/'
chrom_pos=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
chrom_pos.append(line.split(' ')[0:2])
f.close()
chrom_pos=df(chrom_pos)
chrom_pos.columns=['chr','pos']
#plot it on chromosomes
for chrom in np.unique(np.array(chrom_pos['chr'])):
fig,ax=plt.subplots()
ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'o')
ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
ax.set_xlabel('position')
fig.set_size_inches(12,1)
print 'There are ',len(chrom_pos['pos']),' high frequency mutations'
#plot
fig,ax=plt.subplots()
cax = ax.imshow(m,interpolation='none',norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)
#mark interesting
ax.add_patch(plt.Rectangle((720,450),120,120, fc='none',ec='r',lw=5))
%%writefile find_rect.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_mut_center/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam')
and not fnmatch.fnmatch(fname,"*.bam.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
group.append('DS'+str(i)+'_RMdup.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
#select the red rectangle
#select the red rectangle
if ( group_freq >= 0.45 and group_freq <= 0.57 and else_freq >= 0.72 and else_freq <= 0.84):
#print the the chr, pos and freqs
output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+'\n')
p.wait() # wait for the subprocess to exit
output_f.close()
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
print subprocess.check_output('squeue')
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_mut_center/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(0,1.1)
ax.set_xlabel('sample names')
ax.set_ylabel('avg frequency in WT specific, high frequency mutations')
fig.set_size_inches(16,6)
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/wt_spec_mut_center/'
chrom_pos=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
chrom_pos.append(line.split(' ')[0:2])
f.close()
chrom_pos=df(chrom_pos)
chrom_pos.columns=['chr','pos']
#plot it on chromosomes
for chrom in np.unique(np.array(chrom_pos['chr'])):
fig,ax=plt.subplots()
ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'o')
ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
ax.set_xlabel('position')
fig.set_size_inches(12,1)
#save it
chrom_pos.to_csv('wt_spec_mut.csv',sep='\t',index=False)
print 'There are ',len(chrom_pos['pos']),' close tot he center mutations'
#plot
fig,ax=plt.subplots()
cax = ax.imshow(m,interpolation='none',norm=LogNorm(),cmap=plt.cm.jet,origin='lower')
cbar=fig.colorbar(cax,shrink=0.8)
#annotate
ax.set_title('WT vs OTHERS avg freq, on all positions')
ax.set_xlabel('OTHER SAMPLES AVG FREQ x1000')
ax.set_ylabel('WT SAMPLES AVG FREQ x1000')
fig.set_size_inches(12,12)
rect=plt.Rectangle((300, 0), 150,70, fc='none',ec='r',lw=5)
rect2=plt.Rectangle((550, 930), 150,70, fc='none',ec='r',lw=5)
ax.add_patch(rect)
ax.add_patch(rect2)
%%writefile find_rect.py
#!/usr/bin/python
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#input ouput files
input_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
output_dir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/loh_anomaly/'
subprocess.call(['mkdir',output_dir])
#which file to run on come in cmdline arg
input_fname=sys.argv[1]
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam')
and not fnmatch.fnmatch(fname,"*.bam.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample1_RMdup.bam','Sample4_RMdup.bam'] #R1
for i in range(1,8)+range(9,12): #R2
group.append('DS'+str(i)+'_RMdup.bam')
for i in range(41,47)+['47R']+range(48,51): #R3
group.append('DS'+str(i)+'_RMdup.bam')
for i in [73,74]: #R4test
group.append('DS'+str(i)+'_RMdup.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
#filter for pileup lines
# there is a # in the beggining of Orsis format lines
cmd_filt_pup_lines= ' grep -v \'#\' '+ input_dir+input_fname
#run the pipeline
output_f=open(output_dir + input_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_filt_pup_lines, stdout=PIPE, bufsize=1,shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_freq=np.array(ref_count,dtype=np.double)/covs
#calculate group freqs and save in matrix
group_freq=np.mean(ref_freq[group_bool])
else_freq=np.mean(ref_freq[else_bool])
#select the red rectangle
#select the red rectangle
#select the red rectangle
if ( group_freq >= 0.0 and group_freq <= 0.07 and else_freq >= 0.3 and else_freq <= 0.45):
#print the the chr, pos and freqs
output_f.write(linelist[0]+' '+linelist[1]+' '+' '.join(map(str,ref_freq))+'\n')
p.wait() # wait for the subprocess to exit
output_f.close()
inputDir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/list_orsi_RMdup_2/'
for filename in os.listdir(inputDir):
print subprocess.call([ 'sbatch', '--mem',str(1000),'./find_rect.py' ,filename]),
print subprocess.check_output('squeue')
print subprocess.call('rm slurm*',shell=True),
print subprocess.call(['rm','find_rect.py']),
import numpy as np
import os
import fnmatch
from matplotlib.patches import Ellipse
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/loh_anomaly/'
samples=[]
freqs=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
samples.append(range(len(line.split(' ')[2:])))
freqs.append(map(float,line.split(' ')[2:]))
f.close()
freqs=np.mean(freqs,axis=0)
samples=np.mean(samples,axis=0)
#plot
fig,ax=plt.subplots()
cax = ax.plot(samples,freqs,'o',mec='none')
#set ticks
#####################################################
#filenames for samplenames
RMdupDir='/nagyvinyok/adat83/sotejedlik/ngs_data/enzimologia/RMdup/'
#collect filenames
inFilenames,relFilenames=[],[]
for filename in os.listdir(RMdupDir):
if (fnmatch.fnmatch(filename, '*.bam') and (not fnmatch.fnmatch(filename,"*.bam.bai"))):
relFilenames.append(filename)
relFilenames=sorted(relFilenames)
######################################################
relFilenames=[x[:-10] for x in relFilenames]
plt.xticks(np.arange(len(freqs)),relFilenames,size='small',rotation='vertical')
#annotate
ax.set_xlim(-1,len(freqs)+1)
ax.set_ylim(-0.1,0.6)
ax.set_xlabel('sample names')
ax.set_ylabel('avg in red rectangles')
fig.set_size_inches(16,6)
#mark interesting
ax.add_patch(Ellipse(xy=(55,0.5),width=1.5,height=0.05,angle=0,fc='none',ec='r',lw=5))
ax.add_patch(Ellipse(xy=(58,0.41),width=1.5,height=0.05,angle=0,fc='none',ec='r',lw=5))
from pandas import DataFrame as df
#load files
inputdir='/nagyvinyok/adat83/sotejedlik/dezso/dt40/snp/WT_vs_all/loh_anomaly/'
chrom_pos=[]
for filename in os.listdir(inputdir):
f=open(inputdir+filename)
for line in f:
chrom_pos.append(line.split(' ')[0:2])
f.close()
chrom_pos=df(np.array(chrom_pos))
chrom_pos.columns=['chr','pos']
chroms=np.unique(np.array(chrom_pos['chr']))
for chrom in chroms:
fig,ax=plt.subplots()
ax.plot(chrom_pos[(chrom_pos['chr']==chrom)]['pos'],np.ones(len(chrom_pos[chrom_pos['chr']==chrom])),'x')
ax.set_title('chr'+chrom+', number of points in rectangle='+str(len(chrom_pos[chrom_pos['chr']==chrom])))
ax.set_xlabel('position')
fig.set_size_inches(16,2)