The testset was constructed as an independently validated set of SNPs to be used for testing the SNP calling method. It gives us the opportunity to analyze a set of high confidence SNP-s, which were collected using minimal filtering criteria ( they were collected from a clearly, and naturally separated cluester ).
The power of the mutation calling process can be evaluated only using these positions.
The specificity needs to be addressed using the whole genome. Altough this is not entirely clear now, how to do this properly.
The strictest thresholds, which do not cripple the sensivity of the pipeline, can be proposed as the 0. iteration of the optimized threhsolds. Later this needs to be refined, taking into account both sensitivity and specificity of the method.
#Toggle code
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import matplotlib
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
#Load the table
full_testset=pd.read_csv('/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/BRCA1_vs_all/het_snp/all.pup',
header=None,sep=' ')
#define a coverage matrix
covs=np.zeros((len(full_testset),
len(range(3,len(full_testset.columns),2)) ))
j=0
for i in xrange(3,len(full_testset.columns),2):
covs[:,j]=full_testset.loc[:,i].values
j+=1
hist=np.histogram(covs.flatten(),bins=range(0,int(np.max(covs.flatten()))))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('coverage')
ax.set_ylabel('count')
ax.set_xlim(0,60)
cax=ax.bar(hist[1][:-1],hist[0],color='magenta',edgecolor='white')
hist=np.histogram(np.min(covs,axis=1),bins=range(0,int(max(np.min(covs,axis=1)))))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('minimum coverage among samples')
ax.set_ylabel('count')
ax.set_xlim(0,20)
cax=ax.bar(hist[1][:-1]-0.5,hist[0],color='lightblue',edgecolor='white')
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R4
group.append('DS'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
non_brca_non_ref_count=[]
in_f=open('/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/BRCA1_vs_all/het_snp/all.pup','r')
for line in in_f:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_count=np.array(ref_count,dtype=np.int32)
#calculate group freqs and save in matrix
non_brca_non_ref_count.append(np.sum( covs[else_bool] - ref_count[else_bool]))
in_f.close()
hist=np.histogram(non_brca_non_ref_count,bins=range(0,int(max(non_brca_non_ref_count))))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('number of non reference bases aggregated in other samples')
ax.set_ylabel('count')
ax.set_xlim(-1,7)
cax=ax.bar(hist[1][:-1]-0.5,hist[0],color='lightgreen',edgecolor='white')
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R4
group.append('DS'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
non_brca_non_ref_freq=[]
in_f=open('/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/BRCA1_vs_all/het_snp/all.pup','r')
for line in in_f:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_count=np.array(ref_count,dtype=np.int32)
#calculate group freqs and save in matrix
non_brca_non_ref_freq.append(np.mean(
(covs[else_bool] - ref_count[else_bool])/np.float32(covs[else_bool])))
hist=np.histogram(non_brca_non_ref_freq,bins=np.arange(100)/float(5000))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('frequency of non reference bases averaged in other samples')
ax.set_ylabel('count')
ax.set_xlim(0,0.0025)
cax=ax.bar(hist[1][:-1],hist[0],width=0.8*1/float(5000),color='purple',edgecolor='white')
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R4
group.append('DS'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
non_brca_non_ref_maxcount=[]
in_f=open('/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/BRCA1_vs_all/het_snp/all.pup','r')
for line in in_f:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_count=np.array(ref_count,dtype=np.int32)
#calculate group freqs and save in matrix
non_brca_non_ref_maxcount.append(np.max((covs[else_bool] - ref_count[else_bool])))
hist=np.histogram(non_brca_non_ref_maxcount,bins=np.arange(10))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('maximum frequency of non reference bases among other samples')
ax.set_ylabel('count')
ax.set_xlim(-1,4)
cax=ax.bar(hist[1][:-1]-0.4,hist[0],width=0.8,color='violet',edgecolor='white')
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R4
group.append('DS'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
non_brca_non_ref_maxfreq=[]
in_f=open('/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/BRCA1_vs_all/het_snp/all.pup','r')
for line in in_f:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_count=np.array(ref_count,dtype=np.int32)
#calculate group freqs and save in matrix
non_brca_non_ref_maxfreq.append(np.max(
(covs[else_bool] - ref_count[else_bool])/np.float32(covs[else_bool])))
hist=np.histogram(non_brca_non_ref_maxfreq,bins=np.arange(100)/float(40))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('maximum frequency of non reference bases among other samples')
ax.set_ylabel('count')
ax.set_xlim(0,0.2)
cax=ax.bar(hist[1][:-1],hist[0],width=0.8*1/float(40),color='coral',edgecolor='white')
#import modules
import subprocess
import sys
import re
import numpy as np
import fnmatch
import os
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bai")):
fnames.append(fname)
fnames=sorted(fnames)
#select the group samples set
group=['Sample2_RMdup_picard_realign.bam','Sample5_RMdup_picard_realign.bam'] #R1
for i in [14,15,16,18]: #R2
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in range(51,59): #R3
group.append('DS0'+str(i)+'_RMdup_picard_realign.bam')
for i in [101,102,103]: #R4
group.append('DS'+str(i)+'_RMdup_picard_realign.bam')
#create array to index into numpy arrays
group_bool,else_bool,group=[],[],set(group)
for sample in fnames:
group_bool.append(sample in group)
else_bool.append(not (sample in group))
group_bool,else_bool=np.array(group_bool),np.array(else_bool)
brca_non_ref_freq=[]
in_f=open('/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/BRCA1_vs_all/het_snp/all.pup','r')
for line in in_f:
#parse line
linelist=line.strip().upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_count=np.array(ref_count,dtype=np.int32)
#calculate group freqs and save in matrix
brca_non_ref_freq.append(
(covs[group_bool] - ref_count[group_bool])/np.float32(covs[group_bool]))
brca_non_ref_freq=np.array(brca_non_ref_freq)
brca_non_ref_freq=brca_non_ref_freq.flatten()
hist=np.histogram(brca_non_ref_freq,bins=np.arange(100)/float(31))
fig,ax=plt.subplots()
fig.set_size_inches(12,6)
ax.set_xlabel('non reference base frequency of brca1 samples')
ax.set_ylabel('count')
ax.set_xlim(-0.05,1)
cax=ax.bar(hist[1][:-1]-0.5/float(31),hist[0],width=0.8*1/float(31),color='dodgerblue',edgecolor='white')
Minimum coverage of the sample (Unfortunately, because of the different coverages of samples, applying this filter can bias the number of mutations in each sample.)
Minimum coverage of all samples
Average noise in other samples
Maximum noise in each sample: