I will plot the number false mutations found, depending on the p-value threshold. Actually this is very similar to a cumulative distribtuion of somatic p-values.
The curves look very similar, using 1-2 samples should be enough. (especialy if we decide not to use all the samples). I just used more samples, to show the generality of the pattern.
#load modules
import os
import subprocess
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
#some settings
matplotlib.rc('font', size=12)
matplotlib.rcParams['lines.linewidth'] = 3
matplotlib.rcParams['figure.figsize'] =(16,9)
#go to working directory, where the data is
os.chdir("/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/varscan")
#create sql util function
def run_sqlilte3(command,db,remove_db=False,output=None):
start=time.time()
with open('tempf.sql','w') as tempf:
tempf.write(command)
if (remove_db):
subprocess.check_output('rm '+db, shell=True)
if (output == None):
try:
print subprocess.check_output('/usr/bin/sqlite3 '+ db + ' < tempf.sql ', shell=True,
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError, e:
print e.output
else:
try:
print subprocess.check_output('/usr/bin/sqlite3 '+ db + ' < tempf.sql > '+ output, shell=True,
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError, e:
print e.output
subprocess.check_output(['rm','tempf.sql'])
print 'It took',time.time()-start,'s'
#funtction to load sorted pvals into pd dataframe
def get_p(fname,pvals):
#load table into db
run_sqlilte3('''
.separator "\t"
CREATE TABLE '''+fname+'''(
chrom TEXT,
position INTEGER, --!!!
ref TEXT,
var TEXT,
normal_reads1 TEXT,
normal_reads2 TEXT,
normal_var_freq TEXT,
normal_gt TEXT,
tumor_reads1 TEXT,
tumor_reads2 TEXT,
tumor_var_freq TEXT,
tumor_gt TEXT,
somatic_status TEXT,
variant_p_value TEXT,
--!!! its not reading normal forms automatically well (1e-10 etc)
somatic_p_value NUMERIC,
tumor_reads1_plus TEXT,
tumor_reads1_minus TEXT,
tumor_reads2_plus TEXT,
tumor_reads2_minus TEXT,
normal_reads1_plus TEXT,
normal_reads1_minus TEXT,
normal_reads2_plus TEXT,
normal_reads2_minus TEXT
);
.import '''+fname+'''.vsc.snp '''+fname+'''
--delete headers
DELETE FROM '''+fname+''' WHERE chrom='chrom';
''',db='ident_pair_db')
#get the sorted p-values
run_sqlilte3('''
.separator "\t"
SELECT somatic_p_value
FROM '''+fname+'''
WHERE somatic_status='Somatic'
ORDER BY somatic_p_value;
''',db='ident_pair_db',output=fname+'_p.csv')
#load into pd dataframe
pvals[fname]=pd.read_csv(fname+'_p.csv',sep='\t',names=['pval'],header=None)
return
# get pvals for the untreated and identical samples
pvals=dict()
get_p('Sample4_Sample1',pvals)
get_p('Sample5_Sample2',pvals)
get_p('Sample6_Sample3',pvals)
get_p('DS1_DS2',pvals)
get_p('DS14_DS15',pvals)
get_p('DS26_DS27',pvals)
get_p('DS33_DS34',pvals)
get_p('DS41_DS50',pvals)
get_p('DS50_DS41',pvals)
fig,ax=plt.subplots()
fig.set_size_inches(12,9)
for key,value in pvals.iteritems():
ax.plot(value['pval'],np.arange(len(value)),lw=2,label=key)
ax.axvline(0.05,c='dodgerblue',linestyle='dotted',lw=5,label='varscan default = 0.05')
ax.axvline(0.009,c='dodgerblue',linestyle='dashed',lw=5,label='advised by article = 0.009')
ax.set_xlabel('Somatic p-value threshold')
ax.set_ylabel('False mutations found')
ax.set_xlim(0.1,5e-4)
ax.set_ylim(0,2.5e4)
ax.set_xscale('log')
ax.grid()
dump=ax.legend(loc='upper right',fancybox='true')
fig,ax=plt.subplots()
fig.set_size_inches(12,9)
for key,value in pvals.iteritems():
ax.plot(value['pval'],np.arange(len(value)),lw=2,label=key)
ax.axvline(0.05,c='dodgerblue',linestyle='dotted',lw=5,label='varscan default = 0.05')
ax.axvline(0.009,c='dodgerblue',linestyle='dashed',lw=5,label='advised by article = 0.009')
ax.set_xlabel('Somatic p-value threshold')
ax.set_ylabel('False mutations found')
ax.set_xlim(0.1,5e-7)
ax.set_ylim(1,1e5)
ax.set_xscale('log')
ax.set_yscale('log')
ax.grid()
dump=ax.legend(loc='lower right',fancybox='true')