False mutations found by Varscan


This is a suggestion for the section: "Trying standard methods on the samples".

  • I think we should show the bad specificity (high false postive rate) of the standard methods.
    • In the untreated or identical samples all mutations found by Varscan are false mutations, since in the untreated, and identical samples there should be no mutations which are not present in th germline pair used for the analysis.
    • It would a good point to show that Varscan, (GATK) finds tons of mutations in sample where it should find 0
  • I think we don't need to estimate the sensitivity (true positive rate) of these methods, since that is not the problem. The problem is only the specificity (false positive rate).

The suggestion for a plot:

I will plot the number false mutations found, depending on the p-value threshold. Actually this is very similar to a cumulative distribtuion of somatic p-values.


Notes:

The curves look very similar, using 1-2 samples should be enough. (especialy if we decide not to use all the samples). I just used more samples, to show the generality of the pattern.


In [23]:
#load modules

import os
import subprocess
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

#some settings
matplotlib.rc('font', size=12)
matplotlib.rcParams['lines.linewidth'] = 3
matplotlib.rcParams['figure.figsize'] =(16,9)
In [2]:
#go to working directory, where the data is

os.chdir("/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/varscan")
In [3]:
#create sql util function

def run_sqlilte3(command,db,remove_db=False,output=None):
    
    start=time.time()
    with open('tempf.sql','w') as tempf:
        tempf.write(command)
    
    if (remove_db):
        subprocess.check_output('rm '+db, shell=True)
        
    if (output == None):
        try:
            print subprocess.check_output('/usr/bin/sqlite3 '+ db + ' < tempf.sql ', shell=True,
                                          stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError, e:
            print e.output
    else:
        try:
            print subprocess.check_output('/usr/bin/sqlite3 '+ db + ' < tempf.sql > '+ output, shell=True,
                                         stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError, e:
            print e.output
    
    subprocess.check_output(['rm','tempf.sql'])
    
    print 'It took',time.time()-start,'s'
In [4]:
#funtction to load sorted pvals into pd dataframe

def get_p(fname,pvals):

    #load table into db
    run_sqlilte3('''
.separator "\t"

CREATE TABLE '''+fname+'''(
  chrom TEXT,
  position INTEGER, --!!!
  ref TEXT,
  var TEXT,
  normal_reads1 TEXT,
  normal_reads2 TEXT,
  normal_var_freq TEXT,
  normal_gt TEXT,
  tumor_reads1 TEXT,
  tumor_reads2 TEXT,
  tumor_var_freq TEXT,
  tumor_gt TEXT,
  somatic_status TEXT,
  variant_p_value TEXT,

  --!!! its not reading normal forms automatically well (1e-10 etc)
  somatic_p_value NUMERIC, 

  tumor_reads1_plus TEXT,
  tumor_reads1_minus TEXT,
  tumor_reads2_plus TEXT,
  tumor_reads2_minus TEXT,
  normal_reads1_plus TEXT,
  normal_reads1_minus TEXT,
  normal_reads2_plus TEXT,
  normal_reads2_minus TEXT
);

.import '''+fname+'''.vsc.snp '''+fname+'''

--delete headers
DELETE FROM '''+fname+''' WHERE chrom='chrom';

    ''',db='ident_pair_db')

    #get the sorted p-values
    run_sqlilte3('''
.separator "\t"

SELECT somatic_p_value
FROM '''+fname+'''
WHERE somatic_status='Somatic'
ORDER BY somatic_p_value;

    ''',db='ident_pair_db',output=fname+'_p.csv')
    
    #load into pd dataframe
    pvals[fname]=pd.read_csv(fname+'_p.csv',sep='\t',names=['pval'],header=None)
    return
In [ ]:
# get pvals for the untreated and identical samples

pvals=dict()
get_p('Sample4_Sample1',pvals)
get_p('Sample5_Sample2',pvals)
get_p('Sample6_Sample3',pvals)

get_p('DS1_DS2',pvals)
get_p('DS14_DS15',pvals)
get_p('DS26_DS27',pvals)
get_p('DS33_DS34',pvals)

get_p('DS41_DS50',pvals)
get_p('DS50_DS41',pvals)

Log-lin scale

In [29]:
fig,ax=plt.subplots()
fig.set_size_inches(12,9)

for key,value in pvals.iteritems():
    ax.plot(value['pval'],np.arange(len(value)),lw=2,label=key)


ax.axvline(0.05,c='dodgerblue',linestyle='dotted',lw=5,label='varscan default = 0.05')
ax.axvline(0.009,c='dodgerblue',linestyle='dashed',lw=5,label='advised by article = 0.009')

ax.set_xlabel('Somatic p-value threshold')
ax.set_ylabel('False mutations found')


ax.set_xlim(0.1,5e-4)
ax.set_ylim(0,2.5e4)
ax.set_xscale('log')
ax.grid()
dump=ax.legend(loc='upper right',fancybox='true')

Log-log scale

In [28]:
fig,ax=plt.subplots()
fig.set_size_inches(12,9)

for key,value in pvals.iteritems():
    ax.plot(value['pval'],np.arange(len(value)),lw=2,label=key)

ax.axvline(0.05,c='dodgerblue',linestyle='dotted',lw=5,label='varscan default = 0.05')
ax.axvline(0.009,c='dodgerblue',linestyle='dashed',lw=5,label='advised by article = 0.009')

ax.set_xlabel('Somatic p-value threshold')
ax.set_ylabel('False mutations found')

ax.set_xlim(0.1,5e-7)
ax.set_ylim(1,1e5)
ax.set_xscale('log')
ax.set_yscale('log')
ax.grid()
dump=ax.legend(loc='lower right',fancybox='true')