In this notebook i will try to explore VarScan 'SNP' positions in a R1 untreated sample: Sample 4

  • Im trying to look at alignment context information using IGV, and all sample information using mpileup

In [1]:
#load modules

import os
import subprocess
import fnmatch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import gc


from IPython.core.display import Image 
In [2]:
#go to working directory, where the data is

os.chdir("/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/varscan")
In [3]:
%%bash
#Check out the schema of the vsc file

head Sample4_Sample1.vsc.snp -n 1 | awk '{for (i=1; i<=NF;i++) {print i" " $i}}'
1 chrom
2 position
3 ref
4 var
5 normal_reads1
6 normal_reads2
7 normal_var_freq
8 normal_gt
9 tumor_reads1
10 tumor_reads2
11 tumor_var_freq
12 tumor_gt
13 somatic_status
14 variant_p_value
15 somatic_p_value
16 tumor_reads1_plus
17 tumor_reads1_minus
18 tumor_reads2_plus
19 tumor_reads2_minus
20 normal_reads1_plus
21 normal_reads1_minus
22 normal_reads2_plus
23 normal_reads2_minus

Look at positions with very low p-value

  • p < 0.0001

  • It looks like there is some cluestering: e.g.: 4.9M, 10.9M

    • LOH in germline pair?
In [4]:
#select some very unlikely fals positives ( p< 0.0001)

best_pos_cmd='''
head Sample4_Sample1.vsc.snp -n 100000 | awk ' $13 == "Somatic" &&  $15 < 0.0001  { print }'
'''
best_pos=subprocess.check_output(best_pos_cmd,executable='/bin/bash',shell=True).strip()
best_pos_list=best_pos.split('\n')
print '\n'.join(best_pos_list)
1	4938485	A	G	38	1	2.56%	A	18	12	40%	R	Somatic	1.0	9.03732915951061E-5	9	9	9	3	20	18	1	0
1	4940483	C	T	50	1	1.96%	C	40	34	45.95%	Y	Somatic	1.0	5.8388617657188346E-9	15	25	22	12	29	21	1	0
1	4940489	T	C	63	0	0%	T	51	20	28.17%	Y	Somatic	1.0	6.99786330233474E-7	24	27	11	9	43	20	0	0
1	4940497	C	T	62	0	0%	C	63	18	22.22%	Y	Somatic	1.0	1.4282301363030596E-5	27	36	11	7	37	25	0	0
1	5501200	G	A	21	0	0%	G	15	16	51.61%	R	Somatic	1.0	2.9000728968146077E-5	2	13	6	10	3	18	0	0
1	5696972	A	G	47	1	2.08%	A	19	15	44.12%	R	Somatic	1.0	2.1872192065986977E-6	8	11	11	4	27	20	1	0
1	5966182	C	A	42	0	0%	C	27	23	46%	M	Somatic	1.0	3.842678650588233E-8	13	14	11	12	21	21	0	0
1	6008344	T	A	45	1	2.17%	T	25	17	40.48%	W	Somatic	1.0	4.98980219077517E-6	14	11	8	9	28	17	0	1
1	7860898	T	A	35	0	0%	T	12	13	52%	W	Somatic	1.0	1.0064713482112522E-6	6	6	6	7	20	15	0	0
1	8672283	T	G	19	0	0%	T	18	17	48.57%	K	Somatic	1.0	9.622999863760226E-5	8	10	9	8	10	9	0	0
1	10936409	A	T	55	0	0%	A	20	14	41.18%	W	Somatic	1.0	1.8237211328043387E-7	10	10	7	7	31	24	0	0
1	10936417	T	A	54	0	0%	T	23	10	30.3%	W	Somatic	1.0	2.3135915970191888E-5	11	12	5	5	33	21	0	0
In [5]:
#collect filenames for samplenames

rm_dup_dir='/nagyvinyok/adat83/sotejedlik/dt40/rmdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
    if (fnmatch.fnmatch(fname, '*.bam') and 
        not fnmatch.fnmatch(fname,"*.bam.bai")):
        fnames.append(fname)
fnames=sorted(fnames)
In [6]:
#define function to run pileup command

def run_pileup(chrom,pos):
    
    samtools="/nagyvinyok/adat87/home/ribli/tools/samtools-0.1.19/samtools"
    ref_fa="/nagyvinyok/adat87/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"

    cmd_mpileup = samtools + ' mpileup -Q 30 -B  -f ' + ref_fa
    cmd_mpileup += ' -r ' + str(chrom) +':'+ str(pos) +'-' +  str(pos) + ' ' 
    cmd_mpileup += ' '.join([rm_dup_dir+x for x in fnames]) 
    
    pup_line=subprocess.check_output(cmd_mpileup,executable='/bin/bash',shell=True)
    pup_list=pup_line.split('\t')
    
    print pup_list[0],pup_list[1],pup_list[2]
    for i in xrange((len(pup_list)-3)/3):
        print fnames[i].split('.')[0][:-21],pup_list[3+3*i],pup_list[4+3*i]
        

Check a position:

  • 1:4938485
  • Indel error
In [2]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-4938485.png') 
Out[2]:
In [7]:
i=0
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	4938485	A	G	38	1	2.56%	A	18	12	40%	R	Somatic	1.0	9.03732915951061E-5	9	9	9	3	20	18	1	0

1 4938485 A
DS10 13 GgG..GG,,.G.G
DS11 17 .gG,GG.G.,G,.g,,G
DS14 14 g.GG,G,.,G,,,,
DS15 17 G,gg.g..gG,,,...,
DS16 14 GGg.gg.g,,.G.^9,
DS18 8 .g.G,..g
DS1 5 ,.G,,
DS26 18 .GG.G.GG..G.gGG...
DS27 9 ,,,.G.g.,
DS2 10 ,$..GG,,g,g
DS33 16 g,.G.G..,GGg,g.G
DS34 12 gg.,..g,GG,.
DS35 15 G$...g,,G,,g.GG.
DS36 13 g,Gg..GgGG.,.
DS37 13 .GgGgG,..,,GG
DS3 11 Gg,GgG.gG.,
DS41 4 GGGG
DS42 11 ,.GGgG.,G..
DS43 15 GG.GGGG.G..GG,.
DS44 13 ,$g,.g..GG.G.G
DS45 10 ......G.G.
DS46 4 ..GG
DS47 24 g.g.GGG.GG.G.GGGg.G,..G.
DS48 10 .G,.g.G,G.
DS49 8 ,$.GG..GG
DS4 15 ...g..GGgGG...,
DS50 14 G..,..GG.,.G.,
DS51 8 gGGGGGGG
DS52 17 G.g..g.GG...GG..G
DS53 15 g$g..g.,,.GGG.G.
DS54 8 GG..G...
DS55 12 gG..G,GGGGG.
DS56 8 G.G.,.G.
DS57 27 g...,.,GGG,G,.,G.G.G..G.G..
DS58 26 .,g..g.G,,,G..GG.,,.G..G.,
DS59 6 .G.GG,
DS5 13 g.G,G..GG.GG.
DS60 10 G..,..GGG.
DS61 25 .,,g,.,G,,GG....g.G.,G,..
DS62 4 gGGg
DS63 8 .G.G,..G
DS64 8 GgG.G,,G
DS65 45 ,,gG,.G,G,,gGG..,G,,g,,GG,GGGgG.GGG..,G.gGG,^I.
DS66 12 .g,,.,.GG.GG
DS67 11 ...g.,.G...
DS68 6 ....,,
DS69 14 g$Gg.,Gg...G.,G
DS6 13 .G.G.,G...,,.
DS70 14 G.,.......GGG.
DS71 12 .$g.GGG.,G...
DS72 18 G...Gg,.,,,GG....G
DS73 0 
DS74 6 ,G.,G.
DS7 9 .,...Gg,.
DS9 13 g,GGg,G.,.,.,
Sample1 39 .,.,,..,.,..,,.,,.,..,,.,....,,..G.,,,.
Sample2 36 ..,g..,.,..gGgg,Gg,G,GgG.G..,gG.,gGG
Sample3 40 G.ggG,GGG..,g.,..G.,...g,.,gG,......,,,,
Sample4 30 G.,G,G..,g,,,gGGg.G.,.GGG...,,
Sample5 23 ,G,G,.G,G,GG.G.G...,...
Sample6 24 G..,gggg,,.GG,g..G..,,,,

Check a position:

  • 1:4940483
  • Indel error
In [3]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-4940483.png') 
Out[3]:
In [8]:
i=1
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	4940483	C	T	50	1	1.96%	C	40	34	45.95%	Y	Somatic	1.0	5.8388617657188346E-9	15	25	22	12	29	21	1	0

1 4940483 C
DS10 13 .,....t,.....
DS11 29 ttttT.Tt,.,,,.,,,,,,,,,..,t.,
DS14 24 TtTtt,t.,,,,.,,,,,.,,.,,
DS15 26 tttTT...,....,..,,..,t,,..
DS16 24 TTtt,,.,..,.,..,,,....,,
DS18 21 tttTtT,t.,...,..,..,.
DS1 14 ,Tt.,,..,..,t.
DS26 22 tTttTt.tT,.....,..,..,
DS27 27 TTttT,TTT,t,.,.,,..,,..,...
DS2 19 tTt,,.,....,,..,.,.
DS33 11 tttt,.,,,,,
DS34 19 tTT,TT....,,,..,,..
DS35 18 TT.,,.,..,...,.tt.
DS36 19 tT,..,..,...T.,,T.,
DS37 23 t,.tt.,,,,,,,,,..,..,T.
DS3 24 tt,tt.,..,,,,.,.,.,,,,tt
DS41 20 ,,,.,,,,,,,,..,,,,,.
DS42 29 tTTt,..tTT.,.,,,T,T,,,,,,...^-,
DS43 9 TtTT,,,,.
DS44 20 t,..,,,,.,,,....,..,
DS45 15 ,$,,,.,,,,..,,,.
DS46 19 tTTTt,,..,,*,.....,
DS47 47 TTTttttt,T,,,..,.,.T.,,.,,.,,..t,,.,,,,,.,,,.t,
DS48 27 TttTTT.,,TT,,.,,,,,,...,.,.
DS49 12 TTT,,,,,.,..
DS4 21 T,tt...,,....,,....,,
DS50 24 t$tTTT,tTt,.,,T,,,..,,.,.
DS51 21 ,.,,.,,,,.,,,...,..,,
DS52 20 T$tTT,tT.T,,.,,,.,..^K,
DS53 15 ttt,tT,,..,,,,.
DS54 18 ,,,,,,,,...,,,,,t,
DS55 22 ttttTtTTt,,,,,,,.,...^I.
DS56 18 tTTT,tT,.,T,.,....
DS57 51 TtTtTTTt,tt,,t,,.,..,,.,,,,,,,.,..,,...,,,.,,,,t,,,
DS58 53 t$TTtTtTT,,tTtTTt,.,,,,,.,,,....,.,,,.,,,.,......,,,t,
DS59 14 ,,,,.,.,,,,.,.
DS5 33 tTttt..T..,,,,...,,,.,,,......tt^I.
DS60 16 TttT,T,..,,,,,..
DS61 30 Tt...t,,,.,.,,,..T,,..,...T.t.
DS62 7 tt.,.,t
DS63 23 tTT,,,,T.,T,.,..,,.t...
DS64 22 ,,,,,,..,,,.,.,,.,.,,t
DS65 46 TT,,T.,,T,,,...,,...,.,,.,.,.,.,.,,.,...,.,t..
DS66 25 ttTT,T,T,.,,,..,...,..,,,
DS67 7 t,,.,,.
DS68 7 TT,,,.,
DS69 21 tttttTTtt,,.,,,,,,..t
DS6 17 t$tT,T,.,.,,.,.,,,
DS70 15 ,,..,,.,.,,,,,.
DS71 11 ,,,..,.,..^I.
DS72 15 tTtt,,,T.,,,.,.
DS73 1 T
DS74 12 ,,,,,,,,,,,t
DS7 24 tTtTt,T,,....,,.,,,,,,..
DS9 16 t.,,,,,..,,....T
Sample1 51 ,...,,,.,..,,.,.,.,..,,.,...,......T.,...,,..,.,.,,
Sample2 59 ,,.,.,,,,,.,....,.,,.......,,.,,,,.,....,,,.,,,.T.....t,,,^!.
Sample3 64 TTTttTtTttTtTttT..,Tttt,.,..,.,,,....,..,,..,...,,,,,.,..,,,.T.,
Sample4 74 tttTTTTTTTTttTTtTTTttt..tTTTTT,t.tTT,,T,,,,,.,...,,,.,.,.,,,,,,,...,,^M.^I.^"T^I,^K,
Sample5 60 T$tTTtttTTtTttTTT,T,..t.,..,.,,.,,..,.,..,,.,,,.,,,,,,.,.,.T^5.
Sample6 36 t$TTtTttTttT..,T.T...,.,.,.,,....,,,,

And another one

  • 1:5501200
  • Repeat
In [4]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-5501200.png') 
Out[4]:
In [9]:
i=4
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	5501200	G	A	21	0	0%	G	15	16	51.61%	R	Somatic	1.0	2.9000728968146077E-5	2	13	6	10	3	18	0	0

1 5501200 G
DS10 3 *,*
DS11 17 ***,,,AaaaaaAAa.A
DS14 11 ,*,,aaaa,a.
DS15 4 *,$,,
DS16 9 ,***,$,**.
DS18 10 *,*******.
DS1 9 ,,,*,,aAa
DS26 5 ,,aa.
DS27 12 ,$,,,,,aaaA,.
DS2 5 *,a,,
DS33 13 ,***,,,,aaa,a
DS34 17 ,,,,,aAaaaAaa,AA.
DS35 4 *,a,
DS36 13 ,,,$,,,aaaa,a.
DS37 7 ,,AaaA.
DS3 10 ,,,A,aA,.A
DS41 10 ,$,$,,,,aaaA
DS42 10 ,,,,,aa,.^]a
DS43 3 aAA
DS44 8 ,aaAaAAa
DS45 10 ,aaaaaAA.^].
DS46 6 ,,,aA.
DS47 27 ,,,***,********,,,*,A*,*,.,
DS48 8 ,,,AaaAA
DS49 6 ***,,*
DS4 8 ,,**,*,.
DS50 10 ,,,,,aaaa.
DS51 6 *,,*,*
DS52 6 ,,,aaa
DS53 5 ,,,a.
DS54 10 *a,,,aa.,,
DS55 6 ,,a,A.
DS56 6 ,*,,*^].
DS57 10 ,*,****,*,
DS58 19 ***,,$*,*******,*,.,
DS59 8 ,,,,aaa^],
DS5 8 ,,aAa..^].
DS60 11 ,,,,aaaaaa.
DS61 17 ***,$*****.*,,,,,.
DS62 10 ,,,,aaAaA^].
DS63 5 ,$aAA.
DS64 12 ,,,,aaaAaAAA
DS65 35 ,**,$*************,,,**..,A....,,.,.
DS66 12 ,,,,,,,aa,a.
DS67 7 A$,aa.A.
DS68 6 ,,AaaA
DS69 6 *,.,..
DS6 8 ,,,,,AAa
DS70 11 *,,,AaaAaAA
DS71 15 ,,,,aAAaaaA.,..
DS72 13 ,$,,,,,.AaA,A.
DS73 7 ,,aa..^]A
DS74 7 ,aaa...
DS7 15 ,,,,aaa,aa,aaA^]A
DS9 13 ,*,,,,,aA,..,
Sample1 37 ***,,$,,,,**,,**,*****,,**,,,,,,*,*...
Sample2 43 ***,,$,,,,,,**,***,***,,***,,,,,,,,,,**.....
Sample3 33 A,$,,,,*,,*,****,*,,,,,,,,*...,^].^].^].
Sample4 31 ,,,,,,,,,,,a,aaAAAaaaaaa,A..^]A^]A^]a
Sample5 31 **,$,,,,,,,,,,,,,,,aaaaAaAaa...^].
Sample6 29 **,$,,,$,,*,,****,,,*,,,,,,..,^].

And another one

  • 1:5696972
  • Indel error
In [5]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-5696972.png') 
Out[5]:
In [10]:
i=5
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	5696972	A	G	47	1	2.08%	A	19	15	44.12%	R	Somatic	1.0	2.1872192065986977E-6	8	11	11	4	27	20	1	0

1 5696972 A
DS10 18 ,$,.....,,.,.,..,^].^],
DS11 25 ,,,,.,..,.,.,,,......,.^].^],
DS14 23 ,$,,,,..,...,.,,..,.,.^].^I,
DS15 14 ,$,,,,.,..,.,.^],
DS16 22 ..,.,,,..,.,,,,.,.*,^].^g,
DS18 26 ,,,,...,G.,.,.,,,..,....^].^I.
DS1 33 ...,...*,..,..,,,,,,.,,....,,,^].^].^].
DS26 16 gGGg,....*,,,,,,
DS27 18 ,,,,,,,...,.,,,G^],^],
DS2 18 .,,,,,,,...,*.^].^],^],^],
DS33 20 ....,...,.......,.^].^],
DS34 17 ,$,,,,,.,.,.,..,^I.^],
DS35 22 ,.,,,,,.,,.,,,,.,..^g.^].^],
DS36 17 ,,,,........,..,^],
DS37 21 GGgG,G,G.,G,.G....G.^].
DS3 17 G..,,..,G.,,...^T,^],
DS41 7 GG.G..^I.
DS42 9 G$GG,,.G^],^],
DS43 14 GGGGGGG.,,^],^],^],^],
DS44 10 gGGG,..^I.^],^],
DS45 14 ,.,...,...,^],^I,^],
DS46 17 gGGG.GGG,.GG,,G.^g,
DS47 24 .,.,....,*,,...,G,...*,.
DS48 13 G.G.,GGGG,.^I.^],
DS49 17 ...,......,..G,^],^],
DS4 22 ,,..,..,...,,,.,...^].^],^],
DS50 10 g$*,GG.,G.^],
DS51 14 g$,.,.,.,.^],^],^],^I,^],
DS52 11 ...,..,.^].^],^],
DS53 13 ,,..*...,^I.^].^I,^],
DS54 26 ,.........,..,.,.,,^],^],^],^I,^],^],^],
DS55 18 ,,..,...,...^],^],^],^],^X,^],
DS56 16 ,,GG,G,.G.G,^9.^I.^],^],
DS57 35 ,,....,,.,,.*..,,,,....,,,....,,^I.^],^],
DS58 39 ,$,,,...,,...,..,.,,,.,..,,,,..,.,G,^],^g,^I,^I,
DS59 14 ......,.,.^],^],^],^],
DS5 25 .,.....,,,G,..,.,.,,...^].^V.
DS60 11 .......,^],^I,^],
DS61 27 Gg.,GG,G,G,,GG,.GG,,G,,.G^],^],
DS62 14 ,$..,.....^I.^g.^9,^I,^],
DS63 13 g.GG.,.^I.^I.^I,^],^],^],
DS64 15 ,,..,.....,,,^],^Q,
DS65 40 ,,.,.,.,,..,........,,,.....,.,,.^I.^J.^].^].^],^],^],
DS66 11 ,,,..,..^J,^N,^],
DS67 15 ,.........,.^],^],^],
DS68 12 ...,,..^I.^I.^].^Z,^],
DS69 13 GG,G,G.,,,*^I.^],
DS6 18 ,.,.,,,.,..,..,,,,
DS70 16 ..,..,*.,,^],^],^I,^],^],^S,
DS71 16 ,,..,,G...,^].^],^],^I,^],
DS72 12 gG,GGGG.,^L.^],^],
DS73 5 ....^],
DS74 3 G^],^],
DS7 21 ,,,,,.,.,,..,....,,^],^g,
DS9 12 ..,,....,...
Sample1 48 .$,$...,,,.,..,.,,..,,.G,...,,.....,......,^].^I.^],^I,^],^I,^],
Sample2 37 ,$.,,,..,...,..,,.......,..,,..,^].^].^R,^I,^],^],
Sample3 39 g$ggggGG..*,..GG..,.,,..G.G..,GG^8,^],^I,^:,^],^],^],^I,
Sample4 34 GGGgGgggGGGGG..,G.,..G,..,,,.^],^],^],^],^],
Sample5 40 ......,G..,.,....,.,,.,.G,,......,G^].^],^I,^],^],
Sample6 24 ,,........,...,.,...G^].^1,^J,

And another one

  • 1:5966182
  • Del error
In [6]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-5966182.png') 
Out[6]:
In [11]:
i=6
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	5966182	C	A	42	0	0%	C	27	23	46%	M	Somatic	1.0	3.842678650588233E-8	13	14	11	12	21	21	0	0

1 5966182 C
DS10 18 .,.,..,..,,,......
DS11 18 ....,.,,,.....,,,,
DS14 32 a...,,,,,,,,,....,,,.,,,,,,,.,.^],
DS15 22 ,,,..,,.,,,,,,.,..,,..
DS16 18 ..,,.....,..,,...,
DS18 20 ,.....,,,,,.,,,,.,.,
DS1 20 .,,,..,.,,,,,..,....
DS26 25 a,..,,,.,,,.,,..,.,....,.
DS27 24 .,,,,.,.,.,,,.,..,,,.,.^],
DS2 23 A.,..,,,....,.,,...,,,^].
DS33 32 .,,,,,,,..,,.,.,,,,.,.,.....,..^],
DS34 18 a$,,.,,,..,,..,,..,
DS35 19 ,,,.,,,,,,,,,.,,.,,
DS36 18 .,,..,,,,,,,..,.,,
DS37 20 ...,.,.,..,....,,,..
DS3 20 Aa.,..,..,,,.,.....^],
DS41 27 ,..,,.,.,,..,,.,,...,.,..,,
DS42 29 .,,,.,,,,,.,..,.,,.,,,,..,,^],^],
DS43 31 A..,..,.,,.....,,,.,,,.,,,,.,..
DS44 28 .a,.,,,,.,..,,.,,.,,,.,,...,
DS45 29 ,$a.,.,,.,...,.,.,...,.,....,,
DS46 26 ..,..,,.,,,,,.,..,,,..,.,,
DS47 47 aA,,...,.......,,,,,,,..,,,,..,,,...,,,,.,.....
DS48 24 ,.,.,.,,,..,,.,,.,.,,..,
DS49 22 ,$,.....,,..,..,,,.....
DS4 25 a,.,...,.,,.,,,,,....,,.^],
DS50 14 ....,...,,.,,.
DS51 32 ..,,.....,.,,..,,,.,,,.,,..,.,.,
DS52 22 ,.,.,,,,.,,..,,,..,..^],
DS53 19 .,,,,..,.,........^].
DS54 26 ,,.,,..,.,..,.,,,..,,,.,.,
DS55 29 a,,,,.,,,..,.,,.,.,,..,.....,
DS56 35 .......,.....,,..,.,,..,,,....,..,,
DS57 39 ,$,..,.,,....,.,..,,,,....,,...,....,..,
DS58 54 A$,$,$A,,...,,.,.,,,,,,,..,..,,,.,..,..,..,,.,,,,.,,,..,.
DS59 17 ,,,...,.,..,,,..,
DS5 27 ,.,...,,,,,.,..,.........,.
DS60 12 a.,,,.,,,...
DS61 43 ,$.,...,,.,,.,,..,.,..,.,,.,,,...,,....,.,.,
DS62 27 .,...,,,,,,...,,.,,,.,.....
DS63 28 ,......,,,.,,,,,,,.,,.,,,...
DS64 28 a$,..,.,,.,,.,,..,..,....,..^],
DS65 49 .$,$..,.,,.,.,.,,,.,,,.,,,.,.,.........,,,,...,,,,^].
DS66 30 ,..,.,..,,,..,,...,.,.,,,..,,,
DS67 18 ,,.,.......,,....^],
DS68 19 ,..,T,........,....
DS69 15 .,,.,.,.,,,,...
DS6 24 a...,,.,,..,......,,,..,
DS70 22 ,$Aa,,.,.,..,,,,,..,.,^].
DS71 27 ,.,,.,,..,,,.,,..,,,..,.,.,
DS72 14 ...,...,...,..
DS73 19 ,..,.,,..,,.,,.,,,^],
DS74 32 ,,,..,,,,,..,.,..,,.,.,.,..,.,,.
DS7 33 .,.,.,,..,.,.,,,.,.,,,,..,..,,.,.
DS9 11 ,,,,,....,,
Sample1 42 .,.,.,,...,,,.,.,..,.,,,.,,....,..,.,.,.,,
Sample2 69 .,,.,.,,,..,.,,.....,...,..,.,..,.,,,..,.,.,,.,..,...,.,...,..,...,,^],
Sample3 66 A$a.,.,..,.,,,.,,,...,....,..,,,,,.,....,,.,..,,.,g...,.,.,,.,.....
Sample4 50 ,$A,...A-1Ga-1g.A-1G,a-1ga-1g,A-1GA-1G,.a-1g.a-1ga-1ga-1g,A-1GA-1G,.A-1G.A-1Ga-1gA-1G....,,,a-1g,.a-1gA-1G,a-1ga-1g,,
Sample5 41 A$.,.,,...,,.,.,,..,,....,,...,.,..,,...,^],
Sample6 29 ,,.,.,,,,,..,,,.,,...,,.,,...

And another one

  • 1:6008344
  • Deletion error
In [7]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-6008344.png') 
Out[7]:
In [12]:
i=7
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	6008344	T	A	45	1	2.17%	T	25	17	40.48%	W	Somatic	1.0	4.98980219077517E-6	14	11	8	9	28	17	0	1

1 6008344 T
DS10 15 a$a$a.,,*,.,,,A.,
DS11 23 Aaaa.a,..,.*..,....a*..
DS14 28 A$AA.*,.,,,**.,..,,...,.*,A.,
DS15 19 ,.,.,,....,,*,.,.*^],
DS16 24 ,,.,,,........,..*.,,,,A
DS18 19 aA,,a..A,..*,,*.,,*
DS1 17 AAa,A.,a,.,.,a.^].^].
DS26 26 a,**.,.,,..,***,..,.aA*.,A
DS27 18 A,.....a..,..*....
DS2 23 aA.,,,*,*....,....A.,,^],
DS33 24 aa.a..,.,.A.,.......*,.,
DS34 17 a.,,,,*.,Aa..*,,.
DS35 22 aaaA..,,,.a*.,,,....A^],
DS36 22 Aaaa,A*,.*,,*.*,,,....
DS37 24 A$AAA*a..,,a*..,,,.*.,.,.
DS3 14 ,.,.,.,,,,*,.,
DS41 13 ,,........a,,
DS42 7 A..A**A
DS43 14 aaaA*..**,,a.A
DS44 10 a**,A.,,**
DS45 20 a$aAAAA.,A,A****..*,^].
DS46 7 .*.*.,*
DS47 31 a,*..*,,*..,.*..**..*.*.A.,...,
DS48 10 ,,*.*.,.,*
DS49 12 ,,,,*.*A....
DS4 24 aa,,.,.,,,..A,,..*.*,.A^].
DS50 16 a$aaaA,.A.,A*,.,.
DS51 15 a$AAA..A....*.a.
DS52 13 A$aaAaA...**.^].
DS53 5 aaA.A
DS54 11 ,,..,,...*,
DS55 16 ,...**..***,,,..
DS56 17 a$.*.A**.A,....*a,
DS57 42 AAAAaA.,A.*..A.,.,...a..*,A.*Aa,.......,^].^],
DS58 38 a$aaA*....A,aA.*,,...,,..*,,,,.,.*.A...
DS59 17 aaaAA.....*.*.Aa^],
DS5 20 AA,,.*.....A,.,,*..^O,
DS60 7 ,*.*...
DS61 33 ,,,*,.A,.a...**,.*,..,.,...,..,,,
DS62 17 ,,,..,..*...*.,.*
DS63 11 a$aA.*.*a..,
DS64 16 ,,......**..,.,,
DS65 46 aaaaaA,..,A,*A,,,...,*.,,.*.,*..,,..,...,.,.*^],
DS66 18 a,,A**,*.*..*A,^].^].^],
DS67 9 A$*,..A**.
DS68 7 ..*....
DS69 10 .,,..*,...
DS6 22 ..,**,*..,a*,.,.,...,.
DS70 11 *..,...Aa,.
DS71 12 A*.,,aA,.,*.
DS72 8 ,,*,,**.
DS73 16 a$a,,a.,..,..aA.^]A
DS74 10 .,,..*.*..
DS7 40 ,,..,*,,*.,..*,..,,.*,,,.,**,.a.**,.,*..
DS9 14 ,...,A,....*a*
Sample1 48 ,,,...,..,..*.,.,.,,,,.,...,..*.....,,a....,.,..
Sample2 83 a$a$aaaAaAAaaAaAAAAA*,,,,,.,.A,..,,,,,.,............*...a,.,....,,.,................,
Sample3 43 .,...,,.,,.,.,,,.....,..,,.,.,,..*,**....^].^],
Sample4 43 AaaaaaAaAaaAAA..,.,,,,.,A.*.A,a,,.......,.,
Sample5 54 a$a$aAaaAAAA,.,,..,..A,,,.,.,,..,..A,..*.*....,....,,..^].
Sample6 27 ,.,,,,...,,.,..,,,..,..,..,

And another one

  • 1:7860898
  • Del error
In [8]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-7860898.png') 
Out[8]:
In [13]:
i=8
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	7860898	T	A	35	0	0%	T	12	13	52%	W	Somatic	1.0	1.0064713482112522E-6	6	6	6	7	20	15	0	0

1 7860898 T
DS10 17 ,,,.....,....a.,.
DS11 22 ,.,....,,.,,.,..,.,.,.
DS14 15 ,,,,..,,.,.,,,.
DS15 16 .,.,.,....,.,,.,
DS16 13 ,$,.,.,......,
DS18 18 ,,,..,.,,,.,,,...,
DS1 16 .,,,,..,.,...,,,
DS26 16 ,..,A,,..,.,,..,
DS27 21 ,$..,,.,,,.,,,.,,.,,..
DS2 17 aaaaa..A,,.,.,.,.
DS33 17 ,,..,,,,..,,.,,.,
DS34 16 ,..,.,,,,.,.,..,
DS35 14 aAaAAAa,,,,,.,
DS36 15 ,.,..,,.....,.,
DS37 10 ,,,.,.,,..
DS3 9 ,.,,.,,..
DS41 14 ,$.,,,,.,a.....
DS42 14 ,....,,.,....,
DS43 19 aaaaa.,,.,.,,..,,.,
DS44 18 ,,..,,.,.,,.a,..,,
DS45 14 ,,,,,.,,..,..A
DS46 23 .,.,,,,.,,.,.,.,,,,A..,
DS47 35 ,.,,,,,,.....,...,.,...,,,...,.,...
DS48 27 .,,.,,...,,,,,..,,.......,.
DS49 15 ....,.,...,,A.,
DS4 19 ,.,a,,...,..A,.,,,a
DS50 21 .,,,,.,.,,.a.,,....,.
DS51 27 ,,.....,,,...,...,..,,....,
DS52 17 ,,,,.,,.,.A.,..,,
DS53 15 ,,..,,..,..,..,
DS54 17 .$,..,,.,,,.,..,,.
DS55 24 AaAaAaaaAaA,..,,,.,,...,
DS56 18 .,,,,....,,,..,...
DS57 40 ,$.,.,.,.,,,,..,.,.,..,,,.....,,,.......,
DS58 39 .,.,..,,,,,..,......,,.,,.,,,,,...,,..,
DS59 16 ,..,,,.,.,.....,
DS5 19 ..,,.,,,.,..,,...,.
DS60 13 ,,,,..A,.,...
DS61 36 ..,,..,,,..a,,...,,,.,..,..,,,,....,
DS62 7 ,,,.a..
DS63 24 AaAAaAaaaA,,,.,.,.,..,.,
DS64 22 ,,...,,.,,,.a...,,,..,
DS65 44 ,,,,,,,,,,.,,,.,..,,.,.,,...,,.,.,,.,.,,,..,
DS66 15 ,.,..,,...,....
DS67 9 ,$,...,...
DS68 17 ,,,..,.,,,..,,..,
DS69 15 ,,,,,,,..,,,.,.
DS6 15 .,.,,.,,,.,a..,
DS70 21 Aa.,,.,,.,,,..,,.,..,
DS71 26 ,..,,.,,,,,.,,...,.,.+3TAA.....
DS72 22 .$.,,,.,,...,,,.,,..,,,
DS73 12 ,,,,,,.,.,,,
DS74 18 ,,,,.,,,,.,.,...,.
DS7 14 aa,,,,,,,..,..
DS9 19 ,,,a,,..,,,,.,,.,,.
Sample1 35 ...,,..,,,,,.,........,,,,,..,....,
Sample2 55 ,,,.,,,,,.,..,...,,...,..,,,,.,,......,,..,...,,.....,.
Sample3 32 AaaaAAaAaaA...A,....,......A,...
Sample4 25 AAAaAaaaaaaA,.,,.,..A,,..
Sample5 35 .$,,..,,.,,,.,,...,,,..,,..,.,.,....
Sample6 26 aAaaAAaAaaaAAaaaA,.,,....,

And another one

  • 1:8672283
  • Deletion, sometimes misaligned -> SNP
    • why is it consequently misaligned in 2 samples, and not is others? A nearby SNP? The local realigner make it like this?
In [9]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-8672283.png') 
Out[9]:
In [14]:
i=9
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	8672283	T	G	19	0	0%	T	18	17	48.57%	K	Somatic	1.0	9.622999863760226E-5	8	10	9	8	10	9	0	0

1 8672283 T
DS10 16 .**.***...*.*,.*
DS11 16 .*.********,.,.,
DS14 22 ,***,,*.,.*.,.*,.***.*
DS15 22 **,***,,*,,**,*.*****^],
DS16 17 g$,,*,,*,**,.*.**.
DS18 13 **,***,,**,.,
DS1 26 ,*.,*,***..*,**,.***,..*,,
DS26 18 ,,,*,*,**.**.***,.
DS27 16 *..*,..*,****,,^].
DS2 15 ,$*,***,.**..*..
DS33 26 .....**,,*****.**.*.*,.**,
DS34 18 *.**.*,*,.*,*,***^].
DS35 22 ..*.*,,**.*.*..,**...,
DS36 19 ,**,*.,.**,**.,***.
DS37 12 ,*****.,,.*,
DS3 13 ..*.,*,,**,..
DS41 14 *..,,*,**.,,,,
DS42 12 ,.****,..**.
DS43 16 ,,,**..,,,.,*,..
DS44 20 G$,$.,****.,,*,.*.*,,,
DS45 22 .,.,..*...**.***.,*,,*
DS46 20 .,***,.,***..,*,.*.,
DS47 24 ,.**,*,,.***.****.*,****
DS48 13 .$.,...,,*.**.
DS49 15 .*.**..,,***,,,
DS4 20 ..**,*.*.,,**.*.*,*.
DS50 24 ***,***..,..****..*..*,.
DS51 10 ,,.**,.**,
DS52 21 G$,*,.*,***,,**,,.**,.
DS53 18 ,*,.*.*,.***,*...,
DS54 22 ,.*.,*,*****.***,,***.
DS55 24 ,,***..***,***.***,,****
DS56 17 ,,,,.,*,.*...**..
DS57 31 ,$..,**.**,,..*.*,.*..,.*,,.*,,.
DS58 41 ****,*.,,.*.**,*,.*,*..,.**,****.*..,**,,
DS59 8 .,,**.,*
DS5 24 ,**.*..*****.**,.**,*.*.
DS60 13 .$*****,*****.
DS61 34 .,,*,.**,*,**.*,.*..*.*,*,,*,**.,*
DS62 20 .$*,.*.,.,,*,****,,..
DS63 18 *.*,,****.*..*,,,.
DS64 17 .$,*,*,*,**,.***..
DS65 41 ,,,G-5AGGTCg-5aggtc.,G-5AGGTCg-5aggtcG-5AGGTCg-5aggtc,,g-5aggtcG-5AGGTCg-5aggtc,g-5aggtcg-5aggtc.,,,G-5AGGTC.,.G-5AGGTCg-5aggtc,,,...G-5AGGTC.,G-5AGGTC,-5aggtc,
DS66 24 ..**,.***,,..****.,*....
DS67 11 *..,,.*.*,,
DS68 13 ,,*,.***,..*,
DS69 12 ..*.,*.,**.,
DS6 20 .*,..,**....*,*,,*,,
DS70 18 ,**,.*.,,*.,*,*..,
DS71 24 .***,*,*..*,***,***,**.,
DS72 16 ****.**.,..*,*,,
DS73 20 ..**,*..*.,*..*.,*,^],
DS74 11 ..**,**.***
DS7 16 .**,.*,*,..,*,,,
DS9 15 .,,**,.,*.*****
Sample1 33 ,,*,,*******.*,*.**,,*,....,*....
Sample2 51 ,*.*.***.***,*,,,...*,**,*,*.***,.*.**,,.,***.,*,.^],
Sample3 48 ,****,***.*,*,,***..**.,.,***,.*....*.***,**...^],
Sample4 35 G$.g-5aggtc,g-5aggtc.,,g-5aggtc,.,.,g-5aggtc.G-5AGGTCG-5AGGTCG-5AGGTCG-5AGGTC.G-5AGGTCG-5AGGTC,,G-5AGGTCg-5aggtc.g-5aggtcG-5AGGTC,,g-5aggtcg-5aggtc.
Sample5 32 ,,*,..*.**,.*.**,,.,,**.*.**....
Sample6 41 ,$,,,,.**.,**,**,.****,*****.***,,.**,**,.

And another one

  • 1:10936409
  • Indel error
In [10]:
Image('/home/ribli/DT40/SNP/varscan/igv_snapshots/sample4_hits/1-10936409.png') 
Out[10]:
In [15]:
i=10
print best_pos_list[i]
print
run_pileup(1,best_pos_list[i].split('\t')[1])
1	10936409	A	T	55	0	0%	A	20	14	41.18%	W	Somatic	1.0	1.8237211328043387E-7	10	10	7	7	31	24	0	0

1 10936409 A
DS10 12 ...,,,,.,,,.
DS11 20 ,,,,.,..,...,,..,..,
DS14 35 ,$,.,,,,..,.,..,,,...,,,,,..,.,..,.,
DS15 23 .....,,.,..,...,,....,,
DS16 16 .,.,,.,.,.,,,..,
DS18 24 ,$,..,.,,.,.,...,,,,,.,..
DS1 22 .,....,..,,,.,,..,,...
DS26 17 ,........,,,..,,,
DS27 31 ,,..,.,.,,.,..,,,,,,,,..,,...,^],
DS2 21 ..,.,,,,,....,,,....,
DS33 24 .$.,.,.,.,,,.,,,,..,,,.,,
DS34 15 ,$.,,.,,.,..,,..
DS35 16 ,,...,..,..,...,
DS36 23 ,$,,,..,..,,,,,..,,,,,,,
DS37 12 ,,.,..,.,...
DS3 17 .,,,,,.,,,..,..,.
DS41 15 .,,,..,......,,
DS42 13 .....,,.,,,..
DS43 25 ,,,.,t.....,,.,..,,,,,,.^],
DS44 19 ,.,,,.....,,,.,.,,,
DS45 31 ,...,,.,,.,...,,,,,,.,..,.,....
DS46 29 ,,,.,,.,,,,,...,,...,.,,,....
DS47 36 ,.,.,..,..,..,,.,.,..,,....,,,,....^],
DS48 29 ,.,.,,,,.......,,..,,,,,.,.,.
DS49 27 ,.,,,..,,,,.,,..,,...,...,^],
DS4 19 ,.,,..,,,.....,,.,.
DS50 26 ,$,,,...,,,.,,....,,,..,,,,
DS51 31 .T,....,,.,.,,,.,.,..,,.,,,.,..
DS52 23 ,..,..,,,...,,,..,,,,..
DS53 23 ,,,,.,.,,..,,...,,,..,^].
DS54 27 ,,,,..,,.,.,..,..,,,.,,.,,.
DS55 30 ,..,,.,..,.,..,.....,.,,..,..^],
DS56 30 .,...,...,,,,,...,,,,,.,,,...,
DS57 40 ,,,,,.,,,,,.....,,.,......,,,,,.....,,.,
DS58 44 ,,,.,.,,,,..,,,.,,,,,,..,,,,.....,.,...,,..^].
DS59 21 ,,,,.,.,,,,...,...,,.
DS5 31 ,.,.,,.,.,.,,,,.,,..,....,..,,.
DS60 22 .,,.,...,...,.,,..,,,.
DS61 30 .,,,,,.,,,.,......,,,...,..,.,
DS62 18 ,,,..,..,,,,,..,.,
DS63 26 .,.,,,,.,....,,,,...,..,.,
DS64 29 ,.,.,,,,..,,...,.,,.,,....,,.
DS65 49 ,.,,,,,,,,,.,,...,,...,,,,.,..,.,..,,.,...,,,..,.
DS66 23 .,,......,,,.,.,....,.^].
DS67 20 ,,,,.,...,..,..,..,^],
DS68 18 ,..,.,...,....,.,,
DS69 22 ....,,.,,..,,,,.,..,.^],
DS6 15 ,.,.,,,,,..,.,,
DS70 23 .,,..,..,.,,.,..,.,....
DS71 17 ,.,,.,.,T..,.,,,.
DS72 25 .,.....,,..,....,.,.,..t,
DS73 15 ,...,.,...,.,,.
DS74 19 ,,,.......,,,,,.,..
DS7 27 ,,.,,.,,...,.,,,,....,,..,^].
DS9 24 ...,,.,,.,..,...,,,.....
Sample1 55 .,,,..,.,.,...,,...,..........,,,,,.,..,,,..,...,,,,..,
Sample2 76 ttttTTTTttTtttTTtT.......,,,,,,,,..,,.,.....,,....,,.,,,....,...,,...,,,.,,^],
Sample3 56 .$,..,,..,.,...,,..,,,....,,,..,,,...,...,.,...,,,,...,,,
Sample4 34 TttTTtTtTTTttt....,,..,,,,...,,,,^].
Sample5 51 .,,,,.,.,..,.....,,,,,,.,..,...,...,,,.,,....,,,.,,
Sample6 26 ,,..,,.,...,,,,,..,,...,,,

Conclusions:

Almost all indel errors

  • Why are the indel errors so homoegenous? Local realignement make them homogeneous?

All positions could be filtered looking at all the samples

  • More robust germline escapes false positive calls because of fluctuation, or germline LOH-s.
  • At strange positions, more than one samples shows the stange behaviour.