Postprocessing the results of CREST on data received in 2016 august


I take the results of CREST (predSV.txt files) and filter the result with the following criteria:

Filters:

  • No scaffolds
  • No N-s in reference in +- 150 bp
  • No indels in overlapping reads
    • Too strict?
  • Enough evidence
    • How many soft+hard clips? Now at least 5
  • Filter Germline events
    • all other samples used as control
  • Event covered in germline samples too

In [1]:
import pandas as pd
from pandas import DataFrame as df

from Bio import SeqIO
from Bio.Seq import Seq

import fnmatch
import os
import subprocess
import re
import time

from IPython.core.display import Image 
from IPython.display import display
In [2]:
#paths
datadir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/2016_08/sv_res/'
bam_link_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/bam_links_2/'
refFasta="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"

#bam file postfix
postfix='_RMdup_IR.bam'

Check how many SV-s CREST reported:

In [3]:
%%bash
echo The number of SVs found by CREST:
cat /nagyvinyok/adat83/sotejedlik/ribli/dt40/2016_08/sv_res/* | wc -l
The number of SVs found by CREST:
5623

Load chicken reference genome to memory

In [4]:
chromosomes=map(str,range(1,29))+['32','W','Z']
chrom_dict=dict()
for seqin in SeqIO.parse(refFasta,"fasta"):
    if(seqin.id in chromosomes):
        chrom_dict[seqin.id]=seqin.seq

Define a class for reads from a region

In [5]:
class data_from_region:
    
    def __init__(self,samples,chrom,pos):
        self.samples,self.chrom,self.pos=samples,chrom,pos
             
    def get_sam_region(self):
        self.sam_region=''
        for bam_file in [bam_link_dir+sample+postfix for sample in self.samples]:
            cmd='samtools view ' + bam_file + ' '+ self.chrom + ':' +str(int(self.pos))+'-'+str(int(self.pos))
            self.sam_region+=subprocess.check_output(cmd,shell=True)
            
    def get_clips_indels(self):
        self.hard_clips,self.soft_clips,self.ins,self.dels=[],[],[],[]
        
        if (self.sam_region.strip()==''): # no mapped read!
            return
        
        for line in self.sam_region.strip().split('\n'):
            line_list=line.split('\t')
            try:
                start,cigar=int(line_list[3]),line_list[5]
            except:
                print line_list
                
            self.parse_cigar_to_list(start,cigar)
            
    def parse_cigar_to_list(self,start,cigar):
        first_clip=True
        pos=0 # position in the genome (w offset)
        i_bef=0 #start of a number
        for i in xrange(len(cigar)):
            if cigar[i]=='M':
                pos+=int(cigar[i_bef:i])
                first_clip=False
                i_bef=i+1
            elif cigar[i]=='S':
                if(first_clip):
                    self.soft_clips.append(start)
                else:
                    self.soft_clips.append(start+pos-1)
                i_bef=i+1           
            elif cigar[i]=='H':
                if(first_clip):
                    self.hard_clips.append(start)
                else:
                    self.hard_clips.append(start+pos-1)
                i_bef=i+1
            elif cigar[i]=='D':
                self.dels.append(start+pos)
                pos+=int(cigar[i_bef:i])
                i_bef=i+1
            elif cigar[i]=='I':
                self.ins.append(start+pos)
                i_bef=i+1       
      
    def get_evidence(self,pos):
        return self.soft_clips.count(self.pos),self.hard_clips.count(self.pos)

Define a class for CREST result entries

In [6]:
class crest_hit:
    
    def __init__(self,fname,line):
        self.parse_sample_from_filename(fname)
        
        self.dataf=df([[self.sample] + line.split('\t')[0:17]],
                      columns=['sample','chr1','pos1','direction1','n_sup_reads1',
                               'chr2','pos2','direction2','n_sup_reads2','type',
                               'cov1','cov2','ass_leng1','ass_leng2','av_id_percent1',
                               'perc_non_uniq_map1','av_id_percent2','perc_non_uniq_map2'])
        
        self.chr1=self.dataf.loc[0,'chr1']
        self.chr2=self.dataf.loc[0,'chr2']
        self.pos1=int(self.dataf.loc[0,'pos1'])
        self.pos2=int(self.dataf.loc[0,'pos2'])
        
        self.filtered=False
        self.filter_causes=[]
        
    def parse_sample_from_filename(self,fname):
        #R1
        if (fname[:6]=='Sample'):
            self.sample=fname[:7]
        #other rounds
        else:
            self.sample=fname[:5]
        
    
    ####
    #read the data
    
    def get_data_from_sample_regions(self):
        self.data_from_sample=[]
        self.data_from_sample.append(data_from_region([self.sample],self.chr1,self.pos1))
        self.data_from_sample[0].get_sam_region()
        self.data_from_sample[0].get_clips_indels()
        
        self.data_from_sample.append(data_from_region([self.sample],self.chr2,self.pos2))
        self.data_from_sample[1].get_sam_region()
        self.data_from_sample[1].get_clips_indels()
        
        
    def get_data_from_germline_regions(self,germ_samples):
        self.data_from_germline=[]
        self.data_from_germline.append(data_from_region(germ_samples,self.chr1,self.pos1))
        self.data_from_germline[0].get_sam_region()
        self.data_from_germline[0].get_clips_indels()
        
        self.data_from_germline.append(data_from_region(germ_samples,self.chr2,self.pos2))
        self.data_from_germline[1].get_sam_region()
        self.data_from_germline[1].get_clips_indels()
        
    
    ################################################
    ##filters    
        
    ##################
    #without read data
    
    def filter_4_chromosomes(self):
        if ( (not self.chr1 in chromosomes) or
            (not self.chr2 in chromosomes)):
            self.filtered=True
            self.filter_causes.append('not on real chromosome')
                     
    def filter_4_Ns(self,chrom_dict):
        if (len(re.findall('[Nn]',str(chrom_dict[self.chr1][self.pos1-150:self.pos1+150])))!=0 or
           len(re.findall('[Nn]',str(chrom_dict[self.chr2][self.pos2-150:self.pos2+150])))!=0):
            self.filtered=True
            self.filter_causes.append('N-s in reference')
        
    ##################
    #with read data
    
    def filter_4_indels(self):
        if ( len(self.data_from_sample[0].ins+ self.data_from_sample[0].dels +
               self.data_from_sample[1].ins+ self.data_from_sample[1].dels ) != 0 ):
            self.filtered=True
            self.filter_causes.append('indels')
            
    def filter_4_evidence(self,threshold):
        if(sum(self.data_from_sample[0].get_evidence(self.pos1)) < threshold or
             sum(self.data_from_sample[1].get_evidence(self.pos2)) < threshold):
            self.filtered=True
            self.filter_causes.append('not enough evidence')
               
    def filter_4_germline(self,threshold):
        if(sum(self.data_from_germline[0].get_evidence(self.pos1)) > threshold or
             sum(self.data_from_germline[1].get_evidence(self.pos2)) > threshold):
            self.filtered=True
            self.filter_causes.append('germline event')
            
    def filter_4_germline_coverage(self):
        if( (self.data_from_germline[0].sam_region.strip()=='') or
            (self.data_from_germline[1].sam_region.strip()=='')):
            self.filtered=True
            self.filter_causes.append('germline sample not covered')

Load input

In [7]:
crest_hits={}
input_dir=datadir
for fname in os.listdir(input_dir):
    f_handle=open(input_dir+fname)
    for line in f_handle:
        key=' '.join( [fname.split('.')[0][:5]]+ line.split('\t')[:2])
        crest_hits[key]=crest_hit(fname,line)
    f_handle.close()

Filter for scaffolds, and N-s in reference

In [8]:
#filter for chromosome, and Ns in genome

for key in crest_hits.keys():
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_chromosomes()
    
for key in crest_hits.keys():
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_Ns(chrom_dict)
In [9]:
#count whats left

i=0
for key in sorted(crest_hits.keys()):
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for scaffolds, and Ns is reference we have:', i, 'results'
After filtering for scaffolds, and Ns is reference we have: 4091 results

Read data from sample

In [10]:
#Read the bams files overlapping the positions

start=time.time()
for key in sorted(crest_hits.keys()):
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].get_data_from_sample_regions()
print 'It took ',time.time()-start,'s'
It took  1335.49195194 s

Filter for indels

In [11]:
#filter for indels

for key in sorted(crest_hits.keys()):
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_indels()
In [12]:
#count whats left

i=0
for key in sorted(crest_hits.keys()):
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for indels we have:', i, 'results'
After filtering for indels we have: 1099 results

Filter for enough evidence

In [13]:
#filter for enough evidence

for key in sorted(crest_hits.keys()):
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_evidence(threshold=5)
In [14]:
#count whats left

i=0
for key in sorted(crest_hits.keys()):
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for enough evidence we have:', i, 'results'
After filtering for enough evidence we have: 202 results

Filter for germline events


In [15]:
samples=pd.read_excel('DT40 WGS Sample pairs.xlsx')['Sample ID'].values

Read data from germline samples

  • germline samples are all the samples except for the analysed
In [16]:
#Read the bams files overlapping the positions
start=time.time()
for key in sorted(crest_hits.keys()):
    if(crest_hits[key].filtered==True):
        continue
    germ_samples=set(samples)
    germ_samples.remove(crest_hits[key].sample)
    crest_hits[key].get_data_from_germline_regions(list(germ_samples))
print 'It took ',time.time()-start,'s'
It took  4609.49930596 s

Filter for germline events

In [17]:
#filter for germline events

for key in sorted(crest_hits.keys()):
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_germline(threshold=5)
In [18]:
#count whats left

i=0
for key in sorted(crest_hits.keys()):
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for germline events we have:', i, 'results'
After filtering for germline events we have: 78 results

Filter for germline coverage

In [19]:
#filter for germline coverage

for key in sorted(crest_hits.keys()):
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_germline_coverage()
In [20]:
#count whats left

i=0
for key in sorted(crest_hits.keys()):
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for germline coverage we have:', i, 'results'
After filtering for germline coverage we have: 78 results

Aggregate results by samples

In [21]:
temp_df_list=[crest_hits[x].dataf for x in sorted(crest_hits.keys()) if not crest_hits[x].filtered ]
sample_agg_df=pd.DataFrame(pd.concat(temp_df_list).groupby(['sample']).agg('size')).reset_index()
sample_agg_df.columns=['sample','#events']
sample_agg_df.to_csv('sample_agg.csv',sep='\t',index=False)
sample_agg_df
Out[21]:
sample #events
0 CH001 1
1 CH002 2
2 CH003 1
3 CH007 3
4 CH008 1
5 CH009 4
6 CH010 6
7 CH011 3
8 DS163 1
9 DS166 1
10 DS170 1
11 DS179 1
12 DS180 1
13 DS183 4
14 DS184 5
15 DS185 1
16 DS189 2
17 DS200 2
18 DS202 2
19 DS206 1
20 DS207 1
21 DS208 1
22 DS209 1
23 DS211 1
24 DS212 1
25 DS213 2
26 DS214 1
27 DS215 3
28 DS216 1
29 DS223 2
30 DS224 2
31 DS225 3
32 DS226 3
33 DS227 2
34 DS228 2
35 DS229 2
36 DS230 3
37 DS231 4

List all results

In [22]:
temp_df_list=[crest_hits[x].dataf for x in sorted(crest_hits.keys())  if not crest_hits[x].filtered ]
all_events=pd.concat(temp_df_list).reset_index().iloc[:,1:]
all_events.to_csv('all_events.csv',sep='\t',index=False)
all_events.sort_values(['sample'])
Out[22]:
sample chr1 pos1 direction1 n_sup_reads1 chr2 pos2 direction2 n_sup_reads2 type cov1 cov2 ass_leng1 ass_leng2 av_id_percent1 perc_non_uniq_map1 av_id_percent2 perc_non_uniq_map2
0 CH001 1 68006331 + 0 Z 14388034 + 5 CTX 22 13 0 74 0.691666666666667 0 1 0
1 CH002 21 6384544 - 0 19 2646449 + 6 CTX 57 18 0 42 1 0 0.846904761904762 0
2 CH002 6 12852578 + 11 6 2294481 + 4 INS 34 37 64 77 0.971481481481481 0 0.889523809523809 0
3 CH003 2 2947275 + 7 27 2806714 - 0 CTX 24 20 57 0 0.972857142857143 0 0.963333333333333 0
4 CH007 10 1333145 + 8 10 1362683 + 8 DEL 35 30 77 59 0.912727272727273 0 0.973777777777778 0
5 CH007 15 2062980 - 3 15 2062886 + 2 INV 25 15 61 21 0.897777777777778 0 0.927272727272727 0
6 CH007 7 21166342 + 6 7 21166821 + 9 DEL 28 24 70 71 0.905641025641026 0 0.941333333333333 0
7 CH008 18 8237534 + 15 18 8386298 + 4 DEL 40 33 72 70 0.8425 0 0.972549019607843 0
11 CH009 Z 42468587 - 1 Z 42472680 + 6 ITX 5 7 30 68 0.701666666666667 0 1 0
10 CH009 28 1444461 + 10 28 1444836 + 11 DEL 33 52 62 73 0.725964912280702 0 0.859444444444445 0
9 CH009 2 87934928 + 6 2 22973581 - 2 ITX 42 16 74 62 0.946666666666667 0 0.886666666666667 0
8 CH009 1 345357 + 6 1 353425 + 6 DEL 22 26 66 78 0.905238095238095 0 0.887619047619048 0
12 CH010 2 24444081 + 7 2 24444429 + 7 DEL 33 31 72 72 0.863333333333333 0 0.924166666666667 0
13 CH010 2 39463541 + 8 2 39470572 + 7 DEL 24 19 73 66 0.888888888888889 0 0.866666666666667 0
14 CH010 20 3467972 + 5 20 3468077 + 3 DEL 23 21 75 36 0.798095238095238 0 0.879166666666667 0
15 CH010 21 5920463 + 1 21 5920573 + 5 DEL 5 6 53 70 0.695 0 0.929333333333333 0
16 CH010 3 10720566 + 5 3 14197189 + 5 DEL 21 15 64 54 0.946666666666667 0 0.9 0
17 CH010 4 52322362 + 0 1 133821966 + 6 CTX 35 34 0 57 0.948095238095238 0 0.971111111111111 0
20 CH011 5 58896229 - 6 5 58883697 + 9 ITX 16 21 91 78 0.837777777777778 0 0.836666666666667 0
19 CH011 2 139321318 + 3 2 127151830 + 5 INS 38 45 36 62 0.933333333333333 0 1 0
18 CH011 1 29139465 + 2 1 29140265 + 5 DEL 20 24 46 68 0.897222222222222 0 0.963333333333333 0
21 DS163 8 13086111 + 9 8 13086938 + 6 DEL 55 32 81 67 0.80375 0 0.921234567901235 0
22 DS166 2 45038460 + 7 2 45038405 + 0 INS 41 57 76 0 0.879230769230769 0 0.881234567901235 0
23 DS170 3 70082618 + 11 3 70083143 + 10 DEL 42 29 70 62 0.925614035087719 0 0.898181818181818 0
24 DS179 4 30753212 + 12 4 31232474 + 4 DEL 28 30 73 51 0.757777777777778 0 0.981111111111111 0
25 DS180 10 17726969 + 5 10 17727192 + 0 DEL 18 17 66 0 0.798333333333333 0 0.949166666666667 0
28 DS183 4 57261813 + 1 4 57299595 + 3 DEL 20 11 66 59 0.763809523809524 0 0.945 0
29 DS183 7 24184982 + 5 7 24185454 + 3 DEL 9 19 50 61 0.888888888888889 0 0.906666666666667 0
26 DS183 14 2503992 + 19 14 2487066 + 9 INS 54 81 77 57 0.964086021505376 0 0.820579710144927 0
27 DS183 19 1410469 + 6 19 1343358 + 6 INS 49 41 61 76 0.923333333333333 0 0.97968253968254 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49 DS213 8 22774299 + 5 8 22774487 + 8 DEL 26 23 76 71 0.908484848484848 0 0.874285714285714 0
48 DS213 14 8552647 + 3 14 8552127 + 7 INS 23 53 40 66 0.860476190476191 0 0.995384615384615 0
50 DS214 4 40586196 + 4 4 40647015 + 5 DEL 14 24 72 63 0.853333333333333 0 0.898333333333333 0
51 DS215 21 1310941 - 9 21 1310169 + 7 ITX 31 34 71 65 0.932 0 0.882962962962963 0
52 DS215 21 641821 - 5 21 641124 + 7 ITX 15 26 42 66 0.748888888888889 0 0.897777777777778 0
53 DS215 4 49778895 + 7 4 49780107 + 8 DEL 21 28 72 70 0.878095238095238 0 0.8625 0
54 DS216 14 7179887 + 5 14 7179968 + 7 DEL 25 27 64 66 0.935 0 0.895384615384615 0
55 DS223 1 78587807 + 9 1 78578866 + 6 INS 42 36 74 57 0.965396825396825 0 0.949411764705882 0
56 DS223 2 86956939 + 3 2 86957027 + 1 DEL 36 41 50 20 0.926315789473684 0 0.880952380952381 0
58 DS224 5 29060018 + 10 5 29058727 + 2 INS 40 36 75 54 0.843030303030303 0 0.9775 0
57 DS224 2 103021061 + 10 2 102627111 + 7 INS 49 54 72 74 0.948965517241379 0 0.927380952380952 0
59 DS225 1 136112598 + 5 1 136112951 + 6 DEL 24 23 74 71 0.873333333333333 0 0.843076923076923 0
60 DS225 1 155236846 + 3 1 155092003 + 3 INS 38 32 66 65 0.918 0 0.898181818181818 0
61 DS225 4 74440294 + 5 4 74440386 + 7 DEL 28 25 69 57 0.892666666666667 0 0.968333333333333 0
62 DS226 1 170525139 + 4 1 170525226 + 7 DEL 22 21 71 65 0.940833333333333 0 0.850833333333333 0
63 DS226 5 41411601 + 3 5 41377555 + 3 INS 20 32 71 66 0.951764705882353 0 0.906666666666667 0
64 DS226 9 8689636 + 7 9 8689906 + 4 DEL 19 18 59 28 0.901666666666667 0 0.994444444444444 0
66 DS227 3 80040618 + 9 3 81085845 + 8 DEL 42 46 71 37 0.82974358974359 0 1 0
65 DS227 1 109980147 + 4 1 109980778 + 9 DEL 23 28 73 73 0.937435897435898 0 0.852121212121212 0
68 DS228 5 51858227 + 9 5 51887705 + 4 DEL 34 28 73 64 0.820740740740741 0 0.961212121212121 0
67 DS228 2 35158167 + 9 2 35061928 + 5 INS 42 48 75 74 0.949473684210526 0 0.9288 0
69 DS229 15 9576388 + 7 15 9503038 + 11 INS 53 43 76 53 0.94 0 0.912549019607843 0
70 DS229 4 14551051 + 8 4 14552596 + 15 DEL 25 40 71 72 0.917333333333333 0 0.926296296296296 0
71 DS230 3 87956068 + 5 3 89203619 - 5 ITX 27 23 70 60 0.873939393939394 0 0.804761904761905 0
72 DS230 3 89203621 - 7 3 87956068 + 6 ITX 23 27 79 73 0.873939393939394 0 0.804761904761905 0
73 DS230 Z 31596051 + 2 Z 31597432 + 7 DEL 8 11 67 71 0.686666666666667 0 0.868571428571429 0
75 DS231 23 4745789 + 5 23 4740010 + 5 INS 32 49 47 51 0.98 0 1 0
76 DS231 5 31552860 + 4 5 31513744 + 6 INS 39 41 47 47 0.976 0 1 0
74 DS231 1 83469609 + 9 1 83474490 + 2 DEL 27 29 75 62 0.76 0 0.93859649122807 0
77 DS231 8 1105853 + 4 8 1105985 + 1 DEL 18 19 73 40 0.894166666666667 0 0.92952380952381 0

78 rows × 18 columns