Postprocessing the results of CREST on DT40 R4


Filters:

  • DONE No scaffolds
  • DONE No N-s in reference in +- 150 bp
  • DONE No indels in overlapping reads
    • Too strict?
  • DONE Enough evidence
    • How many soft+hard clips? Now at least 5
  • DONE Germline events
    • done in pairs
  • DONE Event covered in germline sample

In [1]:
import pandas as pd
from pandas import DataFrame as df

from Bio import SeqIO
from Bio.Seq import Seq

import fnmatch
import os
import subprocess
import re
import time

from IPython.core.display import Image 
from IPython.display import display

Quick look at data

In [2]:
%%bash
#Check how many SV-s CREST reported:
echo The number of SVs found by CREST:
cat data/* | wc -l
The number of SVs found by CREST:
3814
In [3]:
#some global useful values

bam_link_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links/'
postfix='_RMdup_picard_realign.bam'
chromosomes=map(str,range(1,29))+['32','W','Z']

#for testing
LIMIT=3815
In [4]:
#load chicken reference genome to memory

refFasta="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"
chrom_dict=dict()
for seqin in SeqIO.parse(refFasta,"fasta"):
    if(seqin.id in chromosomes):
        chrom_dict[seqin.id]=seqin.seq
In [5]:
#define a class for reads from a region
 
class data_from_region:
    
    def __init__(self,samples,chrom,pos):
        self.samples,self.chrom,self.pos=samples,chrom,pos
             
    def get_sam_region(self):
        self.sam_region=''
        for bam_file in [bam_link_dir+sample+postfix for sample in self.samples]:
            cmd='samtools view ' + bam_file + ' '+ self.chrom + ':' +str(int(self.pos))+'-'+str(int(self.pos))
            self.sam_region+=subprocess.check_output(cmd,shell=True)
            
    def get_clips_indels(self):
        self.hard_clips,self.soft_clips,self.ins,self.dels=[],[],[],[]
        
        if (self.sam_region.strip()==''): # no mapped read!
            return
        
        for line in self.sam_region.strip().split('\n'):
            line_list=line.split('\t')
            try:
                start,cigar=int(line_list[3]),line_list[5]
            except:
                print line_list
                
            self.parse_cigar_to_list(start,cigar)
            
    def parse_cigar_to_list(self,start,cigar):
        first_clip=True
        pos=0 # position in the genome (w offset)
        i_bef=0 #start of a number
        for i in xrange(len(cigar)):
            if cigar[i]=='M':
                pos+=int(cigar[i_bef:i])
                first_clip=False
                i_bef=i+1
            elif cigar[i]=='S':
                if(first_clip):
                    self.soft_clips.append(start)
                else:
                    self.soft_clips.append(start+pos-1)
                i_bef=i+1           
            elif cigar[i]=='H':
                if(first_clip):
                    self.hard_clips.append(start)
                else:
                    self.hard_clips.append(start+pos-1)
                i_bef=i+1
            elif cigar[i]=='D':
                self.dels.append(start+pos)
                pos+=int(cigar[i_bef:i])
                i_bef=i+1
            elif cigar[i]=='I':
                self.ins.append(start+pos)
                i_bef=i+1       
      
    def get_evidence(self,pos):
        return self.soft_clips.count(self.pos),self.hard_clips.count(self.pos)
In [6]:
#define a class for CREST result entries

class crest_hit:
    
    def __init__(self,fname,line):
        self.parse_sample_from_filename(fname)
        
        self.dataf=df([[self.sample] + line.split('\t')[0:17]],
                      columns=['sample','chr1','pos1','direction1','n_sup_reads1',
                               'chr2','pos2','direction2','n_sup_reads2','type',
                               'cov1','cov2','ass_leng1','ass_leng2','av_id_percent1',
                               'perc_non_uniq_map1','av_id_percent2','perc_non_uniq_map2'])
        
        self.chr1=self.dataf.loc[0,'chr1']
        self.chr2=self.dataf.loc[0,'chr2']
        self.pos1=int(self.dataf.loc[0,'pos1'])
        self.pos2=int(self.dataf.loc[0,'pos2'])
        
        self.filtered=False
        self.filter_causes=[]
        
    def parse_sample_from_filename(self,fname):
        #R1
        if (fname[:6]=='Sample'):
            self.sample=fname[:7]
        #other rounds
        else:
            self.sample=fname[:5]
        
    
    ####
    #read the data
    
    def get_data_from_sample_regions(self):
        self.data_from_sample=[]
        self.data_from_sample.append(data_from_region([self.sample],self.chr1,self.pos1))
        self.data_from_sample[0].get_sam_region()
        self.data_from_sample[0].get_clips_indels()
        
        self.data_from_sample.append(data_from_region([self.sample],self.chr2,self.pos2))
        self.data_from_sample[1].get_sam_region()
        self.data_from_sample[1].get_clips_indels()
        
        
    def get_data_from_germline_regions(self,germ_samples):
        self.data_from_germline=[]
        self.data_from_germline.append(data_from_region(germ_samples,self.chr1,self.pos1))
        self.data_from_germline[0].get_sam_region()
        self.data_from_germline[0].get_clips_indels()
        
        self.data_from_germline.append(data_from_region(germ_samples,self.chr2,self.pos2))
        self.data_from_germline[1].get_sam_region()
        self.data_from_germline[1].get_clips_indels()
        
    
    ################################################
    ##filters    
        
    ##################
    #without read data
    
    def filter_4_chromosomes(self):
        if ( (not self.chr1 in chromosomes) or
            (not self.chr2 in chromosomes)):
            self.filtered=True
            self.filter_causes.append('not on real chromosome')
                     
    def filter_4_Ns(self,chrom_dict):
        if (len(re.findall('[Nn]',str(chrom_dict[self.chr1][self.pos1-150:self.pos1+150])))!=0 or
           len(re.findall('[Nn]',str(chrom_dict[self.chr2][self.pos2-150:self.pos2+150])))!=0):
            self.filtered=True
            self.filter_causes.append('N-s in reference')
        
    ##################
    #with read data
    
    def filter_4_indels(self):
        if ( len(self.data_from_sample[0].ins+ self.data_from_sample[0].dels +
               self.data_from_sample[1].ins+ self.data_from_sample[1].dels ) != 0 ):
            self.filtered=True
            self.filter_causes.append('indels')
            
    def filter_4_evidence(self,threshold):
        if(sum(self.data_from_sample[0].get_evidence(self.pos1)) < threshold or
             sum(self.data_from_sample[1].get_evidence(self.pos2)) < threshold):
            self.filtered=True
            self.filter_causes.append('not enough evidence')
               
    def filter_4_germline(self,threshold):
        if(sum(self.data_from_germline[0].get_evidence(self.pos1)) > threshold or
             sum(self.data_from_germline[1].get_evidence(self.pos2)) > threshold):
            self.filtered=True
            self.filter_causes.append('germline event')
            
    def filter_4_germline_coverage(self):
        if( (self.data_from_germline[0].sam_region.strip()=='') or
            (self.data_from_germline[1].sam_region.strip()=='')):
            self.filtered=True
            self.filter_causes.append('germline sample not covered')
In [7]:
#load every entry (only 2 positions!!!)

crest_hits={}
input_dir='data/'
for fname in os.listdir(input_dir):
    f_handle=open(input_dir+fname)
    for line in f_handle:
        key=' '.join( [fname.split('.')[0][:5]]+ line.split('\t')[:2])
        crest_hits[key]=crest_hit(fname,line)
    f_handle.close()

Filter for scaffolds, and N-s in reference

In [8]:
#filter for chromosome, and Ns in genome

for key in crest_hits.keys():
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_chromosomes()
    
for key in crest_hits.keys():
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_Ns(chrom_dict)
In [9]:
#count whats left

i=0
for key in sorted(crest_hits.keys())[:LIMIT]:
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for scaffolds, and Ns is reference we have:', i, 'results'
After filtering for scaffolds, and Ns is reference we have: 3274 results

Read data from sample

In [10]:
#Read the bams files overlapping the positions

start=time.time()
for key in sorted(crest_hits.keys())[:LIMIT]:
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].get_data_from_sample_regions()
print 'It took ',time.time()-start,'s'
It took  669.438032866 s

Filter for indels

In [11]:
#filter for indels

for key in sorted(crest_hits.keys())[:LIMIT]:
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_indels()
In [12]:
#count whats left

i=0
for key in sorted(crest_hits.keys())[:LIMIT]:
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for indels we have:', i, 'results'
After filtering for indels we have: 838 results

Filter for enough evidence

In [13]:
#filter for enough evidence

for key in sorted(crest_hits.keys())[:LIMIT]:
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_evidence(threshold=5)
In [14]:
#count whats left

i=0
for key in sorted(crest_hits.keys())[:LIMIT]:
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for enough evidence we have:', i, 'results'
After filtering for enough evidence we have: 273 results

Filter for germline events


In [15]:
##All sample codes
#r1
samples=[ 'Sample'+str(i) for i in xrange(1,7)]

#r2
samples+=['DS001','DS002','DS003','DS004','DS005','DS006','DS007','DS009',
         'DS010','DS011','DS014','DS015','DS016','DS018','DS026','DS027',
         'DS033','DS034','DS035','DS036','DS037']
#r3
samples+=[ 'DS0'+str(i) for i in xrange(41,73)]

#r4
samples+=['DS073', 'DS074', 'DS081', 'DS082', 'DS083', 'DS084', 'DS085',
         'DS086', 'DS087', 'DS088', 'DS089', 'DS090', 'DS091', 'DS092',
         'DS093', 'DS094', 'DS095', 'DS096', 'DS097', 'DS098', 'DS099',
         'DS100', 'DS101', 'DS102', 'DS104', 'DS105', 'DS106', 'DS107',
         'DS108', 'DS109', 'DS110', 'DS111', 'DS112', 'DS113', 'DS114',
         'DS115', 'DS116', 'DS117', 'DS118', 'DS119', 'DS120', 'DS121',
         'DS122', 'DS123', 'DS124', 'DS125', 'DS126', 'DS127', 'DS128',
         'DS129', 'DS130', 'DS131', 'DS132', 'DS133', 'DS134', 'DS135',
         'DS136', 'DS137']

#r5
samples+=[ 'DS'+str(i) for i in xrange(141,149)]

Read data from germline samples

  • germline samples are all the samples except for the analysed
In [16]:
#Read the bams files overlapping the positions
start=time.time()
for key in sorted(crest_hits.keys())[:LIMIT]:
    if(crest_hits[key].filtered==True):
        continue
    germ_samples=set(samples)
    germ_samples.remove(crest_hits[key].sample)
    crest_hits[key].get_data_from_germline_regions(list(germ_samples))
print 'It took ',time.time()-start,'s'
It took  9873.00809312 s

Filter for germline events

In [17]:
#filter for germline events

for key in sorted(crest_hits.keys())[:LIMIT]:
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_germline(threshold=5)
In [18]:
#count whats left

i=0
for key in sorted(crest_hits.keys())[:LIMIT]:
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for germline events we have:', i, 'results'
After filtering for germline events we have: 113 results

Filter for germline coverage

In [19]:
#filter for germline coverage

for key in sorted(crest_hits.keys())[:LIMIT]:
    if(crest_hits[key].filtered==True):
        continue
    crest_hits[key].filter_4_germline_coverage()
In [20]:
#count whats left

i=0
for key in sorted(crest_hits.keys())[:LIMIT]:
    if (crest_hits[key].filtered==False):
        i+=1
print 'After filtering for germline coverage we have:', i, 'results'
After filtering for germline coverage we have: 113 results

Aggregate results by samples

In [22]:
temp_df_list=[crest_hits[x].dataf for x in sorted(crest_hits.keys())[:LIMIT] if not crest_hits[x].filtered ]
sample_agg_df=pd.DataFrame(pd.concat(temp_df_list).groupby(['sample']).agg('size')).reset_index()
sample_agg_df.columns=['sample','#events']
sample_agg_df.to_csv('/home/ribli/DT40/SV/CREST/R_1-5/results/sample_agg.csv',sep='\t',index=False)
sample_agg_df
Out[22]:
sample #events
0 DS015 2
1 DS016 1
2 DS018 4
3 DS053 1
4 DS054 1
5 DS055 3
6 DS056 2
7 DS057 1
8 DS058 4
9 DS067 1
10 DS068 1
11 DS087 1
12 DS090 1
13 DS094 1
14 DS099 1
15 DS101 2
16 DS106 1
17 DS109 8
18 DS110 2
19 DS111 5
20 DS112 10
21 DS113 6
22 DS114 9
23 DS115 9
24 DS116 15
25 DS117 9
26 DS122 1
27 DS127 1
28 DS131 1
29 DS132 1
30 DS142 1
31 DS144 1
32 DS146 1
33 DS148 1
34 Sample1 1
35 Sample2 3

List all results

In [23]:
temp_df_list=[crest_hits[x].dataf for x in sorted(crest_hits.keys())[:LIMIT] if not crest_hits[x].filtered ]
all_events=pd.concat(temp_df_list).reset_index().iloc[:,1:]
all_events.to_csv('/home/ribli/DT40/SV/CREST/R_1-5/results/all_events.csv',sep='\t',index=False)
all_events.sort_values(['sample'])
Out[23]:
sample chr1 pos1 direction1 n_sup_reads1 chr2 pos2 direction2 n_sup_reads2 type cov1 cov2 ass_leng1 ass_leng2 av_id_percent1 perc_non_uniq_map1 av_id_percent2 perc_non_uniq_map2
0 DS015 8 11829820 + 5 8 11780449 + 5 INS 35 24 68 32 1 0 0.927037037037037 0
1 DS015 Z 72678553 + 3 Z 72678834 + 2 DEL 9 11 68 32 0.773333333333333 0 0.864761904761905 0
2 DS016 2 80384148 + 1 2 80339126 + 4 INS 44 32 42 76 0.895897435897436 0 1 0
3 DS018 15 7091709 + 4 15 7097274 + 4 DEL 20 17 75 65 0.856190476190476 0 0.895151515151515 0
4 DS018 5 26812765 + 2 5 26663620 + 4 INS 21 31 78 59 0.928333333333333 0 0.910909090909091 0
5 DS018 5 35880263 + 5 5 30147925 + 5 INS 24 26 49 71 0.906666666666667 0 0.953333333333333 0
6 DS018 7 21042677 + 6 2 137440243 + 0 CTX 23 56 58 0 0.951666666666667 0 0.964722222222222 0
7 DS053 4 35232312 + 4 4 35189694 + 3 INS 30 23 61 52 0.963294117647059 0 0.937666666666667 0
8 DS054 4 58011446 + 11 4 57694945 + 3 INS 39 35 68 64 0.953454545454545 0 0.917333333333333 0
9 DS055 1 125228043 + 6 1 125103059 + 3 INS 25 33 56 50 0.9676 0 0.874666666666667 0
10 DS055 24 3214419 + 8 24 2710269 + 3 INS 58 68 45 60 0.964085106382979 0 0.961142857142857 0
11 DS055 24 4343450 + 4 24 4343565 + 3 DEL 48 45 45 59 0.957714285714286 0 0.967238095238095 0
13 DS056 7 15064708 + 5 7 15064800 + 3 DEL 22 29 50 15 0.903529411764706 0 0.886315789473684 0
12 DS056 26 5208763 + 6 26 5208852 + 3 DEL 15 14 54 28 0.852 0 0.972 0
14 DS057 2 30238003 + 3 2 30238261 + 1 DEL 32 28 71 28 0.911666666666667 0 0.908 0
15 DS058 14 7582254 + 16 14 7595150 + 19 DEL 63 68 71 72 0.890114942528736 0 0.917543859649123 0
16 DS058 2 64639806 + 13 2 64639950 + 4 DEL 99 70 76 61 0.887619047619048 0 0.926231884057971 0
17 DS058 20 4229428 + 8 20 4229522 + 2 DEL 87 85 74 27 0.932777777777778 0 0.912820512820513 0
18 DS058 Z 33339107 + 10 Z 33339338 - 8 ITX 15 15 71 61 0.62 0 0.872592592592593 0
19 DS067 9 4471533 + 6 9 4472428 + 3 DEL 19 11 55 60 0.846545454545455 0 0.896 0
20 DS068 2 22332899 + 7 2 22332952 + 2 DEL 32 24 45 39 0.937230769230769 0 0.9805 0
21 DS087 7 7284020 + 7 7 7283933 + 2 INS 39 31 56 47 0.8868 0 0.866909090909091 0
22 DS090 6 3284035 + 3 6 3283952 + 2 INS 36 37 60 39 0.920827586206897 0 0.929714285714286 0
23 DS094 2 40671150 + 6 2 40674603 + 5 DEL 59 40 35 52 0.951804878048781 0 0.930074074074074 0
24 DS099 5 42214904 + 3 5 42215986 + 8 DEL 21 25 50 51 0.8552 0 0.938666666666667 0
25 DS101 5 56016310 + 6 5 56016558 + 9 DEL 42 40 61 55 0.896761904761905 0 0.943448275862069 0
26 DS101 7 9308799 + 2 7 9308937 + 6 DEL 27 28 41 54 0.885176470588235 0 0.965473684210526 0
27 DS106 8 6774485 + 8 8 6774569 + 2 DEL 21 19 66 19 0.839333333333333 0 0.963692307692308 0
28 DS109 1 175023214 + 9 1 175032190 + 9 DEL 40 38 50 56 0.925866666666667 0 0.907466666666667 0
29 DS109 1 87422820 + 9 1 87436109 + 10 DEL 34 35 55 48 0.876 0 0.899130434782609 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
86 DS116 3 41656603 + 8 3 41657377 + 5 DEL 31 41 65 52 0.888470588235294 0 0.934 0
84 DS116 2 52941263 + 8 2 52941565 + 8 DEL 50 38 48 55 0.912761904761905 0 0.95625 0
82 DS116 1 24014582 + 4 1 24017529 + 4 DEL 30 28 59 44 0.921454545454545 0 0.9204 0
81 DS116 1 159442168 + 7 1 159442427 + 1 DEL 21 23 47 55 0.907636363636363 0 0.889714285714286 0
80 DS116 1 141178110 - 14 1 45346483 + 4 ITX 34 33 56 48 0.888888888888889 0 0.829241379310345 0
79 DS116 1 135043966 + 6 1 135069177 + 5 DEL 24 23 38 58 0.942666666666667 0 0.913714285714286 0
78 DS116 1 135006515 + 13 1 135025436 + 8 DEL 33 40 56 61 0.870181818181818 0 0.920296296296296 0
77 DS116 1 116547564 - 1 1 65864349 + 7 ITX 36 43 33 48 0.936533333333333 0 0.916740740740741 0
83 DS116 1 65867665 + 10 1 116547564 - 8 ITX 46 36 51 46 0.942344827586207 0 0.916740740740741 0
97 DS117 3 1657778 + 16 3 1657929 + 8 DEL 44 56 55 60 0.869333333333333 0 0.898051282051282 0
100 DS117 Z 42636126 + 14 Z 42636405 + 9 DEL 17 16 62 57 0.6888 0 0.8608 0
99 DS117 Z 2310614 + 4 Z 2311705 + 2 DEL 5 6 59 37 0.764 0 0.926 0
98 DS117 9 23002156 + 7 9 23002215 + 2 DEL 22 27 57 34 0.8275 0 0.944 0
96 DS117 2 99514768 + 6 2 99527318 + 6 DEL 31 57 59 52 0.88608 0 0.967157894736842 0
95 DS117 1 90188037 + 11 1 90188303 + 4 DEL 29 28 64 51 0.87 0 0.8764 0
94 DS117 1 73785831 + 9 1 73786354 - 5 ITX 17 17 42 57 0.887 0 0.893333333333333 0
93 DS117 1 36236460 + 10 1 36236724 + 3 DEL 24 18 59 47 0.813714285714286 0 0.897684210526316 0
92 DS117 1 109667255 + 6 1 109667741 + 9 DEL 31 28 55 58 0.9428 0 0.867636363636364 0
101 DS122 19 1911217 + 4 19 2400860 + 6 DEL 42 40 66 55 0.94784 0 0.93824 0
102 DS127 4 45506958 + 7 4 45528662 + 5 DEL 48 40 58 41 0.94175 0 0.9648 0
103 DS131 5 40005568 + 4 5 40006324 + 3 DEL 22 14 49 49 0.894 0 0.9608 0
104 DS132 1 69228608 + 5 1 69236012 + 3 DEL 18 11 53 34 0.888470588235294 0 0.930285714285714 0
105 DS142 3 19274224 + 4 3 19297772 + 4 DEL 21 15 46 28 0.927428571428571 0 0.912 0
106 DS144 5 13466640 + 5 5 13466804 + 5 DEL 23 20 61 42 0.879466666666667 0 0.918285714285714 0
107 DS146 15 5706170 + 5 15 5631455 + 4 INS 19 32 45 62 0.958909090909091 0 0.922461538461539 0
108 DS148 9 16036941 + 9 9 15999834 + 8 INS 51 40 58 62 0.912285714285714 0 0.953052631578947 0
112 Sample1 20 13777112 + 9 20 13777180 + 9 DEL 64 74 46 29 0.837227722772277 0 0.859358224979124 0
110 Sample2 1 164316313 + 6 1 164316367 + 5 DEL 50 44 43 28 0.934790030727211 0 0.909209788903419 0
109 Sample2 1 102363762 + 9 1 102360719 + 9 INS 79 52 43 42 0.897969457962745 0 0.918857402981678 0
111 Sample2 19 9016891 + 13 19 9017300 + 9 DEL 63 66 44 29 0.8849532840608 0 0.871015868710159 0

113 rows × 18 columns