Phylogeny on the DT40 samples

  • The goal of this notebook is to create the phylogeny tree of the DT40 samples based on SNV-s.
  • created on 2015.09.22

Code is originally hidden in this notebook, if you are interested in details, please click on the button below.¶

In [1]:
#Toggle code

from IPython.display  import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[1]:
In [1]:
#import modules

import subprocess
import sys
import re
import numpy as np
import pandas as pd
import fnmatch
import os
import time
import copy

from Bio import AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

Collect the positions which have non-uniform genotype among samples


Definition of "genotype" :

  • the reference base if the sample has homozygous reference bases
  • the mutation base if the sample has heterozygous or homozygous mutation
    • limit for mutation is reffreq < 0.7

To avoid false genotype calls, I only keep positions which are nice:

  • no samples between 0.7 and 1 reference base frequency
  • coverage filter: minimum coverage > 5 in every sample
In [4]:
%%writefile collect_genotype.py
#!/usr/bin/python

import sys
import re
import numpy as np

cov_limit=5 #coverage limit for all samples
mut_threshold=0.7 #threshold for mutations

in_fname=sys.argv[1] #input filename comes from stdin
in_f=open(in_fname,'r')
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/phylogeny/test/'
out_f=open(output_dir+in_fname.split('/')[-1].split('.')[0]+'.geno','w')

for line in in_f: #loop over positions

    if (line[0]=='#'): # skip Orsis lines
        continue

    #parse pileup line
    linelist=line.strip().upper().split(' ')
    
    covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
    if(min(covs) < cov_limit ): #coverage filter
        continue
    
    bases=linelist[4::2]
    ref_count=[]
    for i in xrange(len(bases)):
        ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
    ref_count=np.array(ref_count,dtype=np.int32)
    
    ref_freq= ref_count / np.float64(covs)

    
    #the usable positions: there is a het position, and there is a homo too
    #and there are no low frequency positions
    if (min(ref_freq) < mut_threshold and #ther is mutation
        max(ref_freq) == 1 and  #there is reference too
        list(np.logical_and(ref_freq<1,ref_freq >mut_threshold)).count(True)==0): #nice pos

        #"genotyping"
        genotype=[]
        for i in xrange(len(bases)): #reference
            if (ref_freq[i]==1):
                genotype.append(linelist[2])
            else: #mutation
                a_count=len(re.findall('A',bases[i]))
                c_count=len(re.findall('C',bases[i]))
                g_count=len(re.findall('G',bases[i]))
                t_count=len(re.findall('T',bases[i]))
                genotype.append(['A','C','G','T'][np.argmax([a_count,c_count,g_count,t_count])])
        out_f.write(' '.join(genotype)+'\n')
        
in_f.close()
out_f.close()
Overwriting collect_genotype.py
In [ ]:
#Run them in slurm

input_dir= '/nagyvinyok/adat83/sotejedlik/orsi/SNV/SNV_list_withB_allsamples/'
for filename in os.listdir(input_dir):
    try:
        print subprocess.check_output([ 'sbatch', '-C','jimgray83',
                                       '--mem',str(1000),'./collect_genotype.py' ,
                                       input_dir + filename],stderr=subprocess.STDOUT),
    except subprocess.CalledProcessError, e:
        print e.output,
In [29]:
input_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/phylogeny/test/'
genotype=[]

for filename in os.listdir(input_dir):
    try:
        genotype.append(pd.read_csv(input_dir+filename,sep=' ',header=None))
    except Exception,e:
        pass
genotype=pd.concat(genotype)
genotype=np.array(genotype)

print 'The number of usable positions:' ,len(genotype[:,0])
The number of usable positions: 43244

Tree construction


Tree is made using the neighbour joining method of BioPython Phylo package, using all selected positions

  • NJ is not the most sophisticated phylogeny-tree building method, but it is much faster than the sophisticated ones
In [30]:
#Load sample names for annotation

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/orsi/bam_all_links'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
    if (fnmatch.fnmatch(fname, '*.bam') and 
        not fnmatch.fnmatch(fname,"*.bai")):
        fnames.append(fname)
fnames=sorted(fnames)

for i in xrange(len(fnames)):
    fnames[i]=(fnames[i].split('_'))[0]
In [31]:
#Create the multiple sequence alignement records

test=AlignIO.MultipleSeqAlignment([])
for i in xrange(len(genotype[0,])):
    test.extend([SeqRecord(Seq(''.join(genotype[:,i])),id=fnames[i])])
In [32]:
#Construct the tree

calculator = DistanceCalculator('identity')
constructor = DistanceTreeConstructor(calculator, 'nj')


start=time.time()
tree = constructor.build_tree(test)
print 'Tree construction took: ', time.time()-start,'s'
Tree construction took:  653.09604311 s

Plotting the tree


  • Tree is plotted using BioPython + matplotlib
    • this is a really simple plotting setup, more advanced ones can be used for publication quality figures (ETE2)
  • Leaves are annotated using samplenames.
  • Cell lines are colored
In [33]:
#select the WT grou samples set
group=['Sample1','Sample4'] #R1
for i in range(1,8)+[9]: #R2
    group.append('DS00'+str(i))
for i in range(10,12): #R2
    group.append('DS0'+str(i))
for i in range(41,51): #R3
    group.append('DS0'+str(i))
for i in [73,74]: #R4test
    group.append('DS0'+str(i))
for i in range(81,98): #R4
    group.append('DS0'+str(i))
wt_set=set(group)

#select the BRCA1 homo group samples set
group=['Sample2','Sample5'] #R1
for i in [14,15,16,18]: #R2
    group.append('DS0'+str(i))
for i in range(51,59): #R3
    group.append('DS0'+str(i))
for i in [101,102,103]: #R4
    group.append('DS'+str(i))
brca1_homo_set=set(group)


#select the BRCA1 hetero
group=[]
for i in [26,27]: #R2
    group.append('DS0'+str(i))
for i in [59,60,61]: #R3
    group.append('DS0'+str(i))
for i in range(104,108): #R4
    group.append('DS'+str(i))
brca1_hetero_set=set(group)

#select the BRCA2 homo group samples set
group=[]
for i in range(108,118): #R4
    group.append('DS'+str(i))
brca2_homo_set=set(group)

#select the BRCA2 hetero group samples set
group=[]
for i in range(118,127): #R4
    group.append('DS'+str(i))
brca2_hetero_set=set(group)

#select the spartan group samples set
group=[]
for i in range(135,138): #R4
    group.append('DS'+str(i))
spartan_set=set(group)

#select the PCNA group samples set
group=['Sample3','Sample6'] #R1
for i in range(66,73): #R3
    group.append('DS0'+str(i))
pcna_set=set(group)
    
#select the PCNA spartan group samples set
group=[]
for i in [98,99]: #R4
    group.append('DS0'+str(i)) 
for i in [100]: #R4
    group.append('DS'+str(i))
pcna_spartan_set=set(group)


#select the Ku70 group samples set
group=[]
for i in range(33,38): #R2
    group.append('DS0'+str(i))
for i in [62]: #R3
    group.append('DS0'+str(i))
for i in range(128,135): #R4
    group.append('DS'+str(i))
ku70_set=set(group)

#select the blm group samples set
group=[]
for i in [63,64,65]: #R3
    group.append('DS0'+str(i))
blm_set=set(group)

#new WT
group=[]
for i in range(141,145): 
    group.append('DS'+str(i))
wt2_set=set(group)

#db1
group=[]
for i in range(145,149): 
    group.append('DS'+str(i))
db1_set=set(group)
In [34]:
for clade in tree.find_clades():
    if( clade.name[:5]=='Inner'):
        clade.color='gray'
    else: #internal branches
        if (clade.name in wt_set):
            clade.color='blue'
        if (clade.name in brca1_homo_set):
            clade.color='orange'
        if (clade.name in brca1_hetero_set):
            clade.color='purple'
        if (clade.name in brca2_homo_set):
            clade.color='magenta'
        if (clade.name in brca2_hetero_set):
            clade.color='navy'
        if (clade.name in pcna_set):
            clade.color='green'
        if (clade.name in pcna_spartan_set):
            clade.color='salmon'
        if (clade.name in ku70_set):
            clade.color='red'
        if (clade.name in blm_set):
            clade.color='yellow'
        if (clade.name in spartan_set):
            clade.color='cyan'
        if (clade.name in wt2_set):
            clade.color='gold'
        if (clade.name in db1_set):
            clade.color='lime'
In [35]:
#some settings
matplotlib.rc('font', size=12)
matplotlib.rcParams['lines.linewidth'] = 3
matplotlib.rcParams['figure.figsize'] =(16,26) 

fig,ax=plt.subplots()

#custom legend
wt_proxy = plt.Rectangle((0, 0), 1, 1, fc="blue")
brca1_homo_proxy = plt.Rectangle((0, 0), 1, 1, fc="orange")
brca1_hetero_proxy = plt.Rectangle((0, 0), 1, 1, fc="purple")
brca2_homo_proxy = plt.Rectangle((0, 0), 1, 1, fc="magenta")
brca2_hetero_proxy = plt.Rectangle((0, 0), 1, 1, fc="navy")
pcna_proxy = plt.Rectangle((0, 0), 1, 1, fc="green")
pcna_spartan_proxy = plt.Rectangle((0, 0), 1, 1, fc="salmon")
ku70_proxy = plt.Rectangle((0, 0), 1, 1, fc="red")
blm_proxy = plt.Rectangle((0, 0), 1, 1, fc="yellow")
spartan_proxy = plt.Rectangle((0, 0), 1, 1, fc="cyan")
wt2_proxy = plt.Rectangle((0, 0), 1, 1, fc="gold")
db1_proxy = plt.Rectangle((0, 0), 1, 1, fc="lime")


ax.legend([wt_proxy,brca1_homo_proxy,brca1_hetero_proxy,
           brca2_homo_proxy,brca2_hetero_proxy,pcna_proxy,
           pcna_spartan_proxy,ku70_proxy,blm_proxy,spartan_proxy,
          wt2_proxy,db1_proxy],
          ['wt','brca1 homo','brca1 hetero','brca2 homo','brca2 hetero',
           'pcna','pcna spartan','ku70','blm','spartan','wt2','db1'],
         fancybox=True)

#draw tree
def my_label(clade):
    if( clade.name[:5]=='Inner'):
        return None
    else:
        return clade.name
    
Phylo.draw(tree,my_label,axes=ax)

Force topology

In [36]:
#force topology
topo_tree = copy.deepcopy(tree)

for clade in topo_tree.find_clades():
    clade.branch_length=0.1
In [37]:
#some settings
matplotlib.rc('font', size=12)
matplotlib.rcParams['lines.linewidth'] = 3
matplotlib.rcParams['figure.figsize'] =(16,26) 


fig,ax=plt.subplots()

#custom legend
wt_proxy = plt.Rectangle((0, 0), 1, 1, fc="blue")
brca1_homo_proxy = plt.Rectangle((0, 0), 1, 1, fc="orange")
brca1_hetero_proxy = plt.Rectangle((0, 0), 1, 1, fc="purple")
brca2_homo_proxy = plt.Rectangle((0, 0), 1, 1, fc="magenta")
brca2_hetero_proxy = plt.Rectangle((0, 0), 1, 1, fc="navy")
pcna_proxy = plt.Rectangle((0, 0), 1, 1, fc="green")
pcna_spartan_proxy = plt.Rectangle((0, 0), 1, 1, fc="salmon")
ku70_proxy = plt.Rectangle((0, 0), 1, 1, fc="red")
blm_proxy = plt.Rectangle((0, 0), 1, 1, fc="yellow")
spartan_proxy = plt.Rectangle((0, 0), 1, 1, fc="cyan")
wt2_proxy = plt.Rectangle((0, 0), 1, 1, fc="gold")
db1_proxy = plt.Rectangle((0, 0), 1, 1, fc="lime")


ax.legend([wt_proxy,brca1_homo_proxy,brca1_hetero_proxy,
           brca2_homo_proxy,brca2_hetero_proxy,pcna_proxy,
           pcna_spartan_proxy,ku70_proxy,blm_proxy,spartan_proxy,
          wt2_proxy,db1_proxy],
          ['wt','brca1 homo','brca1 hetero','brca2 homo','brca2 hetero',
           'pcna','pcna spartan','ku70','blm','spartan','wt2','db1'],
         fancybox=True)

#draw tree
def my_label(clade):
    if( clade.name[:5]=='Inner'):
        return None
    else:
        return clade.name
    
Phylo.draw(topo_tree,my_label,axes=ax)

Conclusion


  • The cell lines are very nicely separated, altough there are some strange sample positions (e.g.: DS048)
  • Most mutations are generated during the treatment phases
  • Note that the mixed-up samples were easily and clearly identified using this method
  • PCNA, and BRCA2 has very high number of cell line speficic mutations