Run CREST on samples in the all rounds of DT40 sequencing


Pipeline

  • define 'tumor' - 'normal' pairs (treated, untreated)
  • start a gfserver on used machines
  • run CREST on all samples, and all chromosomes in parallel. (as parallel as possible).
  • collect results

In [1]:
#import modules

import os
import subprocess
import time

Defining 'tumor', 'normal' pairs for CREST

  • normal samples are always the 'starting clones' using David's all_samples.txt
In [2]:
#define sample pairs
pairs=dict()

######################
#R1
pairs['Sample1']='Sample4'
pairs['Sample4']='Sample1'

pairs['Sample2']='Sample5'
pairs['Sample5']='Sample2'

pairs['Sample3']='Sample6'
pairs['Sample6']='Sample3'

######################
#R2
#WT
pairs['DS001']='DS002'
for i in range(1,8)+[9,10,11]:
    pairs['DS'+str(i).zfill(3)]='DS001'
    
#BRCA -/-
pairs['DS014']='DS015'
for i in [15,16,17,18]:
    pairs['DS'+str(i).zfill(3)]='DS014'

#BRCA +/-
pairs['DS026']='DS027'
pairs['DS027']='DS026'

#KU70
pairs['DS033']='DS034'
for i in range(34,38):
    pairs['DS'+str(i).zfill(3)]='DS033'

#####################
#R3
#WT
pairs['DS050']='DS041'
for i in range(41,50):
    pairs['DS'+str(i).zfill(3)]='DS050'
    
#BRCA -/-
pairs['DS051']='DS052'
for i in range(52,59):
    pairs['DS'+str(i).zfill(3)]='DS051'

#BRCA +/-
pairs['DS059']='DS060'
for i in range(60,62):
    pairs['DS'+str(i).zfill(3)]='DS059'

#KU70
pairs['DS062']='DS033'

#???
pairs['DS063']='DS064'
for i in range(64,66):
    pairs['DS'+str(i).zfill(3)]='DS063'
    
#???
pairs['DS066']='DS067'
for i in range(67,69):
    pairs['DS'+str(i).zfill(3)]='DS066'
    
#
pairs['DS069']='DS070'
pairs['DS070']='DS069'

#
pairs['DS071']='DS072'
pairs['DS072']='DS071'

#
pairs['DS073']='DS074'
pairs['DS074']='DS073'

#############################
#R4
#WT
for i in [73,74]+range(81,98):
    pairs['DS0'+str(i)]='DS041'

#PCNA164R spartan
pairs['DS098']='DS099'
for i in ['099','100']:
    pairs['DS'+i]='DS098'
    
#BRCA1 homo del
for i in ['101','102']:
    pairs['DS'+i]='DS014'
    
#BRCA1 hetero del
for i in xrange(104,108):
    pairs['DS'+str(i)]='DS026'
    
#BRCA2 homo del
pairs['DS108']='DS109'
for i in xrange(109,118):
    pairs['DS'+str(i)]='DS108'
    
#BRCA2 hetero del?
pairs['DS118']='DS119'
for i in xrange(119,127):
    pairs['DS'+str(i)]='DS118'
    
#PCNA164R
pairs['DS127']='DS066'

#Ku70
pairs['DS128']='DS129'
for i in xrange(129,135):
    pairs['DS'+str(i)]='DS128'

#spartan
pairs['DS135']='DS136'
for i in xrange(136,138):
    pairs['DS'+str(i)]='DS135'
    
############################
#R5

#WT
pairs['DS141']='DS142'
for i in xrange(142,145):
    pairs['DS'+str(i)]='DS141'
    
#DB1
pairs['DS145']='DS146'
for i in xrange(146,149):
    pairs['DS'+str(i)]='DS145'

Starting gfServer on a jimgray85 machine in slurm

In [5]:
%%writefile startblat.py
#!/usr/bin/python

import subprocess
#gfserver
gfserver_hostname,gfserver_port=" localhost "," 50000 "
gfServer="gfServer"

#reference genome
ref_fa="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"
ref_2bit="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.2bit"

#starting gfserver for blasting
cmd0=gfServer + " start "+gfserver_hostname+gfserver_port
cmd0+=" -stepSize=5 -canStop -log=blatServer.log " + ref_2bit + " &"
subprocess.check_output(cmd0,shell=True)
Overwriting startblat.py
In [6]:
try:
    subprocess.check_output(['sbatch','-C','jimgray85','--mem','1500','startblat.py' ])
except subprocess.CalledProcessError,e:
    print e

Run CREST

  • parallel on samples, and chromosomes too
In [7]:
%%writefile dt40_chr_CREST.py
#!/usr/bin/python
import os
import subprocess
import sys

#tmpdir for CREST, because it leaves garbage
os.environ['TMPDIR']='/nagyvinyok/adat83/sotejedlik/ribli/temp/'

tumbam=sys.argv[1]
germbam=sys.argv[2]
chromos=sys.argv[3]

#gfServer
gfserver_hostname,gfserver_port=" localhost "," 50000 "

#exec path
CREST="/nagyvinyok/adat88/kozos/sotejedlik/usr/CREST/CREST.pl"
bam2html="/nagyvinyok/adat88/kozos/sotejedlik/usr/CREST/bam2html.pl"
extractSClip="/nagyvinyok/adat88/kozos/sotejedlik/usr/CREST/extractSClip.pl"
cap3="/nagyvinyok/adat88/kozos/sotejedlik/usr/CAP3/cap3"

#reference
ref_fa="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"
ref_2bit="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.2bit"

#directory
output_dir="/home/ribli/DT40/SV/CREST/R_1-5/results/raw/R_1235"

#run extractsclip, it finds soft clipping positions in bams
cmd_extr=extractSClip + " -i "+tumbam+" --ref_genome "+ref_fa
cmd_extr+=" -o "+output_dir + " -r "+chromos
print
print cmd_extr
subprocess.check_output(cmd_extr,shell=True)

#run CREST 
cmd_CREST= CREST + " -f "+output_dir+"/"+os.path.basename(tumbam)+"."+chromos+".cover "
cmd_CREST+=" -d "+tumbam+" -g " + germbam + " --ref_genome "+ref_fa+" -t "+ref_2bit 
cmd_CREST+="  -o "+output_dir+" --blatserver "+gfserver_hostname
cmd_CREST+=" --blatport "+gfserver_port+" --cap3 " + cap3 + " -r "+chromos 
cmd_CREST+=" -p " + os.path.basename(tumbam)+"."+chromos
print
print cmd_CREST
subprocess.check_output(cmd_CREST,shell=True)
Writing dt40_chr_CREST.py
In [ ]:
#submit jobs

#rename the .bam.bais
bampos='/nagyvinyok/adat83/sotejedlik/ribli/dt40/bam_links_2/'
machine='jimgray85'
chromosomes=map(str,range(1,29)+[32,'W','Z'])

for tum_sample,germ_sample in pairs.iteritems():
    
    tum_bam= bampos + tum_sample + '_RMdup_picard_realign.bam'
    germ_bam= bampos + germ_sample + '_RMdup_picard_realign.bam'

    for chromosome in chromosomes:  
        cmd_CREST= ['sbatch','-C',machine,'--mem','2000',
                    'dt40_chr_CREST.py',tum_bam,germ_bam,chromosome]
        try:
            print subprocess.check_output(cmd_CREST,stderr=subprocess.STDOUT),
        except subprocess.CalledProcessError,e:
            print e