Warning!!! SKipping DS26,DS35

In [1]:
%%writefile indel_list_temp.py
#!/usr/bin/python

import os
import sys
import subprocess
import fnmatch
import re
import numpy as np

#flags
flags=" -Q30 -B "

#neighborhood data read from cmdline
chrom=sys.argv[1]
posfrom=sys.argv[2]
posto='-'+sys.argv[3]

#ouput files
#ouput files
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/indel_list/'
#subprocess.call(['mkdir',output_dir])
output_fname= str(chrom) + '-' + str(posfrom) +  str(posto) + '.pup'

#tools
bwa="/nagyvinyok/adat87/home/ribli/bwa-0.7.10/bwa"
samtools="/nagyvinyok/adat87/home/orsi/DT40/tools/samtools-1.2/samtools"
ref_fa="/nagyvinyok/adat87/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/dt40/rmdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
    if (fnmatch.fnmatch(fname, '*.bam') and 
        not fnmatch.fnmatch(fname,"*.bam.bai") and 
        (not fnmatch.fnmatch(fname,"DS26*")) and
        (not fnmatch.fnmatch(fname,"DS35*"))):
        fnames.append(fname)
fnames=sorted(fnames)

#create pileup command
cmd_mpileup = samtools + ' mpileup ' + flags + ' -f ' + ref_fa
cmd_mpileup += ' -r ' + str(chrom) +':'+ str(posfrom) +  str(posto) + ' '
cmd_mpileup += ' '.join([rm_dup_dir+x for x in fnames]) + ' | '

#filtering:
# throw away quality
cmd_q_filt= ' awk \' BEGIN{FS=\"\t\"} { print $1,$2,$3'
for i in xrange(len(fnames)):
        cmd_q_filt+=',$'+str(4+i*3)+',$'+str(5+i*3)
cmd_q_filt+='} \' | '

#keep lines with indels
cmd_del_ins_filt= " grep [\+\-] | "

#min cov filter (>0)
cmd_cov_filt= ' awk \' BEGIN{FS=\" \"} '
cmd_cov_filt += ' && '.join([ '$'+str(4+i*2)+'>0' for i in xrange(len(fnames))])
cmd_cov_filt += ' {print} \' | '

# delete first base quality (it can be mistaken for a base)
cmd_first_base_filt= " sed  's/\^.//g' "

###########################################
#create full command and execute
cmd_full= cmd_mpileup + cmd_q_filt + cmd_del_ins_filt
cmd_full += cmd_cov_filt + cmd_first_base_filt

#run the pipeline
output_f=open(output_dir + output_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_full, stdout=PIPE, bufsize=1,executable='/bin/bash',shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
    #parse line
    #no strip needed?
        linelist=line.upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]

        #count ins & del
        insertion=[]
        deletion=[]
        for i in xrange(len(bases)):
            insertion.append(len(re.findall('[\+]',bases[i])))
            deletion.append(len(re.findall('[\-]',bases[i])))
        insertion=np.array(insertion,dtype=np.double)
        deletion=np.array(deletion,dtype=np.double)

        #make freq from count 
        #refFreq,  0/0 = 0 not sure if thats good...
        inFreq=np.zeros(len(insertion),dtype=np.double)
        delFreq=np.zeros(len(deletion),dtype=np.double)
        inFreq[covs !=0]=insertion[covs !=0]/covs[covs != 0]
        delFreq[covs !=0]=deletion[covs !=0]/covs[covs != 0]
        inFreq_id=np.argsort(inFreq)
        delFreq_id=np.argsort(delFreq)

        #suspicious filter
        if ( (inFreq[inFreq_id[-1]] <  0.1)
            and (delFreq[delFreq_id[-1]] <  0.1) ) :
            continue

        #print pileup and Orsi's format too, 
        output_f.write(line)
        
output_f.close()
Writing indel_list_temp.py
In [3]:
BLOCKNO=100

from Bio import SeqIO
from Bio.Seq import Seq

#genome
refFasta="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"

#genome lengths
chromDict=dict()
fullLeng=0

#parsing fasta file
for seqin in SeqIO.parse(refFasta,"fasta"):
	# scaffolds and mitochondrium not used
        if (len(seqin.id) < 3 and seqin.id!='MT' ):
            #save lengths
            chromDict[seqin.id]=len(seqin.seq)
            fullLeng+=len(seqin.seq)

#set maximum block size
BLOCKSIZE=(fullLeng/BLOCKNO)

#calculate blocks
blocks=[]
#loop pover chroms
for chrom in chromDict.keys():
	pointer=0
	#until chrom is chopped into pieces
	while (pointer < chromDict[chrom]):
		blockSize=min(BLOCKSIZE,chromDict[chrom]-pointer)
	        blocks.append([chrom,pointer,pointer+blockSize])
	        pointer += blockSize   
In [7]:
import subprocess
for block in blocks:
    try:
        print subprocess.check_output(
            [ 'sbatch', '--mem',str(1000),'-C','jimgray83' , './indel_list_temp.py' ,
             block[0],str(block[1]),str(block[2])],stderr=subprocess.STDOUT),
    except subprocess.CalledProcessError, e:
        print e.output,
Submitted batch job 2207
Submitted batch job 2208
Submitted batch job 2209
Submitted batch job 2210
Submitted batch job 2211
Submitted batch job 2212
Submitted batch job 2213
Submitted batch job 2214
Submitted batch job 2215
Submitted batch job 2216
Submitted batch job 2217
Submitted batch job 2218
Submitted batch job 2219
Submitted batch job 2220
Submitted batch job 2221
Submitted batch job 2222
Submitted batch job 2223
Submitted batch job 2224
Submitted batch job 2225
Submitted batch job 2226
Submitted batch job 2227
Submitted batch job 2228
Submitted batch job 2229
Submitted batch job 2230
Submitted batch job 2231
Submitted batch job 2232
Submitted batch job 2233
Submitted batch job 2234
Submitted batch job 2235
Submitted batch job 2236
Submitted batch job 2237
Submitted batch job 2238
Submitted batch job 2239
Submitted batch job 2240
Submitted batch job 2241
Submitted batch job 2242
Submitted batch job 2243
Submitted batch job 2244
Submitted batch job 2245
Submitted batch job 2246
Submitted batch job 2247
Submitted batch job 2248
Submitted batch job 2249
Submitted batch job 2250
Submitted batch job 2251
Submitted batch job 2252
Submitted batch job 2253
Submitted batch job 2254
Submitted batch job 2255
Submitted batch job 2256
Submitted batch job 2257
Submitted batch job 2258
Submitted batch job 2259
Submitted batch job 2260
Submitted batch job 2261
Submitted batch job 2262
Submitted batch job 2263
Submitted batch job 2264
Submitted batch job 2265
Submitted batch job 2266
Submitted batch job 2267
Submitted batch job 2268
Submitted batch job 2269
Submitted batch job 2270
Submitted batch job 2271
Submitted batch job 2272
Submitted batch job 2273
Submitted batch job 2274
Submitted batch job 2275
Submitted batch job 2276
Submitted batch job 2277
Submitted batch job 2278
Submitted batch job 2279
Submitted batch job 2280
Submitted batch job 2281
Submitted batch job 2282
Submitted batch job 2283
Submitted batch job 2284
Submitted batch job 2285
Submitted batch job 2286
Submitted batch job 2287
Submitted batch job 2288
Submitted batch job 2289
Submitted batch job 2290
Submitted batch job 2291
Submitted batch job 2292
Submitted batch job 2293
Submitted batch job 2294
Submitted batch job 2295
Submitted batch job 2296
Submitted batch job 2297
Submitted batch job 2298
Submitted batch job 2299
Submitted batch job 2300
Submitted batch job 2301
Submitted batch job 2302
Submitted batch job 2303
Submitted batch job 2304
Submitted batch job 2305
Submitted batch job 2306
Submitted batch job 2307
Submitted batch job 2308
Submitted batch job 2309
Submitted batch job 2310
Submitted batch job 2311
Submitted batch job 2312
Submitted batch job 2313
Submitted batch job 2314
Submitted batch job 2315
Submitted batch job 2316
Submitted batch job 2317
Submitted batch job 2318
Submitted batch job 2319
Submitted batch job 2320
Submitted batch job 2321
In [ ]: