%%writefile indel_list_temp.py
#!/usr/bin/python
import os
import sys
import subprocess
import fnmatch
import re
import numpy as np
#flags
flags=" -Q30 -B "
#neighborhood data read from cmdline
chrom=sys.argv[1]
posfrom=sys.argv[2]
posto='-'+sys.argv[3]
#ouput files
#ouput files
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/indel/indel_list/'
#subprocess.call(['mkdir',output_dir])
output_fname= str(chrom) + '-' + str(posfrom) + str(posto) + '.pup'
#tools
bwa="/nagyvinyok/adat87/home/ribli/bwa-0.7.10/bwa"
samtools="/nagyvinyok/adat87/home/orsi/DT40/tools/samtools-1.2/samtools"
ref_fa="/nagyvinyok/adat87/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/dt40/rmdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bam.bai") and
(not fnmatch.fnmatch(fname,"DS26*")) and
(not fnmatch.fnmatch(fname,"DS35*"))):
fnames.append(fname)
fnames=sorted(fnames)
#create pileup command
cmd_mpileup = samtools + ' mpileup ' + flags + ' -f ' + ref_fa
cmd_mpileup += ' -r ' + str(chrom) +':'+ str(posfrom) + str(posto) + ' '
cmd_mpileup += ' '.join([rm_dup_dir+x for x in fnames]) + ' | '
#filtering:
# throw away quality
cmd_q_filt= ' awk \' BEGIN{FS=\"\t\"} { print $1,$2,$3'
for i in xrange(len(fnames)):
cmd_q_filt+=',$'+str(4+i*3)+',$'+str(5+i*3)
cmd_q_filt+='} \' | '
#keep lines with indels
cmd_del_ins_filt= " grep [\+\-] | "
#min cov filter (>0)
cmd_cov_filt= ' awk \' BEGIN{FS=\" \"} '
cmd_cov_filt += ' && '.join([ '$'+str(4+i*2)+'>0' for i in xrange(len(fnames))])
cmd_cov_filt += ' {print} \' | '
# delete first base quality (it can be mistaken for a base)
cmd_first_base_filt= " sed 's/\^.//g' "
###########################################
#create full command and execute
cmd_full= cmd_mpileup + cmd_q_filt + cmd_del_ins_filt
cmd_full += cmd_cov_filt + cmd_first_base_filt
#run the pipeline
output_f=open(output_dir + output_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_full, stdout=PIPE, bufsize=1,executable='/bin/bash',shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
#no strip needed?
linelist=line.upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
#count ins & del
insertion=[]
deletion=[]
for i in xrange(len(bases)):
insertion.append(len(re.findall('[\+]',bases[i])))
deletion.append(len(re.findall('[\-]',bases[i])))
insertion=np.array(insertion,dtype=np.double)
deletion=np.array(deletion,dtype=np.double)
#make freq from count
#refFreq, 0/0 = 0 not sure if thats good...
inFreq=np.zeros(len(insertion),dtype=np.double)
delFreq=np.zeros(len(deletion),dtype=np.double)
inFreq[covs !=0]=insertion[covs !=0]/covs[covs != 0]
delFreq[covs !=0]=deletion[covs !=0]/covs[covs != 0]
inFreq_id=np.argsort(inFreq)
delFreq_id=np.argsort(delFreq)
#suspicious filter
if ( (inFreq[inFreq_id[-1]] < 0.1)
and (delFreq[delFreq_id[-1]] < 0.1) ) :
continue
#print pileup and Orsi's format too,
output_f.write(line)
output_f.close()