%%writefile snp_list_temp.py
#!/usr/bin/python
#import modules
import os
import sys
import subprocess
import fnmatch
import re
import numpy as np
#flags
flags=" -Q30 -B "
#neighborhood data read from cmdline
chrom=sys.argv[1]
posfrom=sys.argv[2]
posto='-'+sys.argv[3]
#ouput files
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/snp_list/'
#subprocess.call(['mkdir',output_dir])
output_fname= str(chrom) + '-' + str(posfrom) + str(posto) + '.pup'
#tools
bwa="/nagyvinyok/adat87/home/ribli/bwa-0.7.10/bwa"
samtools="/nagyvinyok/adat87/home/ribli/tools/samtools-0.1.19/samtools"
ref_fa="/nagyvinyok/adat87/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"
#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/dt40/rmdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
if (fnmatch.fnmatch(fname, '*.bam') and
not fnmatch.fnmatch(fname,"*.bam.bai") and
(not fnmatch.fnmatch(fname,"DS26*")) and
(not fnmatch.fnmatch(fname,"DS35*"))):
fnames.append(fname)
fnames=sorted(fnames)
#create pileup command
cmd_mpileup = samtools + ' mpileup ' + flags + ' -f ' + ref_fa
cmd_mpileup += ' -r ' + str(chrom) +':'+ str(posfrom) + str(posto) + ' '
cmd_mpileup += ' '.join([rm_dup_dir+x for x in fnames]) + ' | '
#filtering:
# throw away quality
cmd_q_filt= ' awk \' BEGIN{FS=\"\t\"} { print $1,$2,$3'
for i in xrange(len(fnames)):
cmd_q_filt+=',$'+str(4+i*3)+',$'+str(5+i*3)
cmd_q_filt+='} \' | '
#delete lines with ins, or del
#their bases are not at the position
# or they are not clean
cmd_del_ins_filt= " grep -v [\*\+\-] | "
#min cov filter (>0)
cmd_cov_filt= ' awk \' BEGIN{FS=\" \"} '
cmd_cov_filt += ' && '.join([ '$'+str(4+i*2)+'>0' for i in xrange(len(fnames))])
cmd_cov_filt += ' {print} \' | '
# delete first base quality (it can be mistaken for a base)
cmd_first_base_filt= " sed 's/\^.//g' "
###########################################
#create full command and execute
cmd_full= cmd_mpileup + cmd_q_filt + cmd_del_ins_filt
cmd_full += cmd_cov_filt + cmd_first_base_filt
#run the pipeline
output_f=open(output_dir + output_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_full, stdout=PIPE, bufsize=1,executable='/bin/bash',shell=True)
with p.stdout:
for line in iter(p.stdout.readline, b''):
#parse line
#no strip needed?
linelist=line.upper().split(' ')
covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
bases=linelist[4::2]
#count ref
ref_count=[]
for i in xrange(len(bases)):
ref_count.append(len(re.findall('[\.\,]',bases[i])))
ref_count=np.array(ref_count,dtype=np.double)
#make freq from count
ref_freq=np.zeros(len(ref_count),dtype=np.double)
ref_freq[covs !=0]=ref_count[covs !=0]/covs[covs != 0]
ref_freq_id=np.argsort(ref_freq)
#suspicious filter
if(ref_freq[ref_freq_id[0]] >= 0.9):
continue
#create Orsi list format
#for interesting positions
# count all bases
format_orsi='# '+ ' '.join(linelist[0:3])
for i in xrange(len(bases)):
tempbase=re.sub('[\,\.]',linelist[2],bases[i])
format_orsi+=' '+fnames[i].split('.')[0]+' '+str(covs[i])
format_orsi+=' '+str(len(re.findall('A',tempbase)))
format_orsi+=' '+str(len(re.findall('C',tempbase)))
format_orsi+=' '+str(len(re.findall('G',tempbase)))
format_orsi+=' '+str(len(re.findall('T',tempbase)))
#print pileup and Orsi's format too,
output_f.write(line)
output_f.write(format_orsi+'\n')
output_f.close()
Set approxiamte number of paralellized blocks
BLOCKNO=100
from Bio import SeqIO
from Bio.Seq import Seq
#genome
refFasta="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"
#genome lengths
chromDict=dict()
fullLeng=0
#parsing fasta file
for seqin in SeqIO.parse(refFasta,"fasta"):
# scaffolds and mitochondrium not used
if (len(seqin.id) < 3 and seqin.id!='MT' ):
#save lengths
chromDict[seqin.id]=len(seqin.seq)
fullLeng+=len(seqin.seq)
#set maximum block size
BLOCKSIZE=(fullLeng/BLOCKNO)
#calculate blocks
blocks=[]
#loop pover chroms
for chrom in chromDict.keys():
pointer=0
#until chrom is chopped into pieces
while (pointer < chromDict[chrom]):
blockSize=min(BLOCKSIZE,chromDict[chrom]-pointer)
blocks.append([chrom,pointer,pointer+blockSize])
pointer += blockSize
import subprocess
for block in blocks:
try:
print subprocess.check_output(
[ 'sbatch', '--mem',str(1000),'-C','jimgray83' , './snp_list_temp.py' ,
block[0],str(block[1]),str(block[2])],stderr=subprocess.STDOUT),
except subprocess.CalledProcessError, e:
print e.output,