Warning!!! skipping DS26,DS35

Creating filtered mpileup, and list from bams


Filters:

  • Q 30 base quality filter
  • all samples must be covered at the position (cov>0)
  • no deletion or insertions at the position
  • minimum reference nucleotide frequency < 0.9 (if all of them are above 0.9 there is no reliable calable mutation)

formats:

  • modified mpileup format: no base qualities kept
  • extra lines added in Orsi's format (which is:'# chr pos ref samplename cov acount ccount gcount tcount samplename cov acount ccount ... ')

Notes:

  • -B causes no local realignement in samtools
    • should try to asses the effect
  • The python script might be the bottleneck! if this is true, it can be augmented with a C scipt i have almost ready.

Script for creating pileup and filtering pileup lines


Paralellization scheme

  • genome is cut into pieces, they run in parallel (etc chr1:0-1000000 )

Pileup generation

  • samtools mpileup on all samples

Filtering

  • intially with grep, awk, sed
  • later more complicated things in python

In [1]:
%%writefile snp_list_temp.py
#!/usr/bin/python

#import modules
import os
import sys
import subprocess
import fnmatch
import re
import numpy as np

#flags
flags=" -Q30 -B "

#neighborhood data read from cmdline
chrom=sys.argv[1]
posfrom=sys.argv[2]
posto='-'+sys.argv[3]

#ouput files
output_dir='/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/snp_list/'
#subprocess.call(['mkdir',output_dir])
output_fname= str(chrom) + '-' + str(posfrom) +  str(posto) + '.pup'

#tools
bwa="/nagyvinyok/adat87/home/ribli/bwa-0.7.10/bwa"
samtools="/nagyvinyok/adat87/home/ribli/tools/samtools-0.1.19/samtools"
ref_fa="/nagyvinyok/adat87/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"

#filenames for samplenames
rm_dup_dir='/nagyvinyok/adat83/sotejedlik/dt40/rmdup/'
#collect filenames
fnames=[]
for fname in os.listdir(rm_dup_dir):
    if (fnmatch.fnmatch(fname, '*.bam') and 
        not fnmatch.fnmatch(fname,"*.bam.bai") and 
        (not fnmatch.fnmatch(fname,"DS26*")) and
        (not fnmatch.fnmatch(fname,"DS35*"))):
        fnames.append(fname)
fnames=sorted(fnames)

#create pileup command
cmd_mpileup = samtools + ' mpileup ' + flags + ' -f ' + ref_fa
cmd_mpileup += ' -r ' + str(chrom) +':'+ str(posfrom) +  str(posto) + ' ' 
cmd_mpileup += ' '.join([rm_dup_dir+x for x in fnames]) + ' | '

#filtering:
# throw away quality
cmd_q_filt= ' awk \' BEGIN{FS=\"\t\"} { print $1,$2,$3'
for i in xrange(len(fnames)):
    cmd_q_filt+=',$'+str(4+i*3)+',$'+str(5+i*3)
cmd_q_filt+='} \' | '

#delete lines with ins, or del 
#their bases are not at the position 
# or they are not clean
cmd_del_ins_filt= " grep -v  [\*\+\-] | "

#min cov filter (>0)
cmd_cov_filt= ' awk \' BEGIN{FS=\" \"} ' 
cmd_cov_filt += ' && '.join([ '$'+str(4+i*2)+'>0' for i in xrange(len(fnames))])
cmd_cov_filt += ' {print} \' | '

# delete first base quality (it can be mistaken for a base)
cmd_first_base_filt= " sed  's/\^.//g' " 

###########################################
#create full command and execute
cmd_full= cmd_mpileup + cmd_q_filt + cmd_del_ins_filt
cmd_full += cmd_cov_filt + cmd_first_base_filt 

#run the pipeline
output_f=open(output_dir + output_fname,'w')
from subprocess import Popen, PIPE
p = Popen(cmd_full, stdout=PIPE, bufsize=1,executable='/bin/bash',shell=True)
with p.stdout:
    for line in iter(p.stdout.readline, b''):
        #parse line
        #no strip needed?
        linelist=line.upper().split(' ')
        covs=np.array(map(int,linelist[3::2]),dtype=np.int32)
        bases=linelist[4::2]

        #count ref
        ref_count=[]
        for i in xrange(len(bases)):
            ref_count.append(len(re.findall('[\.\,]',bases[i]))) 
        ref_count=np.array(ref_count,dtype=np.double)

        #make freq from count 
        ref_freq=np.zeros(len(ref_count),dtype=np.double)
        ref_freq[covs !=0]=ref_count[covs !=0]/covs[covs != 0]
        ref_freq_id=np.argsort(ref_freq)

        #suspicious filter
        if(ref_freq[ref_freq_id[0]] >=  0.9):
            continue

        #create Orsi list format
        #for interesting positions
        # count all bases
        format_orsi='# '+ ' '.join(linelist[0:3])
        for i in xrange(len(bases)):
            tempbase=re.sub('[\,\.]',linelist[2],bases[i])
            format_orsi+=' '+fnames[i].split('.')[0]+' '+str(covs[i])
            format_orsi+=' '+str(len(re.findall('A',tempbase))) 
            format_orsi+=' '+str(len(re.findall('C',tempbase))) 
            format_orsi+=' '+str(len(re.findall('G',tempbase))) 
            format_orsi+=' '+str(len(re.findall('T',tempbase))) 

        #print pileup and Orsi's format too, 
        output_f.write(line)
        output_f.write(format_orsi+'\n')
output_f.close()
Writing snp_list_temp.py

Calculate the blocks of parallelization

Set approxiamte number of paralellized blocks

  • there will be more blocks
In [2]:
BLOCKNO=100
In [3]:
from Bio import SeqIO
from Bio.Seq import Seq

#genome
refFasta="/home/ribli/input/index/gallus/Gallus_gallus.Galgal4.74.dna.toplevel.fa"

#genome lengths
chromDict=dict()
fullLeng=0

#parsing fasta file
for seqin in SeqIO.parse(refFasta,"fasta"):
	# scaffolds and mitochondrium not used
        if (len(seqin.id) < 3 and seqin.id!='MT' ):
            #save lengths
            chromDict[seqin.id]=len(seqin.seq)
            fullLeng+=len(seqin.seq)

#set maximum block size
BLOCKSIZE=(fullLeng/BLOCKNO)

#calculate blocks
blocks=[]
#loop pover chroms
for chrom in chromDict.keys():
	pointer=0
	#until chrom is chopped into pieces
	while (pointer < chromDict[chrom]):
		blockSize=min(BLOCKSIZE,chromDict[chrom]-pointer)
	        blocks.append([chrom,pointer,pointer+blockSize])
	        pointer += blockSize   

Run the jobs in slurm

In [4]:
import subprocess
for block in blocks:
    try:
        print subprocess.check_output(
            [ 'sbatch', '--mem',str(1000),'-C','jimgray83' , './snp_list_temp.py' ,
             block[0],str(block[1]),str(block[2])],stderr=subprocess.STDOUT),
    except subprocess.CalledProcessError, e:
        print e.output,
Submitted batch job 2437
Submitted batch job 2438
Submitted batch job 2439
Submitted batch job 2440
Submitted batch job 2441
Submitted batch job 2442
Submitted batch job 2443
Submitted batch job 2444
Submitted batch job 2445
Submitted batch job 2446
Submitted batch job 2447
Submitted batch job 2448
Submitted batch job 2449
Submitted batch job 2450
Submitted batch job 2451
Submitted batch job 2452
Submitted batch job 2453
Submitted batch job 2454
Submitted batch job 2455
Submitted batch job 2456
Submitted batch job 2457
Submitted batch job 2458
Submitted batch job 2459
Submitted batch job 2460
Submitted batch job 2461
Submitted batch job 2462
Submitted batch job 2463
Submitted batch job 2464
Submitted batch job 2465
Submitted batch job 2466
Submitted batch job 2467
Submitted batch job 2468
Submitted batch job 2469
Submitted batch job 2470
Submitted batch job 2471
Submitted batch job 2472
Submitted batch job 2473
Submitted batch job 2474
Submitted batch job 2475
Submitted batch job 2476
Submitted batch job 2477
Submitted batch job 2478
Submitted batch job 2479
Submitted batch job 2480
Submitted batch job 2481
Submitted batch job 2482
Submitted batch job 2483
Submitted batch job 2484
Submitted batch job 2485
Submitted batch job 2486
Submitted batch job 2487
Submitted batch job 2488
Submitted batch job 2489
Submitted batch job 2490
Submitted batch job 2491
Submitted batch job 2492
Submitted batch job 2493
Submitted batch job 2494
Submitted batch job 2495
Submitted batch job 2496
Submitted batch job 2497
Submitted batch job 2498
Submitted batch job 2499
Submitted batch job 2500
Submitted batch job 2501
Submitted batch job 2502
Submitted batch job 2503
Submitted batch job 2504
Submitted batch job 2505
Submitted batch job 2506
Submitted batch job 2507
Submitted batch job 2508
Submitted batch job 2509
Submitted batch job 2510
Submitted batch job 2511
Submitted batch job 2512
Submitted batch job 2513
Submitted batch job 2514
Submitted batch job 2515
Submitted batch job 2516
Submitted batch job 2517
Submitted batch job 2518
Submitted batch job 2519
Submitted batch job 2520
Submitted batch job 2521
Submitted batch job 2522
Submitted batch job 2523
Submitted batch job 2524
Submitted batch job 2525
Submitted batch job 2526
Submitted batch job 2527
Submitted batch job 2528
Submitted batch job 2529
Submitted batch job 2530
Submitted batch job 2531
Submitted batch job 2532
Submitted batch job 2533
Submitted batch job 2534
Submitted batch job 2535
Submitted batch job 2536
Submitted batch job 2537
Submitted batch job 2538
Submitted batch job 2539
Submitted batch job 2540
Submitted batch job 2541
Submitted batch job 2542
Submitted batch job 2543
Submitted batch job 2544
Submitted batch job 2545
Submitted batch job 2546
Submitted batch job 2547
Submitted batch job 2548
Submitted batch job 2549
Submitted batch job 2550
Submitted batch job 2551