Mutational spectra of samples with VarScan


In [1]:
#load modules

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import gc
In [2]:
#go to working directory, where the data is

os.chdir("/nagyvinyok/adat83/sotejedlik/ribli/dt40/snp/varscan")
In [3]:
#define a spectrum counter/plotter function

def plot_spectrum(fname):
    temp_result=pd.read_csv(fname,sep='\t')
    temp_result=temp_result.convert_objects(convert_numeric=True)
    temp_result[['ref','var','normal_reads1','normal_reads2',
            'tumor_reads1','tumor_reads2',
           'somatic_status','variant_p_value','somatic_p_value']].iloc[:10]

    grouped = pd.DataFrame(temp_result[ (temp_result['somatic_status'] == 'Somatic') &
        (temp_result['somatic_p_value']<0.008)].groupby(['ref','var']).size()).reset_index()

    grouped.loc[0,0]+=grouped.loc[11,0] 
    grouped.loc[1,0]+=grouped.loc[10,0] 
    grouped.loc[2,0]+=grouped.loc[9,0] 
    grouped.loc[3,0]+=grouped.loc[8,0] 
    grouped.loc[4,0]+=grouped.loc[7,0] 
    grouped.loc[5,0]+=grouped.loc[6,0] 

    spectrum = grouped.loc[:5]
    #print spectrum

    fig,ax=plt.subplots()
    fig.set_size_inches(9,6)

    r, g, b = (23, 190, 207)
    c = (r / 255., g / 255., b / 255.)

    ax.bar(np.arange(6),spectrum[0],facecolor=c,edgecolor='white',label=fname[:-7])

    ax.grid(True,c=c,lw=1,linestyle='dotted')
    ax.set_frame_on(False)

    tics=ax.xaxis.set_ticks(np.arange(6)+0.4)
    ax.set_xlim(-0.2,6)
    labs=ax.set_xticklabels(['A>C','A>G','A>T','C>A','C>G','C>T',], rotation='horizontal')

    ax.legend(fancybox=True,loc='upper left')
    
    gc.collect() #this line solves memory issues
In [4]:
#loop over all files, and plot their spectra

for fname in os.listdir('./'):
    if (fname[-3:]!='snp' ):
        continue
    plot_spectrum(fname)
/home/ribli/.local/lib/python2.7/site-packages/pandas-0.14.1-py2.7-linux-x86_64.egg/pandas/io/parsers.py:1139: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
/usr/lib/python2.7/dist-packages/matplotlib/pyplot.py:424: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Notes

  • Theoretically this notebook only reads in one result file at once but garbage collection works quite randomly, so memory usage could go up to 70% before it frees old stuff,( normally around 10% is the size of one file in the memory ). Thats why I need to call the garbage collector by hand, It keeps memory usage at 10%.