In [1]:
#Import pdVCFsingle package
%matplotlib inline
%pylab inline
import sys
sys.path.append( '../src/' )
from pandasvcf import *
%config InlineBackend.figure_format = 'retina'
pd.options.mode.chained_assignment = None #supressing the chained assignment warnings
In [2]:
vcf_path = '../test_data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz'
In [3]:
vcf_chunk = VCF(vcf_path, sample_id='all', cols=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT', 'INFO', 'FILTER'], \
chunksize=1000, n_cores=20)
In [4]:
%time vcf_chunk.get_vcf_df_chunk()
Out[4]:
In [5]:
vcf_chunk.df.info()
print
print vcf_chunk.df.shape[1] * vcf_chunk.df.shape[0], 'Genotypes read'
In [6]:
vcf_chunk.df.head()
Out[6]:
In [7]:
#checking stopIteration flag
vcf_chunk.stopIteration
Out[7]:
In [8]:
%time vcf_chunk.add_variant_annotations(inplace=True) #split_columns={'AD':2, 'HQ':2},
Out[8]:
In [9]:
vcf_chunk.df.info()
In [10]:
#unstack dataframe by sample - QUITE SPARSE DUE TO RARE VARIANTS
vcf_chunk.df.set_index('sample_ids', append=True).unstack(level=4).tail()
Out[10]:
In [11]:
def get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'], \
add_variant_annotations=True, split_columns='', chunksize=5000, inplace=True, n_cores=1):
'''
This function will parse the whole multi-sample vcf file
and return a dataframe.
Note using multiple cores with add_variant_annotations will be
very memory intensive as the parsed dataframe is copied to each process.
'''
vcf_df_obj = Vcf(vcf_path, sample_id=sample_ids, cols=columns, chunksize=chunksize, n_cores=n_cores) #initiate object
stopIteration = False #initiating stopIteration flag
data = [] #aggregation df list
while stopIteration == False:
vcf_df_obj.get_vcf_df_chunk() #retrieving df chunk
if vcf_df_obj.stopIteration == True: break #checking for end of file
if add_variant_annotations:
vcf_df_obj.add_variant_annotations(split_columns=split_columns, inplace=inplace) #parsing df and adding annotations
if inplace:
data.append(vcf_df_obj.df)
else:
data.append(vcf_df_obj.df_annot) #aggregating annotation data
else:
vcf_df_obj.append(vcf_df_obj.df)
df = pd.concat(data)
return df
In [12]:
%time master_df = get_whole_file(vcf_path, sample_ids='all', \
columns=['#CHROM', 'POS', 'REF', 'ALT','FORMAT', 'INFO'], \
chunksize=5000, n_cores=20)
In [13]:
master_df.zygosity.value_counts().plot(kind='bar', log=True, grid=True, color='seagreen')
Out[13]:
In [14]:
master_df.vartype2.value_counts().plot(kind='bar', log=True, grid=True, color='seagreen')
Out[14]:
In [15]:
master_df.vartype2.value_counts()
Out[15]:
In [16]:
len(master_df)
Out[16]:
In [17]:
master_df.head(20)
Out[17]:
In [18]:
master_df.info()