In [1]:
import os
import sys
from variantannotation import annotate_batch
from variantannotation import myvariant_parsing_utils
from variantannotation import mongo_DB_export
from variantannotation import utilities


/Users/carlomazzaferro/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:1035: UserWarning: Duplicate key in file "/Users/carlomazzaferro/.matplotlib/matplotlibrc", line #2
  (fname, cnt))

In [ ]:
filepath = ".../CSV to be tested"
csv_file = "test_file.csv"
vcf_file = "test_file_.vcf"
os.chdir(filepath)

ANNOVAR_PATH = '/database/annovar/'
IN_PATH = '.../file.vcf'
OUT_PATH = '.../annovar_results'

In [ ]:
#Run only if ANNOVAR is properly installed and databases in memory
utilities.run_annovar(ANNOVAR_PATH, IN_PATH, OUT_PATH)

METHOD 1: export data to MongoDB by chunks, iteratively.

This method is well-fitted for large files. Only the 1000 documents are held in memory and processed at a time, instead of attempting to parse and process an entire csv file at once.

As soon as you run the scripts from variantannotaiton the data will automatically be stored to it. Database and collection name should be specified, and there must be a running MongoDB connection. The script will set up a client to communicate between python (through pymongo) and the the database.

In general, the shell command:

mongod --dbpath ../data/db

(data/db is the designated location where the data will be stored) will initiate MongoDB. After this, the script should store data to the directory automatically.

For pymongo, and more information on how to set up a Mongo Database: https://docs.mongodb.com/getting-started/python/


In [5]:
chunksize = 1000
step = 0

#Get variant list. Should always be the first step after running ANNOVAR
open_file = myvariant_parsing_utils.VariantParsing()
list_file = open_file.get_variants_from_vcf(vcf_file)

#Name Collection & DB
collection_name = 'ANNOVAR_MyVariant_chunks'
db_name = 'My_Variant_Database'

#Run process, and export (export happens every time 1000 variants are processed and joined)
as_batch = annotate_batch.AnnotationMethods()
as_batch.by_chunks(list_file, chunksize, step, csv_file, collection_name, db_name)


Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 1 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 2 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 3 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 4 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 5 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 6 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 7 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 8 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 9 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 10 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 11 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 12 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 13 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 14 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 15 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 16 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 17 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 18 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 19 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 20 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
Joining lists ...
Parsing to MongoDB ...
Step: 21 of 22
Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-252...done.
Joining lists ...
Parsing to MongoDB ...
Step: 22 of 22
Out[5]:
'Finished!'

METHOD 2: usign full file, and holding it in memory

Works well for small files.


In [6]:
#get variant list. Should always be the first step after running ANNOVAR
open_file = myvariant_parsing_utils.VariantParsing()
list_file = open_file.get_variants_from_vcf(vcf_file)

#Run process, data saved to joint_list
as_one_file = annotate_batch.AnnotationMethods()
joint_list = as_one_file.full_file(list_file, csv_file)

#Name Collection & DB
collection_name = 'ANNOVAR_MyVariant_full'
db_name = 'My_Variant_Database'

#Export, all at once
exporting_function = mongo_DB_export.export
exporting_function(joint_list, collection_name, db_name)


Processing knownGene info ...
Processing nci60 info ...
Processing tfbsConsSites info ...
Processing genomicSuperDups info ...
Processing cytoBand info ...
Creating hgvs key ...
Processing genotype call info ...
Transforming to JSON from dataFrame
cleaning up...
Done
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-21252...done.
Joining lists ...
Finished!

METHOD 3: ignore annovar, get data solely from myvariant

Easier to run, doesn't require annovar Will however be incomplete (some variants will have no information).


In [7]:
#Get variant list form vcf file
open_file = myvariant_parsing_utils.VariantParsing()
list_file = open_file.get_variants_from_vcf(vcf_file)

#Run process
my_variants = annotate_batch.AnnotationMethods()
myvariant_data = my_variants.my_variant_at_once(list_file)

#Name Collection & DB
collection_name = 'My_Variant_Info_Collection_Full'
db_name = 'My_Variant_Database'

#Export
exporting_function = mongo_DB_export.export
exporting_function(myvariant_data, collection_name, db_name)


querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-21252...done.
Finished!

METHOD 4: ignore annovar, get data solely from myvariant

Easier to run, doesn't require annovar. Will however be incomplete (some variants will have no information). Do so BY CHUNKS. Export function is built in the methods myvariant_chunks


In [8]:
chunksize = 1000
step = 0

#Get variant list from vcf file
open_file = myvariant_parsing_utils.VariantParsing()
list_file = open_file.get_variants_from_vcf(vcf_file)

#Name Collection & DB
collection_name = 'My_Variant_Info_Collection_Chunks'
db_name = 'My_Variant_Database'

#Run process, export to MongoDB in-built
my_variants = annotate_batch.AnnotationMethods()
myvariant_data = my_variants.myvariant_chunks(list_file, chunksize, step, collection_name, db_name)


querying 1-1000...done.
Step: 1 of 22
querying 1-1000...done.
Step: 2 of 22
querying 1-1000...done.
Step: 3 of 22
querying 1-1000...done.
Step: 4 of 22
querying 1-1000...done.
Step: 5 of 22
querying 1-1000...done.
Step: 6 of 22
querying 1-1000...done.
Step: 7 of 22
querying 1-1000...done.
Step: 8 of 22
querying 1-1000...done.
Step: 9 of 22
querying 1-1000...done.
Step: 10 of 22
querying 1-1000...done.
Step: 11 of 22
querying 1-1000...done.
Step: 12 of 22
querying 1-1000...done.
Step: 13 of 22
querying 1-1000...done.
Step: 14 of 22
querying 1-1000...done.
Step: 15 of 22
querying 1-1000...done.
Step: 16 of 22
querying 1-1000...done.
Step: 17 of 22
querying 1-1000...done.
Step: 18 of 22
querying 1-1000...done.
Step: 19 of 22
querying 1-1000...done.
Step: 20 of 22
querying 1-1000...done.
Step: 21 of 22
querying 1-252...done.
Step: 22 of 22