You just need to do this only once
In [5]:
!rm -f genotypes.vcf.gz 2>/dev/null
!tabix -fh ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/supporting/vcf_with_sample_level_annotation/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5_extra_anno.20130502.genotypes.vcf.gz 22:1-17000000|bgzip -c > genotypes.vcf.gz
!tabix -p vcf genotypes.vcf.gz
In [2]:
from collections import defaultdict
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import vcf
In [3]:
v = vcf.Reader(filename='genotypes.vcf.gz')
print('Variant Level information')
infos = v.infos
for info in infos:
print(info)
print('Sample Level information')
fmts = v.formats
for fmt in fmts:
print(fmt)
In [4]:
v = vcf.Reader(filename='genotypes.vcf.gz')
rec = next(v)
print(rec.CHROM, rec.POS, rec.ID, rec.REF, rec.ALT, rec.QUAL, rec.FILTER)
print(rec.INFO)
print(rec.FORMAT)
samples = rec.samples
print(len(samples))
sample = samples[0]
print(sample.called, sample.gt_alleles, sample.is_het, sample.is_variant, sample.phased)
print(int(sample['DP']))
In [5]:
f = vcf.Reader(filename='genotypes.vcf.gz')
my_type = defaultdict(int)
num_alts = defaultdict(int)
for rec in f:
my_type[rec.var_type, rec.var_subtype] += 1
if rec.is_snp:
num_alts[len(rec.ALT)] += 1
print(my_type)
print(num_alts)
In [6]:
f = vcf.Reader(filename='genotypes.vcf.gz')
sample_dp = defaultdict(int)
for rec in f:
if not rec.is_snp or len(rec.ALT) != 1:
continue
for sample in rec.samples:
dp = sample['DP']
if dp is None:
dp = 0
dp = int(dp)
sample_dp[dp] += 1
In [8]:
dps = sample_dp.keys()
dps.sort()
dp_dist = [sample_dp[x] for x in dps]
fig, ax = plt.subplots(figsize=(16, 9))
ax.plot(dp_dist[:50], 'r')
ax.axvline(dp_dist.index(max(dp_dist)))
Out[8]:
In [ ]: