Getting the necessary data

You just need to do this only once


In [2]:
!rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam 2>/dev/null
!rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai 2>/dev/null
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai


--2015-06-26 14:36:30--  ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam
           => ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam’
Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8
Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /vol1/ftp/phase3/data/NA18489/exome_alignment ... done.
==> SIZE NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam ... 327067172
==> PASV ... done.    ==> RETR NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam ... done.
Length: 327067172 (312M) (unauthoritative)

NA18489.chrom20.ILL 100%[=====================>] 311.92M  5.02MB/s   in 31s    

2015-06-26 14:37:02 (9.98 MB/s) - ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam’ saved [327067172]

--2015-06-26 14:37:02--  ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai
           => ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai’
Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8
Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /vol1/ftp/phase3/data/NA18489/exome_alignment ... done.
==> SIZE NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai ... 170688
==> PASV ... done.    ==> RETR NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai ... done.
Length: 170688 (167K) (unauthoritative)

NA18489.chrom20.ILL 100%[=====================>] 166.69K  --.-KB/s   in 0.08s  

2015-06-26 14:37:02 (2.02 MB/s) - ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai’ saved [170688]

The recipe


In [1]:
from collections import defaultdict

import numpy as np

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

import pysam

In [2]:
bam = pysam.AlignmentFile('NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam', 'rb')

In [3]:
headers = bam.header
for record_type, records in headers.items():
    print (record_type)
    for i, record in enumerate(records):
        print('\t%d' % (i + 1))
        for field, value in record.items():
            print('\t\t%s\t%s' % (field, value))


SQ
	1
		LN	249250621
		M5	1b22b98cdeb4a9304cb5d48026a85128
		SN	1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	2
		LN	243199373
		M5	a0d9851da00400dec1098a9255ac712e
		SN	2
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	3
		LN	198022430
		M5	fdfd811849cc2fadebc929bb925902e5
		SN	3
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	4
		LN	191154276
		M5	23dccd106897542ad87d2765d28a19a1
		SN	4
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	5
		LN	180915260
		M5	0740173db9ffd264d728f32784845cd7
		SN	5
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	6
		LN	171115067
		M5	1d3a93a248d92a729ee764823acbbc6b
		SN	6
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	7
		LN	159138663
		M5	618366e953d6aaad97dbe4777c29375e
		SN	7
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	8
		LN	146364022
		M5	96f514a9929e410c6651697bded59aec
		SN	8
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	9
		LN	141213431
		M5	3e273117f15e0a400f01055d9f393768
		SN	9
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	10
		LN	135534747
		M5	988c28e000e84c26d552359af1ea2e1d
		SN	10
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	11
		LN	135006516
		M5	98c59049a2df285c76ffb1c6db8f8b96
		SN	11
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	12
		LN	133851895
		M5	51851ac0e1a115847ad36449b0015864
		SN	12
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	13
		LN	115169878
		M5	283f8d7892baa81b510a015719ca7b0b
		SN	13
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	14
		LN	107349540
		M5	98f3cae32b2a2e9524bc19813927542e
		SN	14
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	15
		LN	102531392
		M5	e5645a794a8238215b2cd77acb95a078
		SN	15
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	16
		LN	90354753
		M5	fc9b1a7b42b97a864f56b348b06095e6
		SN	16
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	17
		LN	81195210
		M5	351f64d4f4f9ddd45b35336ad97aa6de
		SN	17
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	18
		LN	78077248
		M5	b15d4b2d29dde9d3e4f93d1d0f2cbc9c
		SN	18
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	19
		LN	59128983
		M5	1aacd71f30db8e561810913e0b72636d
		SN	19
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	20
		LN	63025520
		M5	0dec9660ec1efaaf33281c0d5ea2560f
		SN	20
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	21
		LN	48129895
		M5	2979a6085bfe28e3ad6f552f361ed74d
		SN	21
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	22
		LN	51304566
		M5	a718acaa6135fdca8357d5bfe94211dd
		SN	22
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	23
		LN	155270560
		M5	7e0e2e580297b7764e31dbc80c2540dd
		SN	X
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	24
		LN	59373566
		M5	1fa3474750af0948bdf97d5a0ee52e51
		SN	Y
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	25
		LN	16569
		M5	c68f52674c9fb33aef52dcf399755519
		SN	MT
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	26
		LN	4262
		M5	f3814841f1939d3ca19072d9e89f3fd7
		SN	GL000207.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	27
		LN	15008
		M5	1c1b2cd1fccbc0a99b6a447fa24d1504
		SN	GL000226.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	28
		LN	19913
		M5	d0f40ec87de311d8e715b52e4c7062e1
		SN	GL000229.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	29
		LN	27386
		M5	ba8882ce3a1efa2080e5d29b956568a4
		SN	GL000231.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	30
		LN	27682
		M5	851106a74238044126131ce2a8e5847c
		SN	GL000210.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	31
		LN	33824
		M5	99795f15702caec4fa1c4e15f8a29c07
		SN	GL000239.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	32
		LN	34474
		M5	118a25ca210cfbcdfb6c2ebb249f9680
		SN	GL000235.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	33
		LN	36148
		M5	dfb7e7ec60ffdcb85cb359ea28454ee9
		SN	GL000201.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	34
		LN	36422
		M5	7de00226bb7df1c57276ca6baabafd15
		SN	GL000247.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	35
		LN	36651
		M5	89bc61960f37d94abf0df2d481ada0ec
		SN	GL000245.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	36
		LN	37175
		M5	6f5efdd36643a9b8c8ccad6f2f1edc7b
		SN	GL000197.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	37
		LN	37498
		M5	96358c325fe0e70bee73436e8bb14dbd
		SN	GL000203.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	38
		LN	38154
		M5	e4afcd31912af9d9c2546acf1cb23af2
		SN	GL000246.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	39
		LN	38502
		M5	1d78abec37c15fe29a275eb08d5af236
		SN	GL000249.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	40
		LN	38914
		M5	d92206d1bb4c3b4019c43c0875c06dc0
		SN	GL000196.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	41
		LN	39786
		M5	5a8e43bec9be36c7b49c84d585107776
		SN	GL000248.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	42
		LN	39929
		M5	0996b4475f353ca98bacb756ac479140
		SN	GL000244.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	43
		LN	39939
		M5	131b1efc3270cc838686b54e7c34b17b
		SN	GL000238.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	44
		LN	40103
		M5	06cbf126247d89664a4faebad130fe9c
		SN	GL000202.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	45
		LN	40531
		M5	93f998536b61a56fd0ff47322a911d4b
		SN	GL000234.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	46
		LN	40652
		M5	3e06b6741061ad93a8587531307057d8
		SN	GL000232.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	47
		LN	41001
		M5	43f69e423533e948bfae5ce1d45bd3f1
		SN	GL000206.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	48
		LN	41933
		M5	445a86173da9f237d7bcf41c6cb8cc62
		SN	GL000240.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	49
		LN	41934
		M5	fdcd739913efa1fdc64b6c0cd7016779
		SN	GL000236.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	50
		LN	42152
		M5	ef4258cdc5a45c206cea8fc3e1d858cf
		SN	GL000241.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	51
		LN	43341
		M5	cc34279a7e353136741c9fce79bc4396
		SN	GL000243.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	52
		LN	43523
		M5	2f8694fc47576bc81b5fe9e7de0ba49e
		SN	GL000242.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	53
		LN	43691
		M5	b4eb71ee878d3706246b7c1dbef69299
		SN	GL000230.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	54
		LN	45867
		M5	e0c82e7751df73f4f6d0ed30cdc853c0
		SN	GL000237.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	55
		LN	45941
		M5	7fed60298a8d62ff808b74b6ce820001
		SN	GL000233.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	56
		LN	81310
		M5	efc49c871536fa8d79cb0a06fa739722
		SN	GL000204.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	57
		LN	90085
		M5	868e7784040da90d900d2d1b667a1383
		SN	GL000198.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	58
		LN	92689
		M5	aa81be49bf3fe63a79bdc6a6f279abf6
		SN	GL000208.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	59
		LN	106433
		M5	d75b436f50a8214ee9c2a51d30b2c2cc
		SN	GL000191.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	60
		LN	128374
		M5	a4aead23f8053f2655e468bcc6ecdceb
		SN	GL000227.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	61
		LN	129120
		M5	c5a17c97e2c1a0b6a9cc5a6b064b714f
		SN	GL000228.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	62
		LN	137718
		M5	46c2032c37f2ed899eb41c0473319a69
		SN	GL000214.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	63
		LN	155397
		M5	3238fb74ea87ae857f9c7508d315babb
		SN	GL000221.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	64
		LN	159169
		M5	f40598e2a5a6b26e84a3775e0d1e2c81
		SN	GL000209.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	65
		LN	161147
		M5	1d708b54644c26c7e01c2dad5426d38c
		SN	GL000218.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	66
		LN	161802
		M5	fc35de963c57bf7648429e6454f1c9db
		SN	GL000220.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	67
		LN	164239
		M5	9d424fdcc98866650b58f004080a992a
		SN	GL000213.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	68
		LN	166566
		M5	7daaa45c66b288847b9b32b964e623d3
		SN	GL000211.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	69
		LN	169874
		M5	569af3b73522fab4b40995ae4944e78e
		SN	GL000199.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	70
		LN	172149
		M5	6d243e18dea1945fb7f2517615b8f52e
		SN	GL000217.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	71
		LN	172294
		M5	642a232d91c486ac339263820aef7fe0
		SN	GL000216.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	72
		LN	172545
		M5	5eb3b418480ae67a997957c909375a73
		SN	GL000215.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	73
		LN	174588
		M5	d22441398d99caf673e9afb9a1908ec5
		SN	GL000205.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	74
		LN	179198
		M5	f977edd13bac459cb2ed4a5457dba1b3
		SN	GL000219.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	75
		LN	179693
		M5	d5b2fc04f6b41b212a4198a07f450e20
		SN	GL000224.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	76
		LN	180455
		M5	399dfa03bf32022ab52a846f7ca35b30
		SN	GL000223.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	77
		LN	182896
		M5	5d9ec007868d517e73543b005ba48535
		SN	GL000195.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	78
		LN	186858
		M5	563531689f3dbd691331fd6c5730a88b
		SN	GL000212.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	79
		LN	186861
		M5	6fe9abac455169f50470f5a6b01d0f59
		SN	GL000222.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	80
		LN	187035
		M5	75e4c8d17cd4addf3917d1703cacaf25
		SN	GL000200.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	81
		LN	189789
		M5	dbb6e8ece0b5de29da56601613007c2a
		SN	GL000193.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	82
		LN	191469
		M5	6ac8f815bf8e845bb3031b73f812c012
		SN	GL000194.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	83
		LN	211173
		M5	63945c3e6962f28ffd469719a747e73c
		SN	GL000225.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	84
		LN	547496
		M5	325ba9e808f669dfeee210fdd7b470ac
		SN	GL000192.1
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	85
		LN	171823
		M5	6743bd63b3ff2b5b8985d8933c53290a
		SN	NC_007605
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
	86
		LN	35477943
		M5	5b6a4b3a81a2d3c134b7d14bf6ad39f1
		SN	hs37d5
		UR	ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
RG
	1
		LB	Solexa-51039
		CN	BI
		DS	SRP004074
		SM	NA18489
		PI	220
		ID	SRR100025
		PL	ILLUMINA
CO
	1
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-3-3adad14901f7> in <module>()
      4     for i, record in enumerate(records):
      5         print('\t%d' % (i + 1))
----> 6         for field, value in record.items():
      7             print('\t\t%s\t%s' % (field, value))

AttributeError: 'str' object has no attribute 'items'

In [4]:
#0-based
for rec in bam:
    if rec.cigarstring.find('M') > -1 and rec.cigarstring.find('S') > -1 and not rec.is_unmapped and not rec.mate_is_unmapped:
        break
print(rec.query_name, rec.reference_id, bam.getrname(rec.reference_id), rec.reference_start, rec.reference_end)
print(rec.cigarstring)
print(rec.query_alignment_start, rec.query_alignment_end, rec.query_alignment_length)
print(rec.next_reference_id, rec.next_reference_start, rec.template_length)
print(rec.is_paired, rec.is_proper_pair, rec.is_unmapped, rec.mapping_quality)
print(rec.query_qualities)
print(rec.query_alignment_qualities)
print(rec.query_sequence)


('SRR100025.62130839', 19, '20', 59996, 60048)
52M24S
(0, 52, 76)
(19, 60228, 295)
(True, True, False, 60)
array('B', [33, 34, 36, 33, 39, 34, 33, 38, 39, 34, 40, 35, 40, 40, 32, 40, 38, 33, 35, 38, 33, 39, 40, 34, 37, 39, 36, 30, 36, 37, 34, 35, 34, 40, 37, 34, 38, 28, 40, 40, 38, 32, 33, 32, 36, 34, 37, 24, 34, 35, 31, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
array('B', [33, 34, 36, 33, 39, 34, 33, 38, 39, 34, 40, 35, 40, 40, 32, 40, 38, 33, 35, 38, 33, 39, 40, 34, 37, 39, 36, 30, 36, 37, 34, 35, 34, 40, 37, 34, 38, 28, 40, 40, 38, 32, 33, 32, 36, 34, 37, 24, 34, 35, 31, 2])
CTCAGATCCAGAGGTGGAAGAGGAAGGAAGCTTGGAACCCTATAGAGTTGCTGAGTGCCAGGACCAGATACTGGGC

In [5]:
counts = [0] * 76
for n, rec in enumerate(bam.fetch('20', 0, 10000000)):
    for i in range(rec.query_alignment_start, rec.query_alignment_end):
        counts[i] += 1
freqs = [x / (n + 1.) for x in counts]
plt.plot(range(1, 77), freqs)


Out[5]:
[<matplotlib.lines.Line2D at 0x7fc9c50789d0>]

In [ ]:
phreds = defaultdict(list)
for rec in bam.fetch('20', 0, None):
    for i in range(rec.query_alignment_start, rec.query_alignment_end):
        phreds[i].append(rec.query_qualities[i])

In [ ]:
maxs = [max(phreds[i]) for i in range(76)]
tops = [np.percentile(phreds[i], 95) for i in range(76)]
medians = [np.percentile(phreds[i], 50) for i in range(76)]
bottoms = [np.percentile(phreds[i], 5) for i in range(76)]
medians_fig = [x - y for x, y in zip(medians, bottoms)]
tops_fig = [x - y for x, y in zip(tops, medians)]
maxs_fig = [x - y for x, y in zip(maxs, tops)]

In [ ]:
fig, ax = plt.subplots(figsize=(16,9))
ax.stackplot(range(1, 77), (bottoms, medians_fig, tops_fig, maxs_fig))
ax.plot(range(1, 77), maxs, 'k-')

In [ ]: