You just need to do this only once
In [2]:
!rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam 2>/dev/null
!rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai 2>/dev/null
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai
--2015-06-26 14:36:30-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam
=> ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam’
Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8
Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done. ==> PWD ... done.
==> TYPE I ... done. ==> CWD (1) /vol1/ftp/phase3/data/NA18489/exome_alignment ... done.
==> SIZE NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam ... 327067172
==> PASV ... done. ==> RETR NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam ... done.
Length: 327067172 (312M) (unauthoritative)
NA18489.chrom20.ILL 100%[=====================>] 311.92M 5.02MB/s in 31s
2015-06-26 14:37:02 (9.98 MB/s) - ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam’ saved [327067172]
--2015-06-26 14:37:02-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai
=> ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai’
Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8
Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done. ==> PWD ... done.
==> TYPE I ... done. ==> CWD (1) /vol1/ftp/phase3/data/NA18489/exome_alignment ... done.
==> SIZE NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai ... 170688
==> PASV ... done. ==> RETR NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai ... done.
Length: 170688 (167K) (unauthoritative)
NA18489.chrom20.ILL 100%[=====================>] 166.69K --.-KB/s in 0.08s
2015-06-26 14:37:02 (2.02 MB/s) - ‘NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai’ saved [170688]
In [1]:
from collections import defaultdict
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pysam
In [2]:
bam = pysam.AlignmentFile('NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam', 'rb')
In [3]:
headers = bam.header
for record_type, records in headers.items():
print (record_type)
for i, record in enumerate(records):
print('\t%d' % (i + 1))
for field, value in record.items():
print('\t\t%s\t%s' % (field, value))
SQ
1
LN 249250621
M5 1b22b98cdeb4a9304cb5d48026a85128
SN 1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
2
LN 243199373
M5 a0d9851da00400dec1098a9255ac712e
SN 2
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
3
LN 198022430
M5 fdfd811849cc2fadebc929bb925902e5
SN 3
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
4
LN 191154276
M5 23dccd106897542ad87d2765d28a19a1
SN 4
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
5
LN 180915260
M5 0740173db9ffd264d728f32784845cd7
SN 5
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
6
LN 171115067
M5 1d3a93a248d92a729ee764823acbbc6b
SN 6
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
7
LN 159138663
M5 618366e953d6aaad97dbe4777c29375e
SN 7
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
8
LN 146364022
M5 96f514a9929e410c6651697bded59aec
SN 8
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
9
LN 141213431
M5 3e273117f15e0a400f01055d9f393768
SN 9
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
10
LN 135534747
M5 988c28e000e84c26d552359af1ea2e1d
SN 10
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
11
LN 135006516
M5 98c59049a2df285c76ffb1c6db8f8b96
SN 11
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
12
LN 133851895
M5 51851ac0e1a115847ad36449b0015864
SN 12
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
13
LN 115169878
M5 283f8d7892baa81b510a015719ca7b0b
SN 13
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
14
LN 107349540
M5 98f3cae32b2a2e9524bc19813927542e
SN 14
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
15
LN 102531392
M5 e5645a794a8238215b2cd77acb95a078
SN 15
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
16
LN 90354753
M5 fc9b1a7b42b97a864f56b348b06095e6
SN 16
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
17
LN 81195210
M5 351f64d4f4f9ddd45b35336ad97aa6de
SN 17
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
18
LN 78077248
M5 b15d4b2d29dde9d3e4f93d1d0f2cbc9c
SN 18
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
19
LN 59128983
M5 1aacd71f30db8e561810913e0b72636d
SN 19
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
20
LN 63025520
M5 0dec9660ec1efaaf33281c0d5ea2560f
SN 20
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
21
LN 48129895
M5 2979a6085bfe28e3ad6f552f361ed74d
SN 21
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
22
LN 51304566
M5 a718acaa6135fdca8357d5bfe94211dd
SN 22
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
23
LN 155270560
M5 7e0e2e580297b7764e31dbc80c2540dd
SN X
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
24
LN 59373566
M5 1fa3474750af0948bdf97d5a0ee52e51
SN Y
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
25
LN 16569
M5 c68f52674c9fb33aef52dcf399755519
SN MT
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
26
LN 4262
M5 f3814841f1939d3ca19072d9e89f3fd7
SN GL000207.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
27
LN 15008
M5 1c1b2cd1fccbc0a99b6a447fa24d1504
SN GL000226.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
28
LN 19913
M5 d0f40ec87de311d8e715b52e4c7062e1
SN GL000229.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
29
LN 27386
M5 ba8882ce3a1efa2080e5d29b956568a4
SN GL000231.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
30
LN 27682
M5 851106a74238044126131ce2a8e5847c
SN GL000210.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
31
LN 33824
M5 99795f15702caec4fa1c4e15f8a29c07
SN GL000239.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
32
LN 34474
M5 118a25ca210cfbcdfb6c2ebb249f9680
SN GL000235.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
33
LN 36148
M5 dfb7e7ec60ffdcb85cb359ea28454ee9
SN GL000201.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
34
LN 36422
M5 7de00226bb7df1c57276ca6baabafd15
SN GL000247.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
35
LN 36651
M5 89bc61960f37d94abf0df2d481ada0ec
SN GL000245.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
36
LN 37175
M5 6f5efdd36643a9b8c8ccad6f2f1edc7b
SN GL000197.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
37
LN 37498
M5 96358c325fe0e70bee73436e8bb14dbd
SN GL000203.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
38
LN 38154
M5 e4afcd31912af9d9c2546acf1cb23af2
SN GL000246.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
39
LN 38502
M5 1d78abec37c15fe29a275eb08d5af236
SN GL000249.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
40
LN 38914
M5 d92206d1bb4c3b4019c43c0875c06dc0
SN GL000196.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
41
LN 39786
M5 5a8e43bec9be36c7b49c84d585107776
SN GL000248.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
42
LN 39929
M5 0996b4475f353ca98bacb756ac479140
SN GL000244.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
43
LN 39939
M5 131b1efc3270cc838686b54e7c34b17b
SN GL000238.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
44
LN 40103
M5 06cbf126247d89664a4faebad130fe9c
SN GL000202.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
45
LN 40531
M5 93f998536b61a56fd0ff47322a911d4b
SN GL000234.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
46
LN 40652
M5 3e06b6741061ad93a8587531307057d8
SN GL000232.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
47
LN 41001
M5 43f69e423533e948bfae5ce1d45bd3f1
SN GL000206.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
48
LN 41933
M5 445a86173da9f237d7bcf41c6cb8cc62
SN GL000240.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
49
LN 41934
M5 fdcd739913efa1fdc64b6c0cd7016779
SN GL000236.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
50
LN 42152
M5 ef4258cdc5a45c206cea8fc3e1d858cf
SN GL000241.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
51
LN 43341
M5 cc34279a7e353136741c9fce79bc4396
SN GL000243.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
52
LN 43523
M5 2f8694fc47576bc81b5fe9e7de0ba49e
SN GL000242.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
53
LN 43691
M5 b4eb71ee878d3706246b7c1dbef69299
SN GL000230.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
54
LN 45867
M5 e0c82e7751df73f4f6d0ed30cdc853c0
SN GL000237.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
55
LN 45941
M5 7fed60298a8d62ff808b74b6ce820001
SN GL000233.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
56
LN 81310
M5 efc49c871536fa8d79cb0a06fa739722
SN GL000204.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
57
LN 90085
M5 868e7784040da90d900d2d1b667a1383
SN GL000198.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
58
LN 92689
M5 aa81be49bf3fe63a79bdc6a6f279abf6
SN GL000208.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
59
LN 106433
M5 d75b436f50a8214ee9c2a51d30b2c2cc
SN GL000191.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
60
LN 128374
M5 a4aead23f8053f2655e468bcc6ecdceb
SN GL000227.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
61
LN 129120
M5 c5a17c97e2c1a0b6a9cc5a6b064b714f
SN GL000228.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
62
LN 137718
M5 46c2032c37f2ed899eb41c0473319a69
SN GL000214.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
63
LN 155397
M5 3238fb74ea87ae857f9c7508d315babb
SN GL000221.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
64
LN 159169
M5 f40598e2a5a6b26e84a3775e0d1e2c81
SN GL000209.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
65
LN 161147
M5 1d708b54644c26c7e01c2dad5426d38c
SN GL000218.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
66
LN 161802
M5 fc35de963c57bf7648429e6454f1c9db
SN GL000220.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
67
LN 164239
M5 9d424fdcc98866650b58f004080a992a
SN GL000213.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
68
LN 166566
M5 7daaa45c66b288847b9b32b964e623d3
SN GL000211.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
69
LN 169874
M5 569af3b73522fab4b40995ae4944e78e
SN GL000199.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
70
LN 172149
M5 6d243e18dea1945fb7f2517615b8f52e
SN GL000217.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
71
LN 172294
M5 642a232d91c486ac339263820aef7fe0
SN GL000216.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
72
LN 172545
M5 5eb3b418480ae67a997957c909375a73
SN GL000215.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
73
LN 174588
M5 d22441398d99caf673e9afb9a1908ec5
SN GL000205.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
74
LN 179198
M5 f977edd13bac459cb2ed4a5457dba1b3
SN GL000219.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
75
LN 179693
M5 d5b2fc04f6b41b212a4198a07f450e20
SN GL000224.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
76
LN 180455
M5 399dfa03bf32022ab52a846f7ca35b30
SN GL000223.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
77
LN 182896
M5 5d9ec007868d517e73543b005ba48535
SN GL000195.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
78
LN 186858
M5 563531689f3dbd691331fd6c5730a88b
SN GL000212.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
79
LN 186861
M5 6fe9abac455169f50470f5a6b01d0f59
SN GL000222.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
80
LN 187035
M5 75e4c8d17cd4addf3917d1703cacaf25
SN GL000200.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
81
LN 189789
M5 dbb6e8ece0b5de29da56601613007c2a
SN GL000193.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
82
LN 191469
M5 6ac8f815bf8e845bb3031b73f812c012
SN GL000194.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
83
LN 211173
M5 63945c3e6962f28ffd469719a747e73c
SN GL000225.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
84
LN 547496
M5 325ba9e808f669dfeee210fdd7b470ac
SN GL000192.1
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
85
LN 171823
M5 6743bd63b3ff2b5b8985d8933c53290a
SN NC_007605
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
86
LN 35477943
M5 5b6a4b3a81a2d3c134b7d14bf6ad39f1
SN hs37d5
UR ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human
RG
1
LB Solexa-51039
CN BI
DS SRP004074
SM NA18489
PI 220
ID SRR100025
PL ILLUMINA
CO
1
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-3-3adad14901f7> in <module>()
4 for i, record in enumerate(records):
5 print('\t%d' % (i + 1))
----> 6 for field, value in record.items():
7 print('\t\t%s\t%s' % (field, value))
AttributeError: 'str' object has no attribute 'items'
In [4]:
#0-based
for rec in bam:
if rec.cigarstring.find('M') > -1 and rec.cigarstring.find('S') > -1 and not rec.is_unmapped and not rec.mate_is_unmapped:
break
print(rec.query_name, rec.reference_id, bam.getrname(rec.reference_id), rec.reference_start, rec.reference_end)
print(rec.cigarstring)
print(rec.query_alignment_start, rec.query_alignment_end, rec.query_alignment_length)
print(rec.next_reference_id, rec.next_reference_start, rec.template_length)
print(rec.is_paired, rec.is_proper_pair, rec.is_unmapped, rec.mapping_quality)
print(rec.query_qualities)
print(rec.query_alignment_qualities)
print(rec.query_sequence)
('SRR100025.62130839', 19, '20', 59996, 60048)
52M24S
(0, 52, 76)
(19, 60228, 295)
(True, True, False, 60)
array('B', [33, 34, 36, 33, 39, 34, 33, 38, 39, 34, 40, 35, 40, 40, 32, 40, 38, 33, 35, 38, 33, 39, 40, 34, 37, 39, 36, 30, 36, 37, 34, 35, 34, 40, 37, 34, 38, 28, 40, 40, 38, 32, 33, 32, 36, 34, 37, 24, 34, 35, 31, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
array('B', [33, 34, 36, 33, 39, 34, 33, 38, 39, 34, 40, 35, 40, 40, 32, 40, 38, 33, 35, 38, 33, 39, 40, 34, 37, 39, 36, 30, 36, 37, 34, 35, 34, 40, 37, 34, 38, 28, 40, 40, 38, 32, 33, 32, 36, 34, 37, 24, 34, 35, 31, 2])
CTCAGATCCAGAGGTGGAAGAGGAAGGAAGCTTGGAACCCTATAGAGTTGCTGAGTGCCAGGACCAGATACTGGGC
In [5]:
counts = [0] * 76
for n, rec in enumerate(bam.fetch('20', 0, 10000000)):
for i in range(rec.query_alignment_start, rec.query_alignment_end):
counts[i] += 1
freqs = [x / (n + 1.) for x in counts]
plt.plot(range(1, 77), freqs)
Out[5]:
[<matplotlib.lines.Line2D at 0x7fc9c50789d0>]
In [ ]:
phreds = defaultdict(list)
for rec in bam.fetch('20', 0, None):
for i in range(rec.query_alignment_start, rec.query_alignment_end):
phreds[i].append(rec.query_qualities[i])
In [ ]:
maxs = [max(phreds[i]) for i in range(76)]
tops = [np.percentile(phreds[i], 95) for i in range(76)]
medians = [np.percentile(phreds[i], 50) for i in range(76)]
bottoms = [np.percentile(phreds[i], 5) for i in range(76)]
medians_fig = [x - y for x, y in zip(medians, bottoms)]
tops_fig = [x - y for x, y in zip(tops, medians)]
maxs_fig = [x - y for x, y in zip(maxs, tops)]
In [ ]:
fig, ax = plt.subplots(figsize=(16,9))
ax.stackplot(range(1, 77), (bottoms, medians_fig, tops_fig, maxs_fig))
ax.plot(range(1, 77), maxs, 'k-')
In [ ]:
Content source: tiagoantao/bioinf-python
Similar notebooks: