In [1]:
%run ../../shared_setup.ipynb
docker image cggh/biipy:v1.6.0
In [13]:
tbl_samples = (etl
.fromtsv('/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/samples.txt')
.convert('cross', LABELS)
.rename({'cross': 'Cross',
'clone': 'Clone',
'sample': 'Sample',
'run': 'Run',
'instrument': 'Instrument',
'coverage': 'Coverage'})
)
tbl_samples.addrownumbers().displayall(index_header=False)
row
Cross
Clone
Sample
Run
Instrument
Coverage
1
3D7 x HB3
3D7
PG0051-C
ERR019061
Illumina Genome Analyzer II
122X
2
3D7 x HB3
C01
PG0065-C
ERR019064
Illumina Genome Analyzer II
163X
3
3D7 x HB3
C01
PG0062-C
ERR019070
Illumina Genome Analyzer II
108X
4
3D7 x HB3
C02
PG0055-C
ERR019066
Illumina Genome Analyzer II
102X
5
3D7 x HB3
C02
PG0053-C
ERR019067
Illumina Genome Analyzer II
73X
6
3D7 x HB3
C02
PG0056-C
ERR019068
Illumina Genome Analyzer II
84X
7
3D7 x HB3
C02
PG0067-C
ERR019073
Illumina Genome Analyzer II
126X
8
3D7 x HB3
C03
PG0066-C
ERR019072
Illumina Genome Analyzer II
79X
9
3D7 x HB3
C04
PG0061-C
ERR019059
Illumina Genome Analyzer II
165X
10
3D7 x HB3
C05
PG0068-C
ERR019065
Illumina Genome Analyzer II
41X
11
3D7 x HB3
C06
PG0069-C
ERR019055
Illumina Genome Analyzer II
135X
12
3D7 x HB3
C07
PG0070-C
ERR019056
Illumina Genome Analyzer II
144X
13
3D7 x HB3
C08
PG0071-C
ERR019074
Illumina Genome Analyzer II
120X
14
3D7 x HB3
C09
PG0072-C
ERR019057
Illumina Genome Analyzer II
173X
15
3D7 x HB3
C10
PG0063-C
ERR019060
Illumina Genome Analyzer II
108X
16
3D7 x HB3
C11
PG0064-C
ERR019071
Illumina Genome Analyzer II
48X
17
3D7 x HB3
C12
PG0058-C
ERR019063
Illumina Genome Analyzer II
51X
18
3D7 x HB3
C13
PG0054-C
ERR019062
Illumina Genome Analyzer II
95X
19
3D7 x HB3
C14
PG0060-C
ERR019058
Illumina Genome Analyzer II
102X
20
3D7 x HB3
C15
PG0057-C
ERR019069
Illumina Genome Analyzer II
56X
21
3D7 x HB3
HB3
PG0052-C
ERR019054
Illumina Genome Analyzer II
100X
22
7G8 x GB4
7G8
PG0083-C
ERR027099
Illumina Genome Analyzer II
87X
23
7G8 x GB4
AL2
PG0103-CW
ERR045627
Illumina HiSeq 2000
127X
24
7G8 x GB4
AUD
PG0112-C
ERR029406
Illumina Genome Analyzer II
129X
25
7G8 x GB4
AUD
PG0112-CW
ERR045639
Illumina HiSeq 2000
88X
26
7G8 x GB4
D2
PG0094-CW
ERR045632
Illumina HiSeq 2000
153X
27
7G8 x GB4
DAN
PG0098-C
ERR027110
Illumina Genome Analyzer II
140X
28
7G8 x GB4
DEV
PG0081-CW
ERR045633
Illumina HiSeq 2000
89X
29
7G8 x GB4
GB4
PG0084-C
ERR027100
Illumina Genome Analyzer II
104X
30
7G8 x GB4
JB12
PG0099-C
ERR029146
Illumina Genome Analyzer II
120X
31
7G8 x GB4
JB8
PG0087-C
ERR029091
Illumina Genome Analyzer II
103X
32
7G8 x GB4
JC3
PG0077-CW
ERR045636
Illumina HiSeq 2000
94X
33
7G8 x GB4
JC9
PG0111-C
ERR029409
Illumina Genome Analyzer II
122X
34
7G8 x GB4
JC9
PG0111-CW
ERR045634
Illumina HiSeq 2000
121X
35
7G8 x GB4
JE11
PG0100-C
ERR029404
Illumina Genome Analyzer II
134X
36
7G8 x GB4
JE11
PG0100-CW
ERR045630
Illumina HiSeq 2000
55X
37
7G8 x GB4
JF6
PG0079-C
ERR027102
Illumina Genome Analyzer II
181X
38
7G8 x GB4
JF6
PG0079-CW
ERR045637
Illumina HiSeq 2000
94X
39
7G8 x GB4
JON
PG0107-C
ERR029408
Illumina Genome Analyzer II
180X
40
7G8 x GB4
KA6
PG0091-C
ERR027117
Illumina Genome Analyzer II
80X
41
7G8 x GB4
KB8
PG0104-C
ERR029148
Illumina Genome Analyzer II
116X
42
7G8 x GB4
KB8
PG0104-CW
ERR045642
Illumina HiSeq 2000
81X
43
7G8 x GB4
KH7
PG0088-C
ERR027111
Illumina Genome Analyzer II
96X
44
7G8 x GB4
LA10
PG0086-C
ERR029090
Illumina Genome Analyzer II
119X
45
7G8 x GB4
LA10
PG0086-CW
ERR045629
Illumina HiSeq 2000
66X
46
7G8 x GB4
NF10
PG0096-C
ERR027108
Illumina Genome Analyzer II
75X
47
7G8 x GB4
NIC
PG0095-C
ERR027107
Illumina Genome Analyzer II
70X
48
7G8 x GB4
NIC
PG0095-CW
ERR045631
Illumina HiSeq 2000
80X
49
7G8 x GB4
QF5
PG0078-C
ERR029092
Illumina Genome Analyzer II
147X
50
7G8 x GB4
QF5
PG0078-CW
ERR045638
Illumina HiSeq 2000
82X
51
7G8 x GB4
TF1
PG0080-C
ERR027103
Illumina Genome Analyzer II
73X
52
7G8 x GB4
WC4
PG0082-C
ERR029093
Illumina Genome Analyzer II
78X
53
7G8 x GB4
WE2
PG0085-C
ERR027101
Illumina Genome Analyzer II
124X
54
7G8 x GB4
WF12
PG0097-C
ERR027109
Illumina Genome Analyzer II
109X
55
7G8 x GB4
XB3
PG0093-C
ERR029105
Illumina Genome Analyzer II
214X
56
7G8 x GB4
XD8
PG0105-C
ERR029144
Illumina Genome Analyzer II
121X
57
7G8 x GB4
XD8
PG0105-CW
ERR045628
Illumina HiSeq 2000
122X
58
7G8 x GB4
XE7
PG0106-C
ERR029407
Illumina Genome Analyzer II
250X
59
7G8 x GB4
XF12
PG0102-C
ERR029143
Illumina Genome Analyzer II
141X
60
7G8 x GB4
XF12
PG0102-CW
ERR045635
Illumina HiSeq 2000
96X
61
7G8 x GB4
XG10
PG0109-C
ERR029405
Illumina Genome Analyzer II
61X
62
HB3 x Dd2
1BB5
PG0023-C
ERR015449
Illumina Genome Analyzer II
22X
63
HB3 x Dd2
3BA6
PG0022-Cx
ERR126027
Illumina HiSeq 2000
32X
64
HB3 x Dd2
3BD5
PG0024-C
ERR019053
Illumina Genome Analyzer II
92X
65
HB3 x Dd2
7C101
PG0074-C
ERR019048
Illumina Genome Analyzer II
98X
66
HB3 x Dd2
7C111
PG0038-C
ERR015457
Illumina Genome Analyzer II
148X
67
HB3 x Dd2
7C12
PG0035-Cx
ERR037704
Illumina HiSeq 2000
637X
68
HB3 x Dd2
7C126
PG0047-C
ERR015452
Illumina Genome Analyzer II
187X
69
HB3 x Dd2
7C140
PG0039-C
ERR015454
Illumina Genome Analyzer II
78X
70
HB3 x Dd2
7C159
PG0040-Cx
ERR107475
Illumina HiSeq 2000
59X
71
HB3 x Dd2
7C16
PG0036-C
ERR015455
Illumina Genome Analyzer II
26X
72
HB3 x Dd2
7C170
PG0041-C
ERR015446
Illumina Genome Analyzer II
130X
73
HB3 x Dd2
7C183
PG0042-C
ERR015448
Illumina Genome Analyzer II
118X
74
HB3 x Dd2
7C188
PG0030-C
ERR019046
Illumina Genome Analyzer II
171X
75
HB3 x Dd2
7C20
PG0037-C
ERR015451
Illumina Genome Analyzer II
82X
76
HB3 x Dd2
7C3
PG0034-C
ERR019047
Illumina Genome Analyzer II
142X
77
HB3 x Dd2
7C408
PG0031-C
ERR015458
Illumina Genome Analyzer II
51X
78
HB3 x Dd2
7C421
PG0043-C
ERR015459
Illumina Genome Analyzer II
164X
79
HB3 x Dd2
7C424
PG0044-C
ERR019043
Illumina Genome Analyzer II
172X
80
HB3 x Dd2
7C46
PG0046-Cx
ERR107476
Illumina HiSeq 2000
62X
81
HB3 x Dd2
7C7
PG0048-C
ERR019049
Illumina Genome Analyzer II
110X
82
HB3 x Dd2
B1SD
PG0015-C
ERR019044
Illumina Genome Analyzer II
91X
83
HB3 x Dd2
B4R3
PG0018-C
ERR019042
Illumina Genome Analyzer II
115X
84
HB3 x Dd2
CH3_116
PG0032-Cx
ERR037703
Illumina HiSeq 2000
186X
85
HB3 x Dd2
CH3_61
PG0033-Cx
ERR175544
Illumina HiSeq 2000
68X
86
HB3 x Dd2
D43
PG0029-Cx
ERR107474
Illumina HiSeq 2000
34X
87
HB3 x Dd2
DD2
PG0008-CW
ERR012840
Illumina Genome Analyzer II
122X
88
HB3 x Dd2
GC03
PG0021-C
ERR015447
Illumina Genome Analyzer II
152X
89
HB3 x Dd2
GC06
PG0028-C
ERR015456
Illumina Genome Analyzer II
54X
90
HB3 x Dd2
HB3
PG0004-CW
ERR012788
Illumina Genome Analyzer II
80X
91
HB3 x Dd2
QC01
PG0017-C
ERR019050
Illumina Genome Analyzer II
117X
92
HB3 x Dd2
QC13
PG0016-C
ERR012895
Illumina Genome Analyzer II
68X
93
HB3 x Dd2
QC23
PG0045-C
ERR012892
Illumina Genome Analyzer II
115X
94
HB3 x Dd2
QC34
PG0026-C
ERR015453
Illumina Genome Analyzer II
55X
95
HB3 x Dd2
SC01
PG0025-C
ERR019045
Illumina Genome Analyzer II
149X
96
HB3 x Dd2
SC05
PG0019-C
ERR019051
Illumina Genome Analyzer II
97X
97
HB3 x Dd2
TC05
PG0027-C
ERR015450
Illumina Genome Analyzer II
115X
98
HB3 x Dd2
TC08
PG0020-C
ERR019052
Illumina Genome Analyzer II
144X
In [4]:
runs = ','.join(tbl_samples.sort('Run').values('Run').list())
runs
Out[4]:
'ERR012788,ERR012840,ERR012892,ERR012895,ERR015446,ERR015447,ERR015448,ERR015449,ERR015450,ERR015451,ERR015452,ERR015453,ERR015454,ERR015455,ERR015456,ERR015457,ERR015458,ERR015459,ERR019042,ERR019043,ERR019044,ERR019045,ERR019046,ERR019047,ERR019048,ERR019049,ERR019050,ERR019051,ERR019052,ERR019053,ERR019054,ERR019055,ERR019056,ERR019057,ERR019058,ERR019059,ERR019060,ERR019061,ERR019062,ERR019063,ERR019064,ERR019065,ERR019066,ERR019067,ERR019068,ERR019069,ERR019070,ERR019071,ERR019072,ERR019073,ERR019074,ERR027099,ERR027100,ERR027101,ERR027102,ERR027103,ERR027107,ERR027108,ERR027109,ERR027110,ERR027111,ERR027117,ERR029090,ERR029091,ERR029092,ERR029093,ERR029105,ERR029143,ERR029144,ERR029146,ERR029148,ERR029404,ERR029405,ERR029406,ERR029407,ERR029408,ERR029409,ERR037703,ERR037704,ERR045627,ERR045628,ERR045629,ERR045630,ERR045631,ERR045632,ERR045633,ERR045634,ERR045635,ERR045636,ERR045637,ERR045638,ERR045639,ERR045642,ERR107474,ERR107475,ERR107476,ERR126027,ERR175544'
In [5]:
tbl_ena = (
etl
.fromtsv('/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/ena_metadata.txt')
)
tbl_ena
Out[5]:
0|Study
1|Sample
2|Experiment
3|Run
4|Organism
5|Instrument Platform
6|Instrument Model
7|Library Name
8|Library Layout
9|Library Source
10|Library Selection
11|Run Read Count
12|Run Base Count
13|File Name
14|File Size
15|md5
16|Ftp
ERP000190
ERS010539
ERX005179
ERR012788
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0004_CW_200 1
PAIRED
GENOMIC
RANDOM
13591264
2065872128
ERR012788_1.fastq.gz
973Mb
c5e9454eb30ea782d48f0b14e78eb184
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR012/ERR012788/ERR012788_1.fastq.gz
ERP000190
ERS010539
ERX005179
ERR012788
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0004_CW_200 1
PAIRED
GENOMIC
RANDOM
13591264
2065872128
ERR012788_2.fastq.gz
1006Mb
31215fc9e29e99d3e759e63dd339736f
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR012/ERR012788/ERR012788_2.fastq.gz
ERP000190
ERS010540
ERX005100
ERR012840
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0008_CW_200 1
PAIRED
GENOMIC
RANDOM
20276091
3081965832
ERR012840_1.fastq.gz
1Gb
80f8291daa0433920f5be0d1724d9c8c
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR012/ERR012840/ERR012840_1.fastq.gz
ERP000190
ERS010540
ERX005100
ERR012840
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0008_CW_200 1
PAIRED
GENOMIC
RANDOM
20276091
3081965832
ERR012840_2.fastq.gz
1Gb
e05eee4c9bde2126be1cb7db3d6ed92e
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR012/ERR012840/ERR012840_2.fastq.gz
ERP000199
ERS010147
ERX007548
ERR019044
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0015-C 138666
PAIRED
GENOMIC
RANDOM
18646034
2834197168
ERR019044_1.fastq.gz
1Gb
3d41ff287a6ac012fecfd8fa43ab6913
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR019/ERR019044/ERR019044_1.fastq.gz
...
In [9]:
etl.mergeduplicates?
In [10]:
tbl_ena_samples = (
tbl_ena
.cut(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
.distinct()
)
tbl_ena_samples
Out[10]:
0|Study
1|Sample
2|Experiment
3|Run
4|Organism
5|Instrument Platform
6|Instrument Model
7|Library Name
8|Library Layout
9|Library Source
10|Library Selection
11|Run Read Count
12|Run Base Count
ERP000190
ERS010133
ERX009555
ERR023683
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
473_578_mal_move
PAIRED
GENOMIC
RANDOM
1952973
312475680
ERP000190
ERS010134
ERX009555
ERR023673
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
473_578_mal_move
PAIRED
GENOMIC
RANDOM
2834488
453518080
ERP000190
ERS010149
ERX009555
ERR023675
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
473_578_mal_move
PAIRED
GENOMIC
RANDOM
801376
128220160
ERP000190
ERS010539
ERX005179
ERR012788
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0004_CW_200 1
PAIRED
GENOMIC
RANDOM
13591264
2065872128
ERP000190
ERS010540
ERX005100
ERR012840
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0008_CW_200 1
PAIRED
GENOMIC
RANDOM
20276091
3081965832
...
In [12]:
tbl_ena_samples.nrows()
Out[12]:
124
In [19]:
tbl_combined_samples = (
tbl_samples
.cut('Cross', 'Clone', 'Sample', 'Run')
.leftjoin(tbl_ena_samples, key='Run')
)
tbl_combined_samples.addrownumbers().displayall()
tbl_combined_samples.totsv('/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/ena_samples.txt')
0|row
1|Cross
2|Clone
3|Sample
4|Run
5|Study
6|Sample
7|Experiment
8|Organism
9|Instrument Platform
10|Instrument Model
11|Library Name
12|Library Layout
13|Library Source
14|Library Selection
15|Run Read Count
16|Run Base Count
1
HB3 x Dd2
HB3
PG0004-CW
ERR012788
ERP000190
ERS010539
ERX005179
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0004_CW_200 1
PAIRED
GENOMIC
RANDOM
13591264
2065872128
2
HB3 x Dd2
DD2
PG0008-CW
ERR012840
ERP000190
ERS010540
ERX005100
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0008_CW_200 1
PAIRED
GENOMIC
RANDOM
20276091
3081965832
3
HB3 x Dd2
QC23
PG0045-C
ERR012892
ERP000199
ERS010544
ERX005281
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0045_C_200 1
PAIRED
GENOMIC
RANDOM
19652775
2987221800
4
HB3 x Dd2
QC13
PG0016-C
ERR012895
ERP000199
ERS010541
ERX005283
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0016_C_200 1
PAIRED
GENOMIC
RANDOM
11709350
1779821200
5
HB3 x Dd2
7C170
PG0041-C
ERR015446
ERP000199
ERS009988
ERX005817
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG41-C-200 1 89067
PAIRED
GENOMIC
RANDOM
21541102
3274247504
6
HB3 x Dd2
GC03
PG0021-C
ERR015447
ERP000199
ERS009986
ERX005829
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG21-C-200 1 89069
PAIRED
GENOMIC
RANDOM
25454751
3869122152
7
HB3 x Dd2
7C183
PG0042-C
ERR015448
ERP000199
ERS009989
ERX005819
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG42-C-200 1 89066
PAIRED
GENOMIC
RANDOM
19451382
2956610064
8
HB3 x Dd2
1BB5
PG0023-C
ERR015449
ERP000199
ERS010581
ERX005822
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0023-C_200 113212
PAIRED
GENOMIC
RANDOM
13874263
2108887976
9
HB3 x Dd2
TC05
PG0027-C
ERR015450
ERP000199
ERS009992
ERX005821
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG27-C-200 1 89063
PAIRED
GENOMIC
RANDOM
19142976
2909732352
10
HB3 x Dd2
7C20
PG0037-C
ERR015451
ERP000199
ERS009997
ERX005816
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG37-C-200 1 89937
PAIRED
GENOMIC
RANDOM
13505157
2052783864
11
HB3 x Dd2
7C126
PG0047-C
ERR015452
ERP000199
ERS010545
ERX005828
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0047_C_200 1
PAIRED
GENOMIC
RANDOM
31260467
4751590984
12
HB3 x Dd2
QC34
PG0026-C
ERR015453
ERP000199
ERS009991
ERX005824
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG26-C-200 1 89064
PAIRED
GENOMIC
RANDOM
9386451
1426740552
13
HB3 x Dd2
7C140
PG0039-C
ERR015454
ERP000199
ERS009987
ERX005820
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG39-C-200 1 89068
PAIRED
GENOMIC
RANDOM
13113083
1993188616
14
HB3 x Dd2
7C16
PG0036-C
ERR015455
ERP000199
ERS009996
ERX005826
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG36-C-200 1 88950
PAIRED
GENOMIC
RANDOM
5014565
762213880
15
HB3 x Dd2
GC06
PG0028-C
ERR015456
ERP000199
ERS010580
ERX005818
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0028-C_200 113211
PAIRED
GENOMIC
RANDOM
16118219
2449969288
16
HB3 x Dd2
7C111
PG0038-C
ERR015457
ERP000199
ERS009998
ERX005823
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG38-C-200 1 89070
PAIRED
GENOMIC
RANDOM
24401190
3708980880
17
HB3 x Dd2
7C408
PG0031-C
ERR015458
ERP000199
ERS009993
ERX005825
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG31-C-200 1 88953
PAIRED
GENOMIC
RANDOM
10601637
1611448824
18
HB3 x Dd2
7C421
PG0043-C
ERR015459
ERP000199
ERS009990
ERX005827
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG43-C-200 1 89065
PAIRED
GENOMIC
RANDOM
26938658
4094676016
19
HB3 x Dd2
B4R3
PG0018-C
ERR019042
ERP000199
ERS009985
ERX007552
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG18-C-200 1 89936
PAIRED
GENOMIC
RANDOM
19618379
2981993608
20
HB3 x Dd2
7C424
PG0044-C
ERR019043
ERP000199
ERS010144
ERX007553
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0044-C 138663
PAIRED
GENOMIC
RANDOM
30075256
4571438912
21
HB3 x Dd2
B1SD
PG0015-C
ERR019044
ERP000199
ERS010147
ERX007548
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0015-C 138666
PAIRED
GENOMIC
RANDOM
18646034
2834197168
22
HB3 x Dd2
SC01
PG0025-C
ERR019045
ERP000199
ERS010151
ERX007555
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0025-C 138669
PAIRED
GENOMIC
RANDOM
28577692
4343809184
23
HB3 x Dd2
7C188
PG0030-C
ERR019046
ERP000199
ERS010135
ERX007551
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0030-C 138649
PAIRED
GENOMIC
RANDOM
31544237
4794724024
24
HB3 x Dd2
7C3
PG0034-C
ERR019047
ERP000199
ERS010136
ERX007557
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0034-C 138650
PAIRED
GENOMIC
RANDOM
27320404
4152701408
25
HB3 x Dd2
7C101
PG0074-C
ERR019048
ERP000199
ERS010137
ERX007556
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0074-C 138651
PAIRED
GENOMIC
RANDOM
22748694
3457801488
26
HB3 x Dd2
7C7
PG0048-C
ERR019049
ERP000199
ERS010138
ERX007550
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0048-C 138652
PAIRED
GENOMIC
RANDOM
24661508
3748549216
27
HB3 x Dd2
QC01
PG0017-C
ERR019050
ERP000199
ERS010145
ERX007547
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0017-C 138664
PAIRED
GENOMIC
RANDOM
20048933
3047437816
28
HB3 x Dd2
SC05
PG0019-C
ERR019051
ERP000199
ERS010146
ERX007549
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0019-C 138665
PAIRED
GENOMIC
RANDOM
16951497
2576627544
29
HB3 x Dd2
TC08
PG0020-C
ERR019052
ERP000199
ERS010148
ERX007554
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0020-C 138667
PAIRED
GENOMIC
RANDOM
23767530
3612664560
30
HB3 x Dd2
3BD5
PG0024-C
ERR019053
ERP000199
ERS010150
ERX007558
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0024-C 138668
PAIRED
GENOMIC
RANDOM
15776947
2398095944
31
3D7 x HB3
HB3
PG0052-C
ERR019054
ERP000200
ERS010000
ERX007566
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG052-C-200 89368
PAIRED
GENOMIC
RANDOM
16414335
2494978920
32
3D7 x HB3
C06
PG0069-C
ERR019055
ERP000200
ERS010001
ERX007573
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG069-C-200 89369
PAIRED
GENOMIC
RANDOM
21797779
3313262408
33
3D7 x HB3
C07
PG0070-C
ERR019056
ERP000200
ERS010002
ERX007574
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG070-C-200 89370
PAIRED
GENOMIC
RANDOM
23027954
3500249008
34
3D7 x HB3
C09
PG0072-C
ERR019057
ERP000200
ERS010003
ERX007578
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG072-C-200 89371
PAIRED
GENOMIC
RANDOM
27803347
4226108744
35
3D7 x HB3
C14
PG0060-C
ERR019058
ERP000200
ERS010006
ERX007577
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG060-C-200 89374
PAIRED
GENOMIC
RANDOM
16413305
2494822360
36
3D7 x HB3
C04
PG0061-C
ERR019059
ERP000200
ERS010007
ERX007568
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG061-C-200 89375
PAIRED
GENOMIC
RANDOM
26407638
4013960976
37
3D7 x HB3
C10
PG0063-C
ERR019060
ERP000200
ERS010008
ERX007572
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG063-C-200 89376
PAIRED
GENOMIC
RANDOM
17421950
2648136400
38
3D7 x HB3
3D7
PG0051-C
ERR019061
ERP000200
ERS009999
ERX007569
Plasmodium falciparum 3D7
ILLUMINA
Illumina Genome Analyzer II
PG051-C-200 89367
PAIRED
GENOMIC
RANDOM
19372784
2944663168
39
3D7 x HB3
C13
PG0054-C
ERR019062
ERP000200
ERS010004
ERX007564
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG054-C-200 89372
PAIRED
GENOMIC
RANDOM
15521956
2359337312
40
3D7 x HB3
C12
PG0058-C
ERR019063
ERP000200
ERS010005
ERX007576
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG058-C-200 89373
PAIRED
GENOMIC
RANDOM
8226236
1250387872
41
3D7 x HB3
C01
PG0065-C
ERR019064
ERP000200
ERS010009
ERX007567
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG065-C-200 89377
PAIRED
GENOMIC
RANDOM
26009794
3953488688
42
3D7 x HB3
C05
PG0068-C
ERR019065
ERP000200
ERS010010
ERX007579
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG068-C-200 89378
PAIRED
GENOMIC
RANDOM
6563653
997675256
43
3D7 x HB3
C02
PG0055-C
ERR019066
ERP000200
ERS010011
ERX007561
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG055-C_200 89379
PAIRED
GENOMIC
RANDOM
16350891
2485335432
44
3D7 x HB3
C02
PG0053-C
ERR019067
ERP000200
ERS010012
ERX007559
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG53-C_200 113389
PAIRED
GENOMIC
RANDOM
12185112
1852137024
45
3D7 x HB3
C02
PG0056-C
ERR019068
ERP000200
ERS010013
ERX007563
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG56-C_200 113390
PAIRED
GENOMIC
RANDOM
13963526
2122455952
46
3D7 x HB3
C15
PG0057-C
ERR019069
ERP000200
ERS010014
ERX007580
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG57-C_200 113391
PAIRED
GENOMIC
RANDOM
9324276
1417289952
47
3D7 x HB3
C01
PG0062-C
ERR019070
ERP000200
ERS010015
ERX007575
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG62-C_200 113393
PAIRED
GENOMIC
RANDOM
18348018
2788898736
48
3D7 x HB3
C11
PG0064-C
ERR019071
ERP000200
ERS010016
ERX007570
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG64-C_200 113394
PAIRED
GENOMIC
RANDOM
8025849
1219929048
49
3D7 x HB3
C03
PG0066-C
ERR019072
ERP000200
ERS010017
ERX007560
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG66-C_200 113395
PAIRED
GENOMIC
RANDOM
13171442
2002059184
50
3D7 x HB3
C02
PG0067-C
ERR019073
ERP000200
ERS010018
ERX007565
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG67-C_200 113396
PAIRED
GENOMIC
RANDOM
20862274
3171065648
51
3D7 x HB3
C08
PG0071-C
ERR019074
ERP000200
ERS010019
ERX007571
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG71-C_200 113397
PAIRED
GENOMIC
RANDOM
20206948
3071456096
52
7G8 x GB4
7G8
PG0083-C
ERR027099
ERP000190
ERS016318
ERX009951
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0083--C 1361748
PAIRED
GENOMIC
RANDOM
14577812
2215827424
53
7G8 x GB4
GB4
PG0084-C
ERR027100
ERP000190
ERS016319
ERX009943
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0084-C 1361749
PAIRED
GENOMIC
RANDOM
17511851
2661801352
54
7G8 x GB4
WE2
PG0085-C
ERR027101
ERP000190
ERS016320
ERX009935
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0085-C 1361750
PAIRED
GENOMIC
RANDOM
21480480
3265032960
55
7G8 x GB4
JF6
PG0079-C
ERR027102
ERP000190
ERS016326
ERX009920
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0079-C 1361756
PAIRED
GENOMIC
RANDOM
29670560
4509925120
56
7G8 x GB4
TF1
PG0080-C
ERR027103
ERP000190
ERS016327
ERX009948
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0080-C 1361757
PAIRED
GENOMIC
RANDOM
12134419
1844431688
57
7G8 x GB4
NIC
PG0095-C
ERR027107
ERP000190
ERS016694
ERX009950
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0095-C 1375572
PAIRED
GENOMIC
RANDOM
11465600
1742771200
58
7G8 x GB4
NF10
PG0096-C
ERR027108
ERP000190
ERS016695
ERX009923
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0096-C 1375573
PAIRED
GENOMIC
RANDOM
12428858
1889186416
59
7G8 x GB4
WF12
PG0097-C
ERR027109
ERP000190
ERS016696
ERX009921
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0097-C 1375574
PAIRED
GENOMIC
RANDOM
18037923
2741764296
60
7G8 x GB4
DAN
PG0098-C
ERR027110
ERP000190
ERS016697
ERX009930
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0098-C 1375575
PAIRED
GENOMIC
RANDOM
23120028
3514244256
61
7G8 x GB4
KH7
PG0088-C
ERR027111
ERP000190
ERS016323
ERX009944
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0088-C 1361753
PAIRED
GENOMIC
RANDOM
15825010
2405401520
62
7G8 x GB4
KA6
PG0091-C
ERR027117
ERP000190
ERS016690
ERX009952
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0091-C 1375568
PAIRED
GENOMIC
RANDOM
13598392
2066955584
63
7G8 x GB4
LA10
PG0086-C
ERR029090
ERP000190
ERS016321
ERX010157
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0086-C 1375614
PAIRED
GENOMIC
RANDOM
19467950
2959128400
64
7G8 x GB4
JB8
PG0087-C
ERR029091
ERP000190
ERS016322
ERX010161
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0087-C 1375615
PAIRED
GENOMIC
RANDOM
16751120
2546170240
65
7G8 x GB4
QF5
PG0078-C
ERR029092
ERP000190
ERS016325
ERX010152
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0078-C 1375616
PAIRED
GENOMIC
RANDOM
23859076
3626579552
66
7G8 x GB4
WC4
PG0082-C
ERR029093
ERP000190
ERS016329
ERX010164
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0082-C 1375617
PAIRED
GENOMIC
RANDOM
12751131
1938171912
67
7G8 x GB4
XB3
PG0093-C
ERR029105
ERP000190
ERS016692
ERX010160
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0093-C 1375737
PAIRED
GENOMIC
RANDOM
35340883
5371814216
68
7G8 x GB4
XF12
PG0102-C
ERR029143
ERP000190
ERS016787
ERX010193
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0102-C 1399445
PAIRED
GENOMIC
RANDOM
22988529
3494256408
69
7G8 x GB4
XD8
PG0105-C
ERR029144
ERP000190
ERS016790
ERX010196
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0105-C 1399448
PAIRED
GENOMIC
RANDOM
19983456
3037485312
70
7G8 x GB4
JB12
PG0099-C
ERR029146
ERP000190
ERS016784
ERX010192
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0099--C 1399442
PAIRED
GENOMIC
RANDOM
19858507
3018493064
71
7G8 x GB4
KB8
PG0104-C
ERR029148
ERP000190
ERS016789
ERX010191
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0104-C 1399447
PAIRED
GENOMIC
RANDOM
19049292
2895492384
72
7G8 x GB4
JE11
PG0100-C
ERR029404
ERP000190
ERS016785
ERX010255
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0100-C 1399443
PAIRED
GENOMIC
RANDOM
22407082
3405876464
73
7G8 x GB4
XG10
PG0109-C
ERR029405
ERP000190
ERS016794
ERX010261
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0109-C 1413751
PAIRED
GENOMIC
RANDOM
10174810
1546571120
74
7G8 x GB4
AUD
PG0112-C
ERR029406
ERP000190
ERS016797
ERX010262
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0112-C 1413754
PAIRED
GENOMIC
RANDOM
21456241
3261348632
75
7G8 x GB4
XE7
PG0106-C
ERR029407
ERP000190
ERS016791
ERX010253
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0106-C 1399449
PAIRED
GENOMIC
RANDOM
41547346
6315196592
76
7G8 x GB4
JON
PG0107-C
ERR029408
ERP000190
ERS016792
ERX010265
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0107-C 1399450
PAIRED
GENOMIC
RANDOM
29930708
4549467616
77
7G8 x GB4
JC9
PG0111-C
ERR029409
ERP000190
ERS016796
ERX010258
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0111-C 1413753
PAIRED
GENOMIC
RANDOM
20311874
3087404848
78
HB3 x Dd2
CH3_116
PG0032-Cx
ERR037703
ERP000199
ERS023540
ERX015006
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2334345
PAIRED
GENOMIC
RANDOM
32249095
4837364250
79
HB3 x Dd2
7C12
PG0035-Cx
ERR037704
ERP000199
ERS023541
ERX015005
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2334346
PAIRED
GENOMIC
RANDOM
105158627
15773794050
80
7G8 x GB4
AL2
PG0103-CW
ERR045627
ERP000190
ERS035455
ERX022689
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735069
PAIRED
GENOMIC
RANDOM
20911727
3136759050
81
7G8 x GB4
XD8
PG0105-CW
ERR045628
ERP000190
ERS035465
ERX022690
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735070
PAIRED
GENOMIC
RANDOM
20011618
3001742700
82
7G8 x GB4
LA10
PG0086-CW
ERR045629
ERP000190
ERS035475
ERX022691
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735071
PAIRED
GENOMIC
RANDOM
10905660
1635849000
83
7G8 x GB4
JE11
PG0100-CW
ERR045630
ERP000190
ERS035485
ERX022692
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735072
PAIRED
GENOMIC
RANDOM
9082756
1362413400
84
7G8 x GB4
NIC
PG0095-CW
ERR045631
ERP000190
ERS035495
ERX022693
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735073
PAIRED
GENOMIC
RANDOM
13152821
1972923150
85
7G8 x GB4
D2
PG0094-CW
ERR045632
ERP000190
ERS035504
ERX022694
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735074
PAIRED
GENOMIC
RANDOM
25333186
3799977900
86
7G8 x GB4
DEV
PG0081-CW
ERR045633
ERP000190
ERS035514
ERX022695
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735075
PAIRED
GENOMIC
RANDOM
14649909
2197486350
87
7G8 x GB4
JC9
PG0111-CW
ERR045634
ERP000190
ERS035528
ERX022696
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735076
PAIRED
GENOMIC
RANDOM
19748516
2962277400
88
7G8 x GB4
XF12
PG0102-CW
ERR045635
ERP000190
ERS035537
ERX022697
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735077
PAIRED
GENOMIC
RANDOM
15828845
2374326750
89
7G8 x GB4
JC3
PG0077-CW
ERR045636
ERP000190
ERS035291
ERX022698
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735123
PAIRED
GENOMIC
RANDOM
15768458
2365268700
90
7G8 x GB4
JF6
PG0079-CW
ERR045637
ERP000190
ERS035292
ERX022699
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735124
PAIRED
GENOMIC
RANDOM
15540616
2331092400
91
7G8 x GB4
QF5
PG0078-CW
ERR045638
ERP000190
ERS035293
ERX022700
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735125
PAIRED
GENOMIC
RANDOM
13504442
2025666300
92
7G8 x GB4
AUD
PG0112-CW
ERR045639
ERP000190
ERS035294
ERX022701
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735126
PAIRED
GENOMIC
RANDOM
14324763
2148714450
93
7G8 x GB4
KB8
PG0104-CW
ERR045642
ERP000190
ERS035323
ERX022704
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
2735129
PAIRED
GENOMIC
RANDOM
13319411
1997911650
94
HB3 x Dd2
D43
PG0029-Cx
ERR107474
ERP000190
ERS074129
ERX084908
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
4021941
PAIRED
GENOMIC
RANDOM
4500516
900103200
95
HB3 x Dd2
7C159
PG0040-Cx
ERR107475
ERP000190
ERS074130
ERX084909
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
4021942
PAIRED
GENOMIC
RANDOM
7477345
1495469000
96
HB3 x Dd2
7C46
PG0046-Cx
ERR107476
ERP000190
ERS074131
ERX084910
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
4021943
PAIRED
GENOMIC
RANDOM
7875308
1575061600
97
HB3 x Dd2
3BA6
PG0022-Cx
ERR126027
ERP000190
ERS088714
ERX102176
Plasmodium falciparum
ILLUMINA
Illumina HiSeq 2000
4432326
PAIRED
GENOMIC
RANDOM
4248156
849631200
98
HB3 x Dd2
CH3_61
PG0033-Cx
ERR175544
None
None
None
None
None
None
None
None
None
None
None
None
In [20]:
tbl_combined_samples
Out[20]:
0|Cross
1|Clone
2|Sample
3|Run
4|Study
5|Sample
6|Experiment
7|Organism
8|Instrument Platform
9|Instrument Model
10|Library Name
11|Library Layout
12|Library Source
13|Library Selection
14|Run Read Count
15|Run Base Count
HB3 x Dd2
HB3
PG0004-CW
ERR012788
ERP000190
ERS010539
ERX005179
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0004_CW_200 1
PAIRED
GENOMIC
RANDOM
13591264
2065872128
HB3 x Dd2
DD2
PG0008-CW
ERR012840
ERP000190
ERS010540
ERX005100
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0008_CW_200 1
PAIRED
GENOMIC
RANDOM
20276091
3081965832
HB3 x Dd2
QC23
PG0045-C
ERR012892
ERP000199
ERS010544
ERX005281
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0045_C_200 1
PAIRED
GENOMIC
RANDOM
19652775
2987221800
HB3 x Dd2
QC13
PG0016-C
ERR012895
ERP000199
ERS010541
ERX005283
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG0016_C_200 1
PAIRED
GENOMIC
RANDOM
11709350
1779821200
HB3 x Dd2
7C170
PG0041-C
ERR015446
ERP000199
ERS009988
ERX005817
Plasmodium falciparum
ILLUMINA
Illumina Genome Analyzer II
PG41-C-200 1 89067
PAIRED
GENOMIC
RANDOM
21541102
3274247504
...
In [42]:
tbl_eva_samples = (
tbl_combined_samples
.cut(5, 0, 1, 2, 3)
.setheader(['Sample Accession', 'Description', 'Strain', 'Sample ID', 'Run'])
.addfield('Sample Title', lambda row: '%s/%s/%s' % (row[2], row[3], row[4]))
.cutout('Run')
.addfield('Sampleset Accession', '', index=1)
.addfield('Analysis Alias', 'pfx-20141022-gatk,pfx-20141022-cortex,pfx-20141022-combined', index=2)
.addfield('Gender', '', index=4)
.addfield('Links', '', index=5)
.addfield('Attributes', '', index=6)
.addfield('Phenotypes', '', index=7)
.addfield('Disease Sites', '', index=8)
.addfield('Breed', '', index=10)
)
tbl_eva_samples.totsv('../../EVA_samples.txt')
tbl_eva_samples
Out[42]:
0|Sample Accession
1|Sampleset Accession
2|Analysis Alias
3|Description
4|Gender
5|Links
6|Attributes
7|Phenotypes
8|Disease Sites
9|Strain
10|Breed
11|Sample ID
12|Sample Title
ERS010539
pfx-20141022-gatk,pfx-20141022-cortex,pfx-20141022-combined
HB3 x Dd2
HB3
PG0004-CW
HB3/PG0004-CW/ERR012788
ERS010540
pfx-20141022-gatk,pfx-20141022-cortex,pfx-20141022-combined
HB3 x Dd2
DD2
PG0008-CW
DD2/PG0008-CW/ERR012840
ERS010544
pfx-20141022-gatk,pfx-20141022-cortex,pfx-20141022-combined
HB3 x Dd2
QC23
PG0045-C
QC23/PG0045-C/ERR012892
ERS010541
pfx-20141022-gatk,pfx-20141022-cortex,pfx-20141022-combined
HB3 x Dd2
QC13
PG0016-C
QC13/PG0016-C/ERR012895
ERS009988
pfx-20141022-gatk,pfx-20141022-cortex,pfx-20141022-combined
HB3 x Dd2
7C170
PG0041-C
7C170/PG0041-C/ERR015446
...
In [56]:
tbl_files = [['Analysis Alias', 'File Name', 'File Type', 'MD5']]
eva_dir = '/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022-eva/'
for fn in os.listdir(eva_dir):
md5 = !md5sum {os.path.join(eva_dir, fn)}
if 'gatk' in fn:
analysis = 'pfx-20141022-gatk'
elif 'cortex' in fn:
analysis = 'pfx-20141022-cortex'
elif 'combined' in fn:
analysis = 'pfx-20141022-combined'
else:
continue
if fn.endswith('.vcf.gz'):
ft = 'vcf'
elif fn.endswith('.tbi'):
ft = 'tabix'
else:
continue
row = [analysis, fn, ft, md5[0].split(' ')[0]]
tbl_files.append(row)
tbl_files = etl.wrap(tbl_files).sort((0, 1))
tbl_files.totsv('../../EVA_files.txt')
tbl_files.displayall()
0|Analysis Alias
1|File Name
2|File Type
3|MD5
pfx-20141022-combined
3d7_hb3.combined.final.vcf.gz
vcf
25a7d1a316295e25d1193b031ed3348e
pfx-20141022-combined
3d7_hb3.combined.final.vcf.gz.tbi
tabix
f4a461ceaa0e4d46557d049f1f80c064
pfx-20141022-combined
7g8_gb4.combined.final.vcf.gz
vcf
b39dee94fb8fb3d69434753cdd3370f5
pfx-20141022-combined
7g8_gb4.combined.final.vcf.gz.tbi
tabix
d6d5efeb6c5abd3caf5af31194cd7186
pfx-20141022-combined
hb3_dd2.combined.final.vcf.gz
vcf
b0d5b7a5eacb615b1f4a12e5a7856fab
pfx-20141022-combined
hb3_dd2.combined.final.vcf.gz.tbi
tabix
caedd35dca723e533bbce6ca6db1a5af
pfx-20141022-cortex
3d7_hb3.cortex.final.vcf.gz
vcf
188236e04c7768947192eebf7065c420
pfx-20141022-cortex
3d7_hb3.cortex.final.vcf.gz.tbi
tabix
ced20606b109126f92b9b3a59df18576
pfx-20141022-cortex
7g8_gb4.cortex.final.vcf.gz
vcf
46b27670948cc325233d649e6a9d577c
pfx-20141022-cortex
7g8_gb4.cortex.final.vcf.gz.tbi
tabix
5399e24b34cbfb360fb29d87a098a5fb
pfx-20141022-cortex
hb3_dd2.cortex.final.vcf.gz
vcf
66bd37eeedbcdfbb7fd0cb4288885563
pfx-20141022-cortex
hb3_dd2.cortex.final.vcf.gz.tbi
tabix
4dd018a3ab9388d460c89092bc4822fb
pfx-20141022-gatk
3d7_hb3.gatk.final.vcf.gz
vcf
922bb61fe92ce358a63bb83b85c6d5c5
pfx-20141022-gatk
3d7_hb3.gatk.final.vcf.gz.tbi
tabix
8f51f19913c365aebf537440e3121f48
pfx-20141022-gatk
7g8_gb4.gatk.final.vcf.gz
vcf
56c805623e024bd2b51422b4851af754
pfx-20141022-gatk
7g8_gb4.gatk.final.vcf.gz.tbi
tabix
d25e20e4d742d73e13e0c17a6c917c4c
pfx-20141022-gatk
hb3_dd2.gatk.final.vcf.gz
vcf
503ea320b05c31a87d17f217d2f06689
pfx-20141022-gatk
hb3_dd2.gatk.final.vcf.gz.tbi
tabix
044a2c7b8aaecbd3fbdaeace2b0dc346
In [ ]:
Content source: wtchg-kwiatkowski/pfx-paper-2015
Similar notebooks: