In [1]:
import glob
import pandas as pd
from collections import Counter
In [2]:
files = glob.glob('PLCL*R1*')
existed_samples = set([f_name.split('_')[0] for f_name in files])
existed_samples = [int(name[4:]) for name in existed_samples]
len(existed_samples)
Out[2]:
87
In [3]:
df = pd.read_csv('sample_template_GA_20161123.txt', sep='\t')
df = df.drop(df.index[-2:])
df
Out[3]:
sample_name
anonymized_name
description
elevation
description.1
env_biome
env_feature
env_material
env_package
host_common_name
...
Interval_course_of_antibiotics
Interval_surgery
Probiotics_Supplements_reported
PSC
change_in_therapy
Subsequent_surgery
Subsequent_response_to_therapy
Unnamed: 76
Unnamed: 77
Unnamed: 78
0
100001-01
17524.0
100001.0
193.0
stool sample 100001-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
1
100001-02
19615.0
100001.0
194.0
stool sample 100001-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
2
100005-01
17532.0
100005.0
195.0
stool sample 100005-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
3
100005-02
19620.0
100005.0
196.0
stool sample 100005-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
4
100016-01
17707.0
100016.0
197.0
stool sample 100016-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
5
100016-02
18757.0
100016.0
198.0
stool sample 100016-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
Missing: Not provided
NaN
NaN
NaN
6
100022-01
17855.0
100022.0
199.0
stool sample 100022-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
1
started on steroids and dose escalation of IFX
NaN
NaN
Developed ATI and anti-vedo ab
7
100022-02
19714.0
100022.0
200.0
stool sample 100022-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
1
Started on vedolizumab with clinical and endso...
NaN
NaN
NaN
8
100026-01
18249.0
100026.0
201.0
stool sample 100026-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
? Ileitis
NaN
NaN
NaN
9
100026-02
19017.0
100026.0
202.0
stool sample 100026-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
10
100026-03
21472.0
100026.0
203.0
stool sample 100026-03
urban biome
human-associated habitat
stool
human-gut
human
...
1
0
0
0
0
0
azithromycin
NaN
NaN
NaN
11
100029-01
18228.0
100029.0
204.0
stool sample 100029-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
started simponi (no baseline)
NaN
NaN
NaN
12
100029-02
19273.0
100029.0
205.0
stool sample 100029-02
urban biome
human-associated habitat
stool
human-gut
human
...
1
0
0
0
1
0
Missing: Not provided
NaN
NaN
NaN
13
100029-03
19276.0
100029.0
206.0
stool sample 100029-03
urban biome
human-associated habitat
stool
human-gut
human
...
1
0
0
0
1
0
started minocycline, stopped 2 months prior to...
NaN
NaN
NaN
14
100029-04
23408.0
100029.0
207.0
stool sample 100029-04
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
15
100056-01
18839.0
100056.0
208.0
stool sample 100056-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
16
100056-02
18818.0
100056.0
209.0
stool sample 100056-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
subsequent mild flare
NaN
NaN
NaN
17
100056-03
21521.0
100056.0
210.0
stool sample 100056-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
18
100056-04
23134.1
100056.0
211.0
stool sample 100056-04
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Flare --> subsequent increase in IFX
NaN
NaN
NaN
19
100059-01
18850.0
100059.0
212.0
stool sample 100059-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
On simponi + Pred, no response
NaN
NaN
NaN
20
100059-02_asterisk
20535.0
100059.0
213.0
stool sample 100059-02_asterisk
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
1
0
1
0
Started vedolizumab
NaN
NaN
NaN
21
100059-03
21561.0
100059.0
214.0
stool sample 100059-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
22
100059-04
25736.1
100059.0
215.0
stool sample 100059-04
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
23
100072-01
18302.0
100072.0
216.0
stool sample 100072-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
Flare, responded to steroid enema
NaN
NaN
NaN
24
100072-02
19995.0
100072.0
217.0
stool sample 100072-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
long course of uceris (steroid)
NaN
NaN
NaN
25
100072-03
22244.0
100072.0
218.0
stool sample 100072-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
26
100111-01
19600.0
100111.0
219.0
stool sample 100111-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
1
0
0
0
Missing: Not provided
NaN
NaN
NaN
27
100111-02
19730.0
100111.0
220.0
stool sample 100111-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
1
0
1
0
single dose of entyvio
NaN
NaN
NaN
28
100111-03
22998.0
100111.0
221.0
stool sample 100111-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
1
0
0
0
Missing: Not provided
NaN
NaN
NaN
29
100111-04
23466.2
100111.0
222.0
stool sample 100111-04
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
1
0
0
0
Missing: Not provided
NaN
NaN
NaN
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
321
100092-02
20935.0
100092.0
514.0
stool sample 100092-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
1
0
0
Missing: Not provided
NaN
NaN
NaN
322
100092-03
21394.0
100092.0
515.0
stool sample 100092-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
1
0
0
Missing: Not provided
NaN
NaN
NaN
323
100092-04
25732.1
100092.0
516.0
stool sample 100092-04
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
1
0
0
Missing: Not provided
NaN
NaN
NaN
324
100094-01
18801.0
100094.0
517.0
stool sample 100094-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
1
0
0
Missing: Not provided
NaN
NaN
NaN
325
100094-02
19616.0
100094.0
518.0
stool sample 100094-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
1
0
0
Missing: Not provided
NaN
NaN
NaN
326
100094-03
21450.0
100094.0
519.0
stool sample 100094-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
1
0
0
Missing: Not provided
NaN
NaN
NaN
327
100102-01
18899.0
100102.0
520.0
stool sample 100102-01
urban biome
human-associated habitat
stool
human-gut
human
...
1
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
328
100102-02
20975.0
100102.0
521.0
stool sample 100102-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
329
100102-03
22136.0
100102.0
522.0
stool sample 100102-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
330
100108-01
18794.0
100108.0
523.0
stool sample 100108-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
331
100108-02
20123.0
100108.0
524.0
stool sample 100108-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
332
100108-03
22140.0
100108.0
525.0
stool sample 100108-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
333
100108-04
25732.1
100108.0
526.0
stool sample 100108-04
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
334
100110-01
18250.0
100110.0
527.0
stool sample 100110-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
335
100110-02
20129.0
100110.0
528.0
stool sample 100110-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
336
100110-03
23082.0
100110.0
529.0
stool sample 100110-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
337
100115-01
19603.0
100115.0
530.0
stool sample 100115-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
338
100115-02
20431.0
100115.0
531.0
stool sample 100115-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
339
100115-03
23082.0
100115.0
532.0
stool sample 100115-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
340
100124-01
19580.0
100124.0
533.0
stool sample 100124-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
341
100124-02
20051.0
100124.0
534.0
stool sample 100124-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
342
100148-01
18894.0
100148.0
535.0
stool sample 100148-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
343
100148-02
20155.0
100148.0
536.0
stool sample 100148-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
344
100148-03
23046.0
100148.0
537.0
stool sample 100148-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
345
100156-01
19006.0
100156.0
538.0
stool sample 100156-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
346
100156-02
21514.0
100156.0
539.0
stool sample 100156-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
347
100156-03
25751.1
100156.0
540.0
stool sample 100156-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
348
100169-01
20003.0
100169.0
541.0
stool sample 100169-01
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
0
0
Missing: Not provided
NaN
NaN
NaN
349
100169-02
22654.0
100169.0
542.0
stool sample 100169-02
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
etrolizumab clinical trial
NaN
NaN
NaN
350
100169-03
22328.1
100169.0
543.0
stool sample 100169-03
urban biome
human-associated habitat
stool
human-gut
human
...
0
0
0
0
1
0
OL etrolizumab
NaN
NaN
NaN
351 rows × 79 columns
In [4]:
df = df[['anonymized_name', 'host_subject_id']].astype('int')
df.head()
Out[4]:
anonymized_name
host_subject_id
0
17524
100001
1
19615
100001
2
17532
100005
3
19620
100005
4
17707
100016
In [5]:
df = df[df['anonymized_name'].isin(existed_samples)]
df
Out[5]:
anonymized_name
host_subject_id
3
19620
100005
7
19714
100022
9
19017
100026
10
21472
100026
12
19273
100029
13
19276
100029
19
18850
100059
21
21561
100059
30
19621
100119
36
20093
100151
48
18791
100006
50
19219
100007
51
21419
100007
54
18281
100008
58
17528
100009
63
17710
100017
64
20166
100017
66
20099
100018
70
19621
100019
71
19707
100019
72
21485
100019
75
19726
100023
78
21555
100028
80
19718
100031
83
19700
100032
86
18283
100033
88
21385
100038
94
19990
100045
96
19699
100046
99
18886
100046
...
...
...
278
21559
100167
279
20981
100181
280
21376
100181
285
19278
100201
287
21433
100207
289
21384
100212
292
21429
100214
295
17693
100021
296
21445
100021
297
18257
100034
298
19024
100034
301
18867
100051
302
19628
100051
305
18290
100055
306
19866
100055
307
19246
100055
308
18295
100076
311
18883
100080
312
20980
100080
315
18291
100082
316
18800
100082
320
19011
100092
321
20935
100092
330
18794
100108
334
18250
100110
337
19603
100115
340
19580
100124
341
20051
100124
342
18894
100148
345
19006
100156
86 rows × 2 columns
In [6]:
for num_samples, num_patients in sorted(Counter(Counter(df['host_subject_id']).values()).items(),
key=lambda x: x[0]):
print("{} patients with {} samples".format(num_patients, num_samples))
40 patients with 1 samples
18 patients with 2 samples
2 patients with 3 samples
1 patients with 4 samples
In [15]:
selected_patients = Counter(df['host_subject_id']).most_common(3)
selected_patients = [x[0] for x in selected_patients]
selected_patients
Out[15]:
[100013, 100019, 100055]
In [19]:
selected_df = df[df['host_subject_id'].isin(selected_patients)].astype('str')
selected_df
Out[19]:
anonymized_name
host_subject_id
70
19621
100019
71
19707
100019
72
21485
100019
248
17712
100013
249
18303
100013
250
19708
100013
251
17712
100013
305
18290
100055
306
19866
100055
307
19246
100055
In [20]:
selected_df.loc[248]['anonymized_name'] += '_A'
selected_df.loc[251]['anonymized_name'] += '_B'
selected_df
Out[20]:
anonymized_name
host_subject_id
70
19621
100019
71
19707
100019
72
21485
100019
248
17712_A
100013
249
18303
100013
250
19708
100013
251
17712_B
100013
305
18290
100055
306
19866
100055
307
19246
100055
In [38]:
for p in selected_patients:
with open("/home/makseshina/"+str(p)+'_samples.txt', 'w') as f:
for index, row in selected_df.loc[selected_df['host_subject_id'] == str(p)].iterrows():
f.write(row['anonymized_name'])
f.write('\n')
In [ ]:
Content source: snurk/meta-strains
Similar notebooks: