In [1]:
import glob
import pandas as pd
from collections import Counter

In [2]:
files = glob.glob('PLCL*R1*')
existed_samples = set([f_name.split('_')[0] for f_name in files])
existed_samples = [int(name[4:]) for name in existed_samples]
len(existed_samples)


Out[2]:
87

In [3]:
df = pd.read_csv('sample_template_GA_20161123.txt', sep='\t')
df = df.drop(df.index[-2:])
df


Out[3]:
sample_name anonymized_name description elevation description.1 env_biome env_feature env_material env_package host_common_name ... Interval_course_of_antibiotics Interval_surgery Probiotics_Supplements_reported PSC change_in_therapy Subsequent_surgery Subsequent_response_to_therapy Unnamed: 76 Unnamed: 77 Unnamed: 78
0 100001-01 17524.0 100001.0 193.0 stool sample 100001-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
1 100001-02 19615.0 100001.0 194.0 stool sample 100001-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
2 100005-01 17532.0 100005.0 195.0 stool sample 100005-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
3 100005-02 19620.0 100005.0 196.0 stool sample 100005-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
4 100016-01 17707.0 100016.0 197.0 stool sample 100016-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
5 100016-02 18757.0 100016.0 198.0 stool sample 100016-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 Missing: Not provided NaN NaN NaN
6 100022-01 17855.0 100022.0 199.0 stool sample 100022-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 1 started on steroids and dose escalation of IFX NaN NaN Developed ATI and anti-vedo ab
7 100022-02 19714.0 100022.0 200.0 stool sample 100022-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 1 Started on vedolizumab with clinical and endso... NaN NaN NaN
8 100026-01 18249.0 100026.0 201.0 stool sample 100026-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 ? Ileitis NaN NaN NaN
9 100026-02 19017.0 100026.0 202.0 stool sample 100026-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
10 100026-03 21472.0 100026.0 203.0 stool sample 100026-03 urban biome human-associated habitat stool human-gut human ... 1 0 0 0 0 0 azithromycin NaN NaN NaN
11 100029-01 18228.0 100029.0 204.0 stool sample 100029-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 started simponi (no baseline) NaN NaN NaN
12 100029-02 19273.0 100029.0 205.0 stool sample 100029-02 urban biome human-associated habitat stool human-gut human ... 1 0 0 0 1 0 Missing: Not provided NaN NaN NaN
13 100029-03 19276.0 100029.0 206.0 stool sample 100029-03 urban biome human-associated habitat stool human-gut human ... 1 0 0 0 1 0 started minocycline, stopped 2 months prior to... NaN NaN NaN
14 100029-04 23408.0 100029.0 207.0 stool sample 100029-04 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
15 100056-01 18839.0 100056.0 208.0 stool sample 100056-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
16 100056-02 18818.0 100056.0 209.0 stool sample 100056-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 subsequent mild flare NaN NaN NaN
17 100056-03 21521.0 100056.0 210.0 stool sample 100056-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
18 100056-04 23134.1 100056.0 211.0 stool sample 100056-04 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Flare --> subsequent increase in IFX NaN NaN NaN
19 100059-01 18850.0 100059.0 212.0 stool sample 100059-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 On simponi + Pred, no response NaN NaN NaN
20 100059-02_asterisk 20535.0 100059.0 213.0 stool sample 100059-02_asterisk urban biome human-associated habitat stool human-gut human ... 0 0 1 0 1 0 Started vedolizumab NaN NaN NaN
21 100059-03 21561.0 100059.0 214.0 stool sample 100059-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
22 100059-04 25736.1 100059.0 215.0 stool sample 100059-04 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
23 100072-01 18302.0 100072.0 216.0 stool sample 100072-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 Flare, responded to steroid enema NaN NaN NaN
24 100072-02 19995.0 100072.0 217.0 stool sample 100072-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 long course of uceris (steroid) NaN NaN NaN
25 100072-03 22244.0 100072.0 218.0 stool sample 100072-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
26 100111-01 19600.0 100111.0 219.0 stool sample 100111-01 urban biome human-associated habitat stool human-gut human ... 0 0 1 0 0 0 Missing: Not provided NaN NaN NaN
27 100111-02 19730.0 100111.0 220.0 stool sample 100111-02 urban biome human-associated habitat stool human-gut human ... 0 0 1 0 1 0 single dose of entyvio NaN NaN NaN
28 100111-03 22998.0 100111.0 221.0 stool sample 100111-03 urban biome human-associated habitat stool human-gut human ... 0 0 1 0 0 0 Missing: Not provided NaN NaN NaN
29 100111-04 23466.2 100111.0 222.0 stool sample 100111-04 urban biome human-associated habitat stool human-gut human ... 0 0 1 0 0 0 Missing: Not provided NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
321 100092-02 20935.0 100092.0 514.0 stool sample 100092-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 1 0 0 Missing: Not provided NaN NaN NaN
322 100092-03 21394.0 100092.0 515.0 stool sample 100092-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 1 0 0 Missing: Not provided NaN NaN NaN
323 100092-04 25732.1 100092.0 516.0 stool sample 100092-04 urban biome human-associated habitat stool human-gut human ... 0 0 0 1 0 0 Missing: Not provided NaN NaN NaN
324 100094-01 18801.0 100094.0 517.0 stool sample 100094-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 1 0 0 Missing: Not provided NaN NaN NaN
325 100094-02 19616.0 100094.0 518.0 stool sample 100094-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 1 0 0 Missing: Not provided NaN NaN NaN
326 100094-03 21450.0 100094.0 519.0 stool sample 100094-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 1 0 0 Missing: Not provided NaN NaN NaN
327 100102-01 18899.0 100102.0 520.0 stool sample 100102-01 urban biome human-associated habitat stool human-gut human ... 1 0 0 0 0 0 Missing: Not provided NaN NaN NaN
328 100102-02 20975.0 100102.0 521.0 stool sample 100102-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
329 100102-03 22136.0 100102.0 522.0 stool sample 100102-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
330 100108-01 18794.0 100108.0 523.0 stool sample 100108-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
331 100108-02 20123.0 100108.0 524.0 stool sample 100108-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
332 100108-03 22140.0 100108.0 525.0 stool sample 100108-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
333 100108-04 25732.1 100108.0 526.0 stool sample 100108-04 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
334 100110-01 18250.0 100110.0 527.0 stool sample 100110-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
335 100110-02 20129.0 100110.0 528.0 stool sample 100110-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
336 100110-03 23082.0 100110.0 529.0 stool sample 100110-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
337 100115-01 19603.0 100115.0 530.0 stool sample 100115-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
338 100115-02 20431.0 100115.0 531.0 stool sample 100115-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
339 100115-03 23082.0 100115.0 532.0 stool sample 100115-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
340 100124-01 19580.0 100124.0 533.0 stool sample 100124-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
341 100124-02 20051.0 100124.0 534.0 stool sample 100124-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
342 100148-01 18894.0 100148.0 535.0 stool sample 100148-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
343 100148-02 20155.0 100148.0 536.0 stool sample 100148-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
344 100148-03 23046.0 100148.0 537.0 stool sample 100148-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
345 100156-01 19006.0 100156.0 538.0 stool sample 100156-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
346 100156-02 21514.0 100156.0 539.0 stool sample 100156-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
347 100156-03 25751.1 100156.0 540.0 stool sample 100156-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
348 100169-01 20003.0 100169.0 541.0 stool sample 100169-01 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 0 0 Missing: Not provided NaN NaN NaN
349 100169-02 22654.0 100169.0 542.0 stool sample 100169-02 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 etrolizumab clinical trial NaN NaN NaN
350 100169-03 22328.1 100169.0 543.0 stool sample 100169-03 urban biome human-associated habitat stool human-gut human ... 0 0 0 0 1 0 OL etrolizumab NaN NaN NaN

351 rows × 79 columns


In [4]:
df = df[['anonymized_name', 'host_subject_id']].astype('int')
df.head()


Out[4]:
anonymized_name host_subject_id
0 17524 100001
1 19615 100001
2 17532 100005
3 19620 100005
4 17707 100016

In [5]:
df = df[df['anonymized_name'].isin(existed_samples)]
df


Out[5]:
anonymized_name host_subject_id
3 19620 100005
7 19714 100022
9 19017 100026
10 21472 100026
12 19273 100029
13 19276 100029
19 18850 100059
21 21561 100059
30 19621 100119
36 20093 100151
48 18791 100006
50 19219 100007
51 21419 100007
54 18281 100008
58 17528 100009
63 17710 100017
64 20166 100017
66 20099 100018
70 19621 100019
71 19707 100019
72 21485 100019
75 19726 100023
78 21555 100028
80 19718 100031
83 19700 100032
86 18283 100033
88 21385 100038
94 19990 100045
96 19699 100046
99 18886 100046
... ... ...
278 21559 100167
279 20981 100181
280 21376 100181
285 19278 100201
287 21433 100207
289 21384 100212
292 21429 100214
295 17693 100021
296 21445 100021
297 18257 100034
298 19024 100034
301 18867 100051
302 19628 100051
305 18290 100055
306 19866 100055
307 19246 100055
308 18295 100076
311 18883 100080
312 20980 100080
315 18291 100082
316 18800 100082
320 19011 100092
321 20935 100092
330 18794 100108
334 18250 100110
337 19603 100115
340 19580 100124
341 20051 100124
342 18894 100148
345 19006 100156

86 rows × 2 columns


In [6]:
for num_samples, num_patients in sorted(Counter(Counter(df['host_subject_id']).values()).items(), 
                                        key=lambda x: x[0]):
    print("{} patients with {} samples".format(num_patients, num_samples))


40 patients with 1 samples
18 patients with 2 samples
2 patients with 3 samples
1 patients with 4 samples

In [15]:
selected_patients = Counter(df['host_subject_id']).most_common(3)
selected_patients = [x[0] for x in selected_patients]
selected_patients


Out[15]:
[100013, 100019, 100055]

In [19]:
selected_df = df[df['host_subject_id'].isin(selected_patients)].astype('str')
selected_df


Out[19]:
anonymized_name host_subject_id
70 19621 100019
71 19707 100019
72 21485 100019
248 17712 100013
249 18303 100013
250 19708 100013
251 17712 100013
305 18290 100055
306 19866 100055
307 19246 100055

In [20]:
selected_df.loc[248]['anonymized_name'] += '_A'
selected_df.loc[251]['anonymized_name'] += '_B'
selected_df


Out[20]:
anonymized_name host_subject_id
70 19621 100019
71 19707 100019
72 21485 100019
248 17712_A 100013
249 18303 100013
250 19708 100013
251 17712_B 100013
305 18290 100055
306 19866 100055
307 19246 100055

In [38]:
for p in selected_patients:
    with open("/home/makseshina/"+str(p)+'_samples.txt', 'w') as f:
        for index, row in selected_df.loc[selected_df['host_subject_id'] == str(p)].iterrows():
            f.write(row['anonymized_name'])
            f.write('\n')

In [ ]: