author: lukethompson@gmail.com
date: 5 October 2017
language: Python 3.5
conda environment: emp-py3
license: BSD3
ORDER OF SCRIPTS:
STUDIES WITH MAPPING FILES PROCESSED IN STEP 2 NOTEBOOK THAT REQUIRE SPECIAL HANLING:
In [ ]:
# from qiita_db.study import Study
# from shutil import copy
# from os import mkdir
# ffp = '/home/qiita/emp-sample-info-files'
# study_ids = [ 550, 632, 638, 659, 662, 678, 713, 714, 722, 723,
# 755, 776, 804, 805, 807, 808, 809, 810, 829, 846,
# 861, 864, 865, 889, 894, 895, 905, 910, 925, 933,
# 940, 945, 958, 963, 990, 1001, 1024, 1030, 1031, 1033,
# 1034, 1035, 1036, 1037, 1038, 1039, 1041, 1043, 1056, 1064,
# 1098, 1197, 1198, 1222, 1235, 1240, 1242, 1288, 1289, 1453,
# 1481, 1521, 1526, 1578, 1579, 1580, 1621, 1622, 1627, 1632,
# 1642, 1665, 1673, 1674, 1692, 1694, 1696, 1702, 1711, 1713,
# 1714, 1715, 1716, 1717, 1721, 1734, 1736, 1747, 1748, 1773,
# 1774, 1795, 1799, 1883, 1889, 2080, 2182, 2192, 2229, 2300,
# 2318, 2338, 2382, 10145, 10146, 10156, 10171, 10172, 10180, 10245,
# 10246, 10247, 10248, 10273, 10278, 10308, 10323, 10346, 10363, 10522,
# 10533, 10581]
# studies = [Study(s) for s in study_ids]
# mkdir(ffp)
# [copy(s.sample_template.get_filepaths()[0][1], ffp)
# for s in studies if s.sample_template is not None]
In [1]:
import pandas as pd
import numpy as np
In [2]:
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 80)
In [3]:
path_refined = '../../data/metadata-refine/emp_qiime_mapping_refined_YYYYMMDD.tsv'
path_ids = '../../data/metadata-refine/refine_emp_studies_ct112.txt'
path_plan = '../../data/metadata-refine/qiita_add_replace_columns.xlsx'
path_sample_info = '../../data/metadata-refine/metadata-sample-info'
In [4]:
studies = set([line.rstrip('\n') for line in open(path_ids)])
# remove studies where mapping file has fewer samples than sample info file -- ignore
studies = studies - {'10246', '10278', '10346'}
# remove studies where mapping file has fewer samples than sample info file -- these will be fixed manually
studies = studies - {'1033', '1696', '2229'}
# remove studies where sample names don't match (10146 is prepended twice!)
studies = studies - {'10146'}
# NOW: DON'T remove these studies bc sample info files ARE in Qiita (studies not in EMP paper)
#studies = studies - {'1889'}
# convert to sorted list of strings
studies = list(studies)
studies = [int(x) for x in studies]
studies.sort()
studies = [str(x) for x in studies]
In [5]:
problem_studies = ['1033', '1696', '2229', '10146', '10246', '10278', '10346']
In [6]:
df_refined = pd.read_csv(path_refined, sep='\t', index_col=0, dtype=object, low_memory=False)
In [7]:
df_plan = pd.read_excel(path_plan)
In [8]:
for study_id in studies:
df = pd.read_csv('%s/%s_sample_info.tsv' % (path_sample_info, study_id), sep='\t', index_col=0)
df_new = df.copy(deep=True)
df_diff = pd.DataFrame(index=df.index)
for index, row in df_plan.iterrows():
old_cols = row.old_column.split(',')
newcol = ''
if row.action == 'replace always':
# drop all old columns if they exist
for old_col in old_cols:
if old_col in df_new.columns:
df_new.drop(old_col, axis=1, inplace=True)
# add new column always
newcol = [df_refined.loc[i, row.new_column] for i in df_new.index]
df_new[row.new_column] = newcol
df_diff[row.new_column] = newcol
elif row.action == 'replace if':
# check if any old columns exist
if np.any([x in df_new.columns for x in old_cols]):
# drop old column if it exists
for old_col in old_cols:
if old_col in df_new.columns:
df_new.drop(old_col, axis=1, inplace=True)
# add new column if old column exists
newcol = [df_refined.loc[i, row.new_column] for i in df_new.index]
df_new[row.new_column] = newcol
df_diff[row.new_column] = newcol
elif row.action == 'add always':
# add new column always
newcol = [df_refined.loc[i, row.new_column] for i in df_new.index]
df_new[row.new_column] = newcol
df_diff[row.new_column] = newcol
elif row.action == 'add if':
# add new column if old column exists
if np.any([x in df_new.columns for x in old_cols]):
newcol = [df_refined.loc[i, row.new_column] for i in df_new.index]
df_new[row.new_column] = newcol
df_diff[row.new_column] = newcol
# fill NaNs with 'Not applicable' (Qiita terminology)
df_new.fillna('Not applicable', inplace=True)
df_diff.fillna('Not applicable', inplace=True)
# reorder columns alphabetically (Qiita style)
df_new = df_new[df_new.columns.sort_values()]
df_diff = df_diff[df_diff.columns.sort_values()]
# write to tsv
df_new.to_csv('../../data/metadata-refine/metadata-sample-info-refined/%s_sample_info.tsv' % str(study_id), sep='\t', index=True)
df_diff.to_csv('../../data/metadata-refine/metadata-sample-info-diff/%s_sample_info_diff.tsv' % str(study_id), sep='\t', index=True)
In [9]:
new_cols = list(df_plan.new_column)
In [10]:
df_refined[df_refined.study_id.isin(problem_studies)][new_cols].to_csv(
'../../data/metadata-refine/qiita_metadata_for_problem_studies.tsv', sep='\t')
In [ ]: