Before running this code, the output from Fauzi was modified with this shell script, fix_unknown.sh, because the "UNKNOWN" row is missing a field:
for i in `ls *.csv`
do
cat $i | sed -E 's/UNKNOWN,UNKNOWN/zzzUnknown,UNKNOWN,UNKNOWN/g' > temp
mv temp $i
done
In [7]:
# import libraries
import pandas as pd
In [8]:
# file paths
path_metadata = '/Users/luke/krse2011/db/krse2011_v5_metadata.csv'
dir_clark_genus = '/Users/luke/krse2011/clark/specificitymode-genus'
dir_clark_species = '/Users/luke/krse2011/clark/fullmode-species'
outfile_genus = '/Users/luke/krse2011/clark/clark_genus.csv'
outfile_species = '/Users/luke/krse2011/clark/clark_species.csv'
In [9]:
# column to export and merge
exportcol = 'Count' # u'Proportion_All(%)'
In [10]:
# read and merge clark taxonomy data
def read_clark(directory):
df_merged = pd.DataFrame()
for i in range(1,46):
if i < 10:
i = '0%s' % i
path = '%s/abundanceHC.results.sample%s.csv' % (directory, i)
df_full = pd.read_csv(path, dtype=object)
df_sub = df_full[['Name', exportcol]]
df_indexed = pd.DataFrame(data=df_sub.iloc[:,1])
df_indexed.index = df_sub.iloc[:,0]
df_indexed.columns = ['sample%s' % i]
df_merged = pd.merge(df_merged, df_indexed, how='outer', left_index=True, right_index=True)
df_transposed = df_merged.T
return df_transposed
In [11]:
# read metadata and merge with clark data
def merge_metadata_clark(path_metadata, dir_clark, outfile):
df_metadata = pd.read_csv(path_metadata, index_col=0, dtype=object)
df_clark = read_clark(dir_clark)
df_final = pd.merge(df_metadata, df_clark, how='outer', left_index=True, right_index=True)
df_final.fillna(value=0, inplace=True)
df_final.to_csv(outfile)
In [12]:
# execute code
merge_metadata_clark(path_metadata, dir_clark_genus, outfile_genus)
merge_metadata_clark(path_metadata, dir_clark_species, outfile_species)
In [ ]: