In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
path_otus = '../../data/sequence-lookup/otu_summary.emp_deblur_90bp.subset_2k.rare_5000.tsv' # gunzip first
df_otus = pd.read_csv(path_otus, sep='\t', index_col=0)
In [3]:
path_entropy = '../../data/entropy/otu_entropy_empo.csv' # output of 09-specificity-entropy/entropy_environment_by_taxon.ipynb
df_otu_entropy = pd.read_csv(path_entropy, index_col=0)
In [4]:
empo3 = df_otu_entropy.columns[1:]
In [5]:
df_merged = pd.merge(df_otus, df_otu_entropy, left_on='sequence', right_index=True)
In [6]:
# OTUs by empo_3 with rel abund empo_3 >= 25%, entropy < 1, and total obs >= 1000 (15 have at least one)
for empo in empo3:
print(empo, '\t',
df_merged[(df_merged[empo] >= 0.25) &
(df_merged['entropy'] < 1) &
(df_merged['total_obs'] >= 1000)].shape)
In [7]:
# now get the most abundant OTU that meets those criteria above
df_top_entropy = pd.DataFrame()
list_empo = []
for empo in empo3:
df_empo = pd.DataFrame()
df_empo = df_merged[(df_merged[empo] >= 0.25) &
(df_merged['entropy'] < 1) &
(df_merged['total_obs'] >= 1000)]
df_empo.sort_values('total_obs', ascending=False, inplace=True)
if df_empo.shape[0] > 0:
df_top_entropy = df_top_entropy.append(df_empo.iloc[0,:])
list_empo.append(empo)
df_top_entropy.index = list_empo
In [8]:
# write out the "most abundant, sample type-specific (>=25%), low-entropy OTU (<1)" for each sample type
df_top_entropy.to_csv('../../data/sequence-lookup/top_specialized_otu_per_empo.csv')
In [ ]: