for some reason Janet's virtualenv is much happier with this TkAgg thing set.
In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import bacteriopop_utils
import feature_selection_utils
import load_data
In [3]:
loaded_data = data = load_data.load_data()
In [4]:
loaded_data.shape
Out[4]:
make sure none of the phyla are NA (checking 160304 update to load_data.py
In [5]:
loaded_data[loaded_data['phylum'].isnull()].head(3)
Out[5]:
In [6]:
loaded_data.head()
Out[6]:
Test filter and reduce functions using a high threshold, which selects for genus==Methylobacter
In [7]:
bacteriopop_utils.filter_by_abundance(dataframe=loaded_data, low= 0.6).head()
Out[7]:
In [8]:
bacteriopop_utils.reduce_data(dataframe=loaded_data, min_abundance= 0.6,
phylo_column='genus', oxygen='high').head()
Out[8]:
In [9]:
raw_dmd_data = bacteriopop_utils.reduce_data(
dataframe=loaded_data, min_abundance= 0.01,
phylo_column='genus', oxygen='Low')
Errors are thrown by functions below if you drop min_abunance below. I think it is hanging up on multiple "other" rows.
In [10]:
data_dict = bacteriopop_utils.break_apart_experiments(raw_dmd_data)
In [11]:
data_dict.keys()
Out[11]:
In [12]:
# Can't view generators very easily!!!
data_dict.itervalues()
Out[12]:
In [13]:
# But we can make a list from them and grab the 0th item
first_df = list(data_dict.itervalues())[0]
In [14]:
first_df.head(3)
Out[14]:
In [15]:
first_df[first_df['genus'] == 'other'].head()
Out[15]:
In [16]:
first_df[first_df['genus'] != ''].pivot(index='genus', columns='week', values='abundance')
Out[16]:
In [17]:
raw_dmd_data.columns
Out[17]:
In [18]:
DMD_input_dict = \
bacteriopop_utils.prepare_DMD_matrices(raw_dmd_data,
groupby_level = "genus")
In [19]:
type(DMD_input_dict)
Out[19]:
We can get each dataframe out like this:
In [20]:
DMD_input_dict[('Low', 1)]
Out[20]:
In [21]:
DMD_input_dict[('Low', 1)].shape
Out[21]:
In [22]:
DMD_input_dict[('Low', 1)].groupby('week')['abundance'].sum()
Out[22]:
TODO: test DMD on this abundance marix.
In [23]:
DMD_test_matrix = DMD_input_dict[('Low', 1)]
In [24]:
# Who is in there?
In [25]:
DMD_test_matrix.reset_index()['genus'].unique()
Out[25]:
I'm stuck at the installation of modred :(
In [ ]:
# following example 1: https://pythonhosted.org/modred/tutorial_modaldecomp.html
import modred as MR
In [ ]:
num_modes = 1
modes, eig_vals = MR.compute_POD_matrices_snaps_method(DMD_test_matrix, range(num_modes))
In [ ]:
modes
In [ ]:
eig_vals
In [ ]:
extracted_features = bacteriopop_utils.extract_features(
dataframe = loaded_data,
column_list = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'oxygen', 'abundance']
# default list was: ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'length', 'abundance', 'project']
)
In [ ]:
extracted_features.head()
In [ ]:
extracted_features.shape
Just do PCA on a tiny bit of the data as a demo
In [ ]:
pca_results = feature_selection_utils.pca_bacteria(
data = extracted_features.head(100), n_components = 10)
In [ ]:
pca_results.components_
Do correlations for a tiny subset of the data.
In [ ]:
feature_selection_utils.calculate_features_target_correlation(
data = extracted_features.head(100),
features = extracted_features.columns.tolist(),
target='abundance',
method="Pearson")
In [ ]:
In [ ]: