In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
path_pro = '/Users/luke/singlecell/krse2013-MT/fpkm_groups_all_pro.binary.Sta3Surf.Gareth.csv'
path_sar = '/Users/luke/singlecell/krse2013-MT/fpkm_groups_all_sar.binary.Sta3Surf.csv'
In [3]:
df_pro = pd.read_csv(path_pro, index_col=0)
df_sar = pd.read_csv(path_sar, index_col=0)
In [4]:
df_pro.head()
Out[4]:
In [5]:
df_pro.Group.value_counts()
Out[5]:
In [6]:
df_pro.columns
Out[6]:
In [7]:
# sum over OGs for each timepoint to find out which timepoint is low: T0!
df_pro.ix[:,0:13].sum(axis=0)
Out[7]:
In [8]:
df_pro.drop('4415_Sta3_T0_Surf_20', axis=1, inplace=True)
In [9]:
# sum over timepoints for each OG to get total number of timepoints observed in
df_pro['Sum'] = df_pro.ix[:,0:13].sum(axis=1)
In [10]:
df_pro.Sum.value_counts()
Out[10]:
In [11]:
df_pro_nonzero = df_pro[df_pro.Sum > 0]
In [12]:
sns.boxplot(x=df_pro_nonzero.Group, y=df_pro_nonzero.Sum)
Out[12]:
In [15]:
sns.kdeplot(data=df_pro.Sum[df_pro.Group == 'RedSea'], label='Red Sea')
sns.kdeplot(data=df_pro.Sum[df_pro.Group == 'Other'], label='Other')
plt.axis([-1.99, 13.99, 0, 0.4])
plt.xlabel('Number of timepoints observed in (out of 12)')
plt.ylabel('Proportion of gene ortholog groups')
Out[15]:
In [ ]: