In [1]:
import sys
import numpy as np
import pandas as pd

sys.path.append('/home/will/PySeqUtils/')
import PlottingTools

fname = '/home/will/KrebbsData/Results.csv'

In [2]:
from sklearn.covariance import EllipticEnvelope

data = pd.read_csv(fname, sep = '\t')
def safe_float(val):
    try:
        return float(val)
    except ValueError:
        return np.nan

cytos = ['VEGF','IL-1beta','G-CSF','EGF','IL-10','HGF','FGF-basic',
'IFN-alpha','IL-6','IL-12','Rantes','Eotaxin','IL-13','IL-15',
'IL-17','MIP-1alpha','GM-CSF','MIP-1beta','MCP-1','IL-5',
'IFN-gamma','TNF-alpha','IL-RA','IL-2','IL-7',
'IP-10','IL-2R','MIG','IL-4','IL-8']

for col in cytos:
    data[col] = data[col].map(safe_float)
    try:
        env = EllipticEnvelope().fit(data[col].dropna().values.reshape(-1,1))
        mask = env.predict(data[col].values.reshape(-1,1))
        data[col][mask == -1] = np.nan
    except:
        pass
        
    
    #print mask
    #break

In [3]:
pos = dict(zip('ABCDEFGH', range(8)))
def xpos(val):
    _, p = val.split('(')
    return pos[p.split(',')[1][0]]

def ypos(val):
    _, p = val.split('(')
    _, r = p.split(',')
    return int(''.join(l for l in r[:-1] if not l.isalpha()))
    
data['xpos'] = data['Location'].map(xpos)
data['ypos'] = data['Location'].map(ypos)

In [4]:
data['NumBad'] = data[cytos].applymap(np.isnan).sum(axis=1)

In [5]:
counts = pd.pivot_table(data, rows = 'xpos', cols = 'ypos', values = 'NumBad', aggfunc = 'sum').fillna(len(cytos))

In [6]:
import matplotlib.pyplot as plt

fig = PlottingTools.make_heatmap_df(counts, figsize = (10,4), colormap = 'copper_r')
plt.colorbar()


Out[6]:
<matplotlib.colorbar.Colorbar instance at 0x513a0e0>

In [7]:
data.head().T


Out[7]:
0 1 2 3 4
Location 1(1,A1) 2(1,B1) 3(1,C1) 4(1,D1) 5(1,E1)
Sample Standard Standard Standard Standard Standard
Age skip skip skip skip skip
Patient skip skip skip skip skip
Exposed skip skip skip skip skip
CellLine skip skip skip skip skip
WellSide skip skip skip skip skip
TimePoint skip skip skip skip skip
VEGF NaN NaN 53.18195 5.638355 1.565731
IL-1beta NaN NaN NaN NaN 39.5339
G-CSF NaN NaN NaN NaN NaN
EGF NaN NaN NaN NaN NaN
IL-10 NaN NaN NaN NaN NaN
HGF NaN 797.8447 212.5651 19.65983 5.840041
FGF-basic NaN NaN NaN 40.03513 16.44776
IFN-alpha NaN NaN NaN 162.2822 59.11853
IL-6 NaN NaN NaN 68.45924 19.877
IL-12 NaN NaN NaN 169.1865 64.43608
Rantes NaN NaN NaN 74.29918 27.08544
Eotaxin NaN NaN NaN 18.17663 5.845376
IL-13 NaN NaN NaN NaN NaN
IL-15 NaN NaN NaN 332.4042 127.094
IL-17 NaN NaN NaN NaN 109.8124
MIP-1alpha NaN NaN NaN NaN 65.67461
GM-CSF NaN NaN NaN NaN NaN
MIP-1beta NaN NaN NaN 81.03217 23.25415
MCP-1 NaN NaN 2819.832 264.4408 83.74506
IL-5 NaN NaN NaN 72.03586 21.33605
IFN-gamma NaN NaN NaN NaN 67.72917
TNF-alpha NaN NaN NaN NaN NaN
IL-RA NaN NaN NaN 492.8668 191.1588
IL-2 NaN NaN NaN NaN 82.19325
IL-7 NaN NaN 755.6191 84.30869 26.54168
IP-10 NaN 67.14169 27.58456 2.790785 0.9082555
IL-2R NaN NaN NaN 227.0333 75.35008
MIG NaN 433.8882 151.1393 NaN NaN
IL-4 NaN NaN NaN NaN NaN
IL-8 NaN NaN 1187.953 144.4454 45.30223
Total Events 4698 3905 4423 4878 4395
xpos 0 1 2 3 4
ypos 1 1 1 1 1
NumBad 30 27 23 13 8

In [8]:
grouped = data.groupby(['CellLine', 'Age', 'Patient', 'Exposed', 'WellSide', 'TimePoint'], as_index=False).mean()

In [9]:
from itertools import combinations
from scipy.stats import ttest_ind
sf_data = data[(data['CellLine'] == 'Seminal Fluid')] 
old_sf =  sf_data['Age'] == 'Aged'
young_sf = sf_data['Age'] == 'Young'
pooled_sf = sf_data['Age'] == 'Pooled'

comp = [('Old', old_sf),
        ('Young', young_sf),
        ('Pooled', pooled_sf)]

results = []
for (g1_name, g1), (g2_name, g2) in combinations(comp, 2):
    for cyto in cytos:
        g1vals = sf_data[g1][cyto].dropna()
        g2vals = sf_data[g2][cyto].dropna()
        if (len(g1vals) > 2) and (len(g2vals) > 2):
            tstat, pval = ttest_ind(g1vals, g2vals)
            results.append({
                            'Group1':g1_name,
                            'Group2':g2_name,
                            'Tstat': tstat,
                            'Pval': pval,
                            'G1N':len(g1vals),
                            'G2N':len(g2vals),
                            'G1mean':g1vals.mean(),
                            'G2mean':g2vals.mean(),
                            'Cyto': cyto
                            })
    
res = pd.DataFrame(results)

In [10]:
from statsmodels.graphics.boxplots import beanplot
labels = ['Old', 'Young', 'Pooled']
for cyto in cytos:
    bins = []
    labs = []
    for gname, gmask in comp:
        td = sf_data[cyto][gmask].dropna()
    
        if len(td) > 2:
            bins.append(td.copy())
            labs.append(gname)
    if len(bins) > 1:
        fig = plt.figure()
        ax = plt.subplot(111)
        beanplot(bins, labels=labs, ax=ax)
        ax.set_ylim([0, ax.get_ylim()[1]])
        ax.set_ylabel(cyto)
        ax.set_title(cyto)
        plt.savefig('/home/will/KrebbsData/draft_figures/violin_plot_%s.png' % cyto)
        plt.close()

In [11]:
print res[(res['Pval'] < 0.1)][['Cyto','Group1', 'G1mean','Group2','G2mean','Pval']]


    Cyto Group1       G1mean  Group2       G2mean      Pval
11  IL-5    Old    53.775253   Young    33.959186  0.084954
16   MIG    Old   769.645050   Young   494.497445  0.070580
17  IL-8    Old  2513.067616   Young  1611.837118  0.080708
47  IL-5  Young    33.959186  Pooled    88.801680  0.000028

In [12]:
wanted_cl = set(['Ect1', 'End1', 'VK2'])
cl_data = data[data['CellLine'].map(lambda x: x in wanted_cl)]
exposed = cl_data['Exposed'].unique()
exposed_colors = 'kbrgm'
wellsides = ['Top', 'Bottom']

In [13]:
for cyto in cytos:
    fig, axs = plt.subplots(3,2, figsize = (10,10), sharey = True)
    for row, cl in enumerate(wanted_cl):
        for col, side in enumerate(wellsides):
            ax = axs[row, col]
            if ax.is_first_row():
                ax.set_title('%s %s' % (cyto, side))
            if ax.is_first_col():
                ax.set_ylabel('log(%s exp)' % cl)
            tmask = data['CellLine'] == cl
            tmask &= data['WellSide'] == side
            data['TimePoint'][tmask] = data['TimePoint'][tmask].map(safe_float)
            tmp = pd.pivot_table(data[tmask], cols = ['Exposed', 'TimePoint'], values = cyto, rows = 'Location')
            tmp.applymap(np.log10).boxplot(ax=ax, rot = 90)
            #data[tmask].pivot('Exposed', 'TimePoint', cyto).boxplot(ax=ax)
            #data[tmask].boxplot(column = cyto, by = ['Exposed', 'TimePoint'], ax = ax)
    fig.tight_layout()
    plt.savefig('/home/will/KrebbsData/draft_figures/boxplot_%s.png' % cyto)
    plt.close()

In [14]:
cl_data['nTimePoint'] = cl_data['TimePoint'].map(str)
gdata = cl_data.groupby(['nTimePoint', 'Exposed', 'CellLine', 'WellSide']).mean()

In [15]:
from scipy.stats import norm
effect = np.log2(gdata.ix['4']/gdata.ix['24'])[cytos]
mock_effects = effect.ix['Mock'].values.reshape(-1,1)
mock_effects[np.isnan(mock_effects)] = []
rvs = norm(*norm.fit(np.abs(mock_effects)))

pval_func = lambda x: rvs.sf(abs(x))
pvals = effect.applymap(pval_func)

In [16]:
from statsmodels.stats.multitest import multipletests
tpvals = pvals.values.flatten()
tpvals[np.isnan(tpvals)] = []
_, bh_pvals, _, _ = multipletests(tpvals, alpha = 0.05, method = 'bonferroni')
tdict = dict(zip(tpvals, bh_pvals))
tdict[np.nan] = np.nan

bon_pvals = pvals.applymap(tdict.get)
wmask = bon_pvals < 0.1

print effect.where(wmask)['IFN-gamma'].dropna()
print bon_pvals.where(wmask)['IFN-gamma'].dropna()


Exposed  CellLine  WellSide
Mock     Ect1      Bottom     -11.611741
         End1      Top         -9.283438
         VK2       Top         -9.698107
SF15     Ect1      Bottom      11.505369
SFV      Ect1      Bottom       8.699213
Name: IFN-gamma, dtype: float64
Exposed  CellLine  WellSide
Mock     Ect1      Bottom      0.000011
         End1      Top         0.005511
         VK2       Top         0.002012
SF15     Ect1      Bottom      0.000014
SFV      Ect1      Bottom      0.021059
Name: IFN-gamma, dtype: float64

In [18]:
fname = '/home/will/KrebbsData/MouseResults.csv'
mouse_data = pd.read_csv(fname, sep = '\t')

mouse_cytos = ['IL-1beta','IL-10','IL-6','IL-12','GM-CSF','IL-5', 'IFN-gamma','TNF-alpha','IL-2','IL-4']

for col in mouse_cytos:
    mouse_data[col] = mouse_data[col].map(safe_float)
    try:
        env = EllipticEnvelope().fit(mouse_data[col].dropna().values.reshape(-1,1))
        mask = env.predict(mouse_data[col].values.reshape(-1,1))
        mouse_data[col][mask == -1] = np.nan
    except:
        pass

In [37]:
check_cytos = ['IL-1beta','IL-6','IL-5','TNF-alpha','IL-2','IL-4']
wanted_reag = ['CpG', 'N-9', 'Imiquimod'] #PBS on all
wanted_time = ['2', '6', '12', '24']
control_mask = mouse_data['Reagent'] == 'PBS'
for cyto in check_cytos:
    fig, axs = plt.subplots(1,3, figsize = (10,5), sharey = True)
    for ax, reag in zip(axs.flatten(), wanted_reag):
        boxes = [mouse_data[cyto][control_mask].dropna().values]
        reag_mask = mouse_data['Reagent'] == reag
        for tm in wanted_time:
            boxes.append(mouse_data[cyto][reag_mask & (mouse_data['TimeGroup']==tm)].values)
        ax.boxplot(boxes)
        if ax.is_first_col():
            ax.set_ylabel(cyto)
        ax.set_title(reag)
        ax.set_xticklabels(['PBS']+wanted_time)
        
    fig.tight_layout()
    plt.savefig('/home/will/KrebbsData/draft_figures/mouse_boxplot_%s.png' % cyto)
    plt.close()

In [33]:



Out[33]:
True

In [ ]: