In [2]:
import lifelines
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
%matplotlib inline
plt.style.available
plt.style.use("bmh")

In [4]:
cns_all = pd.read_csv("survival.csv")

In [5]:
del cns_all["Unnamed: 0"]

In [6]:
C = cns_all["Status (Dead or alive)"].apply(lambda x : True if x == "DEAD" else False)

In [7]:
T = cns_all["Duration from diagnosis to death or last follow up"]

In [8]:
kmf = KaplanMeierFitter()

In [9]:
kmf.fit(T, event_observed=C )


Out[9]:
<lifelines.KaplanMeierFitter: fitted with 259 observations, 198 censored>

In [10]:
kmf.plot(figsize=(12, 8))


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x11304e978>

In [11]:
kmf.survival_function_.plot(figsize=(12, 8))
plt.title('Survival function of CNS');



In [12]:
cns_all["Age"] = cns_all["Age"].apply(lambda x : 1 if x > 60 else 0)
cns_all["Race"] = cns_all["Race"].apply(lambda x : "White" if x == "White" else "Other")

def stage_mask(stage):
    if stage == "I" or stage == "II":
        return "I/II"
    elif stage == "III" or stage == "IV":
        return "III/IV"
    else:
        return "Missing"
    
def LDH_mask(ldh):
    if ldh == "NO":
        return 0
    elif ldh == "YES":
        return 1
    else:
        return None
    
cns_all["Stage"] = cns_all["Stage"].apply(stage_mask)
cns_all["IPI Score"] = cns_all["IPI Score"].apply(lambda x: "L/LI" if x < 3 else "H/HI")
cns_all["PS"] = cns_all["PS"].apply(lambda x: 1 if x > 1 else 0)

In [13]:
cns_all["LDH"] = cns_all["LDH"].apply(LDH_mask)
cns_all["B Symp 1"] = cns_all["B Symp 1"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["B symp 2"] = cns_all["B symp 2"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["B symp 3"] = cns_all["B symp 3"].apply(lambda x : 0 if x == "NO" else 1)

In [14]:
b_symp_any = cns_all["B Symp 1"] + cns_all["B symp 2"] + cns_all["B symp 3"]
cns_all["B Symp"] = b_symp_any.apply(lambda x: 1 if x > 0 else 0)

In [15]:
del cns_all["B Symp 1"]
del cns_all["B symp 2"]
del cns_all["B symp 3"]

In [16]:
cns_all[">1 extranodal"] = cns_all[">1 extranodal"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["BM Involv"] = cns_all["BM Involv"].apply(LDH_mask)
cns_all["PB Involv"] = cns_all["PB Involv"].apply(LDH_mask)
cns = cns_all["CNS "].apply(LDH_mask)
cns_all["cns"] = cns
del cns_all["CNS "]
cns_all["cns"] = cns_all["cns"].apply(lambda x: 1 if x == 1 else 0)
cns_all


Out[16]:
Age Gender Race Stage IPI Score PS LDH >1 extranodal BM Involv PB Involv Status (Dead or alive) Date of death or last follow up Duration from diagnosis to death or last follow up Date of diagnosis B Symp cns
0 1 M White III/IV L/LI 0 NaN 0 1 NaN UNKNOWN 1/23/2013 11.18 2/18/2012 0 0
1 0 F White III/IV H/HI 1 0 1 0 NaN ALIVE 3/31/2014 17.62 10/11/2012 1 0
2 0 M White I/II L/LI 0 0 0 0 0 ALIVE 5/22/2014 183.60 2/5/1999 0 0
3 0 F Other Missing L/LI 0 1 0 1 1 ALIVE 4/28/2014 60.03 4/28/2009 0 0
4 0 M White III/IV H/HI 1 1 0 1 0 UNKNOWN 5/3/2013 82.84 6/9/2006 1 0
5 0 M Other III/IV H/HI 0 1 1 1 NaN UNKNOWN 10/3/2011 2.66 7/14/2011 1 0
6 0 M Other III/IV L/LI 0 NaN 1 0 NaN ALIVE 4/25/2014 11.08 5/23/2013 1 0
7 1 M White III/IV L/LI 0 NaN 0 0 NaN DEAD 3/27/2014 26.20 1/20/2012 1 0
8 1 F White III/IV L/LI 0 NaN 0 1 1 ALIVE 1/16/2014 41.12 8/14/2010 1 0
9 1 M Other I/II L/LI 0 0 0 NaN 1 UNKNOWN 10/26/2009 2.43 8/13/2009 0 0
10 1 M White I/II L/LI 0 0 0 NaN NaN ALIVE 7/31/2014 27.19 4/25/2012 1 0
11 0 F White Missing L/LI 0 NaN 1 NaN NaN ALIVE 6/19/2014 49.80 4/26/2010 1 0
12 1 F White I/II L/LI 0 0 0 NaN NaN UNKNOWN 5/30/2013 32.54 9/13/2010 0 0
13 0 M White III/IV L/LI 0 1 0 1 1 ALIVE 6/9/2014 25.51 4/24/2012 0 0
14 0 M White III/IV H/HI 0 1 1 1 1 UNKNOWN 5/10/2011 7.73 9/17/2010 1 0
15 0 F White I/II L/LI 0 0 0 NaN 1 DEAD 1/31/2011 8.12 5/29/2010 1 0
16 0 M White III/IV L/LI 0 NaN 0 0 NaN UNKNOWN 11/2/2012 83.07 12/2/2005 0 0
17 1 M White III/IV H/HI 1 1 0 1 NaN UNKNOWN 1/23/2009 19.59 6/7/2007 1 0
18 0 M Other III/IV L/LI 0 NaN 1 1 NaN ALIVE 2/13/2014 11.37 3/4/2013 1 0
19 0 F White I/II L/LI 0 NaN 0 NaN NaN ALIVE 7/9/2014 57.56 9/22/2009 1 0
20 0 F Other Missing L/LI 0 NaN 0 1 1 UNKNOWN 5/18/2009 0.66 4/28/2009 0 0
21 1 M White I/II L/LI 0 NaN 0 NaN NaN UNKNOWN 6/10/2013 8.61 9/21/2012 0 0
22 1 M White III/IV L/LI 0 NaN 0 0 NaN DEAD 10/31/2012 41.39 5/21/2009 0 1
23 1 M Other I/II L/LI 0 NaN 0 0 0 UNKNOWN 1/30/2013 38.72 11/9/2009 0 0
24 0 F White I/II L/LI 0 0 0 0 0 ALIVE 2/13/2014 61.54 12/29/2008 0 0
25 1 M White III/IV H/HI 0 NaN 1 1 1 UNKNOWN 8/13/2012 32.61 11/25/2009 0 0
26 1 F White III/IV H/HI 0 1 0 0 0 ALIVE 10/2/2013 22.16 11/28/2011 0 0
27 0 M Other III/IV L/LI 0 0 1 0 0 ALIVE 7/14/2014 93.33 10/5/2006 0 1
28 0 F Other I/II L/LI 0 0 0 0 0 UNKNOWN 6/17/2013 18.80 11/23/2011 1 0
29 0 M White Missing L/LI 0 0 0 0 0 UNKNOWN 2/20/2013 9.73 4/30/2012 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
229 1 M Other III/IV L/LI 0 0 0 1 1 UNKNOWN 3/3/2008 4.27 10/25/2007 0 0
230 1 M Other Missing L/LI 0 0 0 1 1 UNKNOWN 7/24/2008 1.84 5/29/2008 1 0
231 1 M White Missing L/LI 0 NaN 0 1 1 UNKNOWN 7/18/2011 34.35 9/6/2008 0 0
232 0 M Other Missing L/LI 0 0 0 NaN 0 ALIVE 7/7/2014 9.53 9/20/2013 1 0
233 0 M Other III/IV L/LI 0 0 1 1 1 ALIVE 7/31/2014 5.33 2/19/2014 1 1
234 1 M White III/IV H/HI 0 1 0 0 NaN ALIVE 7/31/2014 6.67 12/9/2011 1 0
235 1 F White III/IV H/HI 0 0 1 0 0 ALIVE 7/24/2014 31.49 6/11/2007 1 0
236 0 M White III/IV L/LI 0 1 0 0 NaN ALIVE 7/31/2014 85.70 10/21/2008 0 0
237 0 F Other Missing L/LI 0 NaN 0 0 NaN ALIVE 4/7/2014 80.93 9/20/2011 0 0
238 0 F Other III/IV L/LI 0 1 0 1 1 ALIVE 7/17/2014 33.89 7/11/2010 1 0
239 0 F Other III/IV L/LI 0 1 0 NaN 1 ALIVE 5/1/2014 45.69 4/3/2014 1 0
240 0 F White Missing L/LI 0 1 0 1 NaN ALIVE 6/19/2014 2.53 5/13/2014 0 0
241 0 M White I/II L/LI 0 1 0 0 NaN ALIVE 5/29/2014 0.53 8/19/2009 1 0
242 1 M Other III/IV H/HI 0 NaN 1 0 NaN ALIVE 3/19/2014 55.00 7/21/2011 0 0
243 0 F White I/II L/LI 0 0 0 0 0 DEAD 7/8/2013 23.60 8/28/2009 1 0
244 0 M White III/IV H/HI 0 1 1 0 NaN ALIVE 7/21/2014 58.78 4/29/2014 1 0
245 1 F White III/IV H/HI 0 1 1 NaN NaN ALIVE 5/6/2014 0.23 2/10/2014 1 0
246 0 M Other III/IV H/HI 0 1 1 0 NaN ALIVE 7/31/2014 5.62 12/30/2007 1 0
247 1 F White III/IV L/LI 0 0 0 NaN 1 ALIVE 4/28/2014 73.08 8/1/2006 1 0
248 1 M Other III/IV H/HI 0 1 1 1 NaN ALIVE 7/29/2014 95.96 4/14/2007 1 0
249 0 M White III/IV L/LI 0 0 1 0 NaN ALIVE 5/15/2014 85.08 12/4/2010 0 0
250 0 M White III/IV L/LI 0 NaN 0 1 1 ALIVE 6/30/2014 42.87 9/21/2006 0 0
251 1 F Other I/II L/LI 0 0 0 0 0 UNKNOWN 1/28/2013 76.30 12/22/2010 1 0
252 0 M White I/II L/LI 0 0 0 0 NaN ALIVE 7/31/2014 43.29 7/22/2011 1 0
253 0 M Other III/IV L/LI 0 1 0 1 1 ALIVE 6/12/2014 41.68 5/30/2014 1 0
254 1 F White III/IV H/HI 0 1 0 1 NaN ALIVE 7/30/2014 36.29 8/19/2013 1 0
255 1 M Other Missing L/LI 0 0 0 NaN NaN ALIVE 7/2/2014 1.08 5/22/2014 1 1
256 0 M Other III/IV L/LI 0 0 0 NaN 1 ALIVE 7/31/2014 2.30 6/9/2014 1 0
257 1 F White III/IV L/LI 0 0 0 0 NaN ALIVE 7/31/2014 6.77 11/13/2012 1 0
258 1 M White I/II L/LI 0 NaN 0 NaN NaN ALIVE 7/31/2014 1.71 11/16/2011 1 0

259 rows × 16 columns


In [17]:
cns_all["Observed"] = C
cns_all["Duration"] = T
del cns_all["Status (Dead or alive)"]
del cns_all["Duration from diagnosis to death or last follow up"]

In [18]:
ax = plt.subplot(111)
cns_pos = (cns_all["cns"] == 1)
kmf.fit(T[cns_pos], event_observed=C[cns_pos], label="CNS Positive")
kmf.plot(ax=ax, figsize=(14, 9), flat=False, show_censors=True, ci_show=False)
print(kmf.median_)

kmf.fit(T[~cns_pos], event_observed=C[~cns_pos], label="CNS Negative")
kmf.plot(ax=ax, figsize=(14, 9), show_censors=True, ci_show=False)
print(kmf.median_)

plt.ylim(0,1);

plt.xlabel("Time (months)", **font)
plt.ylabel("Overall survival probability", **font)


59.37
146.68
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-902137ba1bdc> in <module>()
     11 plt.ylim(0,1);
     12 
---> 13 plt.xlabel("Time (months)", **font)
     14 plt.ylabel("Overall survival probability", **font)

NameError: name 'font' is not defined

In [19]:
kmf.fit(T[cns_pos], event_observed=C[cns_pos], label="CNS Positive").median_


Out[19]:
59.369999999999997

In [20]:
from lifelines.statistics import logrank_test

logrank_test(T[cns_pos], T[~cns_pos], C[cns_pos], C[~cns_pos], alpha=.99 )


Out[20]:
<lifelines.StatisticalResult: 
Results
   t 0: -1
   alpha: 0.99
   df: 1
   null distribution: chi squared
   test: logrank

   __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
         0.00522 |              7.803 |      Reject Null    |        True       
>

In [41]:
categories = cns_all['Race'].unique()

for i,category in enumerate(categories):
    ax = plt.subplot(2,3,i+1)
    ix = cns_all['Race'] == category
    kmf.fit( T[ix], C[ix], label=category )
    kmf.plot(ax=ax, legend=False, figsize=(15, 10))
    plt.title(category)
    plt.xlim(0,50)
plt.tight_layout()



In [63]:
def PB_mask(pb):
    if pb == 0:
        return "No"
    elif pb == 1:
        return "Yes"
    else:
        return "Missing"

In [65]:
cns_all["PB Involv"] = cns_all["PB Involv"].apply(PB_mask)

In [98]:
categories = cns_all['PB Involv'].unique()

for i,category in enumerate(categories):
    ax = plt.subplot(2,3,i+1)
    ix = cns_all['PB Involv'] == category
    kmf.fit( T[ix], C[ix], label=category )
    kmf.plot(ax=ax, legend=False, figsize=(15, 10))
    plt.title(category)
    plt.xlim(0,50)
plt.tight_layout()



In [125]:
cns_pos = cns_all[cns_all['cns'] == 1]
cns_pos = cns_pos.reset_index()

In [126]:
column = 'Gender'
categories = cns_pos[column].unique()

for i,category in enumerate(categories):
    ax = plt.subplot(2,3,i+1)
    ix = cns_pos[column] == category
    kmf.fit( T[ix], C[ix], label=category )
    kmf.plot(ax=ax, legend=False, figsize=(15, 10))
    plt.title(category)
    plt.xlim(0,50)
plt.tight_layout()


---------------------------------------------------------------------------
IndexingError                             Traceback (most recent call last)
<ipython-input-126-4e2d34c36f2e> in <module>()
      5     ax = plt.subplot(2,3,i+1)
      6     ix = cns_pos[column] == category
----> 7     kmf.fit( T[ix], C[ix], label=category )
      8     kmf.plot(ax=ax, legend=False, figsize=(15, 10))
      9     plt.title(category)

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    545 
    546         if _is_bool_indexer(key):
--> 547             key = _check_bool_indexer(self.index, key)
    548 
    549         return self._get_with(key)

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/indexing.pyc in _check_bool_indexer(ax, key)
   1637         mask = com.isnull(result.values)
   1638         if mask.any():
-> 1639             raise IndexingError('Unalignable boolean Series key provided')
   1640 
   1641         result = result.astype(bool).values

IndexingError: Unalignable boolean Series key provided

In [118]:
summary, p_value, test_results = logrank_test(T[cns_pos], T[~cns_pos], C[cns_pos], C[~cns_pos], alpha=.99 )
print summary


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-118-10664fe0fc4f> in <module>()
----> 1 summary, p_value, test_results = logrank_test(T[cns_pos], T[~cns_pos], C[cns_pos], C[~cns_pos], alpha=.99 )
      2 print summary

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    547             key = _check_bool_indexer(self.index, key)
    548 
--> 549         return self._get_with(key)
    550 
    551     def _get_with(self, key):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/series.pyc in _get_with(self, key)
    555             return self._get_values(indexer)
    556         elif isinstance(key, ABCDataFrame):
--> 557             raise TypeError('Indexing a Series with DataFrame is not supported, '\
    558                             'use the appropriate DataFrame column')
    559         else:

TypeError: Indexing a Series with DataFrame is not supported, use the appropriate DataFrame column

In [128]:
cns_pos.describe()


Out[128]:
index Age PS LDH >1 extranodal BM Involv B Symp cns Observed Duration
count 17.000000 17.000000 17 13.000000 17.000000 14.000000 17.000000 17 17 17.000000
mean 140.294118 0.352941 0 0.384615 0.294118 0.500000 0.411765 1 0.4705882 31.428824
std 70.354251 0.492592 0 0.506370 0.469668 0.518875 0.507300 0 0.5144958 26.081535
min 22.000000 0.000000 0 0.000000 0.000000 0.000000 0.000000 1 False 1.080000
25% 97.000000 0.000000 0 0.000000 0.000000 0.000000 0.000000 1 0 12.360000
50% 150.000000 0.000000 0 0.000000 0.000000 0.500000 0.000000 1 0 24.620000
75% 192.000000 1.000000 0 1.000000 1.000000 1.000000 1.000000 1 1 44.770000
max 255.000000 1.000000 0 1.000000 1.000000 1.000000 1.000000 1 True 93.330000

In [ ]: