In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import sys
sys.path.append('/home/will/PatientPicker/')
import LoadingTools

In [2]:
redcap_data = LoadingTools.load_redcap_data()
redcap_data = redcap_data.sort(['Patient ID', 'Date Of Visit'])
baseline_date = redcap_data.groupby('Patient ID')['Date Of Visit'].first()

In [3]:
def safe_days(inval):
    try:
        return inval.days
    except:
        return np.nan
    
baseline_dict = baseline_date.to_dict()
redcap_data['BaseLineDate'] = redcap_data.apply(lambda x: baseline_dict[x['Patient ID']], axis=1)
redcap_data['DaysSinceBaseline'] = (redcap_data['Date Of Visit'] - redcap_data['BaseLineDate']).map(safe_days)
redcap_data['DaysSeropositive'] = (redcap_data['Date Of Visit'] - redcap_data['HIV Seropositive Date']).map(safe_days)
redcap_data['IsMale'] = redcap_data['Gender'] == 'Male'
redcap_data['BMI'] = (redcap_data['Weight']/2.2)/redcap_data['Height']**2

In [4]:
delta_cols = ['DaysSinceBaseline', 'TMHDS', 'Constructional Score', 'Memory Recall Score', 'Psychomotor Speed Score']
delta_scores = redcap_data.groupby('Patient ID')[delta_cols].transform(lambda x: pd.rolling_apply(x, 2, np.diff))
rename_dict = dict([(col, 'Delta '+col) for col in delta_cols])
delta_scores = delta_scores.rename(columns=rename_dict)

In [5]:
train_redcap = pd.concat([redcap_data, delta_scores], axis=1)

In [6]:
train_redcap


Out[6]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 1419 entries, 0 to 1418
Columns: 440 entries, Patient ID to Delta Psychomotor Speed Score
dtypes: bool(173), float64(157), object(110)

In [7]:
test_cols = [col for col in redcap_data.columns if col.startswith('Test-')]
admit_cols = [col for col in redcap_data.columns if col.startswith('Admit-')]
clinic_cols = ['Latest CD4 count (cells/uL)', 'Latest CD8 count (cells/uL)', 'Latest viral load']
pat_cols = ['Age', 'IsMale', 'BMI']

pred_cols = ['Delta TMHDS', 'Delta Constructional Score', 'Delta Memory Recall Score', 'Delta Psychomotor Speed Score']

In [25]:
tdata = train_redcap[['Delta TMHDS', 'Delta DaysSinceBaseline']+pat_cols+admit_cols].dropna(axis=0)
y = tdata['Delta TMHDS'].values.astype(float)
X = tdata.drop(['Delta TMHDS'], axis=1).values.astype(float)

In [23]:


In [28]:
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import Bootstrap
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import AdaBoostRegressor
from sklearn.dummy import DummyRegressor

pipe = Pipeline(steps=[('Norm', Normalizer()),
                       ('Regress', DummyRegressor())])

cross_val_score(pipe, X, y, scoring=make_scorer(mean_absolute_error),
                cv=Bootstrap(len(y), n_iter=10, train_size=0.6), 
                verbose=2)


score: 1.576474
score: 1.667163
score: 1.566930
score: 1.548607
score: 1.384625
score: 1.971408
score: 1.299009
score: 1.378101
score: 1.537975
score: 1.487758
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
Out[28]:
array([ 1.5764739 ,  1.66716334,  1.56692977,  1.54860664,  1.38462468,
        1.97140786,  1.29900934,  1.37810084,  1.53797519,  1.48775773])

In [ ]: