In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import sys
sys.path.append('/home/will/PatientPicker/')
import LoadingTools
In [2]:
redcap_data = LoadingTools.load_redcap_data()
redcap_data = redcap_data.sort(['Patient ID', 'Date Of Visit'])
baseline_date = redcap_data.groupby('Patient ID')['Date Of Visit'].first()
In [3]:
def safe_days(inval):
try:
return inval.days
except:
return np.nan
baseline_dict = baseline_date.to_dict()
redcap_data['BaseLineDate'] = redcap_data.apply(lambda x: baseline_dict[x['Patient ID']], axis=1)
redcap_data['DaysSinceBaseline'] = (redcap_data['Date Of Visit'] - redcap_data['BaseLineDate']).map(safe_days)
redcap_data['DaysSeropositive'] = (redcap_data['Date Of Visit'] - redcap_data['HIV Seropositive Date']).map(safe_days)
redcap_data['IsMale'] = redcap_data['Gender'] == 'Male'
redcap_data['BMI'] = (redcap_data['Weight']/2.2)/redcap_data['Height']**2
In [4]:
delta_cols = ['DaysSinceBaseline', 'TMHDS', 'Constructional Score', 'Memory Recall Score', 'Psychomotor Speed Score']
delta_scores = redcap_data.groupby('Patient ID')[delta_cols].transform(lambda x: pd.rolling_apply(x, 2, np.diff))
rename_dict = dict([(col, 'Delta '+col) for col in delta_cols])
delta_scores = delta_scores.rename(columns=rename_dict)
In [5]:
train_redcap = pd.concat([redcap_data, delta_scores], axis=1)
In [6]:
train_redcap
Out[6]:
In [7]:
test_cols = [col for col in redcap_data.columns if col.startswith('Test-')]
admit_cols = [col for col in redcap_data.columns if col.startswith('Admit-')]
clinic_cols = ['Latest CD4 count (cells/uL)', 'Latest CD8 count (cells/uL)', 'Latest viral load']
pat_cols = ['Age', 'IsMale', 'BMI']
pred_cols = ['Delta TMHDS', 'Delta Constructional Score', 'Delta Memory Recall Score', 'Delta Psychomotor Speed Score']
In [25]:
tdata = train_redcap[['Delta TMHDS', 'Delta DaysSinceBaseline']+pat_cols+admit_cols].dropna(axis=0)
y = tdata['Delta TMHDS'].values.astype(float)
X = tdata.drop(['Delta TMHDS'], axis=1).values.astype(float)
In [23]:
In [28]:
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import Bootstrap
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import AdaBoostRegressor
from sklearn.dummy import DummyRegressor
pipe = Pipeline(steps=[('Norm', Normalizer()),
('Regress', DummyRegressor())])
cross_val_score(pipe, X, y, scoring=make_scorer(mean_absolute_error),
cv=Bootstrap(len(y), n_iter=10, train_size=0.6),
verbose=2)
Out[28]:
In [ ]: