Important! make sure you have added your email and name here before proceeding further: https://tinyurl.com/y76vk384
In [2]:
# To support both python 2 and python 3
# from __future__ import division, print_function, unicode_literals
import os
from zipfile import ZipFile
from six.moves import urllib
import sys
print(sys.version)
In [3]:
!mkdir -p ../data
TADPOLE_PATH = os.path.join("..", "data")
Then download the zipfile from: https://ida.loni.usc.edu/pages/access/studyData.jsp?categoryId=43&subCategoryId=94
In [4]:
def fetch_tadpole_data(tadpole_path=TADPOLE_PATH):
if not os.path.isdir(tadpole_path):
os.makedirs(tadpole_path)
zip_path = os.path.join(tadpole_path, "tadpole_challenge.zip")
if not os.path.isfile(zip_path):
raise ValueError("please move the downloaded zipfile to %s folder" % TADPOLE_PATH)
print("extracting from %s" % zip_path)
# urllib.request.urlretrieve(tadpole_url, zip_path)
with ZipFile(zip_path) as tadpole_zip:
tadpole_zip.extractall(path=tadpole_path)
tadpole_zip.close()
fetch_tadpole_data()
In [5]:
from makeLeaderboardDataset import *
import pandas as pd
generateLBdatasets(inputFolder='../data/', outputFolder='../data/')
This training dataset contains medical data including:
This is a subset of LB1; the list of subjects to be predicted in the final submission
See the github readme file ["https://github.com/swhustla/pycon2017-alzheimers-hack/blob/master/README.md"] for more information and explanations on the data sources.
In [6]:
def load_tadpole_data(tadpole_path=TADPOLE_PATH):
csv_path_lb1_lb2 = os.path.join(tadpole_path, "TADPOLE_LB1_LB2.csv")
return pd.read_csv(csv_path_lb1_lb2)
tadpole_lb1_lb2 = load_tadpole_data()
In [7]:
tadpole_lb1_lb2.head()
Out[7]:
In [8]:
print(list(tadpole_lb1_lb2.columns)[:30])
In [9]:
print(tadpole_lb1_lb2.info())
tadpole_lb1_lb2.describe()
Out[9]:
In [10]:
tadpole_lb1_lb2["DX"].value_counts()
Out[10]:
In [11]:
tadpole_lb1_lb2["DX_bl"].value_counts()
Out[11]:
In [12]:
tadpole_lb1_lb2["VISCODE"].value_counts()
Out[12]:
In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
tadpole_lb1_lb2.hist(bins=50, figsize=(20,15))
plt.show()
In [14]:
corr_matrix = tadpole_lb1_lb2.corr()
In [15]:
correlations_with_ADAS13 = corr_matrix['ADAS13'].sort_values(ascending=False)
print(correlations_with_ADAS13[:10], correlations_with_ADAS13[-10:])
In [16]:
prediction_variables = ["ADAS13", "DX", "Ventricles"]
cog_tests_attributes = ["CDRSB", "ADAS11", "MMSE", "RAVLT_immediate"]
mri_measures = ['Hippocampus', 'WholeBrain', 'Entorhinal', 'MidTemp' , "FDG", "AV45"]
pet_measures = ["FDG", "AV45"]
csf_measures = ["ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17"]
risk_factors = ["APOE4", "AGE"]
In [17]:
from pandas.plotting import scatter_matrix
scatter_matrix(tadpole_lb1_lb2[prediction_variables+cog_tests_attributes], figsize=(12,8), alpha=0.1)
plt.title("cog_tests_attributes")
plt.show()
In [18]:
scatter_matrix(tadpole_lb1_lb2[prediction_variables+mri_measures], figsize=(12,8), alpha=0.1)
plt.title("mri_measures")
plt.show()
In [19]:
scatter_matrix(tadpole_lb1_lb2[prediction_variables+pet_measures], figsize=(12,8), alpha=0.1)
plt.title("pet_measures")
plt.show()
In [20]:
scatter_matrix(tadpole_lb1_lb2[prediction_variables+risk_factors], alpha=0.1, figsize=(12,8))
plt.title("risk_factors")
plt.show()
In [21]:
tadpole_lb1_lb2.RID.value_counts()[:5]
Out[21]:
In [22]:
tadpole_lb1_lb2.EXAMDATE = pd.to_datetime(tadpole_lb1_lb2.EXAMDATE)
In [23]:
tadpole_grouped = tadpole_lb1_lb2.groupby("RID").apply(lambda x:(x["EXAMDATE"]-x["EXAMDATE"].min()).dt.days/365.25 + x["AGE"].min())
In [24]:
tadpole_grouped.sort_index(inplace=True)
In [25]:
tadpole_grouped.values
Out[25]:
In [26]:
tadpole_lb1_lb2.sort_values(by=["RID", "EXAMDATE"], inplace=True)
In [27]:
tadpole_lb1_lb2["AGE_AT_EXAM"] = tadpole_grouped.values
In [28]:
tadpole_lb1_lb2[tadpole_lb1_lb2.RID==259].plot(kind="scatter", x="AGE_AT_EXAM", y="ADAS13")
plt.show()
In [29]:
tadpole_lb1_lb2[tadpole_lb1_lb2['RID'] > 5000].plot(kind="scatter", x="RID", y="AGE_AT_EXAM")
Out[29]:
In [30]:
tadpole_lb1_lb2['AGE_INT'] = tadpole_lb1_lb2['AGE_AT_EXAM'].apply(int)
In [31]:
tadpole_lb1_lb2[tadpole_lb1_lb2['ADAS13'].notnull()]\
.groupby('AGE_INT')['ADAS13']\
.count().plot()
Out[31]:
In [32]:
tadpole_lb1_lb2[tadpole_lb1_lb2['ADAS13'].notnull()]\
.groupby('AGE_INT')['ADAS13']\
.mean().plot()
Out[32]:
In [33]:
# categorical example
tadpole_lb1_lb2["DX"].value_counts()
Out[33]:
In [34]:
y_num_cols = ["ADAS13", "Ventricles"]
y_cat_cols = ["DX"]
In [35]:
cog_tests_attributes = ["CDRSB", "ADAS11", "MMSE", "RAVLT_immediate"]
mri_measures = ['Hippocampus', 'WholeBrain', 'Entorhinal', 'MidTemp' , "FDG", "AV45"]
pet_measures = ["FDG", "AV45"]
csf_measures = ["ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17"]
risk_factors = ["APOE4", "AGE"]
In [36]:
def convert_float(val):
try:
return float(val)
except ValueError:
return np.nan
In [37]:
for col in csf_measures:
tadpole_lb1_lb2[col] = tadpole_lb1_lb2[col].map(lambda x:convert_float(x))
In [38]:
useful_numerical_attribs = cog_tests_attributes + mri_measures + pet_measures + csf_measures + ["AGE_AT_EXAM", 'AGE']
useful_numerical_attribs
Out[38]:
In [39]:
tadpole_lb1_lb2.columns[:20]
Out[39]:
In [40]:
useful_categorical_attribs = ['RID', 'SITE', 'DXCHANGE', 'PTGENDER',
'PTEDUCAT', 'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'APOE4']
useful_categorical_attribs
Out[40]:
In [41]:
tadpole_lb1_lb2[useful_categorical_attribs] = tadpole_lb1_lb2[useful_categorical_attribs].astype(str)
In [42]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index \
in split.split(tadpole_lb1_lb2, tadpole_lb1_lb2[["PTMARRY", "PTGENDER","PTETHCAT"]]):
strat_train_set = tadpole_lb1_lb2.loc[train_index]
strat_test_set = tadpole_lb1_lb2.loc[test_index]
Use the training set only now
In [43]:
tadpole = strat_train_set.copy()
print(strat_train_set.AGE)
In [44]:
print(tadpole.head())
print(tadpole_lb1_lb2.head())
print(strat_train_set.keys()[:10])
tadpole = strat_train_set.drop(y_num_cols + y_num_cols, axis=1)
#tadpole_labels_categorical = strat_train_set[useful_categorical_attribs].copy()
#'AGE_AT_EXAM' in strat_train_set.keys()
In [45]:
tadpole = strat_train_set.drop(y_num_cols + y_num_cols, axis=1)
tadpole_labels_categorical = strat_train_set[useful_categorical_attribs].copy()
tadpole_labels_continuous = strat_train_set[useful_numerical_attribs].copy()
In [ ]:
In [46]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
In [47]:
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)
In [48]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
In [49]:
num_pipeline = Pipeline([
('selector', DataFrameSelector(useful_numerical_attribs)),
('imputer', Imputer(strategy="median")),
# ('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(['APOE4'])),
('label_binarizer', LabelBinarizerPipelineFriendly()),
])
In [50]:
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
In [51]:
tadpole_prepared = full_pipeline.fit_transform(tadpole)
In [52]:
tadpole_prepared
Out[52]:
Then generate a simple forecast from the training data, and save it as TADPOLE_Submission_Pycon_TeamName1.csv
In [53]:
!python3 TADPOLE_SimpleForecast1.py
You should replace TeamName1 with your team name and submission index (no underscores allowed) e.g., TADPOLE_Submission_Pycon_TeamAwesome3.csv
In [54]:
team_name = "TeamFrank1" ## add your own team name here
In [55]:
import os
oldFile = '../data/TADPOLE_Submission_Pycon_TeamName1.csv'
newFile = '../data/TADPOLE_Submission_Pycon_%s.csv' % team_name
os.system('mv %s %s' % (oldFile, newFile))
Out[55]:
Evaluate the user forecasts from TADPOLE_Submission_Leaderboard_TeamName1.csv against TADPOLE_LB4_dummy.csv (held out dataset) using the evaluation function
In [59]:
cmd = 'python3 evalOneSubmission.py --leaderboard --d4File %s --forecastFile %s' % ("../data/TADPOLE_LB4_dummy.csv", newFile)
print(cmd)
os.system(cmd)
# check the console where you launched jupyter from, it should show the outputs.
# Otherwise, run the command from the command line
Out[59]:
In [57]:
# Submit (renamed version of) TADPOLE_Submission_Leaderboard_TeamName1.csv to TADPOLE website via the Submit page
In [ ]: