In [1]:
from __future__ import division
import os
from tqdm import tqdm
from natsort import natsorted
import nibabel as nb
import statistics
# Opens data file and constructs features
def get_data(subdir):
data = pd.DataFrame(columns=['mean', 'median', 'stdev'])
for filename in tqdm(natsorted(os.listdir(subdir))):
path = subdir + filename
img = nb.load(path).get_data()[..., 0]
hist = img[img > 0].flatten()
data = data.append({'mean': hist.mean(),
'median': statistics.median(hist),
'stdev': statistics.stdev(hist),
}, ignore_index=True)
return data
In [2]:
# Fetch data or open files with data
import pandas as pd
train_features = pd.DataFrame.from_csv('../data/train_data.csv')[["mean", "median", "gm", "age"]]
test_features = pd.DataFrame.from_csv('../data/test_data.csv')[["mean", "median", "gm"]]
In [3]:
# Show pairwise relations between features
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
sb.pairplot(train_features, hue='age', palette='Blues')
Out[3]:
In [5]:
# Illustrate polynomial regression features vs. age
order = 2
sb.lmplot(y='mean', x='age', data=train_features, order=order)
sb.lmplot(y='median', x='age', data=train_features, order=order)
sb.lmplot(y='gm', x='age', data=train_features, order=order)
Out[5]:
In [6]:
#from sklearn.cross_validation import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
alpha = 40
order = 2
best_estimator_name, best_estimator, best_order, min_error = '', None, 0, 1000
# Train and test model
X = train_features.drop("age", axis=1).as_matrix()
y = train_features["age"].as_matrix()
kf = KFold(n_splits=5)
train_test_split = list(kf.split(X))
normalize = False
for order in range(1, 6):
for alpha in range(50, 2001, 50):
estimators = [
('LinearRegression', LinearRegression(normalize=normalize)),
('Ridge(alpha=%d)' % (alpha), Ridge(alpha=alpha, normalize=normalize)),
('Lasso(alpha=%d)' % (alpha), Lasso(alpha=alpha, normalize=normalize))
]
for estimator_name, estimator in estimators:
model = make_pipeline(PolynomialFeatures(order), estimator)
errors = []
for train, test in train_test_split:
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
model.fit(X_train, y_train)
predictions = model.predict(X_test)
errors.append(mean_squared_error(y_test, predictions))
errors_mean = sum(errors) / len(errors)
print '%s [%d]: %f' % (estimator_name, order, errors_mean)
if min_error > errors_mean:
best_estimator_name = estimator_name
best_estimator = estimator
min_error = errors_mean
best_order = order
print "\nBest result:"
print best_estimator_name, best_order, min_error
In [7]:
best_order = 3
best_estimator = Ridge(alpha=1000)
best_estimator_name = "Ridge"
best_model = make_pipeline(PolynomialFeatures(best_order), best_estimator)
best_model.fit(X, y)
Out[7]:
In [8]:
predictions = best_model.predict(test_features)
predictions = [min(max(int(round(i)), 18), 96) for i in predictions]
result = pd.DataFrame({'ID': range(1, len(predictions)+1), 'Prediction': predictions})
print ''
result.to_csv('../data/result_%s-order-%d.csv' % (best_estimator_name, best_order), index=False)
In [10]:
print predictions
In [ ]: