In [ ]:
%load_ext autoreload
%autoreload 2
import numpy as np
import sys
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
from os.path import join
from src.dump_results import dump

%matplotlib inline

from src.pre_process import process_level2, load_binary

In [ ]:
data = load_binary()

Analysing the dataset


In [ ]:
df = pd.read_csv('data/tracking.csv')

In [ ]:
# df.columns = ['user_id', 'symptom', 'day_in_cycle', 'probability']

In [ ]:
df2 = pd.DataFrame(df.groupby('day_in_cycle').symptom.count())

In [ ]:
# Happy
df2.plot()

In [ ]:
import scipy as sp
def logloss(act, pred):
    """ Vectorised computation of logloss """

    #cap in official Kaggle implementation,
    #per forums/t/1576/r-code-for-logloss
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)

    #compute logloss function (vectorised)
    ll = sum(   act*sp.log(pred) +
                sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

Experimenting with building a model


In [ ]:
# Extract features
user_feat_matrix = process_level2(data)  # X

del user_feat_matrix['X']['user_id']
X = user_feat_matrix['X'].values
X[np.isnan(X)] = 0
Y = user_feat_matrix['Y']
Y.fillna(0, inplace=True)
del user_feat_matrix['X_all']['user_id']
X_all = user_feat_matrix['X_all'].values
X_all[np.isnan(X_all)] = 0

cols = list(Y.columns.values)
symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
            'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
            'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']

In [ ]:
from sklearn.model_selection import GridSearchCV
symptoms = ['cramps']
for symptom in symptoms:
    print(symptom)
    s_Y = Y[[x for x in cols if x[1] == symptom]]
    print("Lasso")
    pipeline = Pipeline([
        ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
        ('standard_scale', StandardScaler()),
        ('estimator', Lasso()),
    ])

    model = GridSearchCV(pipeline,
                         {'estimator__alpha': [0.5]},
                         verbose=10,
                         cv=2
             )

    model.fit(X, s_Y.values)

    model.best_score_

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
symptoms = ['cramps']
for symptom in symptoms:
    print(symptom)
    s_Y = Y[[x for x in cols if x[1] == symptom]]
    pipeline = Pipeline([
        ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
        ('standard_scale', StandardScaler()),        
        ('estimator', DecisionTreeRegressor(max_depth=5))
    ])

    param_grid = {'estimator__max_depth': [3, 5, 7],
                  'estimator__max_features': ['auto', 'sqrt', 'log2']}
    model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
                     verbose=2)
    model.fit(X, s_Y.values)

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
symptoms = ['cramps']
for symptom in symptoms:
    print(symptom)
    s_Y = Y[[x for x in cols if x[1] == symptom]]
    pipeline = Pipeline([
        ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
        ('standard_scale', StandardScaler()),        
        ('estimator', Lasso())
    ])

    param_grid = {'estimator__alpha': [0.1, 0.3, 0.5, 0.7]}
    model = GridSearchCV(pipeline, param_grid = param_grid,
                     verbose=2, cv=2)
    model.fit(X, s_Y.values)

In [ ]:
from sklearn.model_selection import GridSearchCV
symptoms = ['happy']
for symptom in symptoms:
    print(symptom)
    s_Y = Y[[x for x in cols if x[1] == symptom]]
    print("Lasso")
    pipeline = Pipeline([
        ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
        ('standard_scale', StandardScaler()),
        ('estimator', Lasso()),
    ])

    model = GridSearchCV(pipeline,
                         {'estimator__alpha': np.array(list(range(5))) /5 + 0.1},
                         verbose=10,
                         cv=2
             )

    model.fit(X, s_Y.values)

    model.best_score_
    print("dumping...")
    data_dir = 'data'
    cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
    c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)}
    %time dump(symptom, pipeline, X_all, c_length, data['users'].user_id)

In [ ]:


In [ ]: