In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import GridSearchCV

from sklearn.cross_validation import train_test_split

train = pd.read_csv('act_train.csv', nrows = 500)
test = pd.read_csv('act_test.csv', nrows = 500)
people = pd.read_csv('people.csv', nrows = 500)

In [2]:
test_ids = test['activity_id']
target = train['outcome']

train.drop(['outcome', 'activity_id'], axis=1, inplace = True) # date to work with
test.drop(['activity_id'], axis=1, inplace = True) # date to work with

    
## Split off _ from people_id
train['people_id'] = train['people_id'].apply(lambda x: x.split('_')[1])
train['people_id'] = pd.to_numeric(train['people_id']).astype(int)

test['people_id'] = test['people_id'].apply(lambda x: x.split('_')[1])
test['people_id'] = pd.to_numeric(test['people_id']).astype(int)
    
columns = list(train.columns)
columns.remove("date")
    
for col in columns[1:]:
    train[col] = train[col].fillna(-1)
    train[col] = train[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
    train[col] = pd.to_numeric(train[col]).astype(int)
    
    test[col] = test[col].fillna(-1)
    test[col] = test[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
    test[col] = pd.to_numeric(test[col]).astype(int)

In [3]:
people['people_id'] = people['people_id'].apply(lambda x: x.split('_')[1])
people['people_id'] = pd.to_numeric(people['people_id']).astype(int)
    
#  Values in the people df is Booleans and Strings    
columns = list(people.columns)
columns.remove("char_38")
bools = columns[12:]
strings = columns[1:12]
strings.remove("date")
    
for col in bools:
    people[col] = pd.to_numeric(people[col]).astype(int)   
    
for col in strings:
    people[col] = people[col].fillna(-2)
    people[col] = people[col].apply(lambda x: x if x == -2 else x.split(' ')[1])
    people[col] = pd.to_numeric(people[col]).astype(int)

In [4]:
def fix_date_time(df):
    def extract_field(_df, start, stop):
        return _df['date'].map(lambda dt: int(dt[start:stop]))
    df['Year'] = extract_field(df,0,4)
    df['Month'] = extract_field(df,5,7)
    df['Day'] = extract_field(df,8,10)
    return df.drop(['date'], axis = 1)
train = fix_date_time(train)
test = fix_date_time(test)
people = fix_date_time(people)

In [5]:
train = train.merge(people, how='left', on='people_id')

test = test.merge(people, how='left', on='people_id')

# Check it out...
train.sample(10)


Out[5]:
people_id activity_category char_1_x char_2_x char_3_x char_4_x char_5_x char_6_x char_7_x char_8_x ... char_32 char_33 char_34 char_35 char_36 char_37 char_38 Year_y Month_y Day_y
439 100099 2 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 0 16 2022 12 2
345 100075 2 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 0 66 2021 10 27
315 100075 2 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 0 66 2021 10 27
16 100003 2 -1 -1 -1 -1 -1 -1 -1 -1 ... 1 1 1 0 1 1 99 2022 6 10
141 100035 2 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 1 100 2022 1 22
64 100025 5 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 0 76 2022 8 26
328 100075 4 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 0 66 2021 10 27
279 10007 1 7 5 6 3 6 2 3 5 ... 1 1 1 1 1 1 73 2023 5 31
200 100045 5 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 0 7 2022 9 21
169 100035 2 -1 -1 -1 -1 -1 -1 -1 -1 ... 0 0 0 0 0 1 100 2022 1 22

10 rows × 57 columns


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.20, random_state=23)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)


Out[6]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
proba = clf.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print("Area under ROC {0}".format(score))


Area under ROC 0.999782986111

In [8]:
# Test Set Predictions
test_proba = clf.predict_proba(test)
test_preds = test_proba[:,1]

# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('redhat.csv', index = False)

In [9]:
target.unique()


Out[9]:
array([0, 1], dtype=int64)