In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
train = pd.read_csv('act_train.csv', nrows = 500)
test = pd.read_csv('act_test.csv', nrows = 500)
people = pd.read_csv('people.csv', nrows = 500)
In [2]:
test_ids = test['activity_id']
target = train['outcome']
train.drop(['outcome', 'activity_id'], axis=1, inplace = True) # date to work with
test.drop(['activity_id'], axis=1, inplace = True) # date to work with
## Split off _ from people_id
train['people_id'] = train['people_id'].apply(lambda x: x.split('_')[1])
train['people_id'] = pd.to_numeric(train['people_id']).astype(int)
test['people_id'] = test['people_id'].apply(lambda x: x.split('_')[1])
test['people_id'] = pd.to_numeric(test['people_id']).astype(int)
columns = list(train.columns)
columns.remove("date")
for col in columns[1:]:
train[col] = train[col].fillna(-1)
train[col] = train[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
train[col] = pd.to_numeric(train[col]).astype(int)
test[col] = test[col].fillna(-1)
test[col] = test[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
test[col] = pd.to_numeric(test[col]).astype(int)
In [3]:
people['people_id'] = people['people_id'].apply(lambda x: x.split('_')[1])
people['people_id'] = pd.to_numeric(people['people_id']).astype(int)
# Values in the people df is Booleans and Strings
columns = list(people.columns)
columns.remove("char_38")
bools = columns[12:]
strings = columns[1:12]
strings.remove("date")
for col in bools:
people[col] = pd.to_numeric(people[col]).astype(int)
for col in strings:
people[col] = people[col].fillna(-2)
people[col] = people[col].apply(lambda x: x if x == -2 else x.split(' ')[1])
people[col] = pd.to_numeric(people[col]).astype(int)
In [4]:
def fix_date_time(df):
def extract_field(_df, start, stop):
return _df['date'].map(lambda dt: int(dt[start:stop]))
df['Year'] = extract_field(df,0,4)
df['Month'] = extract_field(df,5,7)
df['Day'] = extract_field(df,8,10)
return df.drop(['date'], axis = 1)
train = fix_date_time(train)
test = fix_date_time(test)
people = fix_date_time(people)
In [5]:
train = train.merge(people, how='left', on='people_id')
test = test.merge(people, how='left', on='people_id')
# Check it out...
train.sample(10)
Out[5]:
In [6]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.20, random_state=23)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
Out[6]:
In [7]:
proba = clf.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print("Area under ROC {0}".format(score))
In [8]:
# Test Set Predictions
test_proba = clf.predict_proba(test)
test_preds = test_proba[:,1]
# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('redhat.csv', index = False)
In [9]:
target.unique()
Out[9]: