In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
%matplotlib inline
from sklearn.cross_validation import train_test_split
train = pd.read_csv('act_train.csv')
train.drop('activity_id', axis=1, inplace = True)
#train.drop_duplicates(inplace = True)
test = pd.read_csv('act_test.csv')
people = pd.read_csv('people.csv')
In [2]:
1.0*(people.char_1 != people.char_2).sum()/people.shape[0]
Out[2]:
In [3]:
1.0*len(people.group_1.unique())/len(people.group_1)
Out[3]:
In [4]:
test_ids = test['activity_id']
target = train['outcome']
train.drop('outcome', axis=1, inplace = True)
test.drop(['activity_id'], axis=1, inplace = True)
## Split off _ from people_id
train['people_id'] = train['people_id'].apply(lambda x: x.split('_')[1])
train['people_id'] = pd.to_numeric(train['people_id']).astype(int)
test['people_id'] = test['people_id'].apply(lambda x: x.split('_')[1])
test['people_id'] = pd.to_numeric(test['people_id']).astype(int)
columns = list(train.columns)
columns.remove("date")
for col in columns[1:]:
train[col] = train[col].fillna(-1)
train[col] = train[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
train[col] = pd.to_numeric(train[col]).astype(int)
test[col] = test[col].fillna(-1)
test[col] = test[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
test[col] = pd.to_numeric(test[col]).astype(int)
In [5]:
people['people_id'] = people['people_id'].apply(lambda x: x.split('_')[1])
people['people_id'] = pd.to_numeric(people['people_id']).astype(int)
# Values in the people df is Booleans and Strings
columns = list(people.columns)
columns.remove("char_38")
bools = columns[12:]
strings = columns[1:12]
strings.remove("date")
for col in bools:
people[col] = pd.to_numeric(people[col]).astype(int)
for col in strings:
people[col] = people[col].fillna(-2)
people[col] = people[col].apply(lambda x: x if x == -2 else x.split(' ')[1])
people[col] = pd.to_numeric(people[col]).astype(int)
In [6]:
def fix_date_time(df):
def extract_field(_df, start, stop):
return _df['date'].map(lambda dt: int(dt[start:stop]))
df['Year'] = extract_field(df,0,4)
df['Month'] = extract_field(df,5,7)
df['Day'] = extract_field(df,8,10)
return df.drop(['date'], axis = 1)
train = fix_date_time(train)
test = fix_date_time(test)
people = fix_date_time(people)
In [7]:
train = train.merge(people, how='left', on='people_id')
test = test.merge(people, how='left', on='people_id')
train.drop("people_id", axis = 1, inplace= True)
test.drop("people_id", axis = 1, inplace= True)
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.20, random_state=23)
print('# train: %5d (0s: %5d, 1s: %4d)'%(len(y_train), sum(y_train==0), sum(y_train==1)))
print('# test: %5d (0s: %5d, 1s: %4d)'%((len(y_test), sum(y_test==0), sum(y_test==1))))
In [ ]:
# xgb
clf_xgb = xgb.XGBClassifier(missing=np.nan, max_depth=10, min_child_weight = 0,
n_estimators=1000, learning_rate=0.01,
subsample=0.7, colsample_bytree=0.7, seed=4242,
nthread = 4)
# fitting
#clf_xgb.fit(X_train, y_train, early_stopping_rounds=50,print_every_n = 10, eval_metric="auc", eval_set=[(X_test, y_test)])
clf_xgb.fit(train, target, early_stopping_rounds=50,eval_metric="auc")
test_proba = clf_xgb.predict_proba(test)
test_preds = test_proba[:,1]
# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('xgb.csv', index = False)
In [ ]:
test_proba = clf_xgb.predict_proba(test)
test_preds = test_proba[:,1]
# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('xgb.csv', index = False)
In [ ]:
%matplotlib qt
xgb.plot_importance(clf_xgb)
In [11]:
%matplotlib qt
xgb.to_graphviz(clf_xgb)
In [ ]:
clfRF = RandomForestClassifier()
clfRF.fit(X_train, y_train)
In [ ]:
proba = clfRF.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print("Area under ROC {0}".format(score))
In [ ]:
# Test Set Predictions
test_proba = clfRF.predict_proba(test)
test_preds = test_proba[:,1]
# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('randomForest.csv', index = False)
In [ ]:
plt.hist(target)
In [ ]:
plt.hist(test_preds)
In [ ]: