In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import GridSearchCV

import xgboost as xgb

%matplotlib inline

from sklearn.cross_validation import train_test_split

train = pd.read_csv('act_train.csv')
train.drop('activity_id', axis=1, inplace = True) 
#train.drop_duplicates(inplace = True)

test = pd.read_csv('act_test.csv')
people = pd.read_csv('people.csv')

In [2]:
1.0*(people.char_1 != people.char_2).sum()/people.shape[0]


Out[2]:
0.5105436817225224

In [3]:
1.0*len(people.group_1.unique())/len(people.group_1)


Out[3]:
0.1809663807781385

In [4]:
test_ids = test['activity_id']
target = train['outcome']

train.drop('outcome', axis=1, inplace = True) 
test.drop(['activity_id'], axis=1, inplace = True) 

    
## Split off _ from people_id
train['people_id'] = train['people_id'].apply(lambda x: x.split('_')[1])
train['people_id'] = pd.to_numeric(train['people_id']).astype(int)

test['people_id'] = test['people_id'].apply(lambda x: x.split('_')[1])
test['people_id'] = pd.to_numeric(test['people_id']).astype(int)
    
columns = list(train.columns)
columns.remove("date")
    
for col in columns[1:]:
    train[col] = train[col].fillna(-1)
    train[col] = train[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
    train[col] = pd.to_numeric(train[col]).astype(int)
    
    test[col] = test[col].fillna(-1)
    test[col] = test[col].apply(lambda x: x if x == -1 else x.split(' ')[1])
    test[col] = pd.to_numeric(test[col]).astype(int)

In [5]:
people['people_id'] = people['people_id'].apply(lambda x: x.split('_')[1])
people['people_id'] = pd.to_numeric(people['people_id']).astype(int)
    
#  Values in the people df is Booleans and Strings    
columns = list(people.columns)
columns.remove("char_38")
bools = columns[12:]
strings = columns[1:12]
strings.remove("date")
    
for col in bools:
    people[col] = pd.to_numeric(people[col]).astype(int)   
    
for col in strings:
    people[col] = people[col].fillna(-2)
    people[col] = people[col].apply(lambda x: x if x == -2 else x.split(' ')[1])
    people[col] = pd.to_numeric(people[col]).astype(int)

In [6]:
def fix_date_time(df):
    def extract_field(_df, start, stop):
        return _df['date'].map(lambda dt: int(dt[start:stop]))
    df['Year'] = extract_field(df,0,4)
    df['Month'] = extract_field(df,5,7)
    df['Day'] = extract_field(df,8,10)
    return df.drop(['date'], axis = 1)
train = fix_date_time(train)
test = fix_date_time(test)
people = fix_date_time(people)

In [7]:
train = train.merge(people, how='left', on='people_id')
test = test.merge(people, how='left', on='people_id')

train.drop("people_id", axis = 1, inplace= True)
test.drop("people_id", axis = 1, inplace= True)

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.20, random_state=23)

print('# train: %5d (0s: %5d, 1s: %4d)'%(len(y_train), sum(y_train==0), sum(y_train==1)))
print('# test:  %5d (0s: %5d, 1s: %4d)'%((len(y_test), sum(y_test==0), sum(y_test==1))))


# train: 1757832 (0s: 977474, 1s: 780358)
# test:  439459 (0s: 244320, 1s: 195139)

In [ ]:
# xgb
clf_xgb = xgb.XGBClassifier(missing=np.nan, max_depth=10, min_child_weight = 0,
                        n_estimators=1000, learning_rate=0.01, 
                        subsample=0.7, colsample_bytree=0.7, seed=4242,
                           nthread = 4)

# fitting
#clf_xgb.fit(X_train, y_train, early_stopping_rounds=50,print_every_n = 10, eval_metric="auc", eval_set=[(X_test, y_test)])
clf_xgb.fit(train, target, early_stopping_rounds=50,eval_metric="auc")

test_proba = clf_xgb.predict_proba(test)
test_preds = test_proba[:,1]

# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('xgb.csv', index = False)

In [ ]:
test_proba = clf_xgb.predict_proba(test)
test_preds = test_proba[:,1]

# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('xgb.csv', index = False)

In [ ]:
%matplotlib qt
xgb.plot_importance(clf_xgb)

In [11]:
%matplotlib qt
xgb.to_graphviz(clf_xgb)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-c3ed417e2dc6> in <module>()
      1 get_ipython().magic(u'matplotlib qt')
----> 2 xgb.to_graphviz(clf_xgb)

NameError: name 'clf_xgb' is not defined

In [ ]:
clfRF = RandomForestClassifier()
clfRF.fit(X_train, y_train)

In [ ]:
proba = clfRF.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print("Area under ROC {0}".format(score))

In [ ]:
# Test Set Predictions
test_proba = clfRF.predict_proba(test)
test_preds = test_proba[:,1]

# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('randomForest.csv', index = False)

In [ ]:
plt.hist(target)

In [ ]:
plt.hist(test_preds)

In [ ]: