In [1]:
from lib.modules_hotstar import describe_data, add_timeandday_features, transform_countdata
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.svm import SVC
In [2]:
train_X, train_Y, test, details = describe_data('data/hotstar')
label = test['ID']
try:
new_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/train_data.csv')
test_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/test_data.csv')
print "Files Read"
except:
print "Read Failed"
In [7]:
# new_pd.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
# test_pd.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
print new_pd.shape
print test_pd.shape
In [11]:
print test_pd.head(5)
In [9]:
train_pd = add_timeandday_features(train_X, new_pd)
test_pd_new = add_timeandday_features(test, test_pd)
In [11]:
train = pd.concat([new_pd, train_pd], axis=1)
test = pd.concat([test_pd, test_pd_new], axis=1)
In [14]:
model = GradientBoostingClassifier(n_estimators=100, verbose=True)
model.fit(train, train_Y)
Out[14]:
In [16]:
results = model.predict_proba(test)
results = pd.DataFrame(columns=['seg', 'segment'], data=results)
results.drop('seg', axis=1, inplace=True)
results['ID'] = label
results = results[['ID', 'segment']]
results.head()
Out[16]:
In [17]:
results.to_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/submissions/hotstarsub1.csv', index=False)
In [18]:
train.to_csv('train_X.csv', index=False)
test.to_csv('test_X.csv', index=False)
train_Y.to_csv('train_Y.csv', index=False)
In [ ]: