In [6]:
from lib.modules_hotstar import describe_data, processinput_data, extract_data_fields, process_target, transform_countdata
In [7]:
train_X, train_Y, test, details = describe_data('data/hotstar')
In [5]:
genre_list = extract_data_fields(train_X, 'genres')
city_list = extract_data_fields(train_X, 'cities')
train_Y = process_target(train_Y)
daytime_list = range(1, 25)
weekday_list = range(1, 8)
In [9]:
label = test['ID']
label.to_csv('label.csv', index=False, header='label')
In [7]:
import pandas as pd
df = pd.DataFrame()
for col in genre_list:
df[col] = 0
for col in city_list:
df[col] = 0
for col in daytime_list:
df[col] = 0
for col in weekday_list:
df[col] = 0
In [8]:
train_X['genres'][1]
Out[8]:
In [9]:
new_pd = processinput_data(train_X, df)
test_pd = processinput_data(test, df)
In [28]:
new_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/train_data.csv')
test_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/test_data.csv')
In [29]:
In [27]:
Out[27]:
In [85]:
title_train = transform_countdata(train_X, 'titles')
title_test = transform_countdata(test, 'titles')
In [86]:
new_pd['title_count'] = title_train
In [87]:
test_pd['title_count'] = title_test
In [89]:
tod_train = transform_countdata(train_X, 'tod')
tod_test = transform_countdata(test, 'tod')
dow_train = transform_countdata(train_X, 'dow')
dow_test = transform_countdata(test, 'dow')
new_pd['tod'] = tod_train
test_pd['tod'] = tod_test
new_pd['dow'] = dow_train
test_pd['dow'] = dow_test
In [91]:
new_pd.to_csv('train_X.csv')
test_pd.to_csv('test_X.csv')
In [4]:
import pandas as pd
new_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/train_X.csv')
test_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/test_X.csv')
In [ ]:
from lib.modules_hotstar import add_timeandday_features
train_pd = add_timeandday_features(train_X, new_pd)
test_pd = add_timeandday_features(test, test_pd)
In [14]:
city_train = transform_countdata(train_X, 'cities')
city_test = transform_countdata(test, 'cities')
genre_train = transform_countdata(train_X, 'genres')
genre_test = transform_countdata(test, 'genres')
new_pd['city'] = city_train
test_pd['city'] = city_test
new_pd['genre'] = genre_train
test_pd['genre'] = genre_test
In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.svm import SVC
import pandas as pd
In [ ]:
In [16]:
model = GradientBoostingClassifier(n_estimators=100, verbose=True)
model.fit(new_pd, train_Y)
Out[16]:
In [17]:
results = model.predict_proba(test_pd)
In [18]:
results
Out[18]:
In [19]:
# # uncomment to do grid search - could get better score
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(train_X, train_Y, train_size=0.7, random_state=1)
#
# # doing grid search for parameters
# from sklearn.grid_search import GridSearchCV
# from sklearn.metrics import roc_auc_score, make_scorer
# clf_scorer = make_scorer(roc_auc_score)
# rfc = GradientBoostingClassifier(verbose=True)
# param_grid = {
# 'max_depth': [4, 8, 12],
# 'max_features': ['sqrt', 'log2', None],
# 'n_estimators': [250, 500, 1000]
# }
# cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring=clf_scorer, n_jobs=-1)
# cv_rfc.fit(X_train, y_train)
In [20]:
results = pd.DataFrame(columns=['seg', 'segment'], data=results)
In [21]:
results.drop('seg', axis=1, inplace=True)
In [22]:
results.head()
results['ID'] = label
results = results[['ID', 'segment']]
In [23]:
results.head()
Out[23]:
In [24]:
results.to_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/submissions/hotstarsub1.csv', index=False)
In [1]:
label.to_csv('label.csv')
In [ ]: