In [1]:
from lib.modules_hotstar import describe_data, add_timeandday_features, transform_countdata
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.svm import SVC

In [2]:
train_X, train_Y,  test, details = describe_data('data/hotstar')
label = test['ID']
try:
    new_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/train_data.csv')
    test_pd = pd.read_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/test_data.csv')
    print "Files Read"
except:
    print "Read Failed"


Directory Read from:data/hotstar/*

Files Read:

data/hotstar/train_data.json
data/hotstar/test_data.json
data/hotstar/sample_submission.csv
Files Read

In [7]:
# new_pd.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
# test_pd.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

print new_pd.shape
print test_pd.shape


(200000, 1398)
(100000, 1398)

In [11]:
print test_pd.head(5)


   Cricket  Kabaddi  Reality  Wildlife  LiveTV  Football  TalkShow   Drama  \
0    702.0      0.0      0.0       0.0     0.0       0.0       0.0     0.0   
1      0.0      0.0      0.0       0.0     0.0       0.0       0.0     0.0   
2      0.0      0.0      0.0       0.0     0.0       0.0    2247.0     0.0   
3      0.0      0.0    276.0       0.0     0.0       0.0       0.0  6459.0   
4      0.0      0.0      0.0       0.0     0.0       0.0       0.0  1204.0   

   Action  Romance   ...    minhang qu  streatham  zhenruzhen  pavlodar  \
0     0.0    182.0   ...           0.0        0.0         0.0       0.0   
1  3501.0     50.0   ...           0.0        0.0         0.0       0.0   
2     0.0      0.0   ...           0.0        0.0         0.0       0.0   
3     0.0      0.0   ...           0.0        0.0         0.0       0.0   
4     0.0      0.0   ...           0.0        0.0         0.0       0.0   

   louisville  title_count  tod  dow  genre  cities  
0         0.0            8    2    2      2       2  
1         0.0           10    8    6      3       3  
2         0.0            2   11    5      2       3  
3         0.0            4    6    3      2       3  
4         0.0            1    1    1      1       1  

[5 rows x 1398 columns]

In [9]:
train_pd = add_timeandday_features(train_X, new_pd)
test_pd_new = add_timeandday_features(test, test_pd)


monday       0.0
tuesday      0.0
wednesday    0.0
thursday     0.0
friday       0.0
saturday     0.0
sunday       0.0
1            0.0
2            0.0
3            0.0
4            0.0
5            0.0
6            0.0
7            0.0
8            0.0
9            0.0
10           0.0
11           0.0
12           0.0
13           0.0
14           0.0
15           0.0
16           0.0
17           0.0
18           0.0
19           0.0
20           0.0
21           0.0
22           0.0
23           0.0
24           0.0
Name: 0, dtype: float64
0 Row completed
10000 Row completed
20000 Row completed
30000 Row completed
40000 Row completed
50000 Row completed
60000 Row completed
70000 Row completed
80000 Row completed
90000 Row completed
100000 Row completed
110000 Row completed
120000 Row completed
130000 Row completed
140000 Row completed
150000 Row completed
160000 Row completed
170000 Row completed
180000 Row completed
190000 Row completed
0 Row completed
10000 Row completed
20000 Row completed
30000 Row completed
40000 Row completed
50000 Row completed
60000 Row completed
70000 Row completed
80000 Row completed
90000 Row completed
100000 Row completed
110000 Row completed
120000 Row completed
130000 Row completed
140000 Row completed
150000 Row completed
160000 Row completed
170000 Row completed
180000 Row completed
190000 Row completed
monday       0.0
tuesday      0.0
wednesday    0.0
thursday     0.0
friday       0.0
saturday     0.0
sunday       0.0
1            0.0
2            0.0
3            0.0
4            0.0
5            0.0
6            0.0
7            0.0
8            0.0
9            0.0
10           0.0
11           0.0
12           0.0
13           0.0
14           0.0
15           0.0
16           0.0
17           0.0
18           0.0
19           0.0
20           0.0
21           0.0
22           0.0
23           0.0
24           0.0
Name: 0, dtype: float64
0 Row completed
10000 Row completed
20000 Row completed
30000 Row completed
40000 Row completed
50000 Row completed
60000 Row completed
70000 Row completed
80000 Row completed
90000 Row completed
0 Row completed
10000 Row completed
20000 Row completed
30000 Row completed
40000 Row completed
50000 Row completed
60000 Row completed
70000 Row completed
80000 Row completed
90000 Row completed

In [11]:
train = pd.concat([new_pd, train_pd], axis=1)
test = pd.concat([test_pd, test_pd_new], axis=1)

In [14]:
model = GradientBoostingClassifier(n_estimators=100, verbose=True)
model.fit(train, train_Y)


      Iter       Train Loss   Remaining Time 
         1           0.5244           11.73m
         2           0.5134           11.41m
         3           0.5045           11.25m
         4           0.4970           11.06m
         5           0.4907           10.94m
         6           0.4854           10.77m
         7           0.4809           10.66m
         8           0.4772           10.57m
         9           0.4739           10.59m
        10           0.4711           10.46m
        20           0.4563            8.67m
        30           0.4505            7.54m
        40           0.4474            6.78m
        50           0.4455            5.63m
        60           0.4441            4.26m
        70           0.4428            3.02m
        80           0.4417            1.97m
        90           0.4408           58.14s
       100           0.4402            0.00s
Out[14]:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=True, warm_start=False)

In [16]:
results = model.predict_proba(test)
results = pd.DataFrame(columns=['seg', 'segment'], data=results)
results.drop('seg', axis=1, inplace=True)
results['ID'] = label
results = results[['ID', 'segment']]
results.head()


Out[16]:
ID segment
0 test-1 0.025113
1 test-10 0.061844
2 test-100 0.166906
3 test-1000 0.142373
4 test-10000 0.129130

In [17]:
results.to_csv('/home/suraj/Repositories/IndiaHacks ML Hackathon/submissions/hotstarsub1.csv', index=False)

In [18]:
train.to_csv('train_X.csv', index=False)
test.to_csv('test_X.csv', index=False)
train_Y.to_csv('train_Y.csv', index=False)

In [ ]: