notebook.community

Edit and run



In [1]:

    
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head())

drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
drama_ratings_df = drama_df['ratings']
drama_ratings_df.plot(figsize=(12,8), style='o--');
# print(drama_ratings_df.head(50))

fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(drama_df['ratings'].values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(drama_df['ratings'], lags=40, ax=ax2)



In [12]:

    
drama_df['ratings_wa3'] = pd.rolling_mean(drama_df['ratings'], 3)
drama_df['ratings_wa7'] = pd.rolling_mean(drama_df['ratings'], 5)
# print(drama_df.head())
drama_df.plot(figsize=(12,8), style='o--')









    Out[12]:





<matplotlib.axes.AxesSubplot at 0x52f6c90>



In [2]:

    
# print(drama_df.head())
drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head())

drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
dates = [str(d)for d in drama_df.index]
comments_df = pd.read_csv('jung/jung.csv')
# comments['date'] = comments['date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').date())
comments_df['date'] = pd.to_datetime(comments_df['date'])
score_comments = comments_df[['date', 'score']].sort(['date'])
l = []
for d in dates:
    dd = score_comments[score_comments['date']==d]['score']
    l.append([d, dd.count(), dd.std(), dd.mean()])
s = pd.DataFrame(l, columns=['date', 'count', 'std', 'mean'])
s = s.set_index('date')

drama_df = drama_df.join(s)
# drama_df = drama_df.join(s)
# print(drama_df.corr())
# print(drama_df)



In [10]:

    
mbc_df = pd.read_csv('jung/mbc.csv', sep='\t', names=['date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)
# print(mbc_df.shape)

sbs_df = pd.read_csv('jung/sbs.csv', sep='\t', names=['date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

# ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings']]

data_df = drama_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
data_df.plot(figsize=(12,8), style='o--')
# data_df
data_df['target'] = data_df['ratings'] - data_df['ratings'].shift(1)
data_df.plot(figsize=(12,8), style='o--')

data_df = data_df.dropna()
# data.columns = ['ratings', 'std', 'mean', 'count', 'Y']
X = data_df[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
y = data_df['target']

print(data_df.corr())









    



              ratings  sbs ratings  mbc ratings     count       std      mean  \
ratings      1.000000    -0.218627    -0.652175 -0.085732 -0.179250  0.241104   
sbs ratings -0.218627     1.000000     0.562446 -0.246306 -0.174225  0.041543   
mbc ratings -0.652175     0.562446     1.000000 -0.033118 -0.008729 -0.116688   
count       -0.085732    -0.246306    -0.033118  1.000000  0.022376  0.106962   
std         -0.179250    -0.174225    -0.008729  0.022376  1.000000 -0.893803   
mean         0.241104     0.041543    -0.116688  0.106962 -0.893803  1.000000   
target       0.330596    -0.120052    -0.015467  0.130795 -0.132250  0.016029   

               target  
ratings      0.330596  
sbs ratings -0.120052  
mbc ratings -0.015467  
count        0.130795  
std         -0.132250  
mean         0.016029  
target       1.000000  

[7 rows x 7 columns]



In [16]:

    
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


for k, m in [(2, 100), (2, 200), (2, 300), (2, 400), 
             (3, 100), (3, 200), (3, 300), (3, 400),
             (4, 100), (4, 200), (4, 300), (4, 400),
             (5, 100), (5, 200), (5, 300), (5, 400)]:
    print('**********************************')
    print(k, m)
    rs = cross_validation.ShuffleSplit(len(X), n_iter=4,  test_size=.2, random_state=0)
    clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
    for train, test in rs:
        print('========================')
        clf.fit(X.iloc[train], y.iloc[train])
        predict_y = clf.predict(X.iloc[test])
        print(explained_variance_score(y.iloc[test], predict_y))
        print(r2_score(y.iloc[test], predict_y))
        print(mean_squared_error(y.iloc[test], predict_y))









    



**********************************
(2, 100)
========================
0.322402194935
0.206286139616
1.03373293176
========================
0.0244170557756
-0.00455832119527
0.560141719898
========================
0.196460961843
0.113750920504
1.7179052157
========================
-0.00174486224029
-0.122671423329
0.441883472222
**********************************
(2, 200)
========================
0.402140639044
0.232371304577
0.99975961292
========================
-0.0363800104504
-0.0536233604779
0.587500385802
========================
-0.0529280591252
-0.208155056382
2.34188776129
========================
-0.0229287093038
-0.0394108074407
0.409112093809
**********************************
(2, 300)
========================
0.420893169297
0.279718184608
0.938095036367
========================
0.366657300442
0.362997125595
0.355192802768
========================
0.0930312345973
-0.12110132336
2.1731428052
========================
0.172503774129
0.0215341514575
0.385124157986
**********************************
(2, 400)
========================
0.392939151208
0.323232324934
0.881422220006
========================
-0.0795137733922
-0.0893034343684
0.607395595004
========================
0.165253433712
-0.0155599055074
1.96856132084
========================
0.12663123584
0.0627358185001
0.368907181838
**********************************
(3, 100)
========================
0.358209704698
0.0911432394581
1.18369504493
========================
-0.788588553443
-0.932213944675
1.07740249555
========================
0.141844886291
0.00813703293656
1.92262717536
========================
0.686905222794
-0.479679059971
0.582401678005
**********************************
(3, 200)
========================
0.333503275679
0.0500459584645
1.2372201437
========================
-0.245630655679
-0.375554000077
0.767008910443
========================
0.11239716103
-0.054967164867
2.04494835238
========================
0.699116715826
-0.113787724952
0.438386848541
**********************************
(3, 300)
========================
0.335693496527
0.0246818788485
1.27025432099
========================
-0.191104065815
-0.35064538537
0.753119866882
========================
0.153513245575
0.0024569717444
1.93363740597
========================
0.628635670732
-0.230465574187
0.48431125
**********************************
(3, 400)
========================
0.372635893469
0.0900464858798
1.18512345679
========================
-0.119496522638
-0.255318198305
0.699965427375
========================
0.157875097646
0.00848110766827
1.9219602209
========================
0.628763424671
-0.203789019372
0.473811358025
**********************************
(4, 100)
========================
0.184036309036
-0.059624965875
1.38005555556
========================
-0.781281999597
-0.896323381197
1.05738991736
========================
0.278835545224
0.207408148292
1.53636004535
========================
0.624475990854
-0.546005621189
0.6085078125
**********************************
(4, 200)
========================
0.282320907555
-0.0646693124232
1.3866253125
========================
-0.648614050433
-0.869632777215
1.04250723658
========================
0.281654278952
0.184955510909
1.57988223765
========================
-0.475286672456
-1.3930606598
0.941908675698
**********************************
(4, 300)
========================
0.285497351708
-0.00737691726013
1.31200769704
========================
-0.494248535637
-0.619370520205
0.902961002066
========================
0.247546997277
0.135083080289
1.67655495717
========================
0.602776656918
-0.743402224506
0.686203115566
**********************************
(4, 400)
========================
0.328987168987
-0.0102187414687
1.31570888889
========================
-0.637184139305
-0.76256151653
0.982804301617
========================
0.198405771015
0.0676421044565
1.80728254472
========================
0.608247753207
-0.55537449727
0.612195402125
**********************************
(5, 100)
========================
0.339825583159
0.128286619953
1.13531950617
========================
-1.2633508688
-1.36380121154
1.31805555556
========================
0.268550955219
0.199055946526
1.55254995325
========================
-0.00409327009937
-0.834384880307
0.722013888889
**********************************
(5, 200)
========================
0.167334141707
-0.125856350905
1.46631531142
========================
-0.84752112227
-1.05892316276
1.14805555556
========================
0.109670289357
0.00396088411978
1.93072222222
========================
-0.448446251129
-1.01126919603
0.791635555556
**********************************
(5, 300)
========================
0.190511534262
-0.00789354695605
1.31268055556
========================
-0.785110792284
-0.932488442531
1.07755555556
========================
0.226941807676
0.136803549319
1.67322
========================
0.47935862692
-0.683948735321
0.662802222222
**********************************
(5, 400)
========================
0.340033877802
0.0175549339054
1.27953645408
========================
-0.760003188267
-0.882796309581
1.04984722222
========================
0.263350300362
0.18515155684
1.57950222222
========================
0.554871740315
-0.900229615072
0.747930376492



In [3]:

    
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR

kf = KFold(len(y), n_folds=3)

estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}

pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Fitting 3 folds for each of 20 candidates, totalling 60 fits






    



[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  54 out of  60 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished






    



Best score: -0.046
Best parameters set:
	svr__C: 1.4
	svr__epsilon: 0.2



In [48]:

    
import numpy as np
import pandas as pd

drama_df = pd.read_csv('score.csv')
drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
kbs_df = drama_df
# print(kbs_df.head(10))

mbc_df = pd.read_csv('mbc.csv', sep='\t', names=['date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)
# print(mbc_df.shape)

sbs_df = pd.read_csv('sbs.csv', sep='\t', names=['date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings']]
ratings_df['total rating'] = ratings_df.apply(numpy.sum, axis=1)

ratings_df.plot(figsize=(12,8), style='o--')



In [45]:

    
ratings_df['total rating'] = ratings_df.apply(numpy.sum, axis=1)
ratings_df.head()









    Out[45]:






  
    
      
      ratings
      sbs ratings
      mbc ratings
      total rating
    
  
  
    
      2014-06-29
       19.0
       8.0
       10.2
       37.2
    
    
      2014-06-28
       16.6
       8.7
       10.6
       35.9
    
    
      2014-06-22
       18.9
       7.1
        9.8
       35.8
    
    
      2014-06-21
       17.1
       7.2
        9.9
       34.2
    
    
      2014-06-15
       18.4
       9.1
        9.3
       36.8
    
  

5 rows × 4 columns



In [ ]:



In [ ]:

    
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd

start_day = datetime.strptime('4 Jan 14', '%d %b %y')
end_day = datetime.strptime('29 Jun 14', '%d %b %y')

webs = list()
target_sday = start_day
target_eday = start_day

while(True):
    if target_eday > end_day:
        break
    target_sday = target_eday
    target_eday = target_sday + timedelta(days=7)
    
    webs.append((target_sday, target_eday))
# print(webs)

drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head(100))

drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
comments_df = pd.read_csv('jung/jung.csv')
# comments['date'] = comments['date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').date())
# comments_df['date'] = pd.to_datetime(comments_df['date'])
comments_df = comments_df.set_index(pd.DatetimeIndex(comments_df['date']))
webs_score = list()
for web in webs:
    w = web[0].strftime('%Y-%m-%d')
    t = web[1].strftime('%Y-%m-%d')
    the1940s = comments_df.ix[w:t]['score'].describe().ix[['count', 'mean', 'std']].T
    w = web[0] + timedelta(days=7)
    t = web[0] + timedelta(days=8)
    l1 = [w]
    l2 = [t]
    l1.extend(the1940s.tolist())
    l2.extend(the1940s.tolist())
    webs_score.append(l1)
    webs_score.append(l2)
#     the1940s.tolist())
#     webs_score.append([w].append(the1940s.tolist()))
#     webs_score.append([t].append(the1940s.tolist()))
score_df = pd.DataFrame(webs_score, columns=['date', 'count', 'mean', 'std'])
score_df = score_df.set_index(pd.DatetimeIndex(score_df['date']))
score_df.drop('date', axis=1,  inplace=True)



In [52]:

    
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd

start_day = datetime.strptime('4 Jan 14', '%d %b %y')
end_day = datetime.strptime('29 Jun 14', '%d %b %y')

webs = list()
target_sday = start_day
target_eday = start_day

while(True):
    if target_eday > end_day:
        break
    target_sday = target_eday
    target_eday = target_sday + timedelta(days=7)
    
    webs.append((target_sday, target_eday))
# print(webs)

drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head(100))

drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
comments_df = pd.read_csv('jung/jung.csv')
# comments['date'] = comments['date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').date())
# comments_df['date'] = pd.to_datetime(comments_df['date'])
comments_df = comments_df.set_index(pd.DatetimeIndex(comments_df['date']))
webs_score = list()
for web in webs:
    w = web[0].strftime('%Y-%m-%d')
    t = web[1].strftime('%Y-%m-%d')
    the1940s = comments_df.ix[w:t]['score'].describe().ix[['count', 'mean', 'std']].T
    w = web[0] + timedelta(days=7)
    t = web[0] + timedelta(days=8)
    l1 = [w]
    l2 = [t]
    l1.extend(the1940s.tolist())
    l2.extend(the1940s.tolist())
    webs_score.append(l1)
    webs_score.append(l2)
#     the1940s.tolist())
#     webs_score.append([w].append(the1940s.tolist()))
#     webs_score.append([t].append(the1940s.tolist()))
score_df = pd.DataFrame(webs_score, columns=['date', 'count', 'mean', 'std'])
score_df = score_df.set_index(pd.DatetimeIndex(score_df['date']))
score_df.drop('date', axis=1,  inplace=True)
# print(drama_df.head())
data_df = drama_df.join(score_df, how='inner')[['ratings', 'count', 'std', 'mean']]

# data_df['target'] = data_df['ratings'] - data_df['ratings'].shift(1)
# data_df = data_df.dropna()
# # data.columns = ['ratings', 'std', 'mean', 'count', 'Y']
# X = data_df[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
# y = data_df['target']

mbc_df = pd.read_csv('jung/mbc.csv', sep='\t', names=['date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)
# # print(mbc_df.shape)

sbs_df = pd.read_csv('jung/sbs.csv', sep='\t', names=['date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

# ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings']]

data_df = data_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
data_df
# data_df.plot(figsize=(12,8), style='o--')
# # data_df
data_df['target'] = data_df['ratings'] - data_df['ratings'].shift(1)
data_df['target2'] = data_df['target'].apply(lambda x: 1 if x>=0 else 0)
# data_df.plot(figsize=(12,8), style='o--')

data_df = data_df.dropna()
# # data.columns = ['ratings', 'std', 'mean', 'count', 'Y']
X = data_df[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
y = data_df['target']
y2 = data_df['target2']



In [9]:

    
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


for k, m in [(2, 100), (2, 200), (2, 300), (2, 400), 
             (3, 100), (3, 200), (3, 300), (3, 400),
             (4, 100), (4, 200), (4, 300), (4, 400),
             (1, 50), (1, 70), (1, 100), (2, 50)]:
    print('**********************************')
    
    print(k, m)
    rs = cross_validation.ShuffleSplit(len(X), n_iter=4,  test_size=.2, random_state=0)
    clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
    for train, test in rs:
        print('========================')
        clf.fit(X.iloc[train], y.iloc[train])
        predict_y = clf.predict(X.iloc[test])
        print(explained_variance_score(y.iloc[test], predict_y))
        print(r2_score(y.iloc[test], predict_y))
        print(mean_squared_error(y.iloc[test], predict_y))









    



**********************************
(2, 100)
========================
-0.219130473322
-0.222342437956
0.301812947643
========================
-0.261332798676
-0.261577199627
0.311500543118
========================
0.30235955877
0.264957264957
0.163342830009
========================
0.0380382545175
0.0117214532872
0.24401939425
**********************************
(2, 200)
========================
-0.0537804709141
-0.09595533241
0.270606254916
========================
0.0073921634007
0.00529151277316
0.245607033883
========================
0.0592031671082
-0.137051502698
0.252678111711
========================
0.234664266569
0.131085928185
0.214546684399
**********************************
(2, 300)
========================
-0.143909187801
-0.143924205512
0.282450421114
========================
-0.144524961176
-0.148622980427
0.283610612451
========================
0.261927652229
0.193807388849
0.179153913589
========================
0.35625
0.31796875
0.168402777778
**********************************
(2, 400)
========================
-1.1498466199
-1.34066868622
0.577942885488
========================
-0.38395436606
-0.384085031071
0.341749390388
========================
0.332409972299
0.326869806094
0.149584487535
========================
0.217012619267
0.158711565097
0.207725539482
**********************************
(3, 100)
========================
-0.178320792834
-0.178493103477
0.290985951476
========================
-0.452024313415
-0.454893496523
0.359232962105
========================
-0.0971493664634
-0.170756747173
0.260168166038
========================
0.47722700626
0.36722299616
0.156241235516
**********************************
(3, 200)
========================
-0.194289408993
-0.218875410275
0.300956891426
========================
-0.192641592174
-0.205402681734
0.297630291786
========================
-0.390433429758
-0.390595972336
0.309021327186
========================
0.49513241838
0.351095623995
0.160223302717
**********************************
(3, 300)
========================
-0.309727428383
-0.32582195594
0.327363445911
========================
-0.422236688212
-0.433003116343
0.353827929961
========================
-0.156972288256
-0.348207834897
0.299601741088
========================
0.40697333542
0.260706900396
0.182541506075
**********************************
(3, 400)
========================
-0.115975264004
-0.143563081095
0.282361254591
========================
-0.405416656069
-0.422632428554
0.35126726631
========================
0.156785498166
-0.00493338108041
0.223318529129
========================
0.445948064249
0.343643977136
0.162063215522
**********************************
(4, 100)
========================
-0.749454804717
-0.749466578581
0.43196705644
========================
-0.709476454294
-0.710003116343
0.42222299169
========================
-0.271304764704
-0.347902573066
0.299533905126
========================
0.334987654321
0.0806111111111
0.227009602195
**********************************
(4, 200)
========================
-0.661111111111
-0.6625
0.41049382716
========================
-0.377777777778
-0.4
0.345679012346
========================
-0.565903205948
-0.763387398258
0.39186386628
========================
0.3
0.1
0.222222222222
**********************************
(4, 300)
========================
-0.616049382716
-0.616666666667
0.399176954733
========================
-0.759917355372
-0.760020661157
0.434573002755
========================
-0.186045043532
-0.276413769211
0.283647504269
========================
0.3
0.1
0.222222222222
**********************************
(4, 400)
========================
-0.661277777778
-0.663625
0.410771604938
========================
-0.544444444444
-0.55
0.382716049383
========================
-0.461049382716
-0.618472222222
0.359660493827
========================
0.3
0.1
0.222222222222
**********************************
(1, 50)
========================
-0.145777777778
-0.146
0.282962962963
========================
-0.437890625
-0.466015625
0.361979166667
========================
-0.234567901235
-0.240740740741
0.275720164609
========================
-0.533650416171
-0.590784780024
0.392786365438
**********************************
(1, 70)
========================
-0.506390625
-0.5195234375
0.375190972222
========================
0.00775477405514
-0.0272222893102
0.253635133163
========================
0.0555555555556
0.0
0.222222222222
========================
0.0955102040816
0.0302040816327
0.239455782313
**********************************
(1, 100)
========================
-0.291777777778
-0.306
0.322469135802
========================
-0.380444444444
-0.384
0.341728395062
========================
0.030612244898
-0.224489795918
0.272108843537
========================
-0.108442330559
-0.132223543401
0.279561368741
**********************************
(2, 50)
========================
-0.263326238795
-0.273849300343
0.314530691443
========================
-0.0685124612762
-0.0686467365471
0.26386339174
========================
-0.135776956908
-0.330342467617
0.29563165947
========================
0.2296875
0.22890625
0.190393518519



In [80]:

    
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR, SVC
from sklearn.naive_bayes import MultinomialNB

# print(X)
kf = KFold(len(y2), n_folds=4)

# estimators = [('pca', PCA()), ('mnb', MultinomialNB())]
# parameters = {'pca__n_components':[2, 3], 'mnb__alpha':[0.01, 0.04, 0.07, 0.1, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]}

estimators = [('pca', PCA()), ('svr', SVC())]
parameters = {'pca__n_components':[2, 3], 'svr__C':(0.001, 0.01, 0.03, 0.05, 0.08, 0.1, 0.3, 0.5, 0.8, 1.0, 1.2, 1.4)}

# estimators = [('mnb', MultinomialNB())]
# parameters = {'mnb__alpha':[0.01, 0.04, 0.07, 0.1, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]}


# clf = SVC()
# clf.fit(X, y2)
# print(clf.score(X, y2))

# clf = SVR()
# clf.fit(X, y)
# print(clf.score(X, y))

# clf = MultinomialNB()
# clf.fit(X, y2)
# print(clf.score(X, y2))


# print(y)
pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# # print("Performing grid search...")
# # print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y2)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Fitting 3 folds for each of 24 candidates, totalling 72 fits






    



[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  66 out of  72 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.1s finished






    



Best score: 0.600
Best parameters set:
	pca__n_components: 2
	svr__C: 0.001



In [ ]:

    
estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}

	ratings	sbs ratings	mbc ratings	total rating
2014-06-29	19.0	8.0	10.2	37.2
2014-06-28	16.6	8.7	10.6	35.9
2014-06-22	18.9	7.1	9.8	35.8
2014-06-21	17.1	7.2	9.9	34.2
2014-06-15	18.4	9.1	9.3	36.8