In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head())
drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
drama_ratings_df = drama_df['ratings']
drama_ratings_df.plot(figsize=(12,8), style='o--');
# print(drama_ratings_df.head(50))
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(drama_df['ratings'].values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(drama_df['ratings'], lags=40, ax=ax2)
In [12]:
drama_df['ratings_wa3'] = pd.rolling_mean(drama_df['ratings'], 3)
drama_df['ratings_wa7'] = pd.rolling_mean(drama_df['ratings'], 5)
# print(drama_df.head())
drama_df.plot(figsize=(12,8), style='o--')
Out[12]:
In [2]:
# print(drama_df.head())
drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head())
drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
dates = [str(d)for d in drama_df.index]
comments_df = pd.read_csv('jung/jung.csv')
# comments['date'] = comments['date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').date())
comments_df['date'] = pd.to_datetime(comments_df['date'])
score_comments = comments_df[['date', 'score']].sort(['date'])
l = []
for d in dates:
dd = score_comments[score_comments['date']==d]['score']
l.append([d, dd.count(), dd.std(), dd.mean()])
s = pd.DataFrame(l, columns=['date', 'count', 'std', 'mean'])
s = s.set_index('date')
drama_df = drama_df.join(s)
# drama_df = drama_df.join(s)
# print(drama_df.corr())
# print(drama_df)
In [10]:
mbc_df = pd.read_csv('jung/mbc.csv', sep='\t', names=['date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1, inplace=True)
# print(mbc_df.shape)
sbs_df = pd.read_csv('jung/sbs.csv', sep='\t', names=['date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head())
# ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings']]
data_df = drama_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
data_df.plot(figsize=(12,8), style='o--')
# data_df
data_df['target'] = data_df['ratings'] - data_df['ratings'].shift(1)
data_df.plot(figsize=(12,8), style='o--')
data_df = data_df.dropna()
# data.columns = ['ratings', 'std', 'mean', 'count', 'Y']
X = data_df[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
y = data_df['target']
print(data_df.corr())
In [16]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
for k, m in [(2, 100), (2, 200), (2, 300), (2, 400),
(3, 100), (3, 200), (3, 300), (3, 400),
(4, 100), (4, 200), (4, 300), (4, 400),
(5, 100), (5, 200), (5, 300), (5, 400)]:
print('**********************************')
print(k, m)
rs = cross_validation.ShuffleSplit(len(X), n_iter=4, test_size=.2, random_state=0)
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
for train, test in rs:
print('========================')
clf.fit(X.iloc[train], y.iloc[train])
predict_y = clf.predict(X.iloc[test])
print(explained_variance_score(y.iloc[test], predict_y))
print(r2_score(y.iloc[test], predict_y))
print(mean_squared_error(y.iloc[test], predict_y))
In [3]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR
kf = KFold(len(y), n_folds=3)
estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}
pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [48]:
import numpy as np
import pandas as pd
drama_df = pd.read_csv('score.csv')
drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
kbs_df = drama_df
# print(kbs_df.head(10))
mbc_df = pd.read_csv('mbc.csv', sep='\t', names=['date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1, inplace=True)
# print(mbc_df.shape)
sbs_df = pd.read_csv('sbs.csv', sep='\t', names=['date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head())
ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings']]
ratings_df['total rating'] = ratings_df.apply(numpy.sum, axis=1)
ratings_df.plot(figsize=(12,8), style='o--')
In [45]:
ratings_df['total rating'] = ratings_df.apply(numpy.sum, axis=1)
ratings_df.head()
Out[45]:
In [ ]:
In [ ]:
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd
start_day = datetime.strptime('4 Jan 14', '%d %b %y')
end_day = datetime.strptime('29 Jun 14', '%d %b %y')
webs = list()
target_sday = start_day
target_eday = start_day
while(True):
if target_eday > end_day:
break
target_sday = target_eday
target_eday = target_sday + timedelta(days=7)
webs.append((target_sday, target_eday))
# print(webs)
drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head(100))
drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
comments_df = pd.read_csv('jung/jung.csv')
# comments['date'] = comments['date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').date())
# comments_df['date'] = pd.to_datetime(comments_df['date'])
comments_df = comments_df.set_index(pd.DatetimeIndex(comments_df['date']))
webs_score = list()
for web in webs:
w = web[0].strftime('%Y-%m-%d')
t = web[1].strftime('%Y-%m-%d')
the1940s = comments_df.ix[w:t]['score'].describe().ix[['count', 'mean', 'std']].T
w = web[0] + timedelta(days=7)
t = web[0] + timedelta(days=8)
l1 = [w]
l2 = [t]
l1.extend(the1940s.tolist())
l2.extend(the1940s.tolist())
webs_score.append(l1)
webs_score.append(l2)
# the1940s.tolist())
# webs_score.append([w].append(the1940s.tolist()))
# webs_score.append([t].append(the1940s.tolist()))
score_df = pd.DataFrame(webs_score, columns=['date', 'count', 'mean', 'std'])
score_df = score_df.set_index(pd.DatetimeIndex(score_df['date']))
score_df.drop('date', axis=1, inplace=True)
In [52]:
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd
start_day = datetime.strptime('4 Jan 14', '%d %b %y')
end_day = datetime.strptime('29 Jun 14', '%d %b %y')
webs = list()
target_sday = start_day
target_eday = start_day
while(True):
if target_eday > end_day:
break
target_sday = target_eday
target_eday = target_sday + timedelta(days=7)
webs.append((target_sday, target_eday))
# print(webs)
drama_df = pd.read_csv('jung/score.csv')
# print(drama_df.head(100))
drama_df = drama_df.set_index(pd.DatetimeIndex(drama_df['date']))
comments_df = pd.read_csv('jung/jung.csv')
# comments['date'] = comments['date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').date())
# comments_df['date'] = pd.to_datetime(comments_df['date'])
comments_df = comments_df.set_index(pd.DatetimeIndex(comments_df['date']))
webs_score = list()
for web in webs:
w = web[0].strftime('%Y-%m-%d')
t = web[1].strftime('%Y-%m-%d')
the1940s = comments_df.ix[w:t]['score'].describe().ix[['count', 'mean', 'std']].T
w = web[0] + timedelta(days=7)
t = web[0] + timedelta(days=8)
l1 = [w]
l2 = [t]
l1.extend(the1940s.tolist())
l2.extend(the1940s.tolist())
webs_score.append(l1)
webs_score.append(l2)
# the1940s.tolist())
# webs_score.append([w].append(the1940s.tolist()))
# webs_score.append([t].append(the1940s.tolist()))
score_df = pd.DataFrame(webs_score, columns=['date', 'count', 'mean', 'std'])
score_df = score_df.set_index(pd.DatetimeIndex(score_df['date']))
score_df.drop('date', axis=1, inplace=True)
# print(drama_df.head())
data_df = drama_df.join(score_df, how='inner')[['ratings', 'count', 'std', 'mean']]
# data_df['target'] = data_df['ratings'] - data_df['ratings'].shift(1)
# data_df = data_df.dropna()
# # data.columns = ['ratings', 'std', 'mean', 'count', 'Y']
# X = data_df[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
# y = data_df['target']
mbc_df = pd.read_csv('jung/mbc.csv', sep='\t', names=['date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1, inplace=True)
# # print(mbc_df.shape)
sbs_df = pd.read_csv('jung/sbs.csv', sep='\t', names=['date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head())
# ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings']]
data_df = data_df.join([sbs_df, mbc_df], how='inner')[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
data_df
# data_df.plot(figsize=(12,8), style='o--')
# # data_df
data_df['target'] = data_df['ratings'] - data_df['ratings'].shift(1)
data_df['target2'] = data_df['target'].apply(lambda x: 1 if x>=0 else 0)
# data_df.plot(figsize=(12,8), style='o--')
data_df = data_df.dropna()
# # data.columns = ['ratings', 'std', 'mean', 'count', 'Y']
X = data_df[['ratings', 'sbs ratings', 'mbc ratings', 'count', 'std', 'mean']]
y = data_df['target']
y2 = data_df['target2']
In [9]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
for k, m in [(2, 100), (2, 200), (2, 300), (2, 400),
(3, 100), (3, 200), (3, 300), (3, 400),
(4, 100), (4, 200), (4, 300), (4, 400),
(1, 50), (1, 70), (1, 100), (2, 50)]:
print('**********************************')
print(k, m)
rs = cross_validation.ShuffleSplit(len(X), n_iter=4, test_size=.2, random_state=0)
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
for train, test in rs:
print('========================')
clf.fit(X.iloc[train], y.iloc[train])
predict_y = clf.predict(X.iloc[test])
print(explained_variance_score(y.iloc[test], predict_y))
print(r2_score(y.iloc[test], predict_y))
print(mean_squared_error(y.iloc[test], predict_y))
In [80]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR, SVC
from sklearn.naive_bayes import MultinomialNB
# print(X)
kf = KFold(len(y2), n_folds=4)
# estimators = [('pca', PCA()), ('mnb', MultinomialNB())]
# parameters = {'pca__n_components':[2, 3], 'mnb__alpha':[0.01, 0.04, 0.07, 0.1, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]}
estimators = [('pca', PCA()), ('svr', SVC())]
parameters = {'pca__n_components':[2, 3], 'svr__C':(0.001, 0.01, 0.03, 0.05, 0.08, 0.1, 0.3, 0.5, 0.8, 1.0, 1.2, 1.4)}
# estimators = [('mnb', MultinomialNB())]
# parameters = {'mnb__alpha':[0.01, 0.04, 0.07, 0.1, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]}
# clf = SVC()
# clf.fit(X, y2)
# print(clf.score(X, y2))
# clf = SVR()
# clf.fit(X, y)
# print(clf.score(X, y))
# clf = MultinomialNB()
# clf.fit(X, y2)
# print(clf.score(X, y2))
# print(y)
pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# # print("Performing grid search...")
# # print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y2)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [ ]:
estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}