In [84]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
mbc_df = pd.read_csv('2013/mbc-web.tsv', sep='\t', names=['mbc title', 'date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1, inplace=True)
sbs_df = pd.read_csv('2013/sbs-web.tsv', sep='\t', names=['sbs title', 'date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head())
kbs_df = pd.read_csv('2013/kbs-web.tsv', sep='\t', names=['kbs title', 'date', 'kbs ratings'])
kbs_df = kbs_df.set_index(pd.DatetimeIndex(kbs_df['date']))
kbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head(100))
ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['kbs title', 'kbs ratings', 'mbc title', 'mbc ratings', 'sbs title', 'sbs ratings']]
# print(ratings_df.head(100))
ratings_df['total rating'] = ratings_df[['kbs ratings', 'mbc ratings', 'sbs ratings']].apply(np.sum, axis=1)
ratings_df['kbs mbc total rating'] = ratings_df[['kbs ratings', 'mbc ratings']].apply(np.sum, axis=1)
print(ratings_df.describe())
# print(ratings_df.corr())
ratings_df.boxplot(figsize=(12,8))
ratings_df.plot(figsize=(16, 10), style='o--', grid=True)
######################################
## draw starting new drama's date
title = ''
l = []
for broadcast in ['kbs title', 'mbc title', 'sbs title']:
for (index, row) in ratings_df[[broadcast]].iterrows():
# print(row.values[0])
if title != row.values[0]:
l.append([index, row.values[0], broadcast.split(' ')[0]])
title = row.values[0]
startdate_df = pd.DataFrame(l, columns=['date', 'title', 'broadcast'])
startdate_df = startdate_df.set_index(pd.DatetimeIndex(startdate_df['date']))
startdate_df.drop('date', axis=1, inplace=True)
def get_extra_info(broadcast):
if broadcast == 'kbs':
return 20, 'b'
elif broadcast == 'sbs':
return 23, 'r'
elif broadcast == 'mbc':
return 25, 'g'
else:
pass
return 0, 'b'
for i in range(0, len(startdate_df.index)):
broadcast = str(startdate_df.iloc[i]['broadcast'])
loc, c = get_extra_info(broadcast)
plt.axvline(startdate_df.index[i], color=c, alpha=0.5)
plt.annotate(broadcast, (startdate_df.index[i], loc), color=c)
######################################
## draw mean line
mean_df = ratings_df.describe().T['mean']
for i in range(0, len(mean_df)):
broadcast = str(mean_df.index[i])
loc, c = get_extra_info(broadcast.split(' ')[0])
plt.axhline(mean_df.iloc[i], color=c, alpha=0.5)
In [72]:
# ratings_df[ratings_df['kbs title']=='전우치'].plot(figsize=(12,8), style='o--')
타방송국의 프로그램이 시작과 끝이 영향을 미치는가?
타방송국의 프로그램이 다른 프로그램에 시청율을 미치는가?
프로그램이 시작할 때 방송국 평균과 비슷하다?
SBS의 프로그램이 전체 시청율을 이끈다.
SBS가 평균 15% KBS, MBC는 평균 15%이기에 SBS가 절대적이다.
In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
mbc_df = pd.read_csv('2013/mbc-web.tsv', sep='\t', names=['mbc title', 'date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1, inplace=True)
sbs_df = pd.read_csv('2013/sbs-web.tsv', sep='\t', names=['sbs title', 'date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head())
kbs_df = pd.read_csv('2013/kbs-web.tsv', sep='\t', names=['kbs title', 'date', 'kbs ratings'])
kbs_df = kbs_df.set_index(pd.DatetimeIndex(kbs_df['date']))
kbs_df.drop('date', axis=1, inplace=True)
# print(sbs_df.head(100))
ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['kbs ratings', 'mbc ratings', 'sbs ratings']]
ratings_df['target'] = ratings_df['sbs ratings'] - ratings_df['sbs ratings'].shift(1)
# print(ratings_df.values)
ratings_df = ratings_df.dropna()
X = ratings_df[['kbs ratings', 'mbc ratings', 'sbs ratings']]
y = ratings_df['target']
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.svm import SVR
for k, m in [(2, 100), (2, 200), (2, 300), (2, 400),
(3, 100), (3, 200), (3, 300), (3, 400),
(4, 100), (4, 200), (4, 300), (4, 400),
(5, 100), (5, 200), (5, 300), (5, 400)]:
print('**********************************')
print(k, m)
rs = cross_validation.ShuffleSplit(len(X), n_iter=3, test_size=.25, random_state=0)
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
for train, test in rs:
print('========================')
clf.fit(X.iloc[train], y.iloc[train])
predict_y = clf.predict(X.iloc[test])
print(explained_variance_score(y.iloc[test], predict_y))
print(r2_score(y.iloc[test], predict_y))
In [8]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR
kf = KFold(len(y), n_folds=3)
estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}
pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [ ]: