In [84]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

mbc_df = pd.read_csv('2013/mbc-web.tsv', sep='\t', names=['mbc title', 'date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)

sbs_df = pd.read_csv('2013/sbs-web.tsv', sep='\t', names=['sbs title', 'date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

kbs_df = pd.read_csv('2013/kbs-web.tsv', sep='\t', names=['kbs title', 'date', 'kbs ratings'])
kbs_df = kbs_df.set_index(pd.DatetimeIndex(kbs_df['date']))
kbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head(100))

ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['kbs title', 'kbs ratings', 'mbc title', 'mbc ratings', 'sbs title',  'sbs ratings']]
# print(ratings_df.head(100))

ratings_df['total rating'] = ratings_df[['kbs ratings', 'mbc ratings', 'sbs ratings']].apply(np.sum, axis=1)
ratings_df['kbs mbc total rating'] = ratings_df[['kbs ratings', 'mbc ratings']].apply(np.sum, axis=1)
print(ratings_df.describe())
# print(ratings_df.corr())
ratings_df.boxplot(figsize=(12,8))

ratings_df.plot(figsize=(16, 10), style='o--', grid=True)

######################################
## draw starting new drama's date
title = ''
l = []
for broadcast in ['kbs title', 'mbc title', 'sbs title']:
    for (index, row) in ratings_df[[broadcast]].iterrows():
#         print(row.values[0])
        if title != row.values[0]:
            l.append([index, row.values[0], broadcast.split(' ')[0]])
            title = row.values[0]
startdate_df = pd.DataFrame(l, columns=['date', 'title', 'broadcast'])
startdate_df = startdate_df.set_index(pd.DatetimeIndex(startdate_df['date']))
startdate_df.drop('date', axis=1,  inplace=True)

def get_extra_info(broadcast):
    if broadcast == 'kbs':
        return 20, 'b'
    elif broadcast == 'sbs':
        return 23, 'r'
    elif broadcast == 'mbc':
        return 25, 'g'
    else:
        pass
    return 0, 'b'

for i in range(0, len(startdate_df.index)):
    broadcast = str(startdate_df.iloc[i]['broadcast'])
    loc, c = get_extra_info(broadcast)
    plt.axvline(startdate_df.index[i],  color=c, alpha=0.5)
    plt.annotate(broadcast, (startdate_df.index[i], loc), color=c)

######################################
## draw mean line
mean_df = ratings_df.describe().T['mean']
for i in range(0, len(mean_df)):
    broadcast = str(mean_df.index[i])
    loc, c = get_extra_info(broadcast.split(' ')[0])
    plt.axhline(mean_df.iloc[i],  color=c, alpha=0.5)


       kbs ratings  mbc ratings  sbs ratings  total rating  \
count   152.000000   152.000000   152.000000    152.000000   
mean      8.461184     8.536842    14.326974     31.325000   
std       3.476869     2.327170     5.755515      4.789631   
min       2.300000     3.900000     3.900000     22.600000   
25%       5.100000     7.100000    11.075000     28.100000   
50%       8.800000     8.250000    12.800000     30.850000   
75%      10.600000     9.625000    17.725000     34.225000   
max      17.000000    17.100000    28.100000     42.400000   

       kbs mbc total rating  
count            152.000000  
mean              16.998026  
std                4.849776  
min                7.800000  
25%               13.475000  
50%               16.850000  
75%               19.325000  
max               30.300000  

[8 rows x 5 columns]

In [72]:
# ratings_df[ratings_df['kbs title']=='전우치'].plot(figsize=(12,8), style='o--')
  • 타방송국의 프로그램이 시작과 끝이 영향을 미치는가?

    • MBC와 SBS는 별로 영향을 받지 않음. KBS는 영향을 받는다.
  • 타방송국의 프로그램이 다른 프로그램에 시청율을 미치는가?

    • 일정 부분까지 약 %는 고정적이나 영향을 많이 미친다.
  • 프로그램이 시작할 때 방송국 평균과 비슷하다?

  • SBS의 프로그램이 전체 시청율을 이끈다.

  • SBS가 평균 15% KBS, MBC는 평균 15%이기에 SBS가 절대적이다.


In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

mbc_df = pd.read_csv('2013/mbc-web.tsv', sep='\t', names=['mbc title', 'date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)

sbs_df = pd.read_csv('2013/sbs-web.tsv', sep='\t', names=['sbs title', 'date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

kbs_df = pd.read_csv('2013/kbs-web.tsv', sep='\t', names=['kbs title', 'date', 'kbs ratings'])
kbs_df = kbs_df.set_index(pd.DatetimeIndex(kbs_df['date']))
kbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head(100))

ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['kbs ratings', 'mbc ratings', 'sbs ratings']]
ratings_df['target'] = ratings_df['sbs ratings'] - ratings_df['sbs ratings'].shift(1)
# print(ratings_df.values)
ratings_df = ratings_df.dropna()
X = ratings_df[['kbs ratings', 'mbc ratings', 'sbs ratings']]
y = ratings_df['target']


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.svm import SVR


for k, m in [(2, 100), (2, 200), (2, 300), (2, 400), 
             (3, 100), (3, 200), (3, 300), (3, 400),
             (4, 100), (4, 200), (4, 300), (4, 400),
             (5, 100), (5, 200), (5, 300), (5, 400)]:
    print('**********************************')
    print(k, m)
    rs = cross_validation.ShuffleSplit(len(X), n_iter=3,  test_size=.25, random_state=0)
    clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
    for train, test in rs:
        print('========================')
        clf.fit(X.iloc[train], y.iloc[train])
        predict_y = clf.predict(X.iloc[test])
        print(explained_variance_score(y.iloc[test], predict_y))
        print(r2_score(y.iloc[test], predict_y))


**********************************
(2, 100)
========================
-0.305120235251
-0.515021340498
========================
-0.310165665228
-0.378290714773
========================
-0.0540004739299
-0.0963897909661
**********************************
(2, 200)
========================
-0.51279713935
-0.63870468738
========================
-0.231036402393
-0.34629383646
========================
-0.067712629445
-0.115944783564
**********************************
(2, 300)
========================
-0.175793249251
-0.269374394339
========================
-0.113175127053
-0.201790435896
========================
-0.115142221184
-0.127340168587
**********************************
(2, 400)
========================
-0.367980575103
-0.52836898992
========================
-0.0488249214423
-0.0897630144115
========================
-0.000693079450956
-0.0143428040609
**********************************
(3, 100)
========================
-0.43668028182
-0.436786470391
========================
-0.318976175736
-0.32570024715
========================
-0.032144546635
-0.103998903627
**********************************
(3, 200)
========================
-0.45805597922
-0.459717705705
========================
-0.304538556623
-0.32570084304
========================
-0.0717521340301
-0.15821501891
**********************************
(3, 300)
========================
-0.432025821871
-0.433731878884
========================
0.135387827502
0.109601867645
========================
-0.0406012755226
-0.120831909558
**********************************
(3, 400)
========================
-0.772812021491
-0.77775958772
========================
-0.680201615691
-0.688752749539
========================
-0.0857916249196
-0.161508288366
**********************************
(4, 100)
========================
-1.16667015111
-1.17561076693
========================
-0.11032981141
-0.201381667574
========================
-0.0155154595573
-0.108898412447
**********************************
(4, 200)
========================
-1.17026108622
-1.17806356979
========================
-0.262012070364
-0.342409905134
========================
0.00179893129807
-0.0769013051854
**********************************
(4, 300)
========================
-1.18061237971
-1.18826581974
========================
-0.18315918064
-0.304801550952
========================
-0.0623033017543
-0.158402999821
**********************************
(4, 400)
========================
-0.843300000338
-0.848414030492
========================
-0.141149353305
-0.246894226318
========================
-0.0349308953548
-0.136977752314
**********************************
(5, 100)
========================
-1.14103710813
-1.16216163286
========================
-0.348022365697
-0.45657415092
========================
-0.0108251095226
-0.0939211584734
**********************************
(5, 200)
========================
-1.17181525296
-1.18000020167
========================
-0.298612370833
-0.40378156333
========================
-0.00680773627617
-0.0937219366809
**********************************
(5, 300)
========================
-1.18861663057
-1.20100657459
========================
-0.28993586549
-0.405923274448
========================
-0.0125090511113
-0.0939785913494
**********************************
(5, 400)
========================
-1.16946302949
-1.18202274343
========================
-0.339058224997
-0.447171441502
========================
-0.00515477469363
-0.089169806279

In [8]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR

kf = KFold(len(y), n_folds=3)

estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}

pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished
Best score: -0.026
Best parameters set:
	svr__C: 0.8
	svr__epsilon: 0.6

In [ ]: