notebook.community

Edit and run



In [12]:

    
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

mbc_df = pd.read_csv('2013/mbc-web.tsv', sep='\t', names=['mbc title', 'date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)

sbs_df = pd.read_csv('2013/sbs-web.tsv', sep='\t', names=['sbs title', 'date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

kbs_df = pd.read_csv('2013/kbs-web.tsv', sep='\t', names=['kbs title', 'date', 'kbs ratings'])
kbs_df = kbs_df.set_index(pd.DatetimeIndex(kbs_df['date']))
kbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head(100))

ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['kbs title', 'kbs ratings', 'mbc title', 'mbc ratings', 'sbs title',  'sbs ratings']]
# print(ratings_df.head(100))

ratings_df['total rating'] = ratings_df[['kbs ratings', 'mbc ratings', 'sbs ratings']].apply(np.sum, axis=1)
ratings_df['kbs mbc total rating'] = ratings_df[['kbs ratings', 'mbc ratings']].apply(np.sum, axis=1)
print(ratings_df.describe())
# print(ratings_df.corr())
ratings_df.boxplot(figsize=(12,8))

ratings_df.plot(figsize=(16, 10), style='o--', grid=True)

######################################
## draw starting new drama's date
title = ''
l = []
for broadcast in ['kbs title', 'mbc title', 'sbs title']:
    for (index, row) in ratings_df[[broadcast]].iterrows():
#         print(row.values[0])
        if title != row.values[0]:
            l.append([index, row.values[0], broadcast.split(' ')[0]])
            title = row.values[0]
startdate_df = pd.DataFrame(l, columns=['date', 'title', 'broadcast'])
startdate_df = startdate_df.set_index(pd.DatetimeIndex(startdate_df['date']))
startdate_df.drop('date', axis=1,  inplace=True)

def get_extra_info(broadcast):
    if broadcast == 'kbs':
        return 20, 'b'
    elif broadcast == 'sbs':
        return 23, 'r'
    elif broadcast == 'mbc':
        return 25, 'g'
    else:
        pass
    return 0, 'b'

for i in range(0, len(startdate_df.index)):
    broadcast = str(startdate_df.iloc[i]['broadcast'])
    loc, c = get_extra_info(broadcast)
    plt.axvline(startdate_df.index[i],  color=c, alpha=0.5)
    plt.annotate(broadcast, (startdate_df.index[i], loc), color=c)

######################################
## draw mean line
mean_df = ratings_df.describe().T['mean']
for i in range(0, len(mean_df)):
    broadcast = str(mean_df.index[i])
    loc, c = get_extra_info(broadcast.split(' ')[0])
    plt.axhline(mean_df.iloc[i],  color=c, alpha=0.5)









    



       kbs ratings  mbc ratings  sbs ratings  total rating  \
count   152.000000   152.000000   152.000000    152.000000   
mean      8.461184     8.536842    14.326974     31.325000   
std       3.476869     2.327170     5.755515      4.789631   
min       2.300000     3.900000     3.900000     22.600000   
25%       5.100000     7.100000    11.075000     28.100000   
50%       8.800000     8.250000    12.800000     30.850000   
75%      10.600000     9.625000    17.725000     34.225000   
max      17.000000    17.100000    28.100000     42.400000   

       kbs mbc total rating  
count            152.000000  
mean              16.998026  
std                4.849776  
min                7.800000  
25%               13.475000  
50%               16.850000  
75%               19.325000  
max               30.300000  

[8 rows x 5 columns]



In [15]:

    
# ratings_df



In [14]:

    
'칼과 꽃'
ratings_df[ratings_df['kbs title']=='전우치'].plot(figsize=(12,8), style='o--')









    Out[14]:





<matplotlib.axes.AxesSubplot at 0x5053890>

타방송국의 프로그램이 시작과 끝이 영향을 미치는가?
- MBC와 SBS는 별로 영향을 받지 않음. KBS는 영향을 받는다.
타방송국의 프로그램이 다른 프로그램에 시청율을 미치는가?
- 일정 부분까지 약 %는 고정적이나 영향을 많이 미친다.
프로그램이 시작할 때 방송국 평균과 비슷하다?
SBS의 프로그램이 전체 시청율을 이끈다.
SBS가 평균 15% KBS, MBC는 평균 15%이기에 SBS가 절대적이다.



In [7]:

    
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

mbc_df = pd.read_csv('2013/mbc-web.tsv', sep='\t', names=['mbc title', 'date', 'mbc ratings'])
mbc_df = mbc_df.set_index(pd.DatetimeIndex(mbc_df['date']))
mbc_df.drop('date', axis=1,  inplace=True)

sbs_df = pd.read_csv('2013/sbs-web.tsv', sep='\t', names=['sbs title', 'date', 'sbs ratings'])
sbs_df = sbs_df.set_index(pd.DatetimeIndex(sbs_df['date']))
sbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head())

kbs_df = pd.read_csv('2013/kbs-web.tsv', sep='\t', names=['kbs title', 'date', 'kbs ratings'])
kbs_df = kbs_df.set_index(pd.DatetimeIndex(kbs_df['date']))
kbs_df.drop('date', axis=1,  inplace=True)
# print(sbs_df.head(100))

ratings_df = kbs_df.join([sbs_df, mbc_df], how='inner')[['kbs ratings', 'mbc ratings', 'sbs ratings']]
ratings_df['target'] = ratings_df['sbs ratings'] - ratings_df['sbs ratings'].shift(1)
# print(ratings_df.values)
ratings_df = ratings_df.dropna()
X = ratings_df[['kbs ratings', 'mbc ratings', 'sbs ratings']]
y = ratings_df['target']


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.svm import SVR


for k, m in [(2, 100), (2, 200), (2, 300), (2, 400), 
             (3, 100), (3, 200), (3, 300), (3, 400),
             (4, 100), (4, 200), (4, 300), (4, 400),
             (5, 100), (5, 200), (5, 300), (5, 400)]:
    print('**********************************')
    print(k, m)
    rs = cross_validation.ShuffleSplit(len(X), n_iter=4,  test_size=.25, random_state=0)
    clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=k), n_estimators=m)
    for train, test in rs:
        print('========================')
        clf.fit(X.iloc[train], y.iloc[train])
        predict_y = clf.predict(X.iloc[test])
        print(explained_variance_score(y.iloc[test], predict_y))
        print(r2_score(y.iloc[test], predict_y))









    



**********************************
(2, 100)
========================
-0.223998664223
-0.301195025973
========================
-0.268705113264
-0.395092308843
========================
-0.0434562419467
-0.120488804409
========================
-0.029648491802
-0.168178185628
**********************************
(2, 200)
========================
-0.16449140626
-0.338307891861
========================
-0.234155518941
-0.274369798381
========================
-0.0353807912595
-0.0396275055194
========================
0.00956744819585
-0.088331183316
**********************************
(2, 300)
========================
-0.190436048287
-0.329859458628
========================
-0.160439991598
-0.179315527534
========================
-0.13345412694
-0.148927921291
========================
0.0249697204441
-0.103248204821
**********************************
(2, 400)
========================
-0.406202207939
-0.557234793173
========================
-0.145020545385
-0.18212219139
========================
-0.0969220771282
-0.114870798226
========================
-0.113864175829
-0.279020175849
**********************************
(3, 100)
========================
-0.581759988643
-0.585921230107
========================
-0.133105665791
-0.139973077809
========================
-0.0500553124419
-0.132905441498
========================
0.122547339814
0.121738736413
**********************************
(3, 200)
========================
-0.457396952936
-0.457983030208
========================
-0.06168112418
-0.0861476360559
========================
-0.0479625851121
-0.126426438311
========================
0.177817043528
0.177639699479
**********************************
(3, 300)
========================
-0.433698833157
-0.436627001128
========================
-0.295123752397
-0.300696563912
========================
-0.0529296890956
-0.129681354951
========================
0.101321881848
0.101313914865
**********************************
(3, 400)
========================
-0.69372913903
-0.699465718599
========================
-0.193857454858
-0.201782708433
========================
-0.0681909369183
-0.142011080577
========================
0.0584975321719
0.0557672697757
**********************************
(4, 100)
========================
-1.15303231859
-1.15992890677
========================
-0.223422416582
-0.342971319897
========================
-0.0295467704358
-0.122807920088
========================
0.152774406084
0.148463214915
**********************************
(4, 200)
========================
-1.17813131071
-1.18805837267
========================
-0.233857922881
-0.364664920385
========================
-0.0174374691924
-0.0986143879913
========================
0.143878976592
0.137165950543
**********************************
(4, 300)
========================
-1.15661912391
-1.16681567765
========================
-0.161920013311
-0.260631777616
========================
-0.0520393288425
-0.150110594831
========================
0.118198038834
0.112744685862
**********************************
(4, 400)
========================
-1.15978572534
-1.17097416136
========================
-0.175667578466
-0.271742243867
========================
-0.0176294012577
-0.102605501314
========================
0.154566901738
0.152387173618
**********************************
(5, 100)
========================
-0.699662244565
-0.704103808881
========================
-0.412303516888
-0.554533091271
========================
0.0016509013081
-0.0862960924492
========================
0.121754685048
0.118835024122
**********************************
(5, 200)
========================
-0.363268658992
-0.36417717876
========================
-0.316789869777
-0.45076573354
========================
-0.0428505803789
-0.135668268445
========================
0.155958526912
0.151228694539
**********************************
(5, 300)
========================
-1.16926806464
-1.18102482674
========================
-0.415371079577
-0.537553030683
========================
-0.0193216052049
-0.104883548593
========================
0.131511272791
0.130346281997
**********************************
(5, 400)
========================
-1.16602415864
-1.17784233483
========================
-0.34647806534
-0.466244920235
========================
-0.0283467227147
-0.117163216269
========================
0.191869273337
0.1831077046



In [8]:

    
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVR

kf = KFold(len(y), n_folds=3)

estimators = [('svr', SVR())]
parameters = {'svr__C':(0.8, 1.0, 1.2, 1.4), 'svr__epsilon':(0.2,0.3,0.4,0.5,0.6)}

pipeline = Pipeline(estimators)
# print(pipeline)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=kf)
# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Fitting 3 folds for each of 20 candidates, totalling 60 fits






    



[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/media/riemann/dedekind/local/lib/python2.7/site-packages/zmq/utils/garbage.py", line 46, in run
    s.bind(self.gc.url)
  File "socket.pyx", line 448, in zmq.backend.cython.socket.Socket.bind (zmq/backend/cython/socket.c:4127)
  File "checkrc.pxd", line 21, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/socket.c:6083)
ZMQError: Address already in use

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/media/riemann/dedekind/local/lib/python2.7/site-packages/zmq/utils/garbage.py", line 46, in run
    s.bind(self.gc.url)
  File "socket.pyx", line 448, in zmq.backend.cython.socket.Socket.bind (zmq/backend/cython/socket.c:4127)
  File "checkrc.pxd", line 21, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/socket.c:6083)
ZMQError: Address already in use

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/media/riemann/dedekind/local/lib/python2.7/site-packages/zmq/utils/garbage.py", line 46, in run
    s.bind(self.gc.url)
  File "socket.pyx", line 448, in zmq.backend.cython.socket.Socket.bind (zmq/backend/cython/socket.c:4127)
  File "checkrc.pxd", line 21, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/socket.c:6083)
ZMQError: Address already in use

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/media/riemann/dedekind/local/lib/python2.7/site-packages/zmq/utils/garbage.py", line 46, in run
    s.bind(self.gc.url)
  File "socket.pyx", line 448, in zmq.backend.cython.socket.Socket.bind (zmq/backend/cython/socket.c:4127)
  File "checkrc.pxd", line 21, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/socket.c:6083)
ZMQError: Address already in use







    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-3329e8f34c3b> in <module>()
     17 # print("Performing grid search...")
     18 # print("pipeline:", [name for name, _ in pipeline.steps])
---> 19 grid_search.fit(X, y)
     20 print("Best score: %0.3f" % grid_search.best_score_)
     21 print("Best parameters set:")

/media/riemann/dedekind/local/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y, **params)
    705                           " The params argument will be removed in 0.15.",
    706                           DeprecationWarning)
--> 707         return self._fit(X, y, ParameterGrid(self.param_grid))
    708 
    709 

/media/riemann/dedekind/local/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
    491                     X, y, base_estimator, parameters, train, test,
    492                     self.scorer_, self.verbose, **self.fit_params)
--> 493                 for parameters in parameter_iterable
    494                 for train, test in cv)
    495 

/media/riemann/dedekind/local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    529             if n_jobs > 1:
    530                 self._pool.close()
--> 531                 self._pool.join()
    532                 os.environ.pop('__JOBLIB_SPAWNED_PARALLEL__', 0)
    533             self._jobs = list()

/usr/lib/python2.7/multiprocessing/pool.pyc in join(self)
    436         self._result_handler.join()
    437         for p in self._pool:
--> 438             p.join()
    439 
    440     @staticmethod

/usr/lib/python2.7/multiprocessing/process.pyc in join(self, timeout)
    143         assert self._parent_pid == os.getpid(), 'can only join a child process'
    144         assert self._popen is not None, 'can only join a started process'
--> 145         res = self._popen.wait(timeout)
    146         if res is not None:
    147             _current_process._children.discard(self)

/usr/lib/python2.7/multiprocessing/forking.pyc in wait(self, timeout)
    146         def wait(self, timeout=None):
    147             if timeout is None:
--> 148                 return self.poll(0)
    149             deadline = time.time() + timeout
    150             delay = 0.0005

/usr/lib/python2.7/multiprocessing/forking.pyc in poll(self, flag)
    131             if self.returncode is None:
    132                 try:
--> 133                     pid, sts = os.waitpid(self.pid, flag)
    134                 except os.error:
    135                     # Child process not yet created. See #1731717

KeyboardInterrupt:



In [ ]: