In [1]:
import pandas as pd

In [93]:
main_df = pd.read_csv("data/webnovel/main_df.csv", index_col=0, encoding="cp949")

In [94]:
main_df.head()


Out[94]:
level genre ID name episode_total is_fin main_likeit main_score concern_count comments_count
0 webnovel 101 466391 노승아 97 0.0 20028 9.98 86342 181
1 webnovel 101 398090 윤이수 140 0.0 17547 9.98 71965 188
2 webnovel 101 552533 웹소설 작가 9 0.0 0 9.92 16989 34
3 webnovel 101 523286 플아다 45 0.0 3243 9.97 27875 31
4 webnovel 101 514809 이경하 53 0.0 8742 9.96 43191 125

In [95]:
X_data = main_df[["genre", "name", "episode_total", "is_fin"]]

작가 중 작품 수가 1개인 사람은 기타로 합침


In [96]:
name_size = dict(main_df.groupby("name").size())

In [97]:
X_data["name"] = X_data["name"].apply(lambda x: "기타" if name_size[x] == 1 else x)


C:\Users\kms\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

Label 데이터를 전처리


In [98]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [99]:
lab_name = LabelEncoder()

In [100]:
X_data["name"] = lab_name.fit_transform(X_data["name"])


C:\Users\kms\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [101]:
lab_genre = LabelEncoder()

In [102]:
X_data["genre"] = lab_genre.fit_transform(X_data["genre"])


C:\Users\kms\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [103]:
import statsmodels.api as sm

In [104]:
y = main_df[["concern_count"]]

In [105]:
Xy_data = pd.concat([X_data, y], axis=1)

In [108]:
model = sm.OLS.from_formula("concern_count ~ C(genre) + C(name) + episode_total + is_fin", Xy_data)

In [109]:
result = model.fit()

In [110]:
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:          concern_count   R-squared:                       0.565
Model:                            OLS   Adj. R-squared:                  0.426
Method:                 Least Squares   F-statistic:                     4.074
Date:                Wed, 03 Aug 2016   Prob (F-statistic):           1.65e-13
Time:                        23:15:18   Log-Likelihood:                -2643.6
No. Observations:                 245   AIC:                             5407.
Df Residuals:                     185   BIC:                             5617.
Df Model:                          59                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------
Intercept      2.128e+04   9855.826      2.159      0.032      1835.482  4.07e+04
C(genre)[T.1] -9200.8397   3076.538     -2.991      0.003     -1.53e+04 -3131.230
C(genre)[T.2] -9540.1197   4079.722     -2.338      0.020     -1.76e+04 -1491.359
C(genre)[T.3] -1.459e+04   3787.370     -3.851      0.000     -2.21e+04 -7113.328
C(genre)[T.4] -6741.3330   6021.180     -1.120      0.264     -1.86e+04  5137.672
C(genre)[T.5] -1.414e+04   4530.169     -3.122      0.002     -2.31e+04 -5206.251
C(name)[T.1]   1.368e+04   1.24e+04      1.107      0.270     -1.07e+04  3.81e+04
C(name)[T.2]  -2989.0750   1.49e+04     -0.201      0.841     -3.23e+04  2.63e+04
C(name)[T.3]   -962.0007   1.48e+04     -0.065      0.948     -3.03e+04  2.83e+04
C(name)[T.4]  -3773.9716   1.24e+04     -0.305      0.760     -2.82e+04  2.06e+04
C(name)[T.5]  -5101.5518   9763.748     -0.522      0.602     -2.44e+04  1.42e+04
C(name)[T.6]   1254.7209   1.39e+04      0.091      0.928     -2.61e+04  2.86e+04
C(name)[T.7]  -4807.3060   1.41e+04     -0.341      0.733     -3.26e+04   2.3e+04
C(name)[T.8]  -5645.5332   1.39e+04     -0.407      0.684      -3.3e+04  2.17e+04
C(name)[T.9]   4.273e+04   1.36e+04      3.149      0.002       1.6e+04  6.95e+04
C(name)[T.10]  2.733e+04   1.36e+04      2.016      0.045       589.916  5.41e+04
C(name)[T.11] -6139.5168   1.36e+04     -0.453      0.651     -3.29e+04  2.06e+04
C(name)[T.12] -1.217e+04   1.39e+04     -0.876      0.382     -3.96e+04  1.52e+04
C(name)[T.13] -1712.8099   1.24e+04     -0.139      0.890     -2.61e+04  2.27e+04
C(name)[T.14]  5467.0297   1.35e+04      0.405      0.686     -2.12e+04  3.21e+04
C(name)[T.15]  2.125e+04   1.36e+04      1.568      0.119     -5490.125   4.8e+04
C(name)[T.16] -5815.0000   1.35e+04     -0.430      0.668     -3.25e+04  2.08e+04
C(name)[T.17]  1.462e+04   1.17e+04      1.246      0.214     -8523.891  3.78e+04
C(name)[T.18] -8566.3193   1.37e+04     -0.624      0.533     -3.56e+04  1.85e+04
C(name)[T.19] -1.115e+04   1.35e+04     -0.825      0.410     -3.78e+04  1.55e+04
C(name)[T.20] -1.438e+04   1.41e+04     -1.017      0.311     -4.23e+04  1.35e+04
C(name)[T.21] -3484.4703   1.35e+04     -0.258      0.797     -3.01e+04  2.32e+04
C(name)[T.22]  1.294e+04   1.24e+04      1.047      0.296     -1.14e+04  3.73e+04
C(name)[T.23]  1.109e+04   1.41e+04      0.788      0.432     -1.67e+04  3.89e+04
C(name)[T.24] -4045.3907   1.24e+04     -0.327      0.744     -2.84e+04  2.03e+04
C(name)[T.25] -4675.4871   1.36e+04     -0.345      0.731     -3.14e+04  2.21e+04
C(name)[T.26]  2618.6714   1.18e+04      0.222      0.825     -2.07e+04  2.59e+04
C(name)[T.27]  1.474e+04   1.23e+04      1.195      0.234     -9603.214  3.91e+04
C(name)[T.28]  6.108e+04   1.36e+04      4.486      0.000      3.42e+04  8.79e+04
C(name)[T.29] -7424.3058   1.39e+04     -0.534      0.594     -3.49e+04     2e+04
C(name)[T.30]  1.563e+04   1.24e+04      1.265      0.208     -8751.301     4e+04
C(name)[T.31]  1528.2951   1.36e+04      0.113      0.910     -2.52e+04  2.83e+04
C(name)[T.32]  3.062e+04   1.41e+04      2.177      0.031      2877.459  5.84e+04
C(name)[T.33] -1.124e+04   1.27e+04     -0.888      0.376     -3.62e+04  1.37e+04
C(name)[T.34] -1281.7159   1.41e+04     -0.091      0.928     -2.91e+04  2.66e+04
C(name)[T.35]   700.2239   1.39e+04      0.050      0.960     -2.67e+04  2.81e+04
C(name)[T.36] -5940.3060   1.41e+04     -0.421      0.674     -3.37e+04  2.19e+04
C(name)[T.37]  9533.1136   1.24e+04      0.769      0.443     -1.49e+04   3.4e+04
C(name)[T.38]  4.408e+04   1.41e+04      3.116      0.002      1.62e+04   7.2e+04
C(name)[T.39] -1.656e+04   1.35e+04     -1.226      0.222     -4.32e+04  1.01e+04
C(name)[T.40] -7244.6396   1.36e+04     -0.533      0.595     -3.41e+04  1.96e+04
C(name)[T.41] -9894.4851   1.35e+04     -0.732      0.465     -3.66e+04  1.68e+04
C(name)[T.42] -3722.7693   1.36e+04     -0.274      0.784     -3.05e+04   2.3e+04
C(name)[T.43] -9872.9079   1.36e+04     -0.728      0.467     -3.66e+04  1.69e+04
C(name)[T.44]   -56.3201   1.27e+04     -0.004      0.996     -2.52e+04  2.51e+04
C(name)[T.45] -4233.4920   1.36e+04     -0.312      0.755      -3.1e+04  2.25e+04
C(name)[T.46]  7997.1862   1.39e+04      0.577      0.564     -1.93e+04  3.53e+04
C(name)[T.47] -5462.8163   1.29e+04     -0.423      0.673     -3.09e+04     2e+04
C(name)[T.48]  2536.6698   1.17e+04      0.217      0.829     -2.06e+04  2.56e+04
C(name)[T.49]  1.121e+04   1.24e+04      0.907      0.365     -1.32e+04  3.56e+04
C(name)[T.50]   496.4257   1.35e+04      0.037      0.971     -2.62e+04  2.72e+04
C(name)[T.51]  -1.48e+04   1.23e+04     -1.200      0.232     -3.91e+04  9540.059
C(name)[T.52]  1.186e+04   1.35e+04      0.878      0.381     -1.48e+04  3.85e+04
episode_total    43.0297     28.066      1.533      0.127       -12.341    98.400
is_fin         3383.0754   2321.221      1.457      0.147     -1196.391  7962.542
==============================================================================
Omnibus:                       63.717   Durbin-Watson:                   1.091
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              223.152
Skew:                           1.048   Prob(JB):                     3.49e-49
Kurtosis:                       7.179   Cond. No.                     4.73e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.73e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [22]:
episode_df = pd.read_csv("data/episode_df.csv", index_col=0)
episode_df["ID"] = episode_df["ID"].astype("int64")

In [63]:
novel_19_df = pd.read_csv("data/novel_19.csv", encoding="cp949", index_col=0).dropna()

In [64]:
text_df = main_df.merge(episode_df, on="ID")[["ID", "genre", "text"]]

In [65]:
text_df = pd.concat([text_df, novel_19_df]).reset_index(drop=True)

In [66]:
X_train = text_df[text_df["ID"] != 466391].ix[:,2]
y_train = text_df[text_df["ID"] != 466391].ix[:,1]
X_test = text_df[text_df["ID"] == 466391].ix[:,2]
y_test = text_df[text_df["ID"] == 466391].ix[:,1]

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import LabelKFold
from sklearn.metrics import classification_report

In [89]:
vect = TfidfVectorizer()

In [90]:
%%time
vect.fit(test_df.ix[:,2])


Wall time: 3min 10s
Out[90]:
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [133]:
def make_predict_proba(ID, vect):
    
    X_train = text_df[text_df["ID"] != ID].ix[:,2]
    y_train = text_df[text_df["ID"] != ID].ix[:,1]
    X_test = text_df[text_df["ID"] == ID].ix[:,2]
    
    model = MultinomialNB()
    model.fit(vect.transform(X_train), y_train)
    predict = model.predict_proba(vect.transform(X_test))
    predict_df = pd.DataFrame(predict, columns=[101, 102, 103, 104, 106, 108, 109])
    if text_df[text_df["ID"] == ID]["ID"].unique()[0] == 101:
        predict_df["genre_proba"] = predict[:, 0]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 102:
        predict_df["genre_proba"] = predict[:, 1]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 103:
        predict_df["genre_proba"] = predict[:, 2]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 104:
        predict_df["genre_proba"] = predict[:, 3]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 106:
        predict_df["genre_proba"] = predict[:, 4]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 108:
        predict_df["genre_proba"] = predict[:, 5]

    predict_df["ID"] = ID
    
    return predict_df

In [134]:
def make_predict_proba_df():
    
    vect = TfidfVectorizer()
    vect.fit(text_df.ix[:,2])
    
    proba_df = pd.DataFrame(columns = ["ID", 101, 102, 103, 104, 106, 108, 109])
    
    for ID in main_df["ID"]:
        predict_proba_df = make_predict_proba(ID, vect)
        proba_df = pd.concat([proba_df, predict_proba_df])

    proba_df.reset_index(drop=True)
    proba_df.to_csv("data/proba_df.csv")
    
    return proba_df

In [135]:
%%time
proba_df = make_predict_proba_df()


Wall time: 1h 35min 41s

In [247]:
proba_df = proba_df.reset_index()

In [250]:
text_df.head()


Out[250]:
ID genre text
0 466391 101.0 \n프롤로그 철컥.현관문이 닫히는 소리와 함께 짙은 어둠이 내려앉았다.“아, 센서등...
1 466391 101.0 결혼해 보라. 당신은 후회할 것이다.그러면 결혼하지 말라. 당신은 더욱 후회할 것이...
2 466391 101.0 \n-하이네>야간자율학습 시간 내내, 해수는 노트에 낙서를 했다.결혼. 겨울방학. ...
3 466391 101.0 \n-안톤 체호프>\n해수를 집 앞에 내려주고 돌아가는 길.혁준은 붉은색으로 바뀐 ...
4 466391 101.0 \n-조지 고든 바이런>\n혁준은 정호, 유리와의 통화를 끝내고 휴대폰을 내려놓았다...

In [252]:
predict_list = []
for i in range(len(proba_df)):
    predict_list.append(np.argmax(proba_df.loc[i][2:]))

In [257]:
print(classification_report(text_df[text_df["genre"] != 109]["genre"], predict_list))


             precision    recall  f1-score   support

      101.0       0.34      1.00      0.51      2792
      102.0       0.81      0.02      0.05      2025
      103.0       0.87      0.03      0.06      1829
      104.0       0.00      0.00      0.00       605
      106.0       0.00      0.00      0.00       467
      108.0       0.00      0.00      0.00       629

avg / total       0.50      0.35      0.19      8347

C:\Users\kms\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [179]:
group_proba_df = proba_df.groupby("ID", as_index=False).agg({101: np.mean, 102: np.mean, 103: np.mean, 104: np.mean, 106: np.mean, 108: np.mean, 109: np.mean})

In [180]:
all_df= main_df.merge(group_proba_df, on="ID")

In [181]:
all_df.rename(columns={101:"romance", 102:"SFfantasy", 103:"matial", 104:"mystery", 106:"lightnovel", 108:"fusion", 109:"adult",}, inplace=True)

In [192]:
X_data = all_df[["genre", "episodes_count", "romance", "SFfantasy", "matial", "mystery", "lightnovel", "fusion", "adult"]]

In [153]:
y_data = all_df[["concern_count"]]

In [188]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [189]:
scaler = StandardScaler()

In [194]:
X_data = pd.DataFrame(scaler.fit_transform(X_data), columns=["genre", "episodes_count", "romance", "SFfantasy", "matial", "mystery", "lightnovel", "fusion", "adult"])

In [196]:
model2 = sm.OLS(y_data, sm.add_constant(X_data))

In [197]:
print(model2.fit().summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:          concern_count   R-squared:                       0.148
Model:                            OLS   Adj. R-squared:                  0.115
Method:                 Least Squares   F-statistic:                     4.476
Date:                Thu, 28 Jul 2016   Prob (F-statistic):           1.98e-05
Time:                        06:46:08   Log-Likelihood:                -2682.9
No. Observations:                 241   AIC:                             5386.
Df Residuals:                     231   BIC:                             5421.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------
const           1.946e+04   1088.247     17.886      0.000      1.73e+04  2.16e+04
genre          -5633.9358   1137.429     -4.953      0.000     -7874.997 -3392.874
episodes_count  1608.3709   1192.666      1.349      0.179      -741.524  3958.265
romance         2.376e+16   2.45e+16      0.969      0.334     -2.46e+16  7.21e+16
SFfantasy       1.476e+16   1.52e+16      0.969      0.334     -1.53e+16  4.48e+16
matial          1.319e+16   1.36e+16      0.969      0.334     -1.36e+16     4e+16
mystery         2.815e+13   2.91e+13      0.969      0.334     -2.91e+13  8.54e+13
lightnovel      1.787e+13   1.84e+13      0.969      0.334     -1.85e+13  5.42e+13
fusion           3.64e+13   3.76e+13      0.969      0.334     -3.76e+13   1.1e+14
adult           3.472e+13   3.58e+13      0.969      0.334     -3.59e+13  1.05e+14
==============================================================================
Omnibus:                      106.084   Durbin-Watson:                   0.597
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              342.359
Skew:                           1.937   Prob(JB):                     4.55e-75
Kurtosis:                       7.369   Cond. No.                     6.15e+13
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.8e-25. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [208]:
len(episode_df)


Out[208]:
8347

In [209]:
len(proba_df)


Out[209]:
8347

In [221]:
stack_df = pd.DataFrame(np.hstack([episode_df, proba_df]), columns=list(episode_df.columns)+list(proba_df.columns))

In [227]:
stack_df.rename(columns={101:"romance", 102:"SFfantasy", 103:"matial", 104:"mystery", 106:"lightnovel", 108:"fusion", 109:"adult",}, inplace=True)

In [231]:
stack_df.ix[:,[2,3,4,5,8,9,10,11,12,13,14]]


Out[231]:
is_first score score_count episode_comments_count romance SFfantasy matial mystery lightnovel fusion adult
0 1 9.92 3889 1219 0.996761 0.00255418 0.000682002 9.6311e-07 2.32203e-07 8.7567e-07 1.18855e-06
1 0 9.93 3530 171 0.993403 0.00552969 0.00106461 8.25206e-07 2.91303e-07 8.75303e-07 6.88777e-07
2 0 9.96 3493 223 0.993623 0.00544217 0.000932973 4.32594e-07 1.31084e-07 4.8975e-07 4.01862e-07
3 0 9.96 3629 223 0.995663 0.00368673 0.000647183 1.15869e-06 2.55982e-07 9.03143e-07 1.09214e-06
4 0 9.97 3830 205 0.986212 0.0117887 0.00197172 8.94634e-06 2.76763e-06 7.55279e-06 8.65453e-06
5 0 9.96 3769 257 0.991155 0.00745319 0.0013853 1.82403e-06 5.89243e-07 1.80845e-06 2.20027e-06
6 0 9.98 3681 142 0.992543 0.00627144 0.00117913 1.83714e-06 6.84919e-07 1.76284e-06 2.42631e-06
7 0 9.99 3271 161 0.997269 0.00218077 0.000544956 1.59446e-06 6.25787e-07 1.50678e-06 1.76338e-06
8 0 9.98 3430 200 0.998006 0.00181251 0.000179985 4.67465e-07 2.11563e-07 4.18389e-07 5.28671e-07
9 0 9.98 3827 195 0.995896 0.00377251 0.000329461 7.33249e-07 2.45185e-07 6.28282e-07 7.12114e-07
10 0 9.98 3436 123 0.9926 0.00623276 0.00116063 1.51443e-06 4.22008e-07 1.59556e-06 2.58777e-06
11 0 9.99 3663 133 0.996407 0.00298809 0.000603271 4.54692e-07 1.38176e-07 4.26662e-07 4.58839e-07
12 0 9.98 3807 200 0.9956 0.00379493 0.000602983 6.39851e-07 2.92518e-07 7.51962e-07 6.87168e-07
13 0 9.97 3664 212 0.997896 0.00187465 0.000227785 2.82566e-07 8.56061e-08 3.10612e-07 4.88926e-07
14 0 9.98 3707 132 0.99515 0.00378571 0.00106067 1.12368e-06 2.97781e-07 1.04835e-06 1.35326e-06
15 0 9.97 3605 136 0.988233 0.00973499 0.00201433 4.73666e-06 1.4279e-06 4.87323e-06 6.81464e-06
16 0 9.99 3851 189 0.993411 0.00542851 0.0011545 1.77943e-06 3.67386e-07 1.57478e-06 2.07981e-06
17 0 9.99 4049 214 0.996826 0.00294038 0.000231951 4.14665e-07 1.64471e-07 4.97131e-07 7.24426e-07
18 0 9.98 3888 199 0.995052 0.00402527 0.000918271 1.03484e-06 3.05909e-07 1.19111e-06 2.13834e-06
19 0 9.97 3750 194 0.99483 0.00429366 0.000874005 5.73387e-07 1.72124e-07 5.5877e-07 6.08037e-07
20 0 9.97 3646 164 0.995309 0.00391581 0.000771982 8.98116e-07 3.33077e-07 8.52505e-07 1.04376e-06
21 0 9.97 3537 190 0.986334 0.0114737 0.00217827 3.8508e-06 1.18821e-06 3.83248e-06 4.82937e-06
22 0 9.97 3806 243 0.989295 0.00873679 0.00195442 4.36069e-06 1.6241e-06 4.24306e-06 3.82923e-06
23 0 9.98 3694 279 0.985699 0.0117683 0.00251421 4.70771e-06 1.50291e-06 4.53625e-06 7.51987e-06
24 0 9.98 3645 184 0.997118 0.00243468 0.000447044 1.91556e-07 6.10126e-08 2.19668e-07 1.31913e-07
25 0 9.97 3704 147 0.99451 0.00461526 0.00086783 2.02288e-06 5.28754e-07 1.83329e-06 2.08171e-06
26 0 9.98 4242 293 0.993395 0.00565843 0.000942015 1.17481e-06 4.19605e-07 1.35907e-06 1.31414e-06
27 0 9.98 3968 274 0.997454 0.00211021 0.000434157 3.27386e-07 1.65028e-07 3.66975e-07 4.98158e-07
28 0 9.98 3610 187 0.995357 0.00350873 0.00113064 1.05238e-06 3.17867e-07 1.06167e-06 1.59615e-06
29 0 9.99 4106 266 0.997388 0.0021576 0.000452812 5.12701e-07 2.16948e-07 5.25071e-07 7.87142e-07
... ... ... ... ... ... ... ... ... ... ... ...
8317 0 9.63 198 39 0.978824 0.0201162 0.00103777 6.94255e-06 4.74901e-06 6.00854e-06 4.4528e-06
8318 0 9.49 159 34 0.950871 0.037493 0.0114417 5.52819e-05 4.0407e-05 5.06164e-05 4.77088e-05
8319 0 9.37 137 38 0.95589 0.0356363 0.00833857 3.90109e-05 2.86723e-05 4.41421e-05 2.2943e-05
8320 0 9.22 115 31 0.973007 0.0261689 0.000802065 7.81133e-06 4.06847e-06 5.64804e-06 4.85713e-06
8321 0 9.16 118 26 0.985751 0.0133676 0.000867831 4.30493e-06 2.3077e-06 3.70835e-06 2.97135e-06
8322 0 9.31 130 28 0.97815 0.0193737 0.00243072 1.28054e-05 8.24686e-06 1.30193e-05 1.18927e-05
8323 0 9.41 108 31 0.986655 0.0127349 0.000598482 3.52972e-06 2.25995e-06 2.66755e-06 2.84763e-06
8324 0 9.39 103 28 0.933666 0.0613874 0.00484929 3.11642e-05 1.69548e-05 3.08076e-05 1.85089e-05
8325 0 9.43 95 26 0.946423 0.0517725 0.00177548 8.75156e-06 5.43013e-06 8.33919e-06 6.45058e-06
8326 0 9.32 92 29 0.983977 0.0154722 0.000520721 1.10151e-05 5.07253e-06 6.60171e-06 6.92474e-06
8327 0 9.36 91 19 0.969337 0.0292618 0.00135326 1.43735e-05 1.04729e-05 1.22095e-05 1.059e-05
8328 0 9.06 100 20 0.98552 0.0139271 0.000542893 3.45296e-06 1.37912e-06 2.58344e-06 2.58277e-06
8329 0 9.05 108 19 0.982169 0.016834 0.000969114 9.66214e-06 5.83884e-06 7.38129e-06 5.3863e-06
8330 0 9.14 87 30 0.965639 0.0304209 0.00386229 2.41426e-05 1.5781e-05 2.18124e-05 1.60854e-05
8331 0 8.83 92 32 0.930697 0.0591658 0.00979211 9.52837e-05 8.25959e-05 9.76118e-05 6.94451e-05
8332 0 8.86 78 25 0.960317 0.036018 0.00359266 2.05953e-05 1.50884e-05 2.21989e-05 1.45918e-05
8333 0 9.24 82 26 0.974095 0.0244182 0.0014309 1.53591e-05 9.85645e-06 1.18919e-05 1.90719e-05
8334 0 8.92 89 23 0.96745 0.0313858 0.00109354 2.46829e-05 9.98356e-06 1.60205e-05 2.0016e-05
8335 0 8.92 74 17 0.990672 0.00904011 0.000284364 1.40981e-06 5.0027e-07 9.83181e-07 1.07025e-06
8336 0 9.28 87 39 0.964277 0.0336699 0.00201119 1.27614e-05 8.2706e-06 1.11565e-05 1.0082e-05
8337 0 8.92 76 18 0.970633 0.0277353 0.00157405 1.85727e-05 1.05733e-05 1.2513e-05 1.5724e-05
8338 0 8.94 81 16 0.967806 0.029744 0.002413 1.08616e-05 7.29567e-06 9.7356e-06 9.10651e-06
8339 0 9.03 73 21 0.890563 0.0943081 0.0150841 1.2562e-05 3.69864e-06 1.19503e-05 1.67741e-05
8340 0 9.23 66 20 0.90827 0.0878207 0.00389947 2.72787e-06 1.24456e-06 3.17398e-06 2.45373e-06
8341 0 9.36 104 33 0.932414 0.0584751 0.009084 7.9239e-06 4.61802e-06 7.69809e-06 6.53069e-06
8342 1 8.76 851 246 0.916136 0.0680145 0.0158295 5.53178e-06 3.17836e-06 6.7428e-06 4.17631e-06
8343 0 8.25 293 76 0.959396 0.0364027 0.00417791 6.34824e-06 3.86028e-06 6.27671e-06 7.30174e-06
8344 0 7.65 205 33 0.9829 0.0154141 0.00168368 5.46146e-07 3.04465e-07 6.2754e-07 8.26238e-07
8345 0 7.65 161 26 0.971998 0.0249012 0.0030851 3.66783e-06 2.11937e-06 4.18493e-06 5.28245e-06
8346 0 7.44 170 29 0.956794 0.0389744 0.00421631 6.88631e-06 1.58935e-06 4.6474e-06 2.41662e-06

8347 rows × 11 columns


In [237]:
model = sm.OLS.from_formula("score ~ is_first + romance + SFfantasy + matial + mystery + lightnovel + fusion + adult", stack_df)

In [240]:
result = model.fit()


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-240-57cc4af9e2c4> in <module>()
----> 1 result = model.fit()

C:\Users\kms\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in fit(self, method, cov_type, cov_kwds, use_t, **kwargs)
    172                 (not hasattr(self, 'rank'))):
    173 
--> 174                 self.pinv_wexog, singular_values = pinv_extended(self.wexog)
    175                 self.normalized_cov_params = np.dot(self.pinv_wexog,
    176                                         np.transpose(self.pinv_wexog))

C:\Users\kms\Anaconda3\lib\site-packages\statsmodels\tools\tools.py in pinv_extended(X, rcond)
    390     X = np.asarray(X)
    391     X = X.conjugate()
--> 392     u, s, vt = np.linalg.svd(X, 0)
    393     s_orig = np.copy(s)
    394     m = u.shape[0]

C:\Users\kms\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in svd(a, full_matrices, compute_uv)
   1357 
   1358         signature = 'D->DdD' if isComplexType(t) else 'd->ddd'
-> 1359         u, s, vt = gufunc(a, signature=signature, extobj=extobj)
   1360         u = u.astype(result_t, copy=False)
   1361         s = s.astype(_realType(result_t), copy=False)

MemoryError: 

In [ ]:
print(result.sumarry())

In [259]:
all_df


Out[259]:
ID level genre main_score concern_count episodes_count comments_count romance SFfantasy matial mystery lightnovel fusion adult
0 466391 webnovel 101.0 9.98 86094 95 179 0.993969 0.004980 0.001046 1.445343e-06 4.803589e-07 1.434219e-06 1.883553e-06
1 398090 webnovel 101.0 9.98 71748 138 189 0.960641 0.013836 0.025512 2.390496e-06 6.062998e-07 5.936056e-06 1.502595e-06
2 514809 webnovel 101.0 9.96 43120 51 125 0.998369 0.001436 0.000190 9.660413e-07 5.004053e-07 1.100571e-06 1.373403e-06
3 505096 webnovel 101.0 9.95 48354 59 76 0.984675 0.013187 0.002107 7.836291e-06 3.831237e-06 8.578221e-06 1.020554e-05
4 523286 webnovel 101.0 9.97 27443 43 29 0.996526 0.002925 0.000544 1.519839e-06 4.883162e-07 1.427922e-06 1.901099e-06
5 552533 webnovel 101.0 9.92 16120 7 27 0.985303 0.012440 0.002148 3.113375e-05 1.320339e-05 2.737527e-05 3.720595e-05
6 514807 webnovel 101.0 9.96 36196 51 56 0.998335 0.001381 0.000282 5.093238e-07 2.542612e-07 4.404917e-07 8.493947e-07
7 466374 webnovel 101.0 9.86 34715 95 92 0.974345 0.013147 0.012492 3.980548e-06 9.503504e-07 6.043869e-06 4.812399e-06
8 483047 webnovel 101.0 9.98 33926 77 29 0.996668 0.002617 0.000711 5.325223e-07 1.852298e-07 2.596288e-06 4.936195e-07
9 514808 webnovel 101.0 9.92 31654 51 33 0.999400 0.000522 0.000078 9.013786e-08 3.820966e-08 7.226822e-08 1.099518e-07
10 554027 webnovel 101.0 9.97 19749 7 25 0.996832 0.002797 0.000366 1.546845e-06 4.466064e-07 1.248492e-06 2.397591e-06
11 473138 webnovel 101.0 9.94 29064 85 37 0.991668 0.006818 0.001491 6.611833e-06 2.543757e-06 6.067613e-06 7.449167e-06
12 532196 webnovel 101.0 9.96 22466 33 26 0.999019 0.000835 0.000145 2.813590e-07 8.690416e-08 2.632304e-07 3.141294e-07
13 505101 webnovel 101.0 9.96 28100 59 35 0.983621 0.013435 0.002892 1.689737e-05 4.588671e-06 1.491927e-05 1.556475e-05
14 538920 webnovel 101.0 9.97 19175 24 46 0.991078 0.008286 0.000634 9.331366e-07 2.607075e-07 7.256550e-07 6.727087e-07
15 514812 webnovel 101.0 9.95 20360 50 11 0.992427 0.006692 0.000863 3.953257e-06 1.388956e-06 4.116991e-06 7.970439e-06
16 554029 webnovel 101.0 9.66 11761 7 24 0.970096 0.020605 0.009216 1.993610e-05 1.120622e-05 2.300775e-05 2.824555e-05
17 485251 webnovel 101.0 9.94 22943 77 38 0.934641 0.022862 0.042461 9.159519e-06 2.337529e-06 1.412679e-05 1.062857e-05
18 494448 webnovel 101.0 9.96 23982 69 57 0.987416 0.010239 0.002302 1.350049e-05 5.765558e-06 1.347839e-05 1.035987e-05
19 545434 webnovel 101.0 9.97 16647 17 18 0.991897 0.006700 0.001396 2.203359e-06 7.935207e-07 2.185550e-06 2.534446e-06
20 514814 webnovel 101.0 9.84 18739 51 21 0.999364 0.000533 0.000102 2.781139e-07 1.337080e-07 2.718451e-07 2.242584e-07
21 538919 webnovel 101.0 9.96 16788 25 23 0.990412 0.006177 0.003408 6.147525e-07 2.443061e-07 6.587420e-07 5.641166e-07
22 545432 webnovel 101.0 9.93 14708 16 13 0.991868 0.006279 0.001716 6.407981e-05 1.513034e-05 2.630259e-05 3.087830e-05
23 554028 webnovel 101.0 9.96 9649 7 4 0.996700 0.002729 0.000568 2.691083e-06 9.589671e-08 2.911591e-07 3.547408e-07
24 485250 webnovel 101.0 9.97 23096 76 18 0.984949 0.012058 0.002922 1.992631e-05 8.694231e-06 1.838083e-05 2.312191e-05
25 532198 webnovel 101.0 9.92 15906 33 36 0.984644 0.007595 0.007750 2.690181e-06 9.754488e-07 5.378394e-06 2.551465e-06
26 523290 webnovel 101.0 9.95 17589 43 27 0.998415 0.001303 0.000277 1.444165e-06 5.716618e-07 1.455408e-06 1.480961e-06
27 545435 webnovel 101.0 9.97 13839 16 34 0.980865 0.015089 0.004043 6.880727e-07 1.936134e-07 1.023595e-06 5.827077e-07
28 532199 webnovel 101.0 9.96 15875 33 58 0.993566 0.005684 0.000744 1.826625e-06 8.443799e-07 1.904041e-06 1.271043e-06
29 494450 webnovel 101.0 9.96 19397 69 15 0.978974 0.012022 0.008959 9.993998e-06 3.586207e-06 1.587473e-05 1.535894e-05
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
211 50520 webnovel 104.0 9.95 6437 45 1 0.934258 0.053829 0.011867 1.022589e-05 4.631214e-06 1.815727e-05 1.251620e-05
212 113903 webnovel 104.0 9.77 7325 9 2 0.892085 0.080280 0.026448 3.136395e-04 9.599328e-05 2.733141e-04 5.034985e-04
213 99001 webnovel 104.0 9.79 7117 60 0 0.967893 0.022156 0.009834 3.069187e-05 1.175605e-05 3.902217e-05 3.695452e-05
214 50518 webnovel 104.0 9.77 4011 5 0 0.940350 0.045789 0.013799 2.039404e-05 5.975756e-06 1.984347e-05 1.549793e-05
215 153309 webnovel 104.0 9.71 2784 5 7 0.964349 0.030288 0.005325 2.309729e-05 1.291176e-06 6.719014e-06 6.932062e-06
216 113904 webnovel 104.0 9.40 2095 5 0 0.873610 0.099661 0.024855 5.848567e-04 1.660454e-04 4.531664e-04 6.694488e-04
217 63071 webnovel 104.0 9.39 1769 1 0 0.823737 0.114589 0.061429 7.033829e-05 1.012966e-05 7.752577e-05 8.628442e-05
218 215394 webnovel 104.0 9.72 1701 50 0 0.908446 0.070461 0.020744 1.287177e-04 3.737242e-05 1.194759e-04 6.398999e-05
219 291829 webnovel 104.0 9.72 1179 5 0 0.985270 0.012600 0.002081 1.955910e-05 6.003208e-06 1.267583e-05 1.075962e-05
220 126775 webnovel 104.0 9.83 2417 5 1 0.981565 0.015594 0.002839 3.142884e-07 1.169013e-07 2.786594e-07 5.830829e-07
221 494452 webnovel 106.0 9.96 18765 68 17 0.988963 0.009431 0.001571 1.158039e-05 1.645350e-06 8.777677e-06 1.313621e-05
222 494458 webnovel 106.0 9.95 18428 69 27 0.990813 0.006093 0.003090 1.370683e-06 2.179424e-07 1.499826e-06 1.164900e-06
223 423620 webnovel 106.0 9.92 12605 121 40 0.972061 0.024273 0.003652 3.708277e-06 5.145892e-07 4.059308e-06 5.151478e-06
224 456728 webnovel 106.0 9.95 10327 104 104 0.851440 0.118342 0.030159 1.583590e-05 3.290750e-06 2.543312e-05 1.373876e-05
225 361654 webnovel 106.0 9.91 19946 7 24 0.966827 0.022858 0.008386 4.354335e-04 2.045180e-04 9.995898e-04 2.894717e-04
226 398093 webnovel 106.0 9.94 25847 5 75 0.940230 0.048091 0.011542 3.585632e-05 1.218167e-05 4.055629e-05 4.784122e-05
227 252939 webnovel 106.0 9.83 20019 20 28 0.968589 0.027097 0.004278 9.468333e-06 8.616355e-06 9.277670e-06 9.709161e-06
228 466372 webnovel 106.0 9.89 5850 68 28 0.953480 0.035924 0.010524 1.897680e-05 3.361584e-06 2.820739e-05 2.183960e-05
229 252941 webnovel 106.0 9.42 6466 5 11 0.723633 0.230537 0.044655 2.830989e-04 1.556915e-04 3.803953e-04 3.556454e-04
230 456727 webnovel 108.0 9.97 18706 103 34 0.955125 0.026745 0.018106 7.944327e-06 2.712969e-06 4.717050e-06 8.910317e-06
231 505107 webnovel 108.0 9.95 9158 59 37 0.990124 0.003831 0.006034 3.103803e-06 1.045507e-06 2.688798e-06 4.778717e-06
232 538923 webnovel 108.0 9.94 4672 25 5 0.999208 0.000626 0.000165 2.781051e-07 7.057160e-08 2.239911e-07 6.901216e-07
233 538925 webnovel 108.0 9.87 4097 25 21 0.989412 0.008955 0.001612 7.481042e-06 1.908479e-06 4.624563e-06 7.182610e-06
234 466392 webnovel 108.0 9.95 6825 94 19 0.820782 0.133148 0.046002 2.147574e-05 8.155202e-06 1.274992e-05 2.643142e-05
235 433837 webnovel 108.0 9.93 4468 113 12 0.759214 0.189596 0.051058 5.150152e-05 1.763464e-05 3.074302e-05 3.230948e-05
236 538926 webnovel 108.0 9.96 3445 25 9 0.955100 0.022902 0.021963 1.161190e-05 2.796723e-06 1.068454e-05 8.813088e-06
237 398095 webnovel 108.0 9.91 5477 100 14 0.856304 0.092580 0.050810 1.169083e-04 3.112743e-05 5.423470e-05 1.034936e-04
238 272505 webnovel 108.0 9.94 4801 53 15 0.979590 0.018526 0.001858 9.510890e-06 4.843141e-06 4.906995e-06 6.791485e-06
239 252942 webnovel 108.0 9.28 1093 27 0 0.960285 0.035858 0.003792 1.955862e-05 1.291997e-05 1.783311e-05 1.468656e-05
240 291783 webnovel 108.0 8.29 662 5 1 0.957445 0.036741 0.005798 4.596061e-06 2.210364e-06 4.495877e-06 4.000672e-06

241 rows × 14 columns


In [267]:
model = sm.OLS.from_formula("concern_count ~ C(genre) + episodes_count + romance + SFfantasy + matial", all_df)

In [268]:
result = model.fit()

In [269]:
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:          concern_count   R-squared:                       0.181
Model:                            OLS   Adj. R-squared:                  0.160
Method:                 Least Squares   F-statistic:                     8.604
Date:                Thu, 28 Jul 2016   Prob (F-statistic):           1.84e-08
Time:                        11:00:26   Log-Likelihood:                -2678.3
No. Observations:                 241   AIC:                             5371.
Df Residuals:                     234   BIC:                             5395.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
=====================================================================================
                        coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
Intercept          2.443e+04   1557.194     15.688      0.000      2.14e+04  2.75e+04
C(genre)[T.102.0] -1.368e+04   2918.347     -4.688      0.000     -1.94e+04 -7930.796
C(genre)[T.103.0] -7988.9954   4043.764     -1.976      0.049      -1.6e+04   -22.158
C(genre)[T.104.0] -1.992e+04   3726.756     -5.344      0.000     -2.73e+04 -1.26e+04
C(genre)[T.106.0] -1.156e+04   5734.693     -2.015      0.045     -2.29e+04  -258.229
C(genre)[T.108.0] -2.141e+04   5260.028     -4.070      0.000     -3.18e+04  -1.1e+04
episodes_count       47.9583     27.133      1.768      0.078        -5.499   101.415
==============================================================================
Omnibus:                      112.312   Durbin-Watson:                   0.575
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              414.656
Skew:                           1.992   Prob(JB):                     9.09e-91
Kurtosis:                       8.041   Cond. No.                         317.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [150]:
all_df.head(0)


Out[150]:
ID level genre main_score concern_count episodes_count comments_count 101 102 103 104 106 108 109

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [28]:
from konlpy.tag import Twitter
pos_tagger = Twitter()


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-28-8e76075a56e7> in <module>()
----> 1 from konlpy.tag import Twitter
      2 pos_tagger = Twitter()

ImportError: No module named 'konlpy'

In [ ]:
def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

CountVectorizer(tokenizer=tokenize_pos)

In [ ]:
pos_tagger = Twit

In [45]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos)), 
            ('clf', MultinomialNB()),
        ])

In [98]:
X_data = new_df["text"]
y_data = new_df["genre"]
label_data = new_df["ID"]

In [99]:
cv = LabelKFold(label_data, n_folds=2)

In [101]:
for train_index, test_index in cv:
    model.fit(X_data.loc[train_index], y_data.loc[train_index])
    print(classification_report(y_data.loc[test_index], model.predict(X_data[test_index]), digits=4))


/home/minsoo/anaconda2/lib/python2.7/site-packages/sklearn/metrics/classification.py:1076: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
             precision    recall  f1-score   support

      101.0     0.6486    0.9318    0.7648       513
      102.0     0.9406    0.6404    0.7620       470
      103.0     0.9912    0.9741    0.9826       464
      104.0     0.4111    0.2176    0.2846       170
      106.0     0.0000    0.0000    0.0000         0
      108.0     0.6000    0.4615    0.5217        52

avg / total     0.8004    0.7741    0.7681      1669

             precision    recall  f1-score   support

      101.0     0.6751    0.9216    0.7793       523
      102.0     0.6364    0.7000    0.6667       320
      103.0     0.9713    0.9854    0.9783       480
      104.0     0.5000    0.1607    0.2432       168
      106.0     0.0000    0.0000    0.0000        12
      108.0     0.1897    0.0663    0.0982       166

avg / total     0.6821    0.7292    0.6876      1669

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-101-8bf038cc7042> in <module>()
      1 for train_index, test_index in cv:
----> 2     model.fit(X_data.loc[train_index], y_data.loc[train_index])
      3     print(classification_report(y_data.loc[test_index], model.predict(X_data[test_index]), digits=4))
      4 

/home/minsoo/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
    162             the pipeline.
    163         """
--> 164         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    165         self.steps[-1][-1].fit(Xt, y, **fit_params)
    166         return self

/home/minsoo/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in _pre_transform(self, X, y, **fit_params)
    143         for name, transform in self.steps[:-1]:
    144             if hasattr(transform, "fit_transform"):
--> 145                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    146             else:
    147                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/home/minsoo/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
    815 
    816         vocabulary, X = self._count_vocab(raw_documents,
--> 817                                           self.fixed_vocabulary_)
    818 
    819         if self.binary:

/home/minsoo/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
    750         indptr.append(0)
    751         for doc in raw_documents:
--> 752             for feature in analyze(doc):
    753                 try:
    754                     j_indices.append(vocabulary[feature])

/home/minsoo/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
    236 
    237             return lambda doc: self._word_ngrams(
--> 238                 tokenize(preprocess(self.decode(doc))), stop_words)
    239 
    240         else:

<ipython-input-45-907c4a8c27af> in tokenize_pos(doc)
      3 
      4 def tokenize_pos(doc):
----> 5     return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
      6 
      7 model = Pipeline([

/home/minsoo/anaconda2/lib/python2.7/site-packages/konlpy/tag/_twitter.pyc in pos(self, phrase, norm, stem)
     49                     phrase,
     50                     jpype.java.lang.Boolean(norm),
---> 51                     jpype.java.lang.Boolean(stem)).toArray()
     52         return [tuple(t.rsplit('/', 1)) for t in tokens]
     53 

/home/minsoo/anaconda2/lib/python2.7/site-packages/jpype/_jclass.pyc in _getClassFor(javaClass)
     58 
     59 
---> 60 def _getClassFor(javaClass):
     61     name = javaClass.getName()
     62     if name in _CLASSES:

KeyboardInterrupt: 

In [50]:


In [51]:


In [68]:
%%time
predict = model.predict(X_test)
print(classification_report(y_test, model.predict(X_test), digits=4))


CPU times: user 4min 4s, sys: 668 ms, total: 4min 5s
Wall time: 3min 58s
Out[68]:
'             precision    recall  f1-score   support\n\n      101.0     0.9751    0.9955    0.9852       670\n      102.0     0.9943    0.9796    0.9869       538\n      103.0     1.0000    1.0000    1.0000       455\n      104.0     1.0000    0.9686    0.9840       159\n      106.0     1.0000    1.0000    1.0000       103\n      108.0     1.0000    0.9938    0.9969       162\n\navg / total     0.9906    0.9904    0.9904      2087\n'

In [72]:
print(_)


             precision    recall  f1-score   support

      101.0     0.9751    0.9955    0.9852       670
      102.0     0.9943    0.9796    0.9869       538
      103.0     1.0000    1.0000    1.0000       455
      104.0     1.0000    0.9686    0.9840       159
      106.0     1.0000    1.0000    1.0000       103
      108.0     1.0000    0.9938    0.9969       162

avg / total     0.9906    0.9904    0.9904      2087


In [74]:
model.predict_proba(X_test)


Out[74]:
array([[  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   1.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   1.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  1.00000000e+000,   3.74575540e-225,   0.00000000e+000,
          1.94507552e-247,   9.73069138e-289,   2.52901497e-315],
       ..., 
       [  0.00000000e+000,   3.41306971e-318,   1.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   4.39046198e-271],
       [  3.19829346e-111,   1.47690568e-201,   0.00000000e+000,
          1.00000000e+000,   1.45816618e-182,   8.68007817e-260],
       [  1.00000000e+000,   2.34850085e-236,   0.00000000e+000,
          5.28621093e-299,   3.83921116e-266,   0.00000000e+000]])

In [91]:
predict[2]


Out[91]:
array([  1.00000000e+000,   3.74575540e-225,   0.00000000e+000,
         1.94507552e-247,   9.73069138e-289,   2.52901497e-315])

In [272]:
sum(all_df["comments_count"])


Out[272]:
6310

In [ ]: