Load the Dataset

  • annotated json files have to be stored in the data/processed dir.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("..")))
import pandas as pd
import numpy as np
from scripts.features.reader import Reader
from scripts.features.post_feature import PostFeature
from scripts.features.length_extractor import TitleLengthExtractor, SectionCountExtractor, Header1MeanLengthExtractor, Header2MeanLengthExtractor, SentenceMeanLengthExtractor, SentenceMaxLengthExtractor, SentenceMinLengthExtractor
from scripts.features.charactor_extractor import RenderedBodyPreprocessor, KanjiRatioExtractor, HiraganaRatioExtractor, KatakanaRatioExtractor, NumberRatioExtractor, PunctuationRatioExtractor
from scripts.features.structure_extractor import ItemizationCountExtractor, FormulaCountExtractor, ImageCountExtractor, ItemizationRatioExtractor, ImageRatioExtractor, FormulaRatioExtractor

reader = Reader()
pf_dicts = []

for p in reader.post_iterator():
    pf = PostFeature(p)
    cleaned_rendered_body = RenderedBodyPreprocessor().clean_rendered_body(p.rendered_body)
    
    pf.add(TitleLengthExtractor())
    pf.add(SectionCountExtractor())
    
    pf.add(KanjiRatioExtractor(cleaned_rendered_body))
    pf.add(HiraganaRatioExtractor(cleaned_rendered_body))
    pf.add(KatakanaRatioExtractor(cleaned_rendered_body))
    pf.add(NumberRatioExtractor(cleaned_rendered_body))
    pf.add(PunctuationRatioExtractor(cleaned_rendered_body))
    pf.add(SentenceMeanLengthExtractor(cleaned_rendered_body))
#    pf.add(Header1MeanLengthExtractor())
#    pf.add(Header2MeanLengthExtractor())
    pf.add(SentenceMeanLengthExtractor(cleaned_rendered_body))
    pf.add(SentenceMaxLengthExtractor(cleaned_rendered_body))
#    pf.add(SentenceMinLengthExtractor(cleaned_rendered_body))
    
#    pf.add(ItemizationCountExtractor())
#    pf.add(FormulaCountExtractor())
    pf.add(ImageCountExtractor())
#    pf.add(ItemizationRatioExtractor(cleaned_rendered_body))
#    pf.add(FormulaRatioExtractor(cleaned_rendered_body))
    pf.add(ImageRatioExtractor(cleaned_rendered_body))
    
    pf_d = pf.to_dict(drop_disused_feature=False)  # default True -> drop title, body etc fields
    pf_dicts.append(pf_d)
    
pf_df = pd.DataFrame(pf_dicts)
pf_df.head(5)


Out[2]:
body hiragana_ratio image_count image_ratio kanji_ratio katakana_ratio number_ratio post_id punctuation_ratio quality rendered_body section_count sentence_max_length sentence_mean_length title title_length url user_followers_count user_id
0 [Unsupervised Riemannian Metric Learning for H... 0.253035 0 0.000000 0.206349 0.105509 0.011204 02f3e59fac3bd042121a 0.013072 0 <p><a href="http://jmlr.org/proceedings/papers... 0 10.294118 62.058824 機械学習論文読みメモ_9 12 http://qiita.com/festa78/items/02f3e59fac3bd04... 36 festa78
1 https://www.youtube.com/watch?v=cSKfRcEDGUs&li... 0.302677 0 0.000000 0.224434 0.108442 0.012354 041ed3339c68d1231655 0.009609 0 <p><a href="https://www.youtube.com/watch?v=cS... 12 4.090909 43.181818 機械学習レシピ#6 Tensorflow for Poetsでイメージ分類器 38 http://qiita.com/t-yotsu/items/041ed3339c68d12... 14 t-yotsu
2 ## モチベーション\n\n- Pythonで機械学習アルゴリズムを1から書く系のことがした... 0.455031 5 0.454545 0.364203 0.075467 0.010686 05a884354741bd9ca82b 0.016028 0 \n<h2>\n<span id="モチベーション" class="fragment"></... 36 93.090909 407.454545 Pythonで「線形回帰」と"確率版の線形回帰"である「ベイズ線形回帰」 36 http://qiita.com/ysdyt/items/05a884354741bd9ca82b 7 ysdyt
3 \n\n#作ったもの\n物体認識の例として、お菓子を判別するアプリ(iOS)を作ってみました... 0.247520 4 0.125000 0.138876 0.134152 0.027870 070ff2901c3d95e90470 0.012282 1 \n<h1>\n<span id="作ったもの" class="fragment"></sp... 14 6.468750 65.187500 ディープラーニングが分からなくてもいい感じに物体認識させるサービスを試す(iOS Swift... 76 http://qiita.com/Godai_Aoki/items/070ff2901c3d... 13 Godai_Aoki
4 Courseraのマシンラーニング講座を勉強しています。\nweek8は動画を見ながらメモを... 0.403159 0 0.000000 0.211195 0.131696 0.024210 085e30c206e08fadd86f 0.016140 0 <p>Courseraのマシンラーニング講座を勉強しています。<br>\nweek8は動画を... 18 0.671569 27.553922 Coursera Machine Learning Week8のメモ 34 http://qiita.com/tackey/items/085e30c206e08fad... 63 tackey

In [3]:
# drop disused features
if "post_id" in pf_df.columns:
    pf_df.drop(pf_df[["post_id", "title", "body", "url", "user_id", 'rendered_body']], axis=1, inplace=True)

quality = pf_df["quality"]
pf_df.drop("quality", axis=1, inplace=True)

In [4]:
# normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pf_df_n = scaler.fit_transform(pf_df)

In [5]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# estimate accuracy by cross validation
clf = RandomForestClassifier(n_estimators=15, max_features=0.5)
scores = cross_val_score(clf, pf_df_n,  quality, cv=2, scoring="f1")
print(scores)

# train and show score
train_f, test_f, train_lb, test_lb = train_test_split(pf_df_n, quality, test_size=0.2, random_state=42)
clf.fit(train_f, train_lb)
pred = clf.predict(test_f)
print(classification_report(test_lb, pred, target_names=["bad", "good"]))

# show feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
labels = np.array(pf_df.columns.values.tolist())
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(labels)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(labels)), labels[indices], rotation="vertical")
plt.xlim([-1, len(labels)])
plt.ylim([0, 1])
plt.tight_layout()
plt.show()


[ 0.42857143  0.46666667]
             precision    recall  f1-score   support

        bad       1.00      0.93      0.97        15
       good       0.83      1.00      0.91         5

avg / total       0.96      0.95      0.95        20


In [6]:
from scripts.models.save_models import SaveModelsScalor

# save model
SaveModelsScalor(clf, scaler, pf_df)


Out[6]:
<scripts.models.save_models.SaveModelsScalor at 0x106c3e2b0>

In [ ]: