Load the Dataset

  • annotated json files have to be stored in the data/processed dir.

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("..")))
import pandas as pd
import numpy as np
from scripts.features.reader import Reader
from scripts.features.post_feature import PostFeature
from scripts.features.length_extractor import TitleLengthExtractor, SectionCountExtractor, SentenceMeanLengthExtractor, SentenceMinLengthExtractor, SentenceMaxLengthExtractor, KanjiRatioExtractor, HiraganaRatioExtractor, KatakanaRatioExtractor, NumberRatioExtractor, Header1MeanLengthExtractor, Header2MeanLengthExtractor, SentenceInfo

reader = Reader()
pf_dicts = []

for p in reader.post_iterator():
    pf = PostFeature(p)
    si = SentenceInfo(p.body)
    si.analyse()
    
    pf.add(TitleLengthExtractor())
    pf.add(SectionCountExtractor())
    pf.add(SentenceMeanLengthExtractor(si))
    pf.add(SentenceMinLengthExtractor(si))
    pf.add(SentenceMaxLengthExtractor(si))    
    pf.add(KanjiRatioExtractor(si))
    pf.add(HiraganaRatioExtractor(si))
    pf.add(KatakanaRatioExtractor(si))
    pf.add(NumberRatioExtractor(si))
    pf.add(Header1MeanLengthExtractor())
    pf.add(Header2MeanLengthExtractor())

    pf_d = pf.to_dict(drop_disused_feature=False)  # default True -> drop title, body etc fields
    pf_dicts.append(pf_d)
    
pf_df = pd.DataFrame(pf_dicts)
pf_df.head(5)


Out[4]:
body header1_mean_length header2_mean_length hiragana_ratio kanji_ratio katakana_ratio number_ratio post_id quality rendered_body section_count sentence_max_length sentence_mean_length sentence_min_length title title_length url user_followers_count user_id
0 [Unsupervised Riemannian Metric Learning for H... 0.0 0.000000 0.218725 0.178370 0.091203 0.017756 02f3e59fac3bd042121a 0 <p><a href="http://jmlr.org/proceedings/papers... 0 147 65.210526 21 機械学習論文読みメモ_9 12 http://qiita.com/festa78/items/02f3e59fac3bd04... 36 festa78
1 https://www.youtube.com/watch?v=cSKfRcEDGUs&li... 17.0 9.500000 0.133051 0.096462 0.046265 0.020562 041ed3339c68d1231655 0 <p><a href="https://www.youtube.com/watch?v=cS... 12 136 33.744898 3 機械学習レシピ#6 Tensorflow for Poetsでイメージ分類器 38 http://qiita.com/t-yotsu/items/041ed3339c68d12... 14 t-yotsu
2 ## モチベーション\n\n- Pythonで機械学習アルゴリズムを1から書く系のことがした... 0.0 15.636364 0.176226 0.136950 0.028549 0.095683 05a884354741bd9ca82b 0 \n<h2>\n<span id="モチベーション" class="fragment"></... 36 167 40.844523 1 Pythonで「線形回帰」と"確率版の線形回帰"である「ベイズ線形回帰」 36 http://qiita.com/ysdyt/items/05a884354741bd9ca82b 7 ysdyt
3 \n\n#作ったもの\n物体認識の例として、お菓子を判別するアプリ(iOS)を作ってみました... 13.2 27.750000 0.086724 0.048866 0.049533 0.045364 070ff2901c3d95e90470 1 \n<h1>\n<span id="作ったもの" class="fragment"></sp... 14 408 41.068493 1 ディープラーニングが分からなくてもいい感じに物体認識させるサービスを試す(iOS Swift... 76 http://qiita.com/Godai_Aoki/items/070ff2901c3d... 13 Godai_Aoki
4 Courseraのマシンラーニング講座を勉強しています。\nweek8は動画を見ながらメモを... 18.5 32.416667 0.434976 0.227862 0.142090 0.028159 085e30c206e08fadd86f 0 <p>Courseraのマシンラーニング講座を勉強しています。<br>\nweek8は動画を... 18 77 19.141844 2 Coursera Machine Learning Week8のメモ 34 http://qiita.com/tackey/items/085e30c206e08fad... 63 tackey

In [5]:
# drop disused features
if "post_id" in pf_df.columns:
    pf_df.drop(pf_df[["post_id", "title", "body", "url", "user_id", 'rendered_body']], axis=1, inplace=True)

quality = pf_df["quality"]
pf_df.drop("quality", axis=1, inplace=True)

In [6]:
# normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pf_df_n = scaler.fit_transform(pf_df)

In [8]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# estimate accuracy by cross validation
clf = RandomForestClassifier(n_estimators=15, max_features=0.5)
scores = cross_val_score(clf, pf_df_n,  quality, cv=2, scoring="f1")
print(scores)

# train and show score
train_f, test_f, train_lb, test_lb = train_test_split(pf_df_n, quality, test_size=0.2, random_state=42)
clf.fit(train_f, train_lb)
pred = clf.predict(test_f)
print(classification_report(test_lb, pred, target_names=["bad", "good"]))

# show feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
labels = np.array(pf_df.columns.values.tolist())
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(labels)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(labels)), labels[indices], rotation="vertical")
plt.xlim([-1, len(labels)])
plt.ylim([0, 1])
plt.tight_layout()
plt.show()


[ 0.35294118  0.58333333]
             precision    recall  f1-score   support

        bad       0.79      1.00      0.88        15
       good       1.00      0.20      0.33         5

avg / total       0.84      0.80      0.75        20


In [12]:
from sklearn.externals import joblib
joblib.dump(clf, "../models/banana.pkl") 
joblib.dump(scaler, "../models/banana_scaler.pkl")

with open("../models/banana_list.txt", "w") as f:
    f.write(" ".join(pf_df.columns.tolist()))

In [ ]: