Load the Dataset

annotated json files have to be stored in the data/processed dir.



In [2]:

    
%load_ext autoreload
%autoreload 2



In [4]:

    
import os
import sys
sys.path.append(os.path.abspath(os.path.join("..")))
import pandas as pd
import numpy as np
from scripts.features.reader import Reader
from scripts.features.post_feature import PostFeature
from scripts.features.length_extractor import TitleLengthExtractor, SectionCountExtractor, SentenceMeanLengthExtractor, SentenceMinLengthExtractor, SentenceMaxLengthExtractor, KanjiRatioExtractor, HiraganaRatioExtractor, KatakanaRatioExtractor, NumberRatioExtractor, Header1MeanLengthExtractor, Header2MeanLengthExtractor, SentenceInfo

reader = Reader()
pf_dicts = []

for p in reader.post_iterator():
    pf = PostFeature(p)
    si = SentenceInfo(p.body)
    si.analyse()
    
    pf.add(TitleLengthExtractor())
    pf.add(SectionCountExtractor())
    pf.add(SentenceMeanLengthExtractor(si))
    pf.add(SentenceMinLengthExtractor(si))
    pf.add(SentenceMaxLengthExtractor(si))    
    pf.add(KanjiRatioExtractor(si))
    pf.add(HiraganaRatioExtractor(si))
    pf.add(KatakanaRatioExtractor(si))
    pf.add(NumberRatioExtractor(si))
    pf.add(Header1MeanLengthExtractor())
    pf.add(Header2MeanLengthExtractor())

    pf_d = pf.to_dict(drop_disused_feature=False)  # default True -> drop title, body etc fields
    pf_dicts.append(pf_d)
    
pf_df = pd.DataFrame(pf_dicts)
pf_df.head(5)









    Out[4]:






  
    
      
      body
      header1_mean_length
      header2_mean_length
      hiragana_ratio
      kanji_ratio
      katakana_ratio
      number_ratio
      post_id
      quality
      rendered_body
      section_count
      sentence_max_length
      sentence_mean_length
      sentence_min_length
      title
      title_length
      url
      user_followers_count
      user_id
    
  
  
    
      0
      [Unsupervised Riemannian Metric Learning for H...
      0.0
      0.000000
      0.218725
      0.178370
      0.091203
      0.017756
      02f3e59fac3bd042121a
      0
      <p><a href="http://jmlr.org/proceedings/papers...
      0
      147
      65.210526
      21
      機械学習論文読みメモ_9
      12
      http://qiita.com/festa78/items/02f3e59fac3bd04...
      36
      festa78
    
    
      1
      https://www.youtube.com/watch?v=cSKfRcEDGUs&li...
      17.0
      9.500000
      0.133051
      0.096462
      0.046265
      0.020562
      041ed3339c68d1231655
      0
      <p><a href="https://www.youtube.com/watch?v=cS...
      12
      136
      33.744898
      3
      機械学習レシピ#6 Tensorflow for Poetsでイメージ分類器
      38
      http://qiita.com/t-yotsu/items/041ed3339c68d12...
      14
      t-yotsu
    
    
      2
      ## モチベーション\n\n- Pythonで機械学習アルゴリズムを1から書く系のことがした...
      0.0
      15.636364
      0.176226
      0.136950
      0.028549
      0.095683
      05a884354741bd9ca82b
      0
      \n<h2>\n<span id="モチベーション" class="fragment"></...
      36
      167
      40.844523
      1
      Pythonで「線形回帰」と"確率版の線形回帰"である「ベイズ線形回帰」
      36
      http://qiita.com/ysdyt/items/05a884354741bd9ca82b
      7
      ysdyt
    
    
      3
      \n\n#作ったもの\n物体認識の例として、お菓子を判別するアプリ(iOS)を作ってみました...
      13.2
      27.750000
      0.086724
      0.048866
      0.049533
      0.045364
      070ff2901c3d95e90470
      1
      \n<h1>\n<span id="作ったもの" class="fragment"></sp...
      14
      408
      41.068493
      1
      ディープラーニングが分からなくてもいい感じに物体認識させるサービスを試す(iOS Swift...
      76
      http://qiita.com/Godai_Aoki/items/070ff2901c3d...
      13
      Godai_Aoki
    
    
      4
      Courseraのマシンラーニング講座を勉強しています。\nweek8は動画を見ながらメモを...
      18.5
      32.416667
      0.434976
      0.227862
      0.142090
      0.028159
      085e30c206e08fadd86f
      0
      <p>Courseraのマシンラーニング講座を勉強しています。<br>\nweek8は動画を...
      18
      77
      19.141844
      2
      Coursera Machine Learning Week8のメモ
      34
      http://qiita.com/tackey/items/085e30c206e08fad...
      63
      tackey



In [5]:

    
# drop disused features
if "post_id" in pf_df.columns:
    pf_df.drop(pf_df[["post_id", "title", "body", "url", "user_id", 'rendered_body']], axis=1, inplace=True)

quality = pf_df["quality"]
pf_df.drop("quality", axis=1, inplace=True)



In [6]:

    
# normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pf_df_n = scaler.fit_transform(pf_df)



In [8]:

    
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# estimate accuracy by cross validation
clf = RandomForestClassifier(n_estimators=15, max_features=0.5)
scores = cross_val_score(clf, pf_df_n,  quality, cv=2, scoring="f1")
print(scores)

# train and show score
train_f, test_f, train_lb, test_lb = train_test_split(pf_df_n, quality, test_size=0.2, random_state=42)
clf.fit(train_f, train_lb)
pred = clf.predict(test_f)
print(classification_report(test_lb, pred, target_names=["bad", "good"]))

# show feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
labels = np.array(pf_df.columns.values.tolist())
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(labels)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(labels)), labels[indices], rotation="vertical")
plt.xlim([-1, len(labels)])
plt.ylim([0, 1])
plt.tight_layout()
plt.show()









    



[ 0.35294118  0.58333333]
             precision    recall  f1-score   support

        bad       0.79      1.00      0.88        15
       good       1.00      0.20      0.33         5

avg / total       0.84      0.80      0.75        20



In [12]:

    
from sklearn.externals import joblib
joblib.dump(clf, "../models/banana.pkl") 
joblib.dump(scaler, "../models/banana_scaler.pkl")

with open("../models/banana_list.txt", "w") as f:
    f.write(" ".join(pf_df.columns.tolist()))



In [ ]:

	body	header1_mean_length	header2_mean_length	hiragana_ratio	kanji_ratio	katakana_ratio	number_ratio	post_id	quality	rendered_body	section_count	sentence_max_length	sentence_mean_length	sentence_min_length	title	title_length	url	user_followers_count	user_id
0	[Unsupervised Riemannian Metric Learning for H...	0.0	0.000000	0.218725	0.178370	0.091203	0.017756	02f3e59fac3bd042121a	0	<p><a href="http://jmlr.org/proceedings/papers...	0	147	65.210526	21	機械学習論文読みメモ_9	12	http://qiita.com/festa78/items/02f3e59fac3bd04...	36	festa78
1	https://www.youtube.com/watch?v=cSKfRcEDGUs&li...	17.0	9.500000	0.133051	0.096462	0.046265	0.020562	041ed3339c68d1231655	0	<p><a href="https://www.youtube.com/watch?v=cS...	12	136	33.744898	3	機械学習レシピ#6 Tensorflow for Poetsでイメージ分類器	38	http://qiita.com/t-yotsu/items/041ed3339c68d12...	14	t-yotsu
2	## モチベーション\n\n- Pythonで機械学習アルゴリズムを1から書く系のことがした...	0.0	15.636364	0.176226	0.136950	0.028549	0.095683	05a884354741bd9ca82b	0	\n<h2>\n<span id="モチベーション" class="fragment"></...	36	167	40.844523	1	Pythonで「線形回帰」と"確率版の線形回帰"である「ベイズ線形回帰」	36	http://qiita.com/ysdyt/items/05a884354741bd9ca82b	7	ysdyt
3	\n\n#作ったもの\n物体認識の例として、お菓子を判別するアプリ(iOS)を作ってみました...	13.2	27.750000	0.086724	0.048866	0.049533	0.045364	070ff2901c3d95e90470	1	\n<h1>\n<span id="作ったもの" class="fragment"></sp...	14	408	41.068493	1	ディープラーニングが分からなくてもいい感じに物体認識させるサービスを試す(iOS Swift...	76	http://qiita.com/Godai_Aoki/items/070ff2901c3d...	13	Godai_Aoki
4	Courseraのマシンラーニング講座を勉強しています。\nweek8は動画を見ながらメモを...	18.5	32.416667	0.434976	0.227862	0.142090	0.028159	085e30c206e08fadd86f	0	<p>Courseraのマシンラーニング講座を勉強しています。<br>\nweek8は動画を...	18	77	19.141844	2	Coursera Machine Learning Week8のメモ	34	http://qiita.com/tackey/items/085e30c206e08fad...	63	tackey