In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("..")))
import pandas as pd
import numpy as np
from scripts.features.reader import Reader
from scripts.features.post_feature import PostFeature
from scripts.features.length_extractor import TitleLengthExtractor, SectionCountExtractor, Header1MeanLengthExtractor, Header2MeanLengthExtractor, SentenceMeanLengthExtractor, SentenceMaxLengthExtractor, SentenceMinLengthExtractor
from scripts.features.charactor_extractor import RenderedBodyPreprocessor, KanjiRatioExtractor, HiraganaRatioExtractor, KatakanaRatioExtractor, NumberRatioExtractor, PunctuationRatioExtractor
from scripts.features.structure_extractor import ItemizationCountExtractor, FormulaCountExtractor, ImageCountExtractor, ItemizationRatioExtractor, ImageRatioExtractor, FormulaRatioExtractor
reader = Reader()
pf_dicts = []
for p in reader.post_iterator():
pf = PostFeature(p)
cleaned_rendered_body = RenderedBodyPreprocessor().clean_rendered_body(p.rendered_body)
pf.add(TitleLengthExtractor())
pf.add(SectionCountExtractor())
pf.add(KanjiRatioExtractor(cleaned_rendered_body))
pf.add(HiraganaRatioExtractor(cleaned_rendered_body))
pf.add(KatakanaRatioExtractor(cleaned_rendered_body))
pf.add(NumberRatioExtractor(cleaned_rendered_body))
pf.add(PunctuationRatioExtractor(cleaned_rendered_body))
pf.add(SentenceMeanLengthExtractor(cleaned_rendered_body))
# pf.add(Header1MeanLengthExtractor())
# pf.add(Header2MeanLengthExtractor())
pf.add(SentenceMeanLengthExtractor(cleaned_rendered_body))
pf.add(SentenceMaxLengthExtractor(cleaned_rendered_body))
# pf.add(SentenceMinLengthExtractor(cleaned_rendered_body))
# pf.add(ItemizationCountExtractor())
# pf.add(FormulaCountExtractor())
pf.add(ImageCountExtractor())
# pf.add(ItemizationRatioExtractor(cleaned_rendered_body))
# pf.add(FormulaRatioExtractor(cleaned_rendered_body))
pf.add(ImageRatioExtractor(cleaned_rendered_body))
pf_d = pf.to_dict(drop_disused_feature=False) # default True -> drop title, body etc fields
pf_dicts.append(pf_d)
pf_df = pd.DataFrame(pf_dicts)
pf_df.head(5)
Out[2]:
In [3]:
# drop disused features
if "post_id" in pf_df.columns:
pf_df.drop(pf_df[["post_id", "title", "body", "url", "user_id", 'rendered_body']], axis=1, inplace=True)
quality = pf_df["quality"]
pf_df.drop("quality", axis=1, inplace=True)
In [4]:
# normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pf_df_n = scaler.fit_transform(pf_df)
In [5]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# estimate accuracy by cross validation
clf = RandomForestClassifier(n_estimators=15, max_features=0.5)
scores = cross_val_score(clf, pf_df_n, quality, cv=2, scoring="f1")
print(scores)
# train and show score
train_f, test_f, train_lb, test_lb = train_test_split(pf_df_n, quality, test_size=0.2, random_state=42)
clf.fit(train_f, train_lb)
pred = clf.predict(test_f)
print(classification_report(test_lb, pred, target_names=["bad", "good"]))
# show feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Plot the feature importances of the forest
labels = np.array(pf_df.columns.values.tolist())
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(labels)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(labels)), labels[indices], rotation="vertical")
plt.xlim([-1, len(labels)])
plt.ylim([0, 1])
plt.tight_layout()
plt.show()
In [6]:
from scripts.models.save_models import SaveModelsScalor
# save model
SaveModelsScalor(clf, scaler, pf_df)
Out[6]:
In [ ]: