In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
Read input from JSON records.
In [2]:
lines = []
for part in ("00000", "00001"):
with open("../output/2017-01-03_13.57.34/part-%s" % part) as f:
lines += f.readlines()
print(lines[0])
Create a pandas DataFrame
In [4]:
import pandas as pd
df = pd.read_json('[%s]' % ','.join(lines))
print(df.info())
df.head()
Out[4]:
Create new features
In [5]:
df["has_qmark"] = df.Body.apply(lambda s: "?" in s)
df["num_qmarks"] = df.Body.apply(lambda s: s.count("?"))
df["body_length"] = df.Body.apply(lambda s: len(s))
In [6]:
df
Out[6]:
Filter for PostTypeId == 1 or PostTypeId == 2
In [7]:
df = df.loc[df.PostTypeId.apply(lambda x: x in [1, 2]), :]
df = df.reset_index(drop=True)
df.head()
Out[7]:
In [8]:
n_questions = np.sum(df.PostTypeId == 1)
n_answers = np.sum(df.PostTypeId == 2)
print("No. questions {0} / No. answers {1}".format(n_questions, n_answers))
Are any relationships apparent in the raw data and the obvious features?
In [9]:
df.plot.scatter(x="num_qmarks",y="PostTypeId")
df.plot.scatter(x="body_length",y="PostTypeId")
Out[9]:
Can PostTypeId be predicted from the post body? Here, we try the linear RidgeClassifier and the nonlinear RandomForestClassifier and compare the accuracy in the training set to the accuracy in the test set.
From the results, we see that the RandomForestClassifier is more accurate than the linear model, but is actually overfitting the data. The overfitting is likely to improve with more training examples, so let's choose the RF classifier.
In [10]:
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
X = df.loc[:, ['num_qmarks', 'body_length']]
y = df.loc[:, 'PostTypeId']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42)
classifiers = [("Ridge", RidgeClassifier()), ("RandomForest", RandomForestClassifier())]
for name, classifier in classifiers:
classifier.fit(X_train, y_train)
print(name + " " + "-"*(60 - len(name)))
print("R2_train: {0}, R2_test: {1}".format(classifier.score(X_train, y_train), classifier.score(X_test, y_test)))
print()
In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
class FSTransformer(BaseEstimator, TransformerMixin):
"""
Returns the different feature names
"""
def __init__(self, features):
self.features = features
pass
def fit(self, X, y):
return self
def transform(self, df):
return df[self.features].as_matrix()
class CountVecTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.vectorizer = CountVectorizer(binary=False)
pass
def fit(self, df, y=None):
self.vectorizer.fit(df.Body)
return self
def transform(self, df):
return self.vectorizer.transform(df.Body).todense()
In [12]:
df.head()
Out[12]:
In [13]:
fst = FSTransformer(["has_qmark"])
fst.transform(df)
Out[13]:
In [14]:
CountVecTransformer().fit_transform(df)
Out[14]:
Let's use the word frequency vectors in combination with the obvious features we created previously to try to predict PostTypeId.
In [15]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import f1_score
model_pipe = Pipeline([
("features",
FeatureUnion([
("derived", FSTransformer(["has_qmark", "num_qmarks", "body_length"])),
("count_vec", CountVecTransformer())
])
),
("clf", RandomForestClassifier())
])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
X = df
y = df.PostTypeId
for train_index, test_index in sss.split(X.as_matrix(), y.as_matrix()):
X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model_pipe.fit(X_train, y_train)
r2_train = model_pipe.score(X_train, y_train)
r2_test = model_pipe.score(X_test, y_test)
y_pred = model_pipe.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("R2_train: {0} R2_test: {1} f1: {2}".format(r2_train, r2_test, f1))
Since we're overfitting, we can try reducing the total number of features. One approach to this is to perform the $\chi^2$ statistical test of independence on each feature with respect to the label (PostTypeId) and remove the features that are most independent of the label. Here, we put SelectKBest into the model pipeline and keep only the 10 most dependent features.
In [16]:
from sklearn.feature_selection import SelectKBest, chi2
model_pipe = Pipeline([
("features",
FeatureUnion([
("derived", FSTransformer(["has_qmark", "num_qmarks", "body_length"])),
("count_vec", CountVecTransformer())
])
),
("best_features", SelectKBest(chi2, k=10)),
("clf", RandomForestClassifier())
])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
X = df
y = df.PostTypeId
for train_index, test_index in sss.split(X.as_matrix(), y.as_matrix()):
X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model_pipe.fit(X_train, y_train)
r2_train = model_pipe.score(X_train, y_train)
r2_test = model_pipe.score(X_test, y_test)
y_pred = model_pipe.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("R2_train: {0} R2_test: {1} f1: {2}".format(r2_train, r2_test, f1))
So, the generalization the model improved, but what is the right number of features to keep? For this, we can use model cross-validation. The class GridSearchCV allows us to vary a hyperparameter of the model and compute the model score for each candidate parameter (or set of parameters).
In [18]:
from sklearn.model_selection import GridSearchCV
modelCV = GridSearchCV(model_pipe, {"best_features__k":[3 ** i for i in range(1,7)]})
modelCV.fit(X,y)
cv_accuracy = pd.DataFrame([{**score.parameters, **{"mean_validation_score": score.mean_validation_score}}
for score in modelCV.grid_scores_])
cv_accuracy.plot(x="best_features__k", y="mean_validation_score")
cv_accuracy
Out[18]:
We can refine our range of hyperparameters to hone in on the best number.
In [19]:
modelCV = GridSearchCV(model_pipe, {"best_features__k":list(range(80,120,10))})
modelCV.fit(X,y)
cv_accuracy = pd.DataFrame([{**score.parameters, **{"mean_validation_score": score.mean_validation_score}}
for score in modelCV.grid_scores_])
cv_accuracy.plot(x="best_features__k", y="mean_validation_score")
cv_accuracy
Out[19]:
Finally, we can inspect the confusion matrix to determine the types of errors encounter
| True Positive | False Positive |
|---|---|
| False Negative | True Negative |
In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, modelCV.predict(X_test))
cm = cm / cm.sum(axis=0)
print(cm)
In [23]:
cm = confusion_matrix(y_test, modelCV.predict(X_test))
cm
Out[23]:
In [ ]:
In [25]:
plt.imshow(cm, interpolation="nearest", cmap="Blues")
plt.colorbar()
Out[25]: