In [3]:
"""Fit SGDClassifier on a Tf-Idf matrix, evaluate it against the validation set
and plot the most and least important words per category"""
%load_ext autoreload
%autoreload 2
from app.evaluation import eval_classifier
from app.training import get_best_text_pipeline, get_undersample_df
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy
train_df = pd.read_csv("data/processed_data.csv")
train_df = get_undersample_df(train_df)
val_df = pd.DataFrame.from_csv("data/validation_data.csv")
y_train = train_df.pop("label")
y_val = val_df.pop("label")
le = LabelEncoder().fit(y_train)
classes = le.classes_
for text_feature in ["readme"]:
train_df[text_feature].fillna("", inplace=True)
val_df[text_feature].fillna("", inplace=True)
X_train = train_df[text_feature].values
X_val = val_df[text_feature].values
ppl = find_best_text_pipeline(X_train, y_train)
print "Accuracy for {} feature:".format(text_feature)
acc = eval_classifier(ppl, X_val, y_val, le.classes_)
print acc
In [7]:
from app.training import save_pickle
save_pickle(ppl, "best_readme_pipeline_5161")
In [6]:
ppl.named_steps
Out[6]:
In [2]:
val_df = pd.DataFrame.from_csv("data/validation_data.csv")
val_df.fillna("description", inplace=True)
for i, row in val_df.iterrows():
true = row["label"]
pred = ppl.predict([row["description"]])
if true != pred:
print "{}: label: {}\tpredicted: {} for\n{}".format(row["repository"], true, pred, row["description"])
In [3]:
from operator import itemgetter
vect = ppl.named_steps["vect"]
clf = ppl.named_steps["clf"]
tvec = clf.coef_
output = []
for i in range(len(tvec)):
output.append(classes[i])
coefs = sorted(zip(tvec[i], vect.get_feature_names()), key=itemgetter(0), reverse=True)
n = 20
topn = zip(coefs[:n], coefs[:-(n+1):-1])
output.append("{:>15} {: >20}".format("Positive Words", "Negative Words"))
for (cp, fnp), (cn, fnn) in topn:
output.append(u"{:0.4f}{: >15} {:0.4f}{: >15}".format(
cp, fnp, cn, fnn
)
)
output.append("--------------------")
print "\n".join(output)
In [ ]: