In [1]:
%load_ext autoreload
%autoreload 2
from app.evaluation import print_boolean_matrix, get_accuracy_and_plot_confusion
import pandas as pd
from app.training import get_undersample_df
from app.classifier import normalize, EnsembleAllNumeric, get_text_pipeline, get_voting_classifier, DescriptionClassifier, ReadmeClassifier, NumericEnsembleClassifier, normalize, EnsembleAllNumeric, keep_useful_features
from app.constants import VALIDATION_DATA_PATH, PROCESSED_DATA_PATH
from sklearn.ensemble import VotingClassifier
In [2]:
df = get_undersample_df(pd.read_csv(PROCESSED_DATA_PATH))
df.pop("index")
df.pop("Unnamed: 0")
df = normalize(df)
y = df.pop("label")
val_df = normalize(pd.read_csv(VALIDATION_DATA_PATH))
y_val = val_df.pop("label")
In [3]:
clf = VotingClassifier(estimators=[('description', DescriptionClassifier()),
('readme', ReadmeClassifier()),
('ensemble', NumericEnsembleClassifier())],
voting='soft')
clf.fit(df, y)
Out[3]:
In [4]:
predicted = clf.predict(val_df)
print_boolean_matrix(y_val, predicted)
In [5]:
print list(y_val)
print predicted
In [6]:
get_accuracy_and_plot_confusion(y_val, predicted, clf.classes_)
Out[6]:
In [12]:
val_df = pd.read_csv(VALIDATION_DATA_PATH)
val_df["predicted"] = predicted
val_df[["repository", "label", "predicted", "stargazers"]]
Out[12]:
In [ ]: