In [1]:
%load_ext autoreload
%autoreload 2
from app.evaluation import print_boolean_matrix, get_accuracy_and_plot_confusion
import pandas as pd
from app.training import get_undersample_df
from app.classifier import normalize, EnsembleAllNumeric, get_text_pipeline, get_voting_classifier, DescriptionClassifier, ReadmeClassifier, NumericEnsembleClassifier, normalize, EnsembleAllNumeric, keep_useful_features
from app.constants import VALIDATION_DATA_PATH, PROCESSED_DATA_PATH
from sklearn.ensemble import VotingClassifier

In [2]:
df = get_undersample_df(pd.read_csv(PROCESSED_DATA_PATH))
df.pop("index")
df.pop("Unnamed: 0")
df = normalize(df)
y = df.pop("label")
val_df = normalize(pd.read_csv(VALIDATION_DATA_PATH))
y_val = val_df.pop("label")

In [3]:
clf = VotingClassifier(estimators=[('description', DescriptionClassifier()),
                                                     ('readme', ReadmeClassifier()),
                                                     ('ensemble', NumericEnsembleClassifier())],
                               voting='soft')
clf.fit(df, y)


Out[3]:
VotingClassifier(estimators=[('description', <app.classifier.DescriptionClassifier object at 0x7fa34b6191d0>), ('readme', <app.classifier.ReadmeClassifier object at 0x7fa344612f90>), ('ensemble', <app.classifier.NumericEnsembleClassifier object at 0x7fa34b619490>)],
         n_jobs=1, voting='soft', weights=None)

In [4]:
predicted = clf.predict(val_df)
print_boolean_matrix(y_val, predicted)


\begin{table}[h]
\centering
\caption{Boolean Matrix}
\label{boolean_matrix}
\begin{tabular}{|r|r|r|r|r|}
 \hline
Label & Predicted Correctly & Predicted Incorrectly & Precision & Recall \\ \hline
WEB & 2 & 2 & 0.40 & 0.50 \\ \hline
DOCS & 2 & 2 & 0.50 & 0.50 \\ \hline
HW & 5 & 0 & 0.71 & 1.00 \\ \hline
DEV & 8 & 3 & 0.89 & 0.73 \\ \hline
EDU & 2 & 2 & 1.00 & 0.50 \\ \hline
DATA & 2 & 1 & 0.50 & 0.67 \\ \hline
\multicolumn{3}{|l|}{Weighted Average} & 0.72 & 0.68 \\ \hline
\end{tabular}
\end{table}

In [5]:
print list(y_val)
print predicted


['HW', 'HW', 'EDU', 'EDU', 'DEV', 'WEB', 'EDU', 'DOCS', 'DOCS', 'DATA', 'DEV', 'HW', 'DEV', 'DEV', 'HW', 'DEV', 'DOCS', 'DEV', 'EDU', 'DEV', 'DEV', 'WEB', 'HW', 'DATA', 'DATA', 'WEB', 'DEV', 'DEV', 'WEB', 'DEV', 'DOCS']
['HW' 'HW' 'EDU' 'EDU' 'DEV' 'WEB' 'DEV' 'WEB' 'DOCS' 'DOCS' 'DEV' 'HW'
 'DEV' 'DEV' 'HW' 'DEV' 'DOCS' 'DEV' 'WEB' 'DEV' 'DEV' 'WEB' 'HW' 'DATA'
 'DATA' 'DATA' 'DATA' 'WEB' 'HW' 'DOCS' 'HW']

In [6]:
get_accuracy_and_plot_confusion(y_val, predicted, clf.classes_)


Out[6]:
0.67741935483870963

In [12]:
val_df = pd.read_csv(VALIDATION_DATA_PATH)
val_df["predicted"] = predicted
val_df[["repository", "label", "predicted", "stargazers"]]


Out[12]:
repository label predicted stargazers
0 ga-chicago/wdi5-homework HW HW 0.0
1 Aggregates/MI_HW2 HW HW 0.0
2 datasciencelabs/2016 EDU EDU 38.0
3 githubteacher/intro-november-2015 EDU EDU 13.0
4 atom/atom DEV DEV 31999.0
5 jmcglone/jmcglone.github.io WEB WEB 69.0
6 hpi-swt2-exercise/java-tdd-challenge EDU DEV 1.0
7 alphagov/performanceplatform-documentation DOCS WEB 5.0
8 harvesthq/how-to-walkabout DOCS DOCS 64.0
9 vhf/free-programming-books DATA DOCS 69085.0
10 d3/d3 DEV DEV 56332.0
11 carlosmn/CoMa-II HW HW 0.0
12 git/git-scm.com DEV DEV 1258.0
13 PowerDNS/pdns DEV DEV 704.0
14 cmrberry/cs6300-git-practice HW HW 0.0
15 Sefaria/Sefaria-Project DEV DEV 134.0
16 mongodb/docs DOCS DOCS 445.0
17 sindresorhus/eslint-config-xo DEV DEV 67.0
18 e-books/backbone.en.douceur EDU WEB 101.0
19 erikflowers/weather-icons DEV DEV 3737.0
20 tensorflow/tensorflow DEV DEV 35812.0
21 cs231n/cs231n.github.io WEB WEB 1632.0
22 m2mtech/smashtag-2015 HW HW 19.0
23 openaddresses/openaddresses DATA DATA 754.0
24 benbalter/congressional-districts DATA DATA 94.0
25 Chicago/food-inspections-evaluation WEB DATA 131.0
26 OpenInstitute/OpenDuka DEV DATA 40.0
27 torvalds/linux DEV WEB 38443.0
28 bhuga/bhuga.net WEB HW 1.0
29 macloo/just_enough_code DEV DOCS 1.0
30 hughperkins/howto-jenkins-ssl DOCS HW 25.0

In [ ]: