In [11]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from app.evaluation import get_cleaned_processed_df

data_frame = get_cleaned_processed_df()

In [12]:
from app.process import *
additional_df = process_links("data/edu_links.txt")


 71%|███████▏  | 5/7 [00:06<00:05,  2.74s/it]

In [13]:
data_frame = data_frame.append(additional_df)
data_frame = data_frame.drop_duplicates(["repository"])
len(data_frame)


Out[13]:
1412

In [14]:
data_frame["description"].fillna("", inplace=True)
data_frame["readme"].fillna("", inplace=True)
data_frame.fillna(0, inplace=True)
data_frame.to_csv("data/processed_data.csv", encoding="utf-8")

In [15]:
import pandas as pd
from collections import Counter
import numpy as np

labels, values = zip(*Counter(data_frame['label']).items())

indexes = np.arange(len(labels))
width = 1

plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels)
plt.show()



In [16]:
"""Compare 9 classifiers"""

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from tqdm import tqdm
sns.set_color_codes("muted")

#data_frame = pd.read_csv('data/processed_data.csv')


le = LabelEncoder().fit(data_frame.label) 
labels = le.transform(data_frame.label)
sss = StratifiedShuffleSplit(labels, 10, test_size=0.3, random_state=23)

data_frame.drop('readme', axis=1, inplace=True)
data_frame.drop('description', axis=1, inplace=True)
data_frame.drop('repository', axis=1, inplace=True)
data_frame.drop('owner', axis=1, inplace=True)
data_frame.drop('name', axis=1, inplace=True)
data_frame.drop('label', axis=1, inplace=True)
data_frame.fillna(0, inplace=True)

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    #NuSVC(probability=True),
    DecisionTreeClassifier(class_weight='balanced'),
    RandomForestClassifier(class_weight='balanced'),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    #LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier()]

classifier_accs = {}
classifier_loss = {}
for clf in classifiers:
    classifier_accs[clf.__class__.__name__] = 0
    classifier_loss[clf.__class__.__name__] = 0

for train_index, test_index in tqdm(sss):
    X_train, X_test = data_frame.values[train_index], data_frame.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    for clf in classifiers:
        clf.fit(X_train, y_train)
        name = clf.__class__.__name__
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        classifier_accs[name] += acc

        train_predictions = clf.predict_proba(X_test)
        ll = log_loss(y_test, train_predictions)
        classifier_loss[name] += ll

avg_acc = {k: v / len(sss) for k, v in classifier_accs.iteritems()}
avg_loss = {k: v / len(sss) for k, v in classifier_loss.iteritems()}
avg_df = pd.DataFrame([[name, acc, avg_loss[name]] for name, acc in avg_acc.iteritems()], columns=['Classifier', 'Avg Accuracy', 'Avg Loss'])

sns.barplot(x='Avg Accuracy', y='Classifier', data=avg_df, color="b")
plt.xlabel('Average Accuracy %')
plt.title('Average Classifier Accuracy')
plt.show()

sns.barplot(x='Avg Loss', y='Classifier', data=avg_df, color="g")
plt.xlabel('Average Loss')
plt.title('Average Classifier Loss')
plt.show()


100%|██████████| 10/10 [00:48<00:00,  4.86s/it]

In [17]:
"""Learn one classifier on the training set and evaluate it on the validation set"""

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsOneClassifier
from app.evaluation import get_accuracy_and_plot_confusion, get_training_and_validation_df

X_train, y_train, X_valid, y_valid = get_training_and_validation_df()

le = LabelEncoder().fit(y_train)
labels = le.transform(y_train)  
classes = le.classes_
acc = 0

clf = XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
get_accuracy_and_plot_confusion(y_valid, y_pred, classes)


Out[17]:
0.12903225806451613

In [18]:
data_frame = get_cleaned_processed_df()
print len(data_frame)
training = pd.DataFrame(data_frame, columns=['repository', 'label'])
training["repository"] = training["repository"].apply(lambda x: "https://github.com/" + x)
training.to_csv("data/training_data.csv", index=False)


1412

In [ ]: