In [77]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from app.classifier import EnsembleAllNumeric, normalize, get_voting_classifier
from app.main import normalize
from app.training import get_undersample_df
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("data/processed_data.csv")
df = get_undersample_df(df)
df = normalize(df)

val_df = pd.read_csv("data/validation_data_processed.csv")
val_df = normalize(val_df)

val_add_df = pd.read_csv("data/validation_additional_processed_data.csv")
val_add_df = normalize(val_add_df)

_ = df.pop("readme")
_ = val_df.pop("readme")
_ = val_add_df.pop("readme")

y = df.pop("label")
y_val = val_df.pop("label")
y_val_add = val_add_df.pop("label")

#_ = df.pop("Unnamed: 0")
_ = df.pop("index")
_ = val_df.pop("Unnamed: 0")
_ = val_add_df.pop("Unnamed: 0")

val_df = ensemble_clf.transform(val_df)
val_df = ensemble_clf.keep_useful_features(val_df, useful)
val_add_df = ensemble_clf.transform(val_add_df)
val_add_df = ensemble_clf.keep_useful_features(val_add_df, useful)

ensemble_clf = EnsembleAllNumeric()
ensemble_clf.fit(df, y)
df = ensemble_clf.transform(df)
useful = ensemble_clf.useful_features
from sklearn.ensemble import RandomForestClassifier
nb = RandomForestClassifier()
rfe = RFE(estimator=nb, n_features_to_select=120, step=1)
rfe.fit(df, y)

print df.columns[rfe.support_]
print rfe.ranking_[rfe.support_]

print rfe.score(val_df, y_val)
print rfe.score(val_add_df, y_val_add)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Index([              u'watchers',       u'mentionableUsers',
         u'closed_pull_requests',          u'closed_issues',
                  u'open_issues',                  u'forks',
         u'merged_pull_requests',             u'stargazers',
           u'open_pull_requests',               u'projects',
       ...
        u'description_solutions',     u'description_source',
              u'description_the',       u'description_this',
            u'description_tools',   u'description_training',
        u'description_tutorials', u'description_university',
            u'description_video',        u'description_web'],
      dtype='object', length=120)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-f1b5c5c7568d> in <module>()
     54 print rfe.ranking_[rfe.support_]
     55 
---> 56 print rfe.score(val_df, y_val)
     57 print rfe.score(val_add_df, y_val_add)

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc in <lambda>(*args, **kwargs)
     52 
     53         # lambda, but not partial, allows help() to work with update_wrapper
---> 54         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
     55         # update the docstring of the returned function
     56         update_wrapper(out, self.fn)

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/feature_selection/rfe.pyc in score(self, X, y)
    243             The target values.
    244         """
--> 245         return self.estimator_.score(self.transform(X), y)
    246 
    247     def _get_support_mask(self):

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/feature_selection/base.pyc in transform(self, X)
     81             return np.empty(0).reshape((X.shape[0], 0))
     82         if len(mask) != X.shape[1]:
---> 83             raise ValueError("X has a different shape than during fitting.")
     84         return X[:, safe_mask(X, mask)]
     85 

ValueError: X has a different shape than during fitting.

In [78]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(df, y)
print model.feature_importances_
print model.score(val_df, y_val)
print model.score(val_add_df, y_val_add)


[  7.52175446e-03   1.19605493e-02   1.22449032e-02 ...,   0.00000000e+00
   9.33706816e-06   0.00000000e+00]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-78-f8adae1175b7> in <module>()
      3 model.fit(df, y)
      4 print model.feature_importances_
----> 5 print model.score(val_df, y_val)
      6 print model.score(val_add_df, y_val_add)

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/base.pyc in score(self, X, y, sample_weight)
    347         """
    348         from .metrics import accuracy_score
--> 349         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    350 
    351 

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict(self, X)
    532             The predicted classes.
    533         """
--> 534         proba = self.predict_proba(X)
    535 
    536         if self.n_outputs_ == 1:

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict_proba(self, X)
    571         """
    572         # Check data
--> 573         X = self._validate_X_predict(X)
    574 
    575         # Assign chunk of trees to jobs

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _validate_X_predict(self, X)
    353                                  "call `fit` before exploiting the model.")
    354 
--> 355         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    356 
    357     @property

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/tree/tree.pyc in _validate_X_predict(self, X, check_input)
    374                              "match the input. Model n_features is %s and "
    375                              "input n_features is %s "
--> 376                              % (self.n_features_, n_features))
    377 
    378         return X

ValueError: Number of features of the model must match the input. Model n_features is 1991 and input n_features is 1984 

In [79]:
%matplotlib inline
import matplotlib.pyplot as plt
l = list(model.feature_importances_)
l = sorted(l)
plt.plot(l)
print len(l)
plt.show()


1991

In [80]:
zipped = zip(useful, model.feature_importances_)

In [82]:
zipped.sort(key=lambda x: x[1], reverse=True)
[x[0] for x in zipped[:85]]


Out[82]:
['isOwnerHomepage',
 u'description_homework',
 'hasHomepage',
 u'description_materials',
 u'description_course',
 u'description_curated',
 'stargazers',
 u'description_list',
 u'description_lecture',
 'hasTravisConfig',
 'open_issues',
 'hasLicense',
 u'description_for',
 u'description_solutions',
 'commitsCount',
 'LANGUAGE_Python',
 'hasCiConfig',
 u'description_solution',
 'closed_pull_requests',
 'mentionableUsers',
 'forks',
 'tagsCount',
 u'description_material',
 'merged_pull_requests',
 'branchesCount',
 u'description_awesome',
 u'description_data',
 u'description_resources',
 'releasesCount',
 u'description_assignments',
 'size',
 'watchers',
 'closed_issues',
 u'description_framework',
 'open_pull_requests',
 u'description_language',
 u'description_and',
 u'description_tools',
 u'description_assignment',
 u'description_libraries',
 u'description_class',
 u'description_from',
 u'description_codes',
 'LANGUAGE_Haskell',
 u'description_collection',
 'LANGUAGE_Java',
 u'description_application',
 u'description_countries',
 u'description_the',
 'LANGUAGE_Shell',
 u'description_web',
 u'description_global',
 u'description_coursera',
 u'description_used',
 u'description_css',
 u'description_python',
 u'description_frameworks',
 u'description_package',
 u'description_blog',
 u'description_game',
 u'description_programming',
 u'description_projects',
 u'description_lectures',
 u'description_repository',
 u'description_that',
 u'description_submission',
 u'description_all',
 u'description_javascript',
 u'description_development',
 u'description_metadata',
 u'description_2016',
 u'description_open',
 u'description_engine',
 u'description_documentation',
 u'description_module',
 u'description_series',
 u'description_software',
 u'description_website',
 u'description_summer',
 u'description_natural',
 'LANGUAGE_JavaScript',
 u'description_site',
 u'description_free',
 u'description_slides',
 u'description_resources.']

In [ ]:
clfs = [clf[1] for clf in get_voting_classifier().estimators]
clfs.append(ensemble_clf)

loops = 10
for clf in clfs:
    val_score = 0
    val_add_score = 0
    print clf.__class__
    for i in range(loops):
        clf.fit(df, y)
        val_score += clf.score(val_df, y_val)
        val_add_score += clf.score(val_add_df, y_val_add)
    print "Validation: " + str(val_score/loops)
    print "Additional: " + str(val_add_score/loops)

In [20]:
from sklearn.tree import export_graphviz
export_graphviz(clf.clf, out_file='tree.dot', class_names=clf.clf.classes_, feature_names=clf.useful_features)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-20-e07a27e3342d> in <module>()
      1 from sklearn.tree import export_graphviz
----> 2 export_graphviz(clf.clf, out_file='tree.dot', class_names=clf.clf.classes_, feature_names=clf.useful_features)

/home/wgie/Documents/git_better/venv/local/lib/python2.7/site-packages/sklearn/tree/export.pyc in export_graphviz(decision_tree, out_file, max_depth, feature_names, class_names, label, filled, leaves_parallel, impurity, node_ids, proportion, rotate, rounded, special_characters)
    431             recurse(decision_tree, 0, criterion="impurity")
    432         else:
--> 433             recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)
    434 
    435         # If required, draw leaf nodes at same depth as each other

AttributeError: 'RandomForestClassifier' object has no attribute 'tree_'

In [ ]:
!dot -Tpng tree.dot -o tree.png

In [ ]:
!eog tree.png

In [18]:
df.loc[df["branchesCount"] == 0]


Out[18]:
repository owner name watchers mentionableUsers closed_pull_requests closed_issues open_issues forks merged_pull_requests ... LANGUAGE_ANTLR LANGUAGE_Scilab LANGUAGE_Module Management System LANGUAGE_SAS LANGUAGE_Nemerle LANGUAGE_Csound Document LANGUAGE_Agda LANGUAGE_XQuery LANGUAGE_Ada LANGUAGE_Arduino
69 tekkub/tekkub.github.com tekkub tekkub.github.com 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
126 hacke2/hacke2.github. hacke2 hacke2.github. 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
327 cdwanze/cdwanze.github.io cdwanze cdwanze.github.io 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
486 datasets/datasets.github.com datasets datasets.github.com 6.0 23.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
593 meebert/Dynamic-Programming-EggDrop meebert Dynamic-Programming-EggDrop 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
613 myhomeworkhelp/Accounts-Assignment-Help myhomeworkhelp Accounts-Assignment-Help 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
634 theironyard-rails-atl/jamesdab theironyard-rails-atl jamesdab 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1019 XuezhengMa/Math XuezhengMa Math 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1043 studentalpha/PRML-solutions studentalpha PRML-solutions 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1044 bachkiko/SoftUni bachkiko SoftUni 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1058 karlahernandez/Homework-Solution-Template-LaTeX karlahernandez Homework-Solution-Template-LaTeX 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

11 rows × 178 columns