notebook.community

Edit and run



In [77]:

    
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from app.classifier import EnsembleAllNumeric, normalize, get_voting_classifier
from app.main import normalize
from app.training import get_undersample_df
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("data/processed_data.csv")
df = get_undersample_df(df)
df = normalize(df)

val_df = pd.read_csv("data/validation_data_processed.csv")
val_df = normalize(val_df)

val_add_df = pd.read_csv("data/validation_additional_processed_data.csv")
val_add_df = normalize(val_add_df)

_ = df.pop("readme")
_ = val_df.pop("readme")
_ = val_add_df.pop("readme")

y = df.pop("label")
y_val = val_df.pop("label")
y_val_add = val_add_df.pop("label")

#_ = df.pop("Unnamed: 0")
_ = df.pop("index")
_ = val_df.pop("Unnamed: 0")
_ = val_add_df.pop("Unnamed: 0")

val_df = ensemble_clf.transform(val_df)
val_df = ensemble_clf.keep_useful_features(val_df, useful)
val_add_df = ensemble_clf.transform(val_add_df)
val_add_df = ensemble_clf.keep_useful_features(val_add_df, useful)

ensemble_clf = EnsembleAllNumeric()
ensemble_clf.fit(df, y)
df = ensemble_clf.transform(df)
useful = ensemble_clf.useful_features
from sklearn.ensemble import RandomForestClassifier
nb = RandomForestClassifier()
rfe = RFE(estimator=nb, n_features_to_select=120, step=1)
rfe.fit(df, y)

print df.columns[rfe.support_]
print rfe.ranking_[rfe.support_]

print rfe.score(val_df, y_val)
print rfe.score(val_add_df, y_val_add)









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Index([              u'watchers',       u'mentionableUsers',
         u'closed_pull_requests',          u'closed_issues',
                  u'open_issues',                  u'forks',
         u'merged_pull_requests',             u'stargazers',
           u'open_pull_requests',               u'projects',
       ...
        u'description_solutions',     u'description_source',
              u'description_the',       u'description_this',
            u'description_tools',   u'description_training',
        u'description_tutorials', u'description_university',
            u'description_video',        u'description_web'],
      dtype='object', length=120)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1]






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-f1b5c5c7568d> in <module>()
     54 print rfe.ranking_[rfe.support_]
     55 
---> 56 print rfe.score(val_df, y_val)
     57 print rfe.score(val_add_df, y_val_add)

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc in <lambda>(*args, **kwargs)
     52 
     53         # lambda, but not partial, allows help() to work with update_wrapper
---> 54         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
     55         # update the docstring of the returned function
     56         update_wrapper(out, self.fn)

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/feature_selection/rfe.pyc in score(self, X, y)
    243             The target values.
    244         """
--> 245         return self.estimator_.score(self.transform(X), y)
    246 
    247     def _get_support_mask(self):

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/feature_selection/base.pyc in transform(self, X)
     81             return np.empty(0).reshape((X.shape[0], 0))
     82         if len(mask) != X.shape[1]:
---> 83             raise ValueError("X has a different shape than during fitting.")
     84         return X[:, safe_mask(X, mask)]
     85 

ValueError: X has a different shape than during fitting.



In [78]:

    
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(df, y)
print model.feature_importances_
print model.score(val_df, y_val)
print model.score(val_add_df, y_val_add)









    



[  7.52175446e-03   1.19605493e-02   1.22449032e-02 ...,   0.00000000e+00
   9.33706816e-06   0.00000000e+00]






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-78-f8adae1175b7> in <module>()
      3 model.fit(df, y)
      4 print model.feature_importances_
----> 5 print model.score(val_df, y_val)
      6 print model.score(val_add_df, y_val_add)

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/base.pyc in score(self, X, y, sample_weight)
    347         """
    348         from .metrics import accuracy_score
--> 349         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    350 
    351 

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict(self, X)
    532             The predicted classes.
    533         """
--> 534         proba = self.predict_proba(X)
    535 
    536         if self.n_outputs_ == 1:

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict_proba(self, X)
    571         """
    572         # Check data
--> 573         X = self._validate_X_predict(X)
    574 
    575         # Assign chunk of trees to jobs

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _validate_X_predict(self, X)
    353                                  "call `fit` before exploiting the model.")
    354 
--> 355         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    356 
    357     @property

/home/madness/Documents/Projects/git_better/venv2/local/lib/python2.7/site-packages/sklearn/tree/tree.pyc in _validate_X_predict(self, X, check_input)
    374                              "match the input. Model n_features is %s and "
    375                              "input n_features is %s "
--> 376                              % (self.n_features_, n_features))
    377 
    378         return X

ValueError: Number of features of the model must match the input. Model n_features is 1991 and input n_features is 1984



In [79]:

    
%matplotlib inline
import matplotlib.pyplot as plt
l = list(model.feature_importances_)
l = sorted(l)
plt.plot(l)
print len(l)
plt.show()



In [80]:

    
zipped = zip(useful, model.feature_importances_)



In [82]:

    
zipped.sort(key=lambda x: x[1], reverse=True)
[x[0] for x in zipped[:85]]









    Out[82]:





['isOwnerHomepage',
 u'description_homework',
 'hasHomepage',
 u'description_materials',
 u'description_course',
 u'description_curated',
 'stargazers',
 u'description_list',
 u'description_lecture',
 'hasTravisConfig',
 'open_issues',
 'hasLicense',
 u'description_for',
 u'description_solutions',
 'commitsCount',
 'LANGUAGE_Python',
 'hasCiConfig',
 u'description_solution',
 'closed_pull_requests',
 'mentionableUsers',
 'forks',
 'tagsCount',
 u'description_material',
 'merged_pull_requests',
 'branchesCount',
 u'description_awesome',
 u'description_data',
 u'description_resources',
 'releasesCount',
 u'description_assignments',
 'size',
 'watchers',
 'closed_issues',
 u'description_framework',
 'open_pull_requests',
 u'description_language',
 u'description_and',
 u'description_tools',
 u'description_assignment',
 u'description_libraries',
 u'description_class',
 u'description_from',
 u'description_codes',
 'LANGUAGE_Haskell',
 u'description_collection',
 'LANGUAGE_Java',
 u'description_application',
 u'description_countries',
 u'description_the',
 'LANGUAGE_Shell',
 u'description_web',
 u'description_global',
 u'description_coursera',
 u'description_used',
 u'description_css',
 u'description_python',
 u'description_frameworks',
 u'description_package',
 u'description_blog',
 u'description_game',
 u'description_programming',
 u'description_projects',
 u'description_lectures',
 u'description_repository',
 u'description_that',
 u'description_submission',
 u'description_all',
 u'description_javascript',
 u'description_development',
 u'description_metadata',
 u'description_2016',
 u'description_open',
 u'description_engine',
 u'description_documentation',
 u'description_module',
 u'description_series',
 u'description_software',
 u'description_website',
 u'description_summer',
 u'description_natural',
 'LANGUAGE_JavaScript',
 u'description_site',
 u'description_free',
 u'description_slides',
 u'description_resources.']



In [ ]:

    
clfs = [clf[1] for clf in get_voting_classifier().estimators]
clfs.append(ensemble_clf)

loops = 10
for clf in clfs:
    val_score = 0
    val_add_score = 0
    print clf.__class__
    for i in range(loops):
        clf.fit(df, y)
        val_score += clf.score(val_df, y_val)
        val_add_score += clf.score(val_add_df, y_val_add)
    print "Validation: " + str(val_score/loops)
    print "Additional: " + str(val_add_score/loops)



In [20]:

    
from sklearn.tree import export_graphviz
export_graphviz(clf.clf, out_file='tree.dot', class_names=clf.clf.classes_, feature_names=clf.useful_features)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-20-e07a27e3342d> in <module>()
      1 from sklearn.tree import export_graphviz
----> 2 export_graphviz(clf.clf, out_file='tree.dot', class_names=clf.clf.classes_, feature_names=clf.useful_features)

/home/wgie/Documents/git_better/venv/local/lib/python2.7/site-packages/sklearn/tree/export.pyc in export_graphviz(decision_tree, out_file, max_depth, feature_names, class_names, label, filled, leaves_parallel, impurity, node_ids, proportion, rotate, rounded, special_characters)
    431             recurse(decision_tree, 0, criterion="impurity")
    432         else:
--> 433             recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)
    434 
    435         # If required, draw leaf nodes at same depth as each other

AttributeError: 'RandomForestClassifier' object has no attribute 'tree_'



In [ ]:

    
!dot -Tpng tree.dot -o tree.png



In [ ]:

    
!eog tree.png



In [18]:

    
df.loc[df["branchesCount"] == 0]









    Out[18]:






  
    
      
      repository
      owner
      name
      watchers
      mentionableUsers
      closed_pull_requests
      closed_issues
      open_issues
      forks
      merged_pull_requests
      ...
      LANGUAGE_ANTLR
      LANGUAGE_Scilab
      LANGUAGE_Module Management System
      LANGUAGE_SAS
      LANGUAGE_Nemerle
      LANGUAGE_Csound Document
      LANGUAGE_Agda
      LANGUAGE_XQuery
      LANGUAGE_Ada
      LANGUAGE_Arduino
    
  
  
    
      69
      tekkub/tekkub.github.com
      tekkub
      tekkub.github.com
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      126
      hacke2/hacke2.github.
      hacke2
      hacke2.github.
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      327
      cdwanze/cdwanze.github.io
      cdwanze
      cdwanze.github.io
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      486
      datasets/datasets.github.com
      datasets
      datasets.github.com
      6.0
      23.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      593
      meebert/Dynamic-Programming-EggDrop
      meebert
      Dynamic-Programming-EggDrop
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      613
      myhomeworkhelp/Accounts-Assignment-Help
      myhomeworkhelp
      Accounts-Assignment-Help
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      634
      theironyard-rails-atl/jamesdab
      theironyard-rails-atl
      jamesdab
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1019
      XuezhengMa/Math
      XuezhengMa
      Math
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1043
      studentalpha/PRML-solutions
      studentalpha
      PRML-solutions
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1044
      bachkiko/SoftUni
      bachkiko
      SoftUni
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1058
      karlahernandez/Homework-Solution-Template-LaTeX
      karlahernandez
      Homework-Solution-Template-LaTeX
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

11 rows × 178 columns

	repository	owner	name	watchers	mentionableUsers	...
69	tekkub/tekkub.github.com	tekkub	tekkub.github.com	0.0	0.0	...
126	hacke2/hacke2.github.	hacke2	hacke2.github.	0.0	0.0	...
327	cdwanze/cdwanze.github.io	cdwanze	cdwanze.github.io	0.0	0.0	...
486	datasets/datasets.github.com	datasets	datasets.github.com	6.0	23.0	...
593	meebert/Dynamic-Programming-EggDrop	meebert	Dynamic-Programming-EggDrop	1.0	1.0	...
613	myhomeworkhelp/Accounts-Assignment-Help	myhomeworkhelp	Accounts-Assignment-Help	1.0	1.0	...
634	theironyard-rails-atl/jamesdab	theironyard-rails-atl	jamesdab	0.0	0.0	...
1019	XuezhengMa/Math	XuezhengMa	Math	1.0	1.0	...
1043	studentalpha/PRML-solutions	studentalpha	PRML-solutions	1.0	1.0	...
1044	bachkiko/SoftUni	bachkiko	SoftUni	1.0	1.0	...
1058	karlahernandez/Homework-Solution-Template-LaTeX	karlahernandez	Homework-Solution-Template-LaTeX	0.0	0.0	...