In [67]:
#%writefile learner1.py
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import binarize
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif, mutual_info_classif, f_regression, SelectPercentile

class RubygemLearner(object):
    def __init__(self, fn):
        with open(fn) as f:
            data = json.load(f)
        self.data = data["data"]
        self.spec = data["spec"]
        self.df = pd.io.json.json_normalize(self.data)
        self.features = self.exclude_unwanted_features()
        self._rename_columns()
        self.df = self.df[self.df["avg_nloc"]<20]
            
    def exclude_unwanted_features(self):
        exclude = []#'nloc',] #+ reek_columns + lizard_columns #+ reek_columns
        return [k for k, v in self.data[0]['stat'].items() if k not in exclude]
        
    def remove_too_small_changes(self):
        try:
            df = df.groupby('name').filter(lambda x: x['delta_nloc'].sum() >2000)
        except:
            pass
    
    def summary(self):
        print(self.spec)
        print('nb_records:', len(self.df))
        print("abandoned:", (self.df['label'] != 'maintained').sum())
        
    def feature_evaluation(self):
        X = self.df[self.features]
        y = (self.df.label == 'maintained').values.astype(int)
        X_new = SelectKBest(f_classif, k=10).fit(X, y)
        feature_list = np.array([X_new.scores_.astype(float), self.features]).T
        feature_list=[[float(x[0]), x[1]] for x in feature_list]
        return sorted(feature_list, key=lambda a_entry: a_entry[0], reverse=True )
        
    def _rename_columns(self):
        old_new_cols = {}
        for name in self.df.columns:
            if name.startswith('stat'):
                old_new_cols[name] = name.split('.')[1]
        self.df.rename(columns=old_new_cols, inplace=True)
        
    def my_train_test_split(self, test_size=0.3):
        Xgems = self.df[["name"]].drop_duplicates()
        X_train_gems, X_test_gems = train_test_split(Xgems, test_size=test_size)
        X_train = self.df[self.df['name'].isin(X_train_gems['name'])]
        y_train = (X_train.label == 'maintained').values.astype(int)
        X_train = X_train[self.features]
        dftest = self.df[self.df['name'].isin(X_test_gems['name'])]

        return X_train, y_train, dftest

In [70]:
fn = 'RubygemDigger--Steps--GenerateJsonForLastVersions--1.data.json'
#fn = 'RubygemDigger--Steps--GenerateJsonForAllVersions--1.data.json.all.json'

learner = RubygemLearner(fn)

In [73]:
learner.feature_evaluation() [:20]


Out[73]:
[[46.16000204454106, 'style_'],
 [38.03630385465227, 'lint_'],
 [22.82784427240693, 'metrics_'],
 [15.09687869019009, 'lint_duplicate'],
 [14.954064018153039, 'metrics_perceivedcomplexity'],
 [13.580174991072534, 'metrics_cyclomaticcomplexity'],
 [12.864751105747185, 'ClassVariable'],
 [10.985229061642315, 'performance_'],
 [10.763269893506074, 'metrics_classlength'],
 [9.062464297122212, 'metrics_abcsize'],
 [8.700433362849248, 'InstanceVariableAssumption'],
 [8.68876107343239, 'metrics_methodlength'],
 [6.229381722984, 'metrics_linelength'],
 [5.106207790958623, 'TooManyStatements'],
 [4.9592937039708005, 'metrics_blocklength'],
 [4.83327620552429, 'warning_count'],
 [4.762727855322185, 'reek_total'],
 [4.429191897632879, 'security_'],
 [4.233816937769375, 'fun_count'],
 [4.1352090269238015, 'TooManyInstanceVariables']]

In [ ]: