In [67]:
#%writefile learner1.py
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import binarize
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif, mutual_info_classif, f_regression, SelectPercentile
class RubygemLearner(object):
def __init__(self, fn):
with open(fn) as f:
data = json.load(f)
self.data = data["data"]
self.spec = data["spec"]
self.df = pd.io.json.json_normalize(self.data)
self.features = self.exclude_unwanted_features()
self._rename_columns()
self.df = self.df[self.df["avg_nloc"]<20]
def exclude_unwanted_features(self):
exclude = []#'nloc',] #+ reek_columns + lizard_columns #+ reek_columns
return [k for k, v in self.data[0]['stat'].items() if k not in exclude]
def remove_too_small_changes(self):
try:
df = df.groupby('name').filter(lambda x: x['delta_nloc'].sum() >2000)
except:
pass
def summary(self):
print(self.spec)
print('nb_records:', len(self.df))
print("abandoned:", (self.df['label'] != 'maintained').sum())
def feature_evaluation(self):
X = self.df[self.features]
y = (self.df.label == 'maintained').values.astype(int)
X_new = SelectKBest(f_classif, k=10).fit(X, y)
feature_list = np.array([X_new.scores_.astype(float), self.features]).T
feature_list=[[float(x[0]), x[1]] for x in feature_list]
return sorted(feature_list, key=lambda a_entry: a_entry[0], reverse=True )
def _rename_columns(self):
old_new_cols = {}
for name in self.df.columns:
if name.startswith('stat'):
old_new_cols[name] = name.split('.')[1]
self.df.rename(columns=old_new_cols, inplace=True)
def my_train_test_split(self, test_size=0.3):
Xgems = self.df[["name"]].drop_duplicates()
X_train_gems, X_test_gems = train_test_split(Xgems, test_size=test_size)
X_train = self.df[self.df['name'].isin(X_train_gems['name'])]
y_train = (X_train.label == 'maintained').values.astype(int)
X_train = X_train[self.features]
dftest = self.df[self.df['name'].isin(X_test_gems['name'])]
return X_train, y_train, dftest
In [70]:
fn = 'RubygemDigger--Steps--GenerateJsonForLastVersions--1.data.json'
#fn = 'RubygemDigger--Steps--GenerateJsonForAllVersions--1.data.json.all.json'
learner = RubygemLearner(fn)
In [73]:
learner.feature_evaluation() [:20]
Out[73]:
In [ ]: