In [ ]:
In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cPickle as pickle
from copy import deepcopy
from sklearn.utils import shuffle
import sklearn_mmadsen.graphs as skmg
%matplotlib inline
plt.style.use("fivethirtyeight")
sns.set()
In [2]:
all_graphs = pickle.load(open("train-sc-4-5-cont-graphs.pkl",'r'))
all_labels = pickle.load(open("train-sc-4-5-cont-labels.pkl",'r'))
The strategy, unlike our first attempt, requires a real train/test split in the dataset because we're going to fit an actual model (although a true LOO cross validation is still of course possible). But we need a train_test_split function which is able ot deal with lists of NetworkX objects.
In [ ]:
The goal here is to construct a standard training and test data matrix of numeric values, which will contain the sorted Laplacian eigenvalues of the graphs in each data set. One feature will thus represent the largest eigenvalue for each graph, a second feature will represent the second largest eigenvalue, and so on.
We do not necessarily assume that all of the graphs have the same number of vertices, although if there are marked differences, we would need to handle missing data for those graphs which had many fewer eigenvalues (or restrict our slice of the spectrum to the smallest number of eigenvalues present).
In [3]:
train_graphs, train_labels, test_graphs, test_labels = skmg.graph_train_test_split(all_graphs, all_labels, test_fraction=0.10)
print "train size: %s" % len(train_graphs)
print "test size: %s" % len(test_graphs)
In [ ]:
In [ ]:
We're going to be using a gradient boosted classifier, which has some of best accuracy of any of the standard classifier methods. Ultimately we'll figure out the best hyperparameters using cross-validation, but first we just want to see whether the approach gets us anywhere in the right ballpark -- remember, we can 80% accuracy with just eigenvalue distance, so we have to be in that neighborhood or higher to be worth the effort of switching to a more complex model.
In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
In [5]:
train_matrix = skmg.graphs_to_eigenvalue_matrix(train_graphs, num_eigenvalues=10)
test_matrix = skmg.graphs_to_eigenvalue_matrix(test_graphs, num_eigenvalues=10)
In [6]:
clf = GradientBoostingClassifier(n_estimators = 250)
clf.fit(train_matrix, train_labels)
Out[6]:
In [7]:
pred_label = clf.predict(test_matrix)
In [8]:
cm = confusion_matrix(test_labels, pred_label)
cmdf = pd.DataFrame(cm)
cmdf.columns = map(lambda x: 'predicted {}'.format(x), cmdf.columns)
cmdf.index = map(lambda x: 'actual {}'.format(x), cmdf.index)
print cmdf
print classification_report(test_labels, pred_label)
print "Accuracy on test: %0.3f" % accuracy_score(test_labels, pred_label)
In [ ]:
In [9]:
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
In [10]:
pipeline = Pipeline([
('clf', GradientBoostingClassifier())
])
params = {
'clf__learning_rate': [5.0,2.0,1.0, 0.75, 0.5, 0.25, 0.1, 0.05, 0.01],
'clf__n_estimators': [10,25,50,100,250,500]
}
grid_search = GridSearchCV(pipeline, params, n_jobs = -1, verbose = 1)
In [11]:
grid_search.fit(train_matrix, train_labels)
Out[11]:
In [12]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters:")
best_params = grid_search.best_estimator_.get_params()
for param in sorted(params.keys()):
print("param: %s: %r" % (param, best_params[param]))
In [13]:
pred_label = grid_search.predict(test_matrix)
In [14]:
cm = confusion_matrix(test_labels, pred_label)
cmdf = pd.DataFrame(cm)
cmdf.columns = map(lambda x: 'predicted {}'.format(x), cmdf.columns)
cmdf.index = map(lambda x: 'actual {}'.format(x), cmdf.index)
print cmdf
print classification_report(test_labels, pred_label)
print "Accuracy on test: %0.3f" % accuracy_score(test_labels, pred_label)
In [ ]:
In [ ]: