This notebook explores some potential correlations between the features of our UK school datasets and then performs an agglomerative clustering saving the labeling results on disk for further visualisation.
In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans, AgglomerativeClustering
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
Load UK schools dataset cleaned with cleaning.ipynb and sample 5 data points.
In [62]:
schools = pd.read_csv('/project/uk-schools-clustering/data/derived/2016-2017_england.csv')
In [76]:
schools.head(5)
schools.columns.tolist()
Out[76]:
In [64]:
schools.describe()
Out[64]:
In [65]:
X=np.array(schools[schools.columns[-19:]]).astype(float)
header = schools.columns
In [66]:
fig = plt.figure(figsize=(12,8))
correlationMatrix = np.corrcoef(X, rowvar=0)
plt.pcolor(correlationMatrix, cmap = 'hot', vmin=-1, vmax=1)
plt.colorbar()
plt.yticks(np.arange(0.5, 19), range(0,19))
plt.xticks(np.arange(0.5, 19), range(0,19))
plt.show()
In [67]:
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
In [68]:
header = schools.columns[-19:]
header
Out[68]:
In [69]:
features = ['idaci score', 'on free meal', 'english not first language']
In [70]:
estimator = AgglomerativeClustering(n_clusters=2, linkage='average', affinity='cosine')
x_index = header.tolist().index(features[0])
y_index = header.tolist().index(features[1])
z_index = header.tolist().index(features[2])
fig = plt.figure(1, figsize=(8, 7))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=28, azim=134)
plt.cla()
estimator.fit(X_scaled)
labels = estimator.labels_
ax.scatter(X_scaled[:, x_index], X_scaled[:, y_index], X_scaled[:, z_index], c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel(features[2])
Out[70]:
In [71]:
X_with_labels = np.insert(X, 19, labels, axis=1)
column_names = header.tolist()
column_names.append('cluster')
clustered_schools = pd.DataFrame(X_with_labels, columns=column_names)
clustered_schools['cluster'] = clustered_schools.cluster.astype(int)
clustered_schools
Out[71]:
In [72]:
clustered_schools.insert(loc=0, column='name', value=schools['name'])
clustered_schools.insert(loc=0, column='urn', value=schools['urn'])
In [73]:
clustered_schools.to_csv('/project/uk-schools-clustering/data/derived/2016-2017_england_clusters.csv', index=False)
In [ ]: