Data Source: https://data.cityofnewyork.us/Education/School-Progress-Reports-All-Schools-2009-10/ffnc-f3aa
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")
In [2]:
df = pd.read_csv("data/School_Progress_Reports_-_All_Schools_-_2009-10.csv")
In [3]:
print df.shape
df.head()
Out[3]:
In [50]:
cols = df.columns
print "\n".join("[%s] %s" % k for k in enumerate(cols))
print cols[9:16:2]
df = df.fillna(0)
In [51]:
class MultiColumnExtractor(TransformerMixin):
def __init__(self, colnames):
print "Initialized extractor for column %s" % colnames
self.colnames = colnames
def get_feature_names(self):
return self.colnames
def transform(self, X, **transform_params):
print "Extracting columns [%s]" % (self.colnames,)
return pd.DataFrame(X[self.colnames])
def fit(self, X, y=None, **fit_params):
return self
In [53]:
pipeline = Pipeline([
("columns", MultiColumnExtractor(cols[9:16:2])),
("features_scaled", StandardScaler())
])
In [54]:
cluster = AgglomerativeClustering(n_clusters=2)
In [55]:
X = pipeline.fit_transform(df)
print X.shape
cids = cluster.fit_predict(X)
cols = df.columns
df_pred = pd.concat((df[df.columns[:7]], pd.DataFrame(X, columns=cols[9:16:2]), pd.Series(cids, name="ClusterID")), axis=1)
df_pred.head()
Out[55]:
In [80]:
cols = df_pred.columns
print df_pred.ClusterID.value_counts()
print "\n".join("[%s] %s" % k for k in enumerate(df_pred.columns))
xi, yi = cols[6], cols[1]
cax = plt.scatter(df_pred[xi], df_pred[yi], c=df_pred[cols[11]], s=(1+df_pred[cols[6]]), cmap="Set1")
cbar = plt.colorbar(cax, ticks=[0,1], orientation='horizontal')
cbar.ax.set_xticklabels(['Cluster 0','Cluster 1']) # horizontal colorbar
plt.xlabel(xi)
plt.ylabel(yi)
Out[80]:
In [ ]: