Data downloaded from: https://data.cityofnewyork.us/Education/SAT-Results/f9bf-2cp4
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")
In [2]:
df = pd.read_csv("data/SAT_Results.csv")
In [14]:
df_filtered = df[(df["Num of SAT Test Takers"] != "s") & (df["SAT Critical Reading Avg. Score"] != "s")\
& (df["SAT Math Avg. Score"] != "s") & (df["SAT Writing Avg. Score"] != "s")].copy()
In [69]:
df_filtered[df_filtered.columns[2:]] = df_filtered[df_filtered.columns[2:]].astype(np.float)
df_filtered.to_csv("data/SAT_Results.filtered.txt", sep="\t", index=False)
In [54]:
df_filtered.shape
Out[54]:
In [55]:
df_filtered.head()
Out[55]:
In [56]:
class MultiColumnExtractor(TransformerMixin):
def __init__(self, colnames):
print "Initialized extractor for column %s" % colnames
self.colnames = colnames
def get_feature_names(self):
return self.colnames
def transform(self, X, **transform_params):
print "Extracting columns [%s]" % (self.colnames,)
return pd.DataFrame(X[self.colnames])
def fit(self, X, y=None, **fit_params):
return self
In [57]:
pipeline = Pipeline([
("columns", MultiColumnExtractor(df_filtered.columns[2:])),
("features_scaled", StandardScaler())
])
In [65]:
cluster = AgglomerativeClustering(n_clusters=5)
In [66]:
X = pipeline.fit_transform(df_filtered)
print X.shape
cids = cluster.fit_predict(X)
df_pred = pd.concat((df_filtered[df_filtered.columns[:2]], pd.DataFrame(X, columns=df_filtered.columns[2:]), pd.Series(cids, name="ClusterID")), axis=1)
df_pred.head()
Out[66]:
In [68]:
cols = df_pred.columns
xi, yi = cols[2], cols[4]
plt.scatter(df_pred[xi], df_pred[yi], c=df_pred[cols[6]], s=df_pred[cols[2]]*100, cmap="Greys")
plt.xlabel(xi)
plt.ylabel(yi)
Out[68]:
In [ ]:
from sklearn.cluster.hierarchical import p