In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_table("tagged.csv")
In [3]:
df.head()
Out[3]:
In [4]:
df.columns
Out[4]:
In [5]:
categoricals = []
for col in "section target_audience type".split():
categoricals.append(pd.get_dummies(df[col]).values)
In [6]:
[x.shape for x in categoricals]
Out[6]:
In [7]:
import numpy as np
In [8]:
xCat = np.hstack(categoricals)
In [9]:
xCat.shape
Out[9]:
In [10]:
xBool = df['speaker_link_present content_url_present'.split()].astype(int)
In [11]:
xText = df['title speaker_info prerequisites description'.split()].copy()
xText.fillna(value=" ", inplace=True)
In [12]:
xText = xText.apply(lambda x: " ".join(x), axis=1)
In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [14]:
xText = TfidfVectorizer().fit_transform(xText.values)
In [15]:
xNum = df['n_comments deadlinediff'.split()].values
In [16]:
feats = [xCat, xBool.values, xText.toarray(), xNum]
In [17]:
X = np.hstack(feats)
y = df['selected'].astype(int).values
In [18]:
from scipy.io import savemat
savemat("trainData.mat", dict(X=X, y=y))
In [19]:
from sklearn.preprocessing import StandardScaler
In [20]:
xScaled = StandardScaler().fit_transform(X)
In [21]:
from sklearn.linear_model import LogisticRegression
from jeeves.cross_validation import report_stratifiedKfold_cv
In [22]:
report_stratifiedKfold_cv(LogisticRegression(), X, y, shuffle=True)
Out[22]:
In [23]:
from sklearn.svm import SVC
In [24]:
report_stratifiedKfold_cv(SVC(), X, y, shuffle=True)
Out[24]:
In [25]:
x_non_text = [xCat, xBool.values, xNum]
In [26]:
x_non_text = np.hstack(x_non_text)
In [27]:
x_non_text_scaled = StandardScaler().fit_transform(x_non_text)
In [28]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
In [29]:
x_red = pca.fit_transform(x_non_text_scaled)
plt.figure(figsize=(10, 8))
plt.scatter(x_red[:, 0], x_red[:, 1], c=y)
Out[29]:
In [30]:
report_stratifiedKfold_cv(LogisticRegression(), x_non_text_scaled, y, shuffle=True)
Out[30]:
In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
In [32]:
report_stratifiedKfold_cv(RandomForestClassifier(), x_non_text, y, shuffle=True, n_folds=10)
Out[32]:
In [33]:
report_stratifiedKfold_cv(DecisionTreeClassifier(), x_non_text, y, shuffle=True, n_folds=10)
Out[33]:
In [34]:
print report_stratifiedKfold_cv(RandomForestClassifier(), x_non_text, y, shuffle=True, n_folds=10).mean()
print report_stratifiedKfold_cv(RandomForestClassifier(), X, y, shuffle=True, n_folds=10).mean()
In [35]:
print report_stratifiedKfold_cv(DecisionTreeClassifier(), x_non_text, y, shuffle=True, n_folds=10).mean()
print report_stratifiedKfold_cv(DecisionTreeClassifier(), X, y, shuffle=True, n_folds=10).mean()
In [36]:
from sklearn.grid_search import GridSearchCV
param_grid = dict(
criterion="gini entropy".split(), splitter="best random".split(),
max_features=[None, "auto", "sqrt", "log2"],
class_weight=["balanced", None],
presort=[True, False]
)
clf = DecisionTreeClassifier()
from sklearn.metrics import recall_score
gcv = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring="accuracy")
In [37]:
gcv.fit(x_non_text, y)
Out[37]:
In [39]:
gcv.best_params_
Out[39]:
In [40]:
clf = gcv.best_estimator_
In [42]:
report_stratifiedKfold_cv(clf, x_non_text, y, n_folds=10)
Out[42]:
In [43]:
from sklearn.tree import export_graphviz
In [44]:
with open("cfp_classifier.dot", "w") as f_out:
export_graphviz(clf, out_file=f_out)
In [ ]:
from IPython.display import Image
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())