In [1]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
In [2]:
plt.style.use('ggplot')
In [3]:
def tdm(words_unique, words_questions):
"""Create a term document matrix.
Return the m (unique words, sorted) by n (words_questions)
matrix, M.
"""
M = np.zeros([len(words_unique), len(words_questions)])
for m, term in enumerate(words_unique):
for n, doc in enumerate(words_questions):
M[m, n] = doc.count(term)
return M
In [4]:
def jitter(values, sd=0.25):
return [np.random.normal(v, sd) for v in values]
In [5]:
df = pd.read_csv('../data/newtrain.csv')
In [6]:
df.head()
Out[6]:
In [7]:
df.Category.value_counts()
Out[7]:
In [8]:
df.Category.value_counts().plot(kind='bar', color='#348ABD')
Out[8]:
Remove non-alpha numeric characters and extra whitespace.
In [9]:
df['text_clean'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
.apply(lambda x: re.sub('\s+', ' ', x).strip())
In [10]:
words_questions = [s.split() for s in df['text_clean'].values]
In [11]:
words_unique = sorted(list(set([word for sent in words_questions for word in sent])))
In [12]:
X = tdm(words_unique, words_questions)
X = X.T
In [13]:
y = df.Category.values
In [14]:
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(X)
Out[14]:
In [15]:
df['k_means'] = kmeans.labels_
In [16]:
plt.figure(figsize=(8, 6))
plt.scatter(jitter(df.Category),
jitter(df.k_means),
color='#348ABD', alpha=0.25)
plt.xlabel('Ground Truth')
plt.ylabel('k-means Cluster')
Out[16]:
k-means is classifying most questions into cluster #2.
Logistic Regression, Random Forest, and naive Bayes.
In [17]:
np.random.seed(123)
lr = LogisticRegression()
rf = RandomForestClassifier()
nb = GaussianNB()
mb = MultinomialNB()
In [18]:
for clf, label in zip([lr, rf, nb, mb], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'multinomial NB']):
scores = cross_validation.cross_val_score(clf, X, y, cv=7, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
In [ ]: