In [1]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
In [2]:
plt.style.use('ggplot')
In [3]:
def tdm(words_unique, words_questions):
"""Create a term document matrix.
Return the m (unique words, sorted) by n (words_questions)
matrix, M.
"""
M = np.zeros([len(words_unique), len(words_questions)])
for m, term in enumerate(words_unique):
for n, doc in enumerate(words_questions):
M[m, n] = doc.count(term)
if type(words_questions) is list:
return M
elif type(words_questions) is str:
return M.sum(axis=1)
In [4]:
def flatten_words(list1d, get_unique=False):
qa = [s.split() for s in list1d]
if get_unique:
return sorted(list(set([w for sent in qa for w in sent])))
else:
return [w for sent in qa for w in sent]
In [5]:
def jitter(values, sd=0.25):
return [np.random.normal(v, sd) for v in values]
In [6]:
df = pd.read_csv('../data/newtrain.csv')
In [7]:
df.head()
Out[7]:
Remove non-alpha numeric characters and extra whitespace.
In [8]:
df['text_clean'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
.apply(lambda x: re.sub('\s+', ' ', x).strip())
Get a list of all unique words, then create a term document matrix for each category.
In [9]:
words_unique_all = flatten_words(df.text_clean.values, get_unique=True)
In [10]:
words_unique_all[-5:]
Out[10]:
Construct a list of lists: One list for each class.
In [11]:
df_ql = df.copy()
In [12]:
df_ql = df_ql[['Category', 'text_clean']]
In [13]:
df_ql['all_questions'] = df_ql.apply(lambda row:
df.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
axis=1)
In [14]:
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)
In [15]:
df_ql = df_ql[['Category', 'all_questions']]
In [16]:
df_ql.all_questions
Out[16]:
In [17]:
documents = []
for label in np.sort(df_ql.Category.unique()):
tdf = df_ql[df_ql['Category'] == label]
documents.append(flatten_words(tdf['all_questions'].values[0]))
In [18]:
M = tdm(words_unique_all, documents)
In [19]:
M.sum(axis=0)
Out[19]:
Normalized by document length.
In [20]:
M_norm = M / M.sum(axis=0)
Testing: Comparing a single question (represented as a vector of word counts) to the training data matrix.
In [21]:
individual_question = df_ql[df_ql['Category'] == 1]['all_questions'].values[0][10]
In [22]:
M_test = tdm(words_unique_all, individual_question)
In [23]:
M_test_norm = M_test / M_test.sum()
In [24]:
results = M_norm.T.dot(M_test_norm)
results
Out[24]:
In [25]:
plt.plot(range(1, len(results)+1), results)
Out[25]:
What should the matrix look like? All unique words by category with counts in each cell?
In [44]:
example_matrix = np.array([[1, 2, 3], [2, 5, 1], [3, 3, 7], [2, 2, 2], [1, 1, 4]])
In [45]:
example_matrix
Out[45]:
In [46]:
example_question = np.array([3, 2, 6, 4, 2])
In [47]:
example_matrix.shape, example_question.shape
Out[47]:
In [52]:
example_matrix.T
Out[52]:
In [53]:
example_question
Out[53]:
In [51]:
example_matrix.T.dot(example_question)
Out[51]: