Imports



In [1]:

    
%matplotlib inline

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



In [2]:

    
plt.style.use('ggplot')

Functions



In [3]:

    
def tdm(words_unique, words_questions):
    """Create a term document matrix.
    Return the m (unique words, sorted) by n (words_questions)
    matrix, M.
    """
    M = np.zeros([len(words_unique), len(words_questions)])
    for m, term in enumerate(words_unique):
        for n, doc in enumerate(words_questions):
            M[m, n] = doc.count(term)
    if type(words_questions) is list:
        return M
    elif type(words_questions) is str:
        return M.sum(axis=1)



In [4]:

    
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]



In [5]:

    
def jitter(values, sd=0.25):
    return [np.random.normal(v, sd) for v in values]

Data

Load



In [6]:

    
df = pd.read_csv('../data/newtrain.csv')



In [7]:

    
df.head()









    Out[7]:






  
    
      
      Category
      Text
    
  
  
    
      0
      5
      why are yawns contagious? when people yawn
    
    
      1
      6
      what is trans fat? how to reduce that? i heard...
    
    
      2
      1
      roth ira vs 401k? what is the difference betwe...
    
    
      3
      1
      how many planes fedex has? i heard that it is ...
    
    
      4
      2
      what is the best photo slideshow creation appl...

Clean

Remove non-alpha numeric characters and extra whitespace.



In [8]:

    
df['text_clean'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                             .apply(lambda x: re.sub('\s+', ' ', x).strip())

NumPy Arrays

Get a list of all unique words, then create a term document matrix for each category.



In [9]:

    
words_unique_all = flatten_words(df.text_clean.values, get_unique=True)



In [10]:

    
words_unique_all[-5:]









    Out[10]:





['zodiac', 'zoe', 'zones', 'zoology', 'zwick']

Construct a list of lists: One list for each class.



In [11]:

    
df_ql = df.copy()



In [12]:

    
df_ql = df_ql[['Category', 'text_clean']]



In [13]:

    
df_ql['all_questions'] = df_ql.apply(lambda row:
                                     df.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
                                     axis=1)



In [14]:

    
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)



In [15]:

    
df_ql = df_ql[['Category', 'all_questions']]



In [16]:

    
df_ql.all_questions









    Out[16]:





0    [roth ira vs 401k what is the difference betwe...
1    [what is the best photo slideshow creation app...
2    [what was the first dvd you ever bought what w...
3    [what is the reason for the increasing divorce...
4    [why are yawns contagious when people yawn, wh...
5    [what is trans fat how to reduce that i heard ...
6    [what is an imaginary number what is an imagin...
Name: all_questions, dtype: object



In [17]:

    
documents = []

for label in np.sort(df_ql.Category.unique()):
    tdf = df_ql[df_ql['Category'] == label]
    documents.append(flatten_words(tdf['all_questions'].values[0]))



In [18]:

    
M = tdm(words_unique_all, documents)



In [19]:

    
M.sum(axis=0)









    Out[19]:





array([ 11257.,   8560.,   6302.,   6887.,   3740.,   3909.,   3309.])

Normalized by document length.



In [20]:

    
M_norm = M / M.sum(axis=0)

Testing: Comparing a single question (represented as a vector of word counts) to the training data matrix.



In [21]:

    
individual_question = df_ql[df_ql['Category'] == 1]['all_questions'].values[0][10]



In [22]:

    
M_test = tdm(words_unique_all, individual_question)



In [23]:

    
M_test_norm = M_test / M_test.sum()



In [24]:

    
results = M_norm.T.dot(M_test_norm)
results









    Out[24]:





array([ 0.00572517,  0.00582501,  0.00464002,  0.00649399,  0.00487737,
        0.00508111,  0.00365774])



In [25]:

    
plt.plot(range(1, len(results)+1), results)









    Out[25]:





[<matplotlib.lines.Line2D at 0x116259ac8>]

What should the matrix look like? All unique words by category with counts in each cell?



In [44]:

    
example_matrix = np.array([[1, 2, 3], [2, 5, 1], [3, 3, 7], [2, 2, 2], [1, 1, 4]])



In [45]:

    
example_matrix









    Out[45]:





array([[1, 2, 3],
       [2, 5, 1],
       [3, 3, 7],
       [2, 2, 2],
       [1, 1, 4]])



In [46]:

    
example_question = np.array([3, 2, 6, 4, 2])



In [47]:

    
example_matrix.shape, example_question.shape









    Out[47]:





((5, 3), (5,))



In [52]:

    
example_matrix.T









    Out[52]:





array([[1, 2, 3, 2, 1],
       [2, 5, 3, 2, 1],
       [3, 1, 7, 2, 4]])



In [53]:

    
example_question









    Out[53]:





array([3, 2, 6, 4, 2])



In [51]:

    
example_matrix.T.dot(example_question)









    Out[51]:





array([35, 44, 69])

	Category	Text
0	5	why are yawns contagious? when people yawn
1	6	what is trans fat? how to reduce that? i heard...
2	1	roth ira vs 401k? what is the difference betwe...
3	1	how many planes fedex has? i heard that it is ...
4	2	what is the best photo slideshow creation appl...