Imports



In [1]:

    
%matplotlib inline

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:

    
plt.style.use('ggplot')

Functions



In [3]:

    
def jitter(values, sd=0.25):
    return [np.random.normal(v, sd) for v in values]



In [4]:

    
def clean_text(df, col):
    """A function for keeping only alpha-numeric
    characters and replacing all white space with
    a single space.
    """
    df = df.copy()
    porter_stemmer = PorterStemmer()
    return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                  .apply(lambda x: re.sub('\s+', ' ', x).strip())\
                  .apply(lambda x: ' '.join([porter_stemmer.stem(w)
                                             for w in x.split()]))



In [5]:

    
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]

Data

Load



In [6]:

    
training = pd.read_csv('../data/newtrain.csv')



In [7]:

    
training.head()









    Out[7]:






  
    
      
      Category
      Text
    
  
  
    
      0
      5
      why are yawns contagious? when people yawn
    
    
      1
      6
      what is trans fat? how to reduce that? i heard...
    
    
      2
      1
      roth ira vs 401k? what is the difference betwe...
    
    
      3
      1
      how many planes fedex has? i heard that it is ...
    
    
      4
      2
      what is the best photo slideshow creation appl...



In [8]:

    
test = pd.read_csv('../data/newtest.csv')

Clean

Remove non-alpha numeric characters and extra whitespace.



In [9]:

    
training['text_clean'] = clean_text(training, 'Text')



In [10]:

    
test['text_clean'] = clean_text(test, 'Text')

Get a list of questions, by category.



In [11]:

    
df_ql = training.copy()



In [12]:

    
df_ql = df_ql[['Category', 'text_clean']]



In [13]:

    
df_ql['all_questions'] = df_ql.apply(lambda row:
                                     df_ql.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
                                     axis=1)
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)



In [14]:

    
df_ql = df_ql[['Category', 'all_questions']]



In [15]:

    
df_ql.all_questions









    Out[15]:





0    [roth ira vs 401k what is the differ between r...
1    [what is the best photo slideshow creation app...
2    [what wa the first dvd you ever bought what wa...
3    [what is the reason for the increas divorc per...
4    [whi are yawn contagi when peopl yawn, who sai...
5    [what is tran fat how to reduc that i heard th...
6    [what is an imaginari number what is an imagin...
Name: all_questions, dtype: object

Create a list of lists with one entry of $n$ concatenated questions per category.



In [16]:

    
documents = []

for i in range(len(df_ql)):
    documents.append(' '.join(df_ql['all_questions'][i]))

Classification

All Vocabulary



In [17]:

    
all_text = training['text_clean'].values.tolist() + test['text_clean'].values.tolist()



In [18]:

    
vocab = flatten_words(all_text, get_unique=True)

tfidf



In [19]:

    
tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)



In [20]:

    
train_matrix = tfidf.fit_transform(documents)



In [21]:

    
test_matrix = tfidf.fit_transform(test.text_clean.values)

Cosine Similarity



In [22]:

    
cos_scores = cosine_similarity(train_matrix, test_matrix)



In [23]:

    
test['Category'] = np.argmax(cos_scores, axis=0) + 1



In [24]:

    
output = test[['Id', 'Category']]



In [25]:

    
output.to_csv('../data/solution01.csv', index=False)



In [ ]:

	Category	Text
0	5	why are yawns contagious? when people yawn
1	6	what is trans fat? how to reduce that? i heard...
2	1	roth ira vs 401k? what is the difference betwe...
3	1	how many planes fedex has? i heard that it is ...
4	2	what is the best photo slideshow creation appl...