Imports


In [1]:
%matplotlib inline

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
plt.style.use('ggplot')

Functions


In [3]:
def jitter(values, sd=0.25):
    return [np.random.normal(v, sd) for v in values]

In [4]:
def clean_text(df, col):
    """A function for keeping only alpha-numeric
    characters and replacing all white space with
    a single space.
    """
    df = df.copy()
    porter_stemmer = PorterStemmer()
    return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                  .apply(lambda x: re.sub('\s+', ' ', x).strip())\
                  .apply(lambda x: ' '.join([porter_stemmer.stem(w)
                                             for w in x.split()]))

In [5]:
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]

Data

Load


In [6]:
training = pd.read_csv('../data/newtrain.csv')

In [7]:
training.head()


Out[7]:
Category Text
0 5 why are yawns contagious? when people yawn
1 6 what is trans fat? how to reduce that? i heard...
2 1 roth ira vs 401k? what is the difference betwe...
3 1 how many planes fedex has? i heard that it is ...
4 2 what is the best photo slideshow creation appl...

In [8]:
test = pd.read_csv('../data/newtest.csv')

Clean

Remove non-alpha numeric characters and extra whitespace.


In [9]:
training['text_clean'] = clean_text(training, 'Text')

In [10]:
test['text_clean'] = clean_text(test, 'Text')

Get a list of questions, by category.


In [11]:
df_ql = training.copy()

In [12]:
df_ql = df_ql[['Category', 'text_clean']]

In [13]:
df_ql['all_questions'] = df_ql.apply(lambda row:
                                     df_ql.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
                                     axis=1)
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)

In [14]:
df_ql = df_ql[['Category', 'all_questions']]

In [15]:
df_ql.all_questions


Out[15]:
0    [roth ira vs 401k what is the differ between r...
1    [what is the best photo slideshow creation app...
2    [what wa the first dvd you ever bought what wa...
3    [what is the reason for the increas divorc per...
4    [whi are yawn contagi when peopl yawn, who sai...
5    [what is tran fat how to reduc that i heard th...
6    [what is an imaginari number what is an imagin...
Name: all_questions, dtype: object

Create a list of lists with one entry of $n$ concatenated questions per category.


In [16]:
documents = []

for i in range(len(df_ql)):
    documents.append(' '.join(df_ql['all_questions'][i]))

Classification

All Vocabulary


In [17]:
all_text = training['text_clean'].values.tolist() + test['text_clean'].values.tolist()

In [18]:
vocab = flatten_words(all_text, get_unique=True)

tfidf


In [19]:
tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)

In [20]:
train_matrix = tfidf.fit_transform(documents)

In [21]:
test_matrix = tfidf.fit_transform(test.text_clean.values)

Cosine Similarity


In [22]:
cos_scores = cosine_similarity(train_matrix, test_matrix)

In [23]:
test['Category'] = np.argmax(cos_scores, axis=0) + 1

In [24]:
output = test[['Id', 'Category']]

In [25]:
output.to_csv('../data/solution01.csv', index=False)

In [ ]: