Imports


In [1]:
%matplotlib inline

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
plt.style.use('ggplot')

Functions


In [3]:
def jitter(values, sd=0.25):
    return [np.random.normal(v, sd) for v in values]

In [4]:
def clean_text(df, col):
    """A function for keeping only alpha-numeric
    characters and replacing all white space with
    a single space.
    """
    return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                  .apply(lambda x: re.sub('\s+', ' ', x).strip())

In [5]:
def count_pattern(df, col, pattern):
    """Count the occurrences of `pattern`
    in df[col].
    """
    df = df.copy()
    return df[col].str.count(pattern)

In [6]:
def split_on_word(text):
    """Use regular expression tokenizer.
    Keep apostrophes.
    Returns a list of lists, one list for each sentence:
        [[word, word], [word, word, ..., word], ...].
    """
    if type(text) is list:
        return [regexp_tokenize(sentence, pattern="\w+(?:[-']\w+)*") for sentence in text]
    else:
        return regexp_tokenize(text, pattern="\w+(?:[-']\w+)*")

In [7]:
def normalize(tokenized_words):
    """Removes stop words, numbers, short words, and lowercases text.
    Returns a list of lists, one list for each sentence:
        [[word, word], [word, word, ..., word], ...].
    """
    stop_words = stopwords.words('english')
    return [[w.lower() for w in sent
             if (w.lower() not in stop_words)]
            for sent in tokenized_words]

In [8]:
def features(df):
    df = df.copy()
    df['n_questionmarks'] = count_pattern(df, 'Text', '\?')
    df['n_periods'] = count_pattern(df, 'Text', '\.')
    df['n_apostrophes'] = count_pattern(df, 'Text', '\'')
    df['first_word'] = df.text_clean.apply(lambda x: split_on_word(x)[0])
    question_words = ['what', 'how', 'why', 'is']
    for w in question_words:
        col_wc = 'n_' + w
        col_fw = 'fw_' + w
        df[col_wc] = count_pattern(df, 'text_clean', w)
        df[col_fw] = (df.first_word == w) * 1
        
    del df['first_word']
    
    df['n_words'] = df.Text.apply(lambda x: len(split_on_word(x)))
    return df

In [9]:
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]

Data

Load


In [10]:
training = pd.read_csv('../data/newtrain.csv')

In [11]:
tr_non1 = training[training['Category'] != 1]

In [12]:
tr_non1.shape


Out[12]:
(1929, 2)

In [13]:
sample = tr_non1.sample(1000, random_state=1868)

In [14]:
training = training.append(sample)

In [15]:
training.reset_index(drop=True, inplace=True)

In [16]:
training.head()


Out[16]:
Category Text
0 5 why are yawns contagious? when people yawn
1 6 what is trans fat? how to reduce that? i heard...
2 1 roth ira vs 401k? what is the difference betwe...
3 1 how many planes fedex has? i heard that it is ...
4 2 what is the best photo slideshow creation appl...

In [17]:
training.Category.value_counts()


Out[17]:
1    769
2    681
3    611
4    538
5    374
7    372
6    353
dtype: int64

In [18]:
test = pd.read_csv('../data/newtest.csv')

Clean

Remove non-alpha numeric characters and extra whitespace.


In [19]:
training['text_clean'] = clean_text(training, 'Text')

In [20]:
test['text_clean'] = clean_text(test, 'Text')

Feature Engineering

tfidf


In [21]:
all_text = training['text_clean'].values.tolist() + test['text_clean'].values.tolist()

In [22]:
vocab = flatten_words(all_text, get_unique=True)

In [23]:
tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)

In [24]:
training_matrix = tfidf.fit_transform(training.text_clean)

In [25]:
test_matrix = tfidf.fit_transform(test.text_clean)

Other Features


In [26]:
training = features(training)

In [27]:
training = pd.concat([training, pd.DataFrame(training_matrix.todense())], axis=1)

In [28]:
test = features(test)

In [29]:
test = pd.concat([test, pd.DataFrame(test_matrix.todense())], axis=1)

Split the Training Data


In [30]:
train, dev = cross_validation.train_test_split(training, test_size=0.2, random_state=1868)

Training


In [31]:
svm = LinearSVC(dual=False, max_iter=5000)
logistic = LogisticRegression()
naivebayes = MultinomialNB()
bernoulli = BernoulliNB()

In [32]:
features = train.columns[3:]

In [33]:
X = train[features].values
y = train['Category'].values

In [34]:
features_dev = dev[features].values

Cross Validation


In [35]:
kf = cross_validation.KFold(n=len(train), n_folds=5)

In [36]:
for clf, label in zip([svm, logistic, naivebayes, bernoulli],
                      ['SVC', 'Logistic Regression', 'multinomial NB', 'Bernoulli NB']):
    print(np.array([clf.fit(X[tr], y[tr]).score(X[te], y[te]) for tr, te in kf]).mean(), label)


0.705532308959 SVC
0.610205446563 Logistic Regression
0.324894247039 multinomial NB
0.359380001829 Bernoulli NB

Testing on dev

SVM


In [37]:
svm.fit(X, y)
dev_predicted = svm.predict(features_dev)

In [38]:
accuracy_score(dev.Category, dev_predicted)


Out[38]:
0.74189189189189186

In [39]:
plt.figure(figsize=(5, 4))

plt.scatter(jitter(dev.Category, 0.15),
            jitter(dev_predicted, 0.15),
            color='#348ABD', alpha=0.25)

plt.xlabel('Ground Truth')
plt.ylabel('Predicted')


Out[39]:
<matplotlib.text.Text at 0x10bde77f0>

Logistic Regression


In [40]:
logistic.fit(X, y)
dev_predicted = logistic.predict(features_dev)

In [41]:
accuracy_score(dev.Category, dev_predicted)


Out[41]:
0.65135135135135136

In [42]:
plt.figure(figsize=(5, 4))

plt.scatter(jitter(dev.Category, 0.15),
            jitter(dev_predicted, 0.15),
            color='#348ABD', alpha=0.25)

plt.xlabel('Ground Truth')
plt.ylabel('Predicted')


Out[42]:
<matplotlib.text.Text at 0x10be01588>

Multinomial Naive Bayes


In [43]:
naivebayes.fit(X, y)
dev_predicted = naivebayes.predict(features_dev)

In [44]:
accuracy_score(dev.Category, dev_predicted)


Out[44]:
0.38108108108108107

In [45]:
plt.figure(figsize=(5, 4))

plt.scatter(jitter(dev.Category, 0.15),
            jitter(dev_predicted, 0.15),
            color='#348ABD', alpha=0.25)

plt.xlabel('Ground Truth')
plt.ylabel('Predicted')


Out[45]:
<matplotlib.text.Text at 0x10bdd6630>

Bernoulli Naive Bayes


In [46]:
bernoulli.fit(X, y)
dev_predicted = bernoulli.predict(features_dev)

In [47]:
accuracy_score(dev.Category, dev_predicted)


Out[47]:
0.4081081081081081

In [48]:
plt.figure(figsize=(5, 4))

plt.scatter(jitter(dev.Category, 0.15),
            jitter(dev_predicted, 0.15),
            color='#348ABD', alpha=0.25)

plt.xlabel('Ground Truth')
plt.ylabel('Predicted')


Out[48]:
<matplotlib.text.Text at 0x10be2cb70>

Testing


In [49]:
X = training[features].values
y = training['Category'].values

In [50]:
features_test = test[features].values

In [51]:
svm.fit(X, y)
test_predicted = svm.predict(features_test)

In [52]:
test['Category'] = test_predicted

In [53]:
output = test[['Id', 'Category']]

In [54]:
output.to_csv('../data/solution03.csv', index=False)

In [ ]: