In [1]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
In [2]:
plt.style.use('ggplot')
In [3]:
def sample(df, n=1000, include_cats=[2, 3, 4, 5, 6, 7], random_state=1868):
df = df.copy()
subset = df[df.Category.isin(include_cats)]
sample = subset.sample(n, random_state=random_state)
return sample
In [4]:
def clean_text(df, col):
"""A function for keeping only alpha-numeric
characters and replacing all white space with
a single space.
"""
df = df.copy()
porter_stemmer = PorterStemmer()
return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
.apply(lambda x: re.sub('\s+', ' ', x).strip())
In [5]:
def count_pattern(df, col, pattern):
"""Count the occurrences of `pattern`
in df[col].
"""
df = df.copy()
return df[col].str.count(pattern)
In [6]:
def split_on_word(text):
"""Use regular expression tokenizer.
Keep apostrophes.
Returns a list of lists, one list for each sentence:
[[word, word], [word, word, ..., word], ...].
"""
if type(text) is list:
return [regexp_tokenize(sentence, pattern="\w+(?:[-']\w+)*") for sentence in text]
else:
return regexp_tokenize(text, pattern="\w+(?:[-']\w+)*")
In [7]:
def normalize(tokenized_words):
"""Removes stop words, numbers, short words, and lowercases text.
Returns a list of lists, one list for each sentence:
[[word, word], [word, word, ..., word], ...].
"""
stop_words = stopwords.words('english')
return [[w.lower() for w in sent
if (w.lower() not in stop_words)]
for sent in tokenized_words]
In [8]:
def features(df):
df = df.copy()
df['n_questionmarks'] = count_pattern(df, 'Text', '\?')
df['n_periods'] = count_pattern(df, 'Text', '\.')
df['n_apostrophes'] = count_pattern(df, 'Text', '\'')
df['first_word'] = df.text_clean.apply(lambda x: split_on_word(x)[0])
question_words = ['what', 'how', 'why', 'is']
for w in question_words:
col_wc = 'n_' + w
col_fw = 'fw_' + w
df[col_wc] = count_pattern(df, 'text_clean', w)
df[col_fw] = (df.first_word == w) * 1
del df['first_word']
df['n_words'] = df.Text.apply(lambda x: len(split_on_word(x)))
return df
In [9]:
def flatten_words(list1d, get_unique=False):
qa = [s.split() for s in list1d]
if get_unique:
return sorted(list(set([w for sent in qa for w in sent])))
else:
return [w for sent in qa for w in sent]
In [10]:
def tfidf_matrices(tr, te, col='text_clean'):
"""Returns tfidf matrices for both the
training and test DataFrames.
The matrices will have the same number of
columns, which represent unique words, but
not the same number of rows, which represent
samples.
"""
tr = tr.copy()
te = te.copy()
text = tr[col].values.tolist() + te[col].values.tolist()
vocab = flatten_words(text, get_unique=True)
tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)
tr_matrix = tfidf.fit_transform(tr.text_clean)
te_matrix = tfidf.fit_transform(te.text_clean)
return tr_matrix, te_matrix
In [11]:
def concat_tfidf(df, matrix):
df = df.copy()
df = pd.concat([df, pd.DataFrame(matrix.todense())], axis=1)
return df
In [12]:
def jitter(values, sd=0.25):
return [np.random.normal(v, sd) for v in values]
In [13]:
training = pd.read_csv('../data/newtrain.csv')
In [14]:
# training = training.append(sample(training))\
# .append(sample(training,
# n=300,
# include_cats=[5, 6, 7],
# random_state=1868))
In [15]:
# training.reset_index(drop=True, inplace=True)
In [16]:
# training.Category.value_counts()
In [17]:
test = pd.read_csv('../data/newtest.csv')
Remove non-alpha numeric characters and extra whitespace.
In [18]:
training['text_clean'] = clean_text(training, 'Text')
In [19]:
test['text_clean'] = clean_text(test, 'Text')
In [20]:
training = features(training)
test = features(test)
In [21]:
train, dev = cross_validation.train_test_split(training, test_size=0.2, random_state=1868)
In [22]:
training.shape[0] == train.shape[0] + dev.shape[0]
Out[22]:
In [23]:
train = train.append(sample(train, n=800))
In [24]:
train.reset_index(drop=True, inplace=True)
dev.reset_index(drop=True, inplace=True)
In [25]:
train_matrix, dev_matrix = tfidf_matrices(train, dev)
In [26]:
train = concat_tfidf(train, train_matrix)
In [27]:
dev = concat_tfidf(dev, dev_matrix)
In [28]:
svm = LinearSVC(dual=False, max_iter=5000)
In [29]:
features = train.columns[3:]
In [30]:
X = train[features].values
y = train['Category'].values
In [31]:
features_dev = dev[features].values
In [32]:
kf = cross_validation.KFold(n=len(train), n_folds=5)
In [33]:
for clf, label in zip([svm],
['SVM']):
print(np.array([clf.fit(X[tr], y[tr]).score(X[te], y[te]) for tr, te in kf]).mean(), label)
In [34]:
svm.fit(X, y)
dev_predicted = svm.predict(features_dev)
In [35]:
accuracy_score(dev.Category, dev_predicted)
Out[35]:
In [35]:
plt.figure(figsize=(5, 4))
plt.scatter(jitter(dev.Category, 0.15),
jitter(dev_predicted, 0.15),
color='#348ABD', alpha=0.25)
plt.xlabel('Ground Truth')
plt.ylabel('Predicted')
Out[35]:
In [37]:
training = training.append(sample(training))
training.reset_index(drop=True, inplace=True)
In [38]:
training_matrix, test_matrix = tfidf_matrices(training, test)
In [39]:
training = concat_tfidf(training, training_matrix)
test = concat_tfidf(test, test_matrix)
In [40]:
features = training.columns[3:]
In [41]:
X = training[features].values
y = training['Category'].values
In [42]:
features_test = test[features].values
In [43]:
svm.fit(X, y)
test_predicted = svm.predict(features_test)
In [44]:
test['Category'] = test_predicted
In [45]:
output = test[['Id', 'Category']]
In [42]:
# output.to_csv('../data/solution05.csv', index=False)