In [1]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
In [2]:
plt.style.use('ggplot')
In [3]:
def jitter(values, sd=0.25):
return [np.random.normal(v, sd) for v in values]
In [4]:
def clean_text(df, col):
"""A function for keeping only alpha-numeric
characters and replacing all white space with
a single space.
"""
df = df.copy()
porter_stemmer = PorterStemmer()
return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
.apply(lambda x: re.sub('\s+', ' ', x).strip())\
.apply(lambda x: ' '.join([porter_stemmer.stem(w)
for w in x.split()]))
In [5]:
def flatten_words(list1d, get_unique=False):
qa = [s.split() for s in list1d]
if get_unique:
return sorted(list(set([w for sent in qa for w in sent])))
else:
return [w for sent in qa for w in sent]
In [6]:
training = pd.read_csv('../data/newtrain.csv')
In [7]:
training.head()
Out[7]:
In [8]:
test = pd.read_csv('../data/newtest.csv')
Remove non-alpha numeric characters and extra whitespace.
In [9]:
training['text_clean'] = clean_text(training, 'Text')
In [10]:
test['text_clean'] = clean_text(test, 'Text')
Get a list of questions, by category.
In [11]:
df_ql = training.copy()
In [12]:
df_ql = df_ql[['Category', 'text_clean']]
In [13]:
df_ql['all_questions'] = df_ql.apply(lambda row:
df_ql.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
axis=1)
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)
In [14]:
df_ql = df_ql[['Category', 'all_questions']]
In [15]:
df_ql.all_questions
Out[15]:
Create a list of lists with one entry of $n$ concatenated questions per category.
In [16]:
documents = []
for i in range(len(df_ql)):
documents.append(' '.join(df_ql['all_questions'][i]))
In [17]:
all_text = training['text_clean'].values.tolist() + test['text_clean'].values.tolist()
In [18]:
vocab = flatten_words(all_text, get_unique=True)
In [19]:
tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)
In [20]:
train_matrix = tfidf.fit_transform(documents)
In [21]:
test_matrix = tfidf.fit_transform(test.text_clean.values)
In [22]:
cos_scores = cosine_similarity(train_matrix, test_matrix)
In [23]:
test['Category'] = np.argmax(cos_scores, axis=0) + 1
In [24]:
output = test[['Id', 'Category']]
In [25]:
output.to_csv('../data/solution01.csv', index=False)
In [ ]: