In [434]:
import re
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
stopwords = stopwords.words('english')
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
nb = MultinomialNB()
In [435]:
df = pd.read_csv('../data/newtrain.csv')
In [436]:
df.head()
Out[436]:
In [437]:
random_index = np.random.permutation(df.index)
In [438]:
df_shuffled = df.ix[random_index]
In [439]:
df_shuffled.reset_index(drop=True, inplace=True)
df_shuffled[:5]
Out[439]:
In [440]:
rows, columns = df_shuffled.shape
train_size = round(rows*.8)
dev_size = round(rows*.2)
In [441]:
df_train = df_shuffled.loc[:train_size]
df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)
In [442]:
def flatten_words(list1d, get_unique=False):
qa = [s.split() for s in list1d]
if get_unique:
return sorted(list(set([w for sent in qa for w in sent])))
else:
return [w for sent in qa for w in sent]
In [445]:
df['text_clean'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
.apply(lambda x: re.sub('\s+', ' ', x).strip())
words = flatten_words(df.text_clean.values)
In [482]:
def unigram_feature(x, unigrams):
word_list = x.lower().split(" ")
count = 0
for unigram in unigrams:
count += word_list.count(unigram)
return count
def numeric_feature(x):
return sum(w.isnumeric() for w in x)
def similarity_feature(x, word):
word_list = x.lower().split(" ")
similarity = 0
for w in word_list:
for s in wn.synsets(w, pos=wn.NOUN):
similarity = max(similarity, word.wup_similarity(s))
return similarity
def length_feature(x):
return len(x)
def pos_feature(x, pos):
word_list = x.lower().split(" ")
t = nltk.pos_tag(word_list)
count = 0
for w in t:
if w[1] == pos:
count+=1
return count
In [447]:
df_ql = df.copy()
df_ql = df_ql[['Category', 'text_clean']]
In [448]:
df_ql['all_questions'] = df_ql.apply(lambda row:
df.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
axis=1)
In [449]:
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)
In [450]:
def get_norm_words(collection):
pattern = r'''(?x)
([A-Z]\.)+
| \w+
| \$?\d+(\.\d+)?%?
| \.\.\.
'''
tokens = nltk.regexp_tokenize(collection, pattern)
collection_words = [w.lower() for w in tokens if w.lower() not in stopwords and len(w) > 3 ]
return collection_words
In [451]:
df_ql.all_questions[0]
category1_questions = [q for q in df_ql.all_questions[0]]
category2_questions = [q for q in df_ql.all_questions[1]]
category3_questions = [q for q in df_ql.all_questions[2]]
category4_questions = [q for q in df_ql.all_questions[3]]
category5_questions = [q for q in df_ql.all_questions[4]]
category6_questions = [q for q in df_ql.all_questions[5]]
category7_questions = [q for q in df_ql.all_questions[6]]
In [452]:
category1_words = []
for i in range(len(category1_questions)):
category1_words += get_norm_words(category1_questions[i])
category2_words = []
for i in range(len(category2_questions)):
category2_words += get_norm_words(category2_questions[i])
category3_words = []
for i in range(len(category3_questions)):
category3_words += get_norm_words(category1_questions[i])
category4_words = []
for i in range(len(category4_questions)):
category4_words += get_norm_words(category4_questions[i])
category5_words = []
for i in range(len(category5_questions)):
category5_words += get_norm_words(category5_questions[i])
category6_words = []
for i in range(len(category6_questions)):
category6_words += get_norm_words(category6_questions[i])
category7_words = []
for i in range(len(category7_questions)):
category7_words += get_norm_words(category7_questions[i])
In [453]:
def norm_unigram_fd(collection_words):
fd_collection = nltk.FreqDist(collection_words)
fd = nltk.FreqDist(words)
fd_norm = fd_collection.copy()
for w in fd_collection.keys():
fd_norm[w] = (float(fd_collection[w]) / (1+ fd[w]))
return fd_norm
In [454]:
fd1 = norm_unigram_fd(category1_words)
fd2 = norm_unigram_fd(category2_words)
fd3 = norm_unigram_fd(category3_words)
fd4 = norm_unigram_fd(category4_words)
fd5 = norm_unigram_fd(category5_words)
fd6 = norm_unigram_fd(category6_words)
fd7 = norm_unigram_fd(category7_words)
In [455]:
finance = wn.synset('finance.n.01')
internet = wn.synset('internet.n.01')
entertainment = wn.synset('entertainment.n.01')
music = wn.synset('music.n.01')
relationship = wn.synset('relationship.n.01')
family = wn.synset('family.n.01')
education = wn.synset('education.n.01')
health = wn.synset('health.n.01')
science = wn.synset('science.n.01')
math = wn.synset('mathematics.n.01')
In [522]:
amt = 500
train_fd1_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
train_fd2_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
train_fd3_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
train_fd4_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
train_fd5_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
train_fd6_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
train_fd7_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
train_what_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("what")))
train_who_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("who")))
train_why_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("why")))
train_how_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("how")))
train_finance_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, finance))
train_internet_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, internet))
train_entertainment_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, entertainment))
train_music_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, music))
train_relationship_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, relationship))
train_family_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, family))
train_education_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, education))
train_health_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, health))
train_science_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, science))
train_math_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, math))
train_posV_feature = df_train['Text'].apply(lambda x: pos_feature(x, "VB"))
train_posJJ_feature = df_train['Text'].apply(lambda x: pos_feature(x, "JJ"))
train_length_feature = df_train['Text'].apply(lambda x: length_feature(x))
train_numeric_feature = df_train['Text'].apply(lambda x: numeric_feature(x))
In [567]:
df_train_features = pd.DataFrame({'fd1': train_fd1_feature,
'fd2': train_fd2_feature,
'fd3': train_fd3_feature,
'fd4': train_fd4_feature,
'fd5': train_fd5_feature,
'fd6': train_fd6_feature,
'fd7': train_fd7_feature,
'what': train_what_feature,
'who': train_who_feature,
'why': train_why_feature,
'how': train_how_feature,
# 'finance': train_finance_sim_feature,
# 'internet': train_internet_sim_feature,
# 'entertainment': train_entertainment_sim_feature,
# 'music':train_music_sim_feature,
# 'relationship': train_relationship_sim_feature,
# 'family': train_family_sim_feature,
# 'education': train_education_sim_feature,
# 'health':train_health_sim_feature,
# 'science': train_science_sim_feature,
# 'math':train_math_sim_feature,
'numbers': train_numeric_feature,
'posV': train_posV_feature,
#'posJJ': train_posJJ_feature,
'length': train_length_feature})
In [520]:
dev_fd1_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
dev_fd2_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
dev_fd3_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
dev_fd4_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
dev_fd5_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
dev_fd6_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
dev_fd7_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
dev_what_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("what")))
dev_who_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("who")))
dev_why_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("why")))
dev_how_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("how")))
dev_numeric_feature = df_dev['Text'].apply(lambda x: numeric_feature(x))
dev_length_feature = df_dev['Text'].apply(lambda x: length_feature(x))
dev_finance_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, finance))
dev_internet_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, internet))
dev_entertainment_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, entertainment))
dev_music_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, music))
dev_relationship_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, relationship))
dev_family_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, family))
dev_education_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, education))
dev_health_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, health))
dev_science_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, science))
dev_math_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, math))
dev_posV_feature = df_dev['Text'].apply(lambda x: pos_feature(x, "VB"))
dev_posJJ_feature = df_dev['Text'].apply(lambda x: pos_feature(x, "JJ"))
In [568]:
df_dev_features = pd.DataFrame({'fd1': dev_fd1_feature,
'fd2': dev_fd2_feature,
'fd3': dev_fd3_feature,
'fd4': dev_fd4_feature,
'fd5': dev_fd5_feature,
'fd6': dev_fd6_feature,
'fd7': dev_fd7_feature,
'what': dev_what_feature,
'who': dev_who_feature,
'why': dev_why_feature,
'how': dev_how_feature,
# 'finance': dev_finance_sim_feature,
# 'internet': dev_internet_sim_feature,
# 'entertainment': dev_entertainment_sim_feature,
# 'music': dev_music_sim_feature,
# 'relationship': dev_relationship_sim_feature,
# 'family': dev_family_sim_feature,
# 'education': dev_education_sim_feature,
# 'health': dev_health_sim_feature,
# 'science': dev_science_sim_feature,
# 'math': dev_math_sim_feature,
'length': dev_length_feature,
'posV': dev_posV_feature,
#'posJJ': dev_posJJ_feature,
'numbers': dev_numeric_feature})
In [569]:
nb_model = nb.fit(df_train_features, df_train.Category)
nb_predictions = nb_model.predict(df_dev_features)
In [570]:
accuracy_score(df_dev.Category, nb_predictions)
Out[570]:
In [571]:
class_labels = np.sort(df_train.Category.unique())
class_labels = [str(l) for l in class_labels]
print(classification_report(df_dev.Category, nb_predictions, target_names=class_labels))
In [572]:
test_set = pd.read_csv('../data/newtest.csv')
In [573]:
test_fd1_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
test_fd2_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
test_fd3_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
test_fd4_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
test_fd5_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
test_fd6_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
test_fd7_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
test_what_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("what")))
test_who_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("who")))
test_why_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("why")))
test_how_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("how")))
test_numeric_feature = test_set['Text'].apply(lambda x: numeric_feature(x))
test_pos_feature = test_set['Text'].apply(lambda x: pos_feature(x, "VB"))
test_length_feature = test_set['Text'].apply(lambda x: length_feature(x))
In [574]:
df_test_features = pd.DataFrame({'fd1': test_fd1_feature,
'fd2': test_fd2_feature,
'fd3': test_fd3_feature,
'fd4': test_fd4_feature,
'fd5': test_fd5_feature,
'fd6': test_fd6_feature,
'fd7': test_fd7_feature,
'what': test_what_feature,
'who': test_who_feature,
'why': test_why_feature,
'how': test_how_feature,
'length': test_length_feature,
'pos': test_pos_feature,
'numbers': test_numeric_feature})
In [575]:
nb_predictions = nb_model.predict(df_test_features)
test_set["category"] = nb_predictions[test_set.index]
output = test_set[['Id', 'category']]
output.to_csv('../data/solution.csv', index=False)
In [ ]:
In [ ]:
In [ ]: