In [434]:
import re
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
stopwords = stopwords.words('english')
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
nb = MultinomialNB()

In [435]:
df = pd.read_csv('../data/newtrain.csv')

In [436]:
df.head()


Out[436]:
Category Text
0 5 why are yawns contagious? when people yawn
1 6 what is trans fat? how to reduce that? i heard...
2 1 roth ira vs 401k? what is the difference betwe...
3 1 how many planes fedex has? i heard that it is ...
4 2 what is the best photo slideshow creation appl...

In [437]:
random_index = np.random.permutation(df.index)

In [438]:
df_shuffled = df.ix[random_index]

In [439]:
df_shuffled.reset_index(drop=True, inplace=True)
df_shuffled[:5]


Out[439]:
Category Text
0 5 what does xoxo stand for?
1 5 what was james bond's wife's name? i seem to b...
2 7 how to solve magic cube ? could please tell me...
3 3 what is tia carrere's nationality?
4 7 what is the deepest trough in the world?

In [440]:
rows, columns = df_shuffled.shape
train_size = round(rows*.8)
dev_size   = round(rows*.2)

In [441]:
df_train = df_shuffled.loc[:train_size]
df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)

In [442]:
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]

In [445]:
df['text_clean'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                             .apply(lambda x: re.sub('\s+', ' ', x).strip())
words = flatten_words(df.text_clean.values)

In [482]:
def unigram_feature(x, unigrams):
    word_list = x.lower().split(" ")
    count = 0
    for unigram in unigrams:
        count += word_list.count(unigram)
    return count
def numeric_feature(x):
    return sum(w.isnumeric() for w in x)
def similarity_feature(x, word):
    word_list = x.lower().split(" ")
    similarity = 0
    for w in word_list:
        for s in wn.synsets(w, pos=wn.NOUN):
            similarity = max(similarity, word.wup_similarity(s)) 
    return similarity
def length_feature(x):
    return len(x)
def pos_feature(x, pos):
    word_list = x.lower().split(" ")
    t = nltk.pos_tag(word_list)
    count = 0
    for w in t:
        if w[1] == pos:
                count+=1
    return count

In [447]:
df_ql = df.copy()
df_ql = df_ql[['Category', 'text_clean']]

In [448]:
df_ql['all_questions'] = df_ql.apply(lambda row:
                                     df.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
                                     axis=1)

In [449]:
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)

In [450]:
def get_norm_words(collection):
    pattern = r'''(?x)    
         ([A-Z]\.)+        
       | \w+        
       | \$?\d+(\.\d+)?%?  
       | \.\.\.            
     '''
    tokens = nltk.regexp_tokenize(collection, pattern)
    collection_words = [w.lower() for w in tokens if w.lower() not in stopwords and len(w) > 3 ]
    return collection_words

In [451]:
df_ql.all_questions[0]
category1_questions = [q for q in df_ql.all_questions[0]]
category2_questions = [q for q in df_ql.all_questions[1]]
category3_questions = [q for q in df_ql.all_questions[2]]
category4_questions = [q for q in df_ql.all_questions[3]]
category5_questions = [q for q in df_ql.all_questions[4]]
category6_questions = [q for q in df_ql.all_questions[5]]
category7_questions = [q for q in df_ql.all_questions[6]]

In [452]:
category1_words = []
for i in range(len(category1_questions)):
    category1_words += get_norm_words(category1_questions[i])
category2_words = []
for i in range(len(category2_questions)):
    category2_words += get_norm_words(category2_questions[i])
category3_words = []
for i in range(len(category3_questions)):
    category3_words += get_norm_words(category1_questions[i])
category4_words = []
for i in range(len(category4_questions)):
    category4_words += get_norm_words(category4_questions[i])
category5_words = []
for i in range(len(category5_questions)):
    category5_words += get_norm_words(category5_questions[i])
category6_words = []
for i in range(len(category6_questions)):
    category6_words += get_norm_words(category6_questions[i])
category7_words = []
for i in range(len(category7_questions)):
    category7_words += get_norm_words(category7_questions[i])

In [453]:
def norm_unigram_fd(collection_words):
    fd_collection = nltk.FreqDist(collection_words) 
    fd = nltk.FreqDist(words) 
    fd_norm = fd_collection.copy()
    for w in fd_collection.keys():
        fd_norm[w] = (float(fd_collection[w]) / (1+ fd[w]))
    return fd_norm

In [454]:
fd1 = norm_unigram_fd(category1_words)
fd2 = norm_unigram_fd(category2_words)
fd3 = norm_unigram_fd(category3_words)
fd4 = norm_unigram_fd(category4_words)
fd5 = norm_unigram_fd(category5_words)
fd6 = norm_unigram_fd(category6_words)
fd7 = norm_unigram_fd(category7_words)

In [455]:
finance = wn.synset('finance.n.01')
internet = wn.synset('internet.n.01')
entertainment = wn.synset('entertainment.n.01')
music = wn.synset('music.n.01')
relationship = wn.synset('relationship.n.01')
family = wn.synset('family.n.01')
education = wn.synset('education.n.01')
health = wn.synset('health.n.01')
science = wn.synset('science.n.01')
math = wn.synset('mathematics.n.01')

In [522]:
amt = 500
train_fd1_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
train_fd2_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
train_fd3_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
train_fd4_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
train_fd5_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
train_fd6_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
train_fd7_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))

train_what_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("what")))
train_who_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("who")))
train_why_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("why")))
train_how_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("how")))

train_finance_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, finance))
train_internet_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, internet))
train_entertainment_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, entertainment))
train_music_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, music))
train_relationship_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, relationship))
train_family_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, family))
train_education_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, education))
train_health_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, health))
train_science_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, science))
train_math_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, math))

train_posV_feature = df_train['Text'].apply(lambda x: pos_feature(x, "VB"))
train_posJJ_feature = df_train['Text'].apply(lambda x: pos_feature(x, "JJ"))

train_length_feature = df_train['Text'].apply(lambda x: length_feature(x))
train_numeric_feature = df_train['Text'].apply(lambda x: numeric_feature(x))

In [567]:
df_train_features = pd.DataFrame({'fd1': train_fd1_feature, 
                                  'fd2': train_fd2_feature, 
                                  'fd3': train_fd3_feature,
                                  'fd4': train_fd4_feature, 
                                  'fd5': train_fd5_feature, 
                                  'fd6': train_fd6_feature,
                                  'fd7': train_fd7_feature,
                                  'what': train_what_feature, 
                                  'who': train_who_feature, 
                                  'why': train_why_feature,
                                  'how': train_how_feature,
#                                    'finance': train_finance_sim_feature, 
#                                   'internet': train_internet_sim_feature,
#                                   'entertainment': train_entertainment_sim_feature, 
#                                   'music':train_music_sim_feature, 
#                                   'relationship': train_relationship_sim_feature, 
#                                   'family': train_family_sim_feature,
#                                   'education': train_education_sim_feature, 
#                                   'health':train_health_sim_feature,
#                                   'science': train_science_sim_feature, 
#                                   'math':train_math_sim_feature,
                                 'numbers': train_numeric_feature,
                                 'posV': train_posV_feature,
                                #'posJJ': train_posJJ_feature,
                                 'length': train_length_feature})

In [520]:
dev_fd1_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
dev_fd2_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
dev_fd3_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
dev_fd4_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
dev_fd5_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
dev_fd6_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
dev_fd7_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
dev_what_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("what")))
dev_who_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("who")))
dev_why_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("why")))
dev_how_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("how")))
dev_numeric_feature = df_dev['Text'].apply(lambda x: numeric_feature(x))
dev_length_feature = df_dev['Text'].apply(lambda x: length_feature(x))
dev_finance_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, finance))
dev_internet_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, internet))
dev_entertainment_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, entertainment))
dev_music_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, music))
dev_relationship_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, relationship))
dev_family_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, family))
dev_education_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, education))
dev_health_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, health))
dev_science_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, science))
dev_math_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, math))
dev_posV_feature = df_dev['Text'].apply(lambda x: pos_feature(x, "VB"))
dev_posJJ_feature = df_dev['Text'].apply(lambda x: pos_feature(x, "JJ"))

In [568]:
df_dev_features = pd.DataFrame({'fd1': dev_fd1_feature, 
                                  'fd2': dev_fd2_feature, 
                                  'fd3': dev_fd3_feature,
                                  'fd4': dev_fd4_feature, 
                                  'fd5': dev_fd5_feature, 
                                  'fd6': dev_fd6_feature,
                                  'fd7': dev_fd7_feature,
                                  'what': dev_what_feature, 
                                  'who': dev_who_feature, 
                                  'why': dev_why_feature,
                                  'how': dev_how_feature,
#                                  'finance': dev_finance_sim_feature, 
#                                   'internet': dev_internet_sim_feature,
#                                   'entertainment': dev_entertainment_sim_feature, 
#                                   'music': dev_music_sim_feature, 
#                                   'relationship': dev_relationship_sim_feature, 
#                                   'family': dev_family_sim_feature,
#                                   'education': dev_education_sim_feature, 
#                                   'health': dev_health_sim_feature,
#                                   'science': dev_science_sim_feature, 
#                                   'math': dev_math_sim_feature,
                                'length': dev_length_feature,
                                 'posV': dev_posV_feature,
                                #'posJJ': dev_posJJ_feature,
                                  'numbers': dev_numeric_feature})

In [569]:
nb_model = nb.fit(df_train_features, df_train.Category)
nb_predictions = nb_model.predict(df_dev_features)

In [570]:
accuracy_score(df_dev.Category, nb_predictions)


Out[570]:
0.71481481481481479

In [571]:
class_labels = np.sort(df_train.Category.unique())
class_labels = [str(l) for l in class_labels]

print(classification_report(df_dev.Category, nb_predictions, target_names=class_labels))


             precision    recall  f1-score   support

          1       0.75      0.66      0.70       158
          2       0.76      0.74      0.75        76
          3       0.44      0.60      0.51        73
          4       0.69      0.71      0.70        70
          5       0.84      0.87      0.85        54
          6       0.84      0.84      0.84        57
          7       0.86      0.69      0.77        52

avg / total       0.73      0.71      0.72       540


In [572]:
test_set = pd.read_csv('../data/newtest.csv')

In [573]:
test_fd1_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
test_fd2_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
test_fd3_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
test_fd4_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
test_fd5_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
test_fd6_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
test_fd7_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
test_what_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("what")))
test_who_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("who")))
test_why_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("why")))
test_how_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("how")))
test_numeric_feature = test_set['Text'].apply(lambda x: numeric_feature(x))
test_pos_feature = test_set['Text'].apply(lambda x: pos_feature(x, "VB"))
test_length_feature = test_set['Text'].apply(lambda x: length_feature(x))

In [574]:
df_test_features = pd.DataFrame({'fd1': test_fd1_feature, 
                                  'fd2': test_fd2_feature, 
                                  'fd3': test_fd3_feature,
                                  'fd4': test_fd4_feature, 
                                  'fd5': test_fd5_feature, 
                                  'fd6': test_fd6_feature,
                                  'fd7': test_fd7_feature,
                                  'what': test_what_feature, 
                                  'who': test_who_feature, 
                                  'why': test_why_feature,
                                  'how': test_how_feature,
                                  'length': test_length_feature,
                                  'pos': test_pos_feature,
                                  'numbers': test_numeric_feature})

In [575]:
nb_predictions = nb_model.predict(df_test_features)
test_set["category"] = nb_predictions[test_set.index]
output = test_set[['Id', 'category']]
output.to_csv('../data/solution.csv', index=False)

In [ ]:


In [ ]:


In [ ]: