notebook.community

Edit and run



In [434]:

    
import re
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
stopwords = stopwords.words('english')
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
nb = MultinomialNB()



In [435]:

    
df = pd.read_csv('../data/newtrain.csv')



In [436]:

    
df.head()









    Out[436]:






  
    
      
      Category
      Text
    
  
  
    
      0
      5
      why are yawns contagious? when people yawn
    
    
      1
      6
      what is trans fat? how to reduce that? i heard...
    
    
      2
      1
      roth ira vs 401k? what is the difference betwe...
    
    
      3
      1
      how many planes fedex has? i heard that it is ...
    
    
      4
      2
      what is the best photo slideshow creation appl...



In [437]:

    
random_index = np.random.permutation(df.index)



In [438]:

    
df_shuffled = df.ix[random_index]



In [439]:

    
df_shuffled.reset_index(drop=True, inplace=True)
df_shuffled[:5]









    Out[439]:






  
    
      
      Category
      Text
    
  
  
    
      0
      5
      what does xoxo stand for?
    
    
      1
      5
      what was james bond's wife's name? i seem to b...
    
    
      2
      7
      how to solve magic cube ? could please tell me...
    
    
      3
      3
      what is tia carrere's nationality?
    
    
      4
      7
      what is the deepest trough in the world?



In [440]:

    
rows, columns = df_shuffled.shape
train_size = round(rows*.8)
dev_size   = round(rows*.2)



In [441]:

    
df_train = df_shuffled.loc[:train_size]
df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)



In [442]:

    
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]



In [445]:

    
df['text_clean'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                             .apply(lambda x: re.sub('\s+', ' ', x).strip())
words = flatten_words(df.text_clean.values)



In [482]:

    
def unigram_feature(x, unigrams):
    word_list = x.lower().split(" ")
    count = 0
    for unigram in unigrams:
        count += word_list.count(unigram)
    return count
def numeric_feature(x):
    return sum(w.isnumeric() for w in x)
def similarity_feature(x, word):
    word_list = x.lower().split(" ")
    similarity = 0
    for w in word_list:
        for s in wn.synsets(w, pos=wn.NOUN):
            similarity = max(similarity, word.wup_similarity(s)) 
    return similarity
def length_feature(x):
    return len(x)
def pos_feature(x, pos):
    word_list = x.lower().split(" ")
    t = nltk.pos_tag(word_list)
    count = 0
    for w in t:
        if w[1] == pos:
                count+=1
    return count



In [447]:

    
df_ql = df.copy()
df_ql = df_ql[['Category', 'text_clean']]



In [448]:

    
df_ql['all_questions'] = df_ql.apply(lambda row:
                                     df.groupby('Category').get_group(row['Category'])['text_clean'].tolist(),
                                     axis=1)



In [449]:

    
df_ql.drop_duplicates(subset='Category', inplace=True)
df_ql.sort(columns='Category', inplace=True)
df_ql.reset_index(drop=True, inplace=True)



In [450]:

    
def get_norm_words(collection):
    pattern = r'''(?x)    
         ([A-Z]\.)+        
       | \w+        
       | \$?\d+(\.\d+)?%?  
       | \.\.\.            
     '''
    tokens = nltk.regexp_tokenize(collection, pattern)
    collection_words = [w.lower() for w in tokens if w.lower() not in stopwords and len(w) > 3 ]
    return collection_words



In [451]:

    
df_ql.all_questions[0]
category1_questions = [q for q in df_ql.all_questions[0]]
category2_questions = [q for q in df_ql.all_questions[1]]
category3_questions = [q for q in df_ql.all_questions[2]]
category4_questions = [q for q in df_ql.all_questions[3]]
category5_questions = [q for q in df_ql.all_questions[4]]
category6_questions = [q for q in df_ql.all_questions[5]]
category7_questions = [q for q in df_ql.all_questions[6]]



In [452]:

    
category1_words = []
for i in range(len(category1_questions)):
    category1_words += get_norm_words(category1_questions[i])
category2_words = []
for i in range(len(category2_questions)):
    category2_words += get_norm_words(category2_questions[i])
category3_words = []
for i in range(len(category3_questions)):
    category3_words += get_norm_words(category1_questions[i])
category4_words = []
for i in range(len(category4_questions)):
    category4_words += get_norm_words(category4_questions[i])
category5_words = []
for i in range(len(category5_questions)):
    category5_words += get_norm_words(category5_questions[i])
category6_words = []
for i in range(len(category6_questions)):
    category6_words += get_norm_words(category6_questions[i])
category7_words = []
for i in range(len(category7_questions)):
    category7_words += get_norm_words(category7_questions[i])



In [453]:

    
def norm_unigram_fd(collection_words):
    fd_collection = nltk.FreqDist(collection_words) 
    fd = nltk.FreqDist(words) 
    fd_norm = fd_collection.copy()
    for w in fd_collection.keys():
        fd_norm[w] = (float(fd_collection[w]) / (1+ fd[w]))
    return fd_norm



In [454]:

    
fd1 = norm_unigram_fd(category1_words)
fd2 = norm_unigram_fd(category2_words)
fd3 = norm_unigram_fd(category3_words)
fd4 = norm_unigram_fd(category4_words)
fd5 = norm_unigram_fd(category5_words)
fd6 = norm_unigram_fd(category6_words)
fd7 = norm_unigram_fd(category7_words)



In [455]:

    
finance = wn.synset('finance.n.01')
internet = wn.synset('internet.n.01')
entertainment = wn.synset('entertainment.n.01')
music = wn.synset('music.n.01')
relationship = wn.synset('relationship.n.01')
family = wn.synset('family.n.01')
education = wn.synset('education.n.01')
health = wn.synset('health.n.01')
science = wn.synset('science.n.01')
math = wn.synset('mathematics.n.01')



In [522]:

    
amt = 500
train_fd1_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
train_fd2_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
train_fd3_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
train_fd4_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
train_fd5_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
train_fd6_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
train_fd7_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))

train_what_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("what")))
train_who_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("who")))
train_why_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("why")))
train_how_feature = df_train['Text'].apply(lambda x: unigram_feature(x, ("how")))

train_finance_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, finance))
train_internet_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, internet))
train_entertainment_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, entertainment))
train_music_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, music))
train_relationship_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, relationship))
train_family_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, family))
train_education_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, education))
train_health_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, health))
train_science_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, science))
train_math_sim_feature = df_train['Text'].apply(lambda x: similarity_feature(x, math))

train_posV_feature = df_train['Text'].apply(lambda x: pos_feature(x, "VB"))
train_posJJ_feature = df_train['Text'].apply(lambda x: pos_feature(x, "JJ"))

train_length_feature = df_train['Text'].apply(lambda x: length_feature(x))
train_numeric_feature = df_train['Text'].apply(lambda x: numeric_feature(x))



In [567]:

    
df_train_features = pd.DataFrame({'fd1': train_fd1_feature, 
                                  'fd2': train_fd2_feature, 
                                  'fd3': train_fd3_feature,
                                  'fd4': train_fd4_feature, 
                                  'fd5': train_fd5_feature, 
                                  'fd6': train_fd6_feature,
                                  'fd7': train_fd7_feature,
                                  'what': train_what_feature, 
                                  'who': train_who_feature, 
                                  'why': train_why_feature,
                                  'how': train_how_feature,
#                                    'finance': train_finance_sim_feature, 
#                                   'internet': train_internet_sim_feature,
#                                   'entertainment': train_entertainment_sim_feature, 
#                                   'music':train_music_sim_feature, 
#                                   'relationship': train_relationship_sim_feature, 
#                                   'family': train_family_sim_feature,
#                                   'education': train_education_sim_feature, 
#                                   'health':train_health_sim_feature,
#                                   'science': train_science_sim_feature, 
#                                   'math':train_math_sim_feature,
                                 'numbers': train_numeric_feature,
                                 'posV': train_posV_feature,
                                #'posJJ': train_posJJ_feature,
                                 'length': train_length_feature})



In [520]:

    
dev_fd1_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
dev_fd2_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
dev_fd3_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
dev_fd4_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
dev_fd5_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
dev_fd6_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
dev_fd7_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
dev_what_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("what")))
dev_who_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("who")))
dev_why_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("why")))
dev_how_feature = df_dev['Text'].apply(lambda x: unigram_feature(x, ("how")))
dev_numeric_feature = df_dev['Text'].apply(lambda x: numeric_feature(x))
dev_length_feature = df_dev['Text'].apply(lambda x: length_feature(x))
dev_finance_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, finance))
dev_internet_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, internet))
dev_entertainment_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, entertainment))
dev_music_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, music))
dev_relationship_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, relationship))
dev_family_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, family))
dev_education_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, education))
dev_health_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, health))
dev_science_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, science))
dev_math_sim_feature = df_dev['Text'].apply(lambda x: similarity_feature(x, math))
dev_posV_feature = df_dev['Text'].apply(lambda x: pos_feature(x, "VB"))
dev_posJJ_feature = df_dev['Text'].apply(lambda x: pos_feature(x, "JJ"))



In [568]:

    
df_dev_features = pd.DataFrame({'fd1': dev_fd1_feature, 
                                  'fd2': dev_fd2_feature, 
                                  'fd3': dev_fd3_feature,
                                  'fd4': dev_fd4_feature, 
                                  'fd5': dev_fd5_feature, 
                                  'fd6': dev_fd6_feature,
                                  'fd7': dev_fd7_feature,
                                  'what': dev_what_feature, 
                                  'who': dev_who_feature, 
                                  'why': dev_why_feature,
                                  'how': dev_how_feature,
#                                  'finance': dev_finance_sim_feature, 
#                                   'internet': dev_internet_sim_feature,
#                                   'entertainment': dev_entertainment_sim_feature, 
#                                   'music': dev_music_sim_feature, 
#                                   'relationship': dev_relationship_sim_feature, 
#                                   'family': dev_family_sim_feature,
#                                   'education': dev_education_sim_feature, 
#                                   'health': dev_health_sim_feature,
#                                   'science': dev_science_sim_feature, 
#                                   'math': dev_math_sim_feature,
                                'length': dev_length_feature,
                                 'posV': dev_posV_feature,
                                #'posJJ': dev_posJJ_feature,
                                  'numbers': dev_numeric_feature})



In [569]:

    
nb_model = nb.fit(df_train_features, df_train.Category)
nb_predictions = nb_model.predict(df_dev_features)



In [570]:

    
accuracy_score(df_dev.Category, nb_predictions)









    Out[570]:





0.71481481481481479



In [571]:

    
class_labels = np.sort(df_train.Category.unique())
class_labels = [str(l) for l in class_labels]

print(classification_report(df_dev.Category, nb_predictions, target_names=class_labels))









    



             precision    recall  f1-score   support

          1       0.75      0.66      0.70       158
          2       0.76      0.74      0.75        76
          3       0.44      0.60      0.51        73
          4       0.69      0.71      0.70        70
          5       0.84      0.87      0.85        54
          6       0.84      0.84      0.84        57
          7       0.86      0.69      0.77        52

avg / total       0.73      0.71      0.72       540



In [572]:

    
test_set = pd.read_csv('../data/newtest.csv')



In [573]:

    
test_fd1_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd1.most_common(amt)])))
test_fd2_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd2.most_common(amt)])))
test_fd3_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd3.most_common(amt)])))
test_fd4_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd4.most_common(amt)])))
test_fd5_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd5.most_common(amt)])))
test_fd6_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd6.most_common(amt)])))
test_fd7_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ([w[0] for w in fd7.most_common(amt)])))
test_what_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("what")))
test_who_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("who")))
test_why_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("why")))
test_how_feature = test_set['Text'].apply(lambda x: unigram_feature(x, ("how")))
test_numeric_feature = test_set['Text'].apply(lambda x: numeric_feature(x))
test_pos_feature = test_set['Text'].apply(lambda x: pos_feature(x, "VB"))
test_length_feature = test_set['Text'].apply(lambda x: length_feature(x))



In [574]:

    
df_test_features = pd.DataFrame({'fd1': test_fd1_feature, 
                                  'fd2': test_fd2_feature, 
                                  'fd3': test_fd3_feature,
                                  'fd4': test_fd4_feature, 
                                  'fd5': test_fd5_feature, 
                                  'fd6': test_fd6_feature,
                                  'fd7': test_fd7_feature,
                                  'what': test_what_feature, 
                                  'who': test_who_feature, 
                                  'why': test_why_feature,
                                  'how': test_how_feature,
                                  'length': test_length_feature,
                                  'pos': test_pos_feature,
                                  'numbers': test_numeric_feature})



In [575]:

    
nb_predictions = nb_model.predict(df_test_features)
test_set["category"] = nb_predictions[test_set.index]
output = test_set[['Id', 'category']]
output.to_csv('../data/solution.csv', index=False)



In [ ]:



In [ ]:



In [ ]:

	Category	Text
0	5	why are yawns contagious? when people yawn
1	6	what is trans fat? how to reduce that? i heard...
2	1	roth ira vs 401k? what is the difference betwe...
3	1	how many planes fedex has? i heard that it is ...
4	2	what is the best photo slideshow creation appl...

	Category	Text
0	5	what does xoxo stand for?
1	5	what was james bond's wife's name? i seem to b...
2	7	how to solve magic cube ? could please tell me...
3	3	what is tia carrere's nationality?
4	7	what is the deepest trough in the world?