NLP Testing Notebook

NLTK Position Tags:

('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')

and is CC, a coordinating conjunction; now and completely are RB, or adverbs; for is IN, a preposition; something is NN, a noun; and different is JJ, an adjective.

('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')

refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning "deny," while REFuse is a noun meaning "trash"


In [1]:
from pattern.web import Twitter
from pattern.en import tag
from pattern.vector import NB, count
import sys
import time

twitter, classifier = Twitter(language="en"), NB(baseline="UNDEFINED")

def train_model(n_pts, search_terms, category, category_count):
    print("Training " + str(n_pts*100) + " data points for " + str(category))
    for i in range(1, n_pts):
        for tweet in twitter.search(search_terms, start=i, count=100):
            s = tweet.text.lower()
            p = category
            category_count+=1
            v = tag(s)
            v = [word for word, pos in v if (pos == "NN" or pos == "VB")]
            v = count(v) # {'sweet': 1}
            if v:
                classifier.train(v, type=p)
                sys.stdout.write('\r')
                sys.stdout.write(str(int(category_count/(n_pts*100)*100)) + "% : " + str(v))
    sys.stdout.write('\r')
    print("Finished!")
    print("Number of data points: " + str(category_count) + "\n")

In [2]:
n = 30
happy = 0
sad = 0

train_model(n, "shiok OR swee OR perfect OR #happy", "HAPPY", happy)
time.sleep(1)
train_model(n, "sian OR shag OR suay OR fml OR #sad", "SAD", sad)


Training 3000 data points for HAPPY
Finished!uck': 1}, 'art': 1, 'demon': 1, 'rt': 1, 'diggity': 1}2, 'bb11': 1, 'biggboss11': 1, 'https://t.co/9dowprwtog': 1}friday': 1, 'package': 1, 'rt': 2}1, 'rt': 1}m!…': 1}1} 'eye': 1}ffix': 1, 'rt': 1}'@marionspekker': 1, '@marienassar': 1}
Number of data points: 2742

Training 3000 data points for SAD
Finished!sshole': 1, 'flooring': 1, 'wood': 1}: 1, 'love': 1, 'life': 1}ult': 1, 'https://t.co/xa5yhucnoj': 1, 'whitney': 2, 'houston': 1} 1}}ng': 1} 1}, 'dami': 1, 'worldwidemayor': 1}1, 'https://t.co/7di1muz8em': 1, '@salnpage': 1}
Number of data points: 2900


In [3]:
def evaluate(word):
    category = classifier.classify(word)
    return ("The word " + str(word) + " is " + str(category))
    
words = ("pangseh","nasi lemak","food","breakfast","lunch","dinner","MRT","school","trip","work","home","family","garden","play","train","bus","KFC","SAF","book out","camp","army","navy","air force",)
    
for word in words:
    print(evaluate(word))


The word pangseh is SAD
The word nasi lemak is SAD
The word food is SAD
The word breakfast is SAD
The word lunch is HAPPY
The word dinner is HAPPY
The word MRT is SAD
The word school is SAD
The word trip is HAPPY
The word work is SAD
The word home is SAD
The word family is SAD
The word garden is SAD
The word play is SAD
The word train is SAD
The word bus is SAD
The word KFC is SAD
The word SAF is SAD
The word book out is HAPPY
The word camp is SAD
The word army is HAPPY
The word navy is SAD
The word air force is HAPPY

In [ ]: