In [47]:
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
from textblob.np_extractors import ConllExtractor
import MySQLdb
import MySQLdb.cursors
from bs4 import BeautifulSoup
import time
start = time.strftime("%I:%M:%S")
USELESS_CHARS = {"[]{}\"'`:()00123456789"}
trainPosSelect = "SELECT * FROM gathered.gatherer_article where source_title like '%ashdo%' and comments > 200 and active = 1 order by updated desc limit 500;"
trainNegSelect = "SELECT * FROM gathered.gatherer_article where source_title like '%ashdo%' and comments < 100 and active = 1 order by updated desc limit 500;"
trainPos = list()
trainNeg = list()
train = list()
test = []
dbConfig = {"user":"gatherer", "passwd":"gatherer@@", "host":"localhost", "db":"gathered", "cursorclass": MySQLdb.cursors.DictCursor}
connection = MySQLdb.connect(**dbConfig)
cursor = connection.cursor()
cursor.execute(trainPosSelect)
extractor = ConllExtractor()
for row in cursor.fetchall():
text = BeautifulSoup(row['title'] + " " + row['summary'], 'lxml').text
#text = TextBlob(article, classifier=cl, np_extractor=extractor)
trainPos.append((text, "pos"))
cursor.execute(trainNegSelect)
for row in cursor.fetchall():
text = BeautifulSoup(row['title'] + " " + row['summary'], 'lxml').text
#text = TextBlob(article, classifier=cl, np_extractor=extractor)
trainNeg.append((text, "neg"))
midPosPoint = (len(trainPos)//2)
midNegPoint = (len(trainNeg)//2)
train.extend(trainPos[:midPosPoint])
#print("train: ",len(train))
train.extend(trainNeg[:midNegPoint])
#print("train: ",len(train))
test.extend(trainPos[midPosPoint:])
#print("train: ",len(test))
test.extend(trainNeg[midNegPoint:])
#print("train: ",len(test))
cl = NaiveBayesClassifier(train)
#print (cl.classify("Their burgers are amazing")) # "pos"
#print (cl.classify("I don't like their pizza.")) # "neg"
# hit,miss = 0,0
# for article in test:
# blob = TextBlob(article[0], classifier=cl, np_extractor=extractor)
# result = blob.classify()
# print("expected/returned = {}/{}".format(article[1], result))
# if article[1]==result:
# hit += 1
# else:
# miss += 1
#print (hit,miss)
print("Test Accuracy: {:.>10}".format(cl.accuracy(test)))
print(cl.show_informative_features(15))
finish = time.strftime("%I:%M:%S")
print(start, finish)
# for article in test:
# print("\nexpecting {}".format(article[1]))
# prob_dist = cl.prob_classify(article[0])
# #print(round(prob_dist.prob("neg"), 2), round(prob_dist.prob("pos"), 2))
# if(round(prob_dist.prob("neg"), 2) > .75):
# print('resulted: neg: {0}'.format(str(round(prob_dist.prob("neg"), 2))))
# elif(round(prob_dist.prob("pos"), 2) > .75):
# print('resulted: pos: {0}'.format(str(round(prob_dist.prob("pos"), 2))))