In [47]:
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
from textblob.np_extractors import ConllExtractor
import MySQLdb
import MySQLdb.cursors
from bs4 import BeautifulSoup
import time

start = time.strftime("%I:%M:%S")
USELESS_CHARS = {"[]{}\"'`:()00123456789"}
trainPosSelect = "SELECT * FROM gathered.gatherer_article where source_title like '%ashdo%' and comments > 200 and active = 1 order by updated desc limit 500;"
trainNegSelect = "SELECT * FROM gathered.gatherer_article where source_title like '%ashdo%' and comments < 100 and active = 1 order by updated desc limit 500;"
trainPos = list()
trainNeg = list()
train = list()
test = []
dbConfig = {"user":"gatherer", "passwd":"gatherer@@", "host":"localhost", "db":"gathered", "cursorclass": MySQLdb.cursors.DictCursor}
connection = MySQLdb.connect(**dbConfig)
cursor = connection.cursor()
cursor.execute(trainPosSelect)

extractor = ConllExtractor()
for row in cursor.fetchall():
    text = BeautifulSoup(row['title'] + " " + row['summary'], 'lxml').text
    #text = TextBlob(article, classifier=cl, np_extractor=extractor)
    trainPos.append((text, "pos"))
cursor.execute(trainNegSelect)
for row in cursor.fetchall():
    text = BeautifulSoup(row['title'] + " " + row['summary'], 'lxml').text
    #text = TextBlob(article, classifier=cl, np_extractor=extractor)
    trainNeg.append((text, "neg"))
midPosPoint = (len(trainPos)//2)
midNegPoint = (len(trainNeg)//2)
train.extend(trainPos[:midPosPoint])
#print("train: ",len(train))
train.extend(trainNeg[:midNegPoint])
#print("train: ",len(train))
test.extend(trainPos[midPosPoint:])
#print("train: ",len(test))
test.extend(trainNeg[midNegPoint:])
#print("train: ",len(test))
cl = NaiveBayesClassifier(train)
#print (cl.classify("Their burgers are amazing"))  # "pos"
#print (cl.classify("I don't like their pizza."))  # "neg"

# hit,miss = 0,0
# for article in test:
#      blob = TextBlob(article[0], classifier=cl, np_extractor=extractor)
#      result = blob.classify()
#      print("expected/returned = {}/{}".format(article[1], result))
#      if article[1]==result:
#         hit += 1
#      else: 
#         miss += 1
#print (hit,miss)

print("Test Accuracy: {:.>10}".format(cl.accuracy(test)))
print(cl.show_informative_features(15))
finish = time.strftime("%I:%M:%S")
print(start, finish)
# for article in test:
#      print("\nexpecting {}".format(article[1]))
#      prob_dist = cl.prob_classify(article[0])   
#      #print(round(prob_dist.prob("neg"), 2), round(prob_dist.prob("pos"), 2))
#      if(round(prob_dist.prob("neg"), 2) > .75):
#         print('resulted: neg: {0}'.format(str(round(prob_dist.prob("neg"), 2))))
#      elif(round(prob_dist.prob("pos"), 2) > .75):
#         print('resulted: pos: {0}'.format(str(round(prob_dist.prob("pos"), 2))))


Test Accuracy: 0.754
Most Informative Features
            contains(Do) = True              pos : neg    =      9.0 : 1.0
        contains(effect) = True              pos : neg    =      8.3 : 1.0
      contains(response) = True              pos : neg    =      7.7 : 1.0
       contains(workers) = True              pos : neg    =      7.7 : 1.0
     contains(financial) = True              pos : neg    =      7.7 : 1.0
        contains(global) = True              pos : neg    =      7.7 : 1.0
     contains(countries) = True              pos : neg    =      7.0 : 1.0
        contains(school) = True              pos : neg    =      7.0 : 1.0
    contains(scientific) = True              pos : neg    =      7.0 : 1.0
contains(HughPickens.com) = True              pos : neg    =      6.6 : 1.0
         contains(offer) = True              neg : pos    =      6.3 : 1.0
            contains(Up) = True              pos : neg    =      6.3 : 1.0
        contains(nation) = True              pos : neg    =      6.3 : 1.0
         contains(young) = True              pos : neg    =      6.3 : 1.0
  contains(applications) = True              neg : pos    =      6.3 : 1.0
None
01:04:56 01:29:38