In [2]:
import nltk
import random
from nltk.corpus import movie_reviews
import pprint
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
import pickle

There are a thousand movie reviews for both

  • positive and
  • negetive

reviews


In [3]:
movie_reviews.categories()


Out[3]:
['neg', 'pos']

Now I need to store it as

documents = [
    ('pos', ['good', 'awesome', ....]), 
    ('neg', ['ridiculous', 'horrible', ...])
]

OR

Storing it in a dictionary would also be a better idea, will try out with both

documents = {
    'pos': ['good', 'awesome', ....],
    'neg': ['ridiculous', 'horrible', ...]
}

In [4]:
documents = [(list(word for word in movie_reviews.words(fileid) if word not in stop_words), category)
            for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)
            ]
random.shuffle(documents)

Getting the list of all words to store the most frequently occuring ones


In [5]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

Making a frequency distribution of the words


In [6]:
all_words = nltk.FreqDist(all_words)
all_words.most_common(20)


Out[6]:
[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595),
 (')', 11781),
 ('(', 11664),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961)]

In [7]:
all_words["hate"]  ## counting the occurences of a single word


Out[7]:
134

will train only for the first 5000 top words in the list


In [8]:
feature_words = list(all_words.keys())[:5000]

Finding these feature words in documents, making our function would ease it out!


In [9]:
def find_features(document):
    words = set(document)
    feature = {}
    for w in feature_words:
        feature[w] = (w in words)
    return feature

What the below one does is, before hand we had only words and its category. But not we have the feature set (along with a boolean value of whether it is one of the most frequently used words or not)of the same word and then the category.


In [10]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]

In [11]:
feature_sets[:1]


Out[11]:
[({'murderers': False,
   'magoo': False,
   'autumn': False,
   'cheryl': False,
   'dalmatians': False,
   'inaction': False,
   'goateed': False,
   'entomologist': False,
   'filmed': False,
   '92s': False,
   'snipers': False,
   'unintentionally': False,
   'dragon': False,
   'wrinkles': False,
   'blasphemy': False,
   'forward': False,
   'butter': False,
   'interrelate': False,
   'tantor': False,
   'marx': False,
   'errs': False,
   'chews': False,
   'outlook': False,
   'keywords': False,
   'honkey': False,
   'replenishing': False,
   'shapes': False,
   'sommeliers': False,
   'grappling': False,
   'traps': False,
   'loathsome': False,
   'obscene': False,
   'korman': False,
   'hohh': False,
   'campaigns': False,
   'horizontally': False,
   'unearthing': False,
   'uhhm': False,
   'promoted': False,
   'discontented': False,
   'wallpaper': False,
   'lifespan': False,
   'excises': False,
   'bonnier': False,
   'lipnicki': False,
   'curious': False,
   'impressiveness': False,
   'were': False,
   'nouvelle': False,
   'overpraising': False,
   'orwell': False,
   'copywriters': False,
   'devoted': False,
   'unlikable': False,
   'butchery': False,
   'shadowed': False,
   'feces': False,
   'opposites': False,
   'prompter': False,
   'conceptualization': False,
   'ambience': False,
   'cinergi': False,
   'owing': False,
   'bujold': False,
   'harford': False,
   'ravenous': False,
   'stiffest': False,
   'conclude': False,
   'brand': False,
   'disarming': False,
   'joy': False,
   'parma': False,
   'lifeboats': False,
   'lurk': False,
   'drills': False,
   'winnings': False,
   'shade': False,
   'numbing': False,
   'harboring': False,
   'preconception': False,
   'exemplified': False,
   'delaware': False,
   'secures': False,
   'mistakingly': False,
   'cycle': False,
   'christabella': False,
   'koven': False,
   'fluids': False,
   'outstripped': False,
   'streetwise': False,
   'luis': False,
   'disguised': False,
   'stabilize': False,
   'cad': False,
   'motorhead': False,
   'emotion': False,
   'positronic': False,
   'blackmailed': False,
   'upn': False,
   'alright': False,
   'jointly': False,
   'disturbed': False,
   'friedkin': False,
   'frontals': False,
   'flaring': False,
   'cloaks': False,
   'gritty': False,
   'fay': False,
   'cosmopolitan': False,
   'dealer': False,
   'nieces': False,
   'goons': False,
   'soze': False,
   'waystation': False,
   'bateman': False,
   'posters': False,
   'montenegro': False,
   'passer': False,
   'sadist': False,
   'danielle': False,
   'eeriness': False,
   'chester': False,
   'appreciating': False,
   'rko': False,
   'picasso': False,
   'hamburger': False,
   'zone': False,
   'unworthy': False,
   'akroyd': False,
   'undergone': False,
   'convinced': False,
   'cinemtography': False,
   'confederation': False,
   'seperation': False,
   'castro': False,
   'southampton': False,
   'kutcher': False,
   'rienfenstal': False,
   'reprise': False,
   'decadent': False,
   'spoofing': False,
   'beaudoin': False,
   'cigarettes': False,
   'worries': False,
   'kwietniowski': False,
   'note': False,
   'gallo': False,
   'drip': False,
   '1400': False,
   'bugged': False,
   'kerrigans': False,
   'cohen': False,
   'sewing': False,
   'drainage': False,
   'singlehandedly': False,
   '84': False,
   'anime': False,
   'jeannie': False,
   'wells': False,
   'victim': False,
   'tolerant': False,
   'reviewed': False,
   'commission': False,
   'penguin': False,
   'photos': False,
   'penned': False,
   'pimple': False,
   'gladiator': False,
   'kindness': False,
   'jason': False,
   'hav': False,
   'wide': False,
   'safely': False,
   'claptrap': False,
   'grizzly': False,
   'indignantly': False,
   'accentuated': False,
   'pointlessly': False,
   'gown': False,
   'miscue': False,
   'mccabe': False,
   '1993': False,
   'glove': False,
   'boddy': False,
   'roomie': False,
   'elvira': False,
   'executed': False,
   'rhythms': False,
   'testicles': False,
   'clone': False,
   'prints': False,
   'trents': False,
   'outgunned': False,
   'spoliers': False,
   'squash': False,
   'strands': False,
   'vinci': False,
   'lebrock': False,
   'espionnage': False,
   'groveling': False,
   'weeps': False,
   'melvin': False,
   'likewise': False,
   'ball': False,
   'tuttle': False,
   'editing': False,
   'loneliness': False,
   'exhilirating': False,
   'dorff': False,
   'fearsomely': False,
   'todde': False,
   'clutching': False,
   'geezer': False,
   'unimpeachable': False,
   'eighteen': False,
   'hefnerism': False,
   'hoppe': False,
   'pretentiously': False,
   'upsurge': False,
   'permanetly': False,
   'cheesiness': False,
   'henry': False,
   'weirder': False,
   'doppelganger': False,
   'fundamental': False,
   'crunchem': False,
   'scouting': False,
   'activity': False,
   'badder': False,
   'tormey': False,
   'proudly': False,
   'undress': False,
   'humid': False,
   '18': False,
   'hors': False,
   'attackers': False,
   'sustain': False,
   'saturated': False,
   'sargeant': False,
   'nelken': False,
   'wanker': False,
   'traditional': False,
   'veronique': False,
   'slut': False,
   'apologize': False,
   'so': False,
   'stickell': False,
   'photojournalist': False,
   'converts': False,
   'rescuing': False,
   'amazing': False,
   'strickler': False,
   'sprinklers': False,
   'demonstration': False,
   'fetishizes': False,
   'prerecorded': False,
   'himself': False,
   'unsympathetic': False,
   'providence': False,
   'afficianados': False,
   '2023': False,
   'jettisoned': False,
   'anguishes': False,
   'overhyped': False,
   'barn': False,
   'minus': False,
   'steadicam': False,
   'uncharacteristic': False,
   'capping': False,
   'centerfold': False,
   'focker': False,
   'bruise': False,
   '2036': False,
   'obstructed': False,
   'marienbad_': False,
   'depth': False,
   'luckless': False,
   'roll': False,
   'survive': False,
   'exists': False,
   'loomed': False,
   'guitarist': False,
   'wiseacre': False,
   'reprieve': False,
   'aurally': False,
   'appendaged': False,
   'tempers': False,
   'insincerity': False,
   'shaped': False,
   'holiness': False,
   'bellows': False,
   'arming': False,
   'websites': False,
   'obsolete': False,
   'stinkiest': False,
   'tenny': False,
   'jerri': False,
   'balthazar': False,
   'decades': False,
   'lindberg': False,
   'swim': False,
   'supersecret': False,
   'clocks': False,
   'forgetable': False,
   'liu': False,
   'manor': False,
   'prob': False,
   'thewlis': False,
   'threateningly': False,
   'antagonists': False,
   'sidewalk': False,
   'unnerrving': False,
   'merian': False,
   'saber': False,
   'beeyatch': False,
   'jugs': False,
   'mcsorley': False,
   'boat': False,
   'rin': False,
   'dunois': False,
   'hippyish': False,
   'ronkonkoma': False,
   'irks': False,
   'sakamoto': False,
   'hue': False,
   'esposito': False,
   'fischer': False,
   'enchanting': False,
   'correct': False,
   'cobbles': False,
   'shouts': False,
   'spicebus': False,
   'synonymous': False,
   'living': False,
   'jeepers': False,
   'proclivity': False,
   'favors': False,
   'enrolls': False,
   'journal': False,
   'entrusting': False,
   'wojciech': False,
   'takei': False,
   '1600s': False,
   'impressionistic': False,
   'tempered': False,
   'infallibility': False,
   'pallid': False,
   'anakins': False,
   'quantity': False,
   'thyself': False,
   'counting': False,
   'adventurer': False,
   'bombastic': False,
   'unexpecting': False,
   'newell': False,
   'immediate': False,
   'islandnet': False,
   'disowned': False,
   'deserved': False,
   'wanted': False,
   'ranged': False,
   'mozart': False,
   'overrun': False,
   'sadistic': False,
   'saintliness': False,
   'duplication': False,
   'peaceful': False,
   'mirthless': False,
   'salting': False,
   'duplicate': False,
   'moviegoers': True,
   'irascible': False,
   'nowak': False,
   'mister': False,
   'impulsive': False,
   'summit': False,
   'unraveled': False,
   'dapper': False,
   'cowardice': False,
   'automation': False,
   'wire': False,
   'gon': False,
   'preminger': False,
   'changing': False,
   'thereby': False,
   'jesus': False,
   'stimulate': True,
   'domain': False,
   'tightest': False,
   'analogous': False,
   'driscoll': False,
   'shaft': False,
   'calculations': False,
   'gutsy': False,
   'brothels': False,
   'colours': False,
   'bossman': False,
   'exorbitantly': False,
   'outperform': False,
   'sweethearts': False,
   'similarly': False,
   'ridden': False,
   'excitable': False,
   'kramer': False,
   'gwar': False,
   'ramble': False,
   'preventing': False,
   'halloweentown': False,
   'rifle': False,
   'narrate': False,
   'hitherto': False,
   'expatriate': False,
   'hugh': False,
   'emperors': False,
   'mercedes': False,
   'flashdancer': False,
   'additive': False,
   'scissorshands': False,
   'sputtering': False,
   'mobster': False,
   'evan': False,
   'preserve': False,
   'searing': False,
   'psychedelic': False,
   'shorts': False,
   'enthused': False,
   'verdict': False,
   'leagues': False,
   'jittery': False,
   'draggy': False,
   'fretful': False,
   'anything': False,
   'unenergetically': False,
   'megalomaniac': False,
   'ship': False,
   'airbrushed': False,
   'fig': False,
   'parillaud': False,
   'garbled': False,
   'short': False,
   'swooping': False,
   'voyeuristically': False,
   'hustle': False,
   'copulation': False,
   'trent': False,
   'overcooked': False,
   'wahoo': False,
   'cartman': False,
   'occurances': False,
   'kitschy': False,
   'duels': False,
   'grilling': False,
   'glass': False,
   'symbiosis': False,
   'appropriate': False,
   'swanson': False,
   'macht': False,
   'overcasts': False,
   'overplotted': False,
   'harshly': False,
   'dated': False,
   'beetle': False,
   'borrows': False,
   'repairs': False,
   'kurgan': False,
   'barred': False,
   'gere': False,
   'atmosphere': False,
   'burbank': False,
   'reactionary': False,
   'spartacus': False,
   'invincible': False,
   'anton': False,
   'departure': False,
   'sherilyn': False,
   'covering': False,
   'decker': False,
   'pledge': False,
   'silverback': False,
   'lithuanian': False,
   'onward': False,
   'sniffs': False,
   'sob': False,
   'loftier': False,
   'events': False,
   'misdemeanors': False,
   'cramped': False,
   'stalker': False,
   'tummy': False,
   'less': False,
   'vengeful': False,
   'insignificant': False,
   'coco': False,
   'central': False,
   'cigar': False,
   'barren': False,
   'albany': False,
   'cosette': False,
   'tainsy': False,
   'wisely': False,
   'kettle': False,
   'lacked': False,
   'attorney': False,
   'fawning': False,
   'screentime': False,
   'conway': False,
   'raccoons': False,
   'potency': False,
   'silverware': False,
   'explosives': False,
   'pindar': False,
   'menno': False,
   'concluding': False,
   'bowler': False,
   'hala': False,
   'traveled': False,
   'poets': False,
   'medfield': False,
   'clues': False,
   'ax': False,
   'smartmouths': False,
   'lunching': False,
   'illnesses': False,
   'unlovable': False,
   'forum': False,
   'fussell': False,
   'bearing': False,
   'hanger': False,
   'constructed': False,
   'avenues': False,
   'wigs': False,
   'creamy': False,
   'could': False,
   'stubby': False,
   'knits': False,
   'fraction': False,
   'revoltingly': False,
   'totemic': False,
   'opar': False,
   'dee': False,
   'jasmine': False,
   'robbie': False,
   'matrix': False,
   'archery': False,
   'recouperating': False,
   'cymbals': False,
   'electrically': False,
   'thick': False,
   'kunz': False,
   'terra': False,
   'warship': False,
   'disaster': False,
   'weakling': False,
   '14th': False,
   'cousins': False,
   'mournfulness': False,
   'draw': False,
   'spacious': False,
   'abused': False,
   'reinventing': False,
   'zephyr': False,
   'letterman': False,
   'freaking': False,
   'husbands': False,
   'heart': False,
   'curtis': False,
   'sauna': False,
   'compilation': False,
   'disturbs': False,
   'juxtaposing': False,
   'gila': False,
   'outta': False,
   'shirtless': False,
   'bumcheeks': False,
   'raucously': False,
   'bottom': False,
   'cubicle': False,
   'craving': False,
   'creationism': False,
   'telegram': False,
   'pnly': False,
   'relegated': False,
   'style': True,
   'macho': False,
   'vaporize': False,
   'references': False,
   'inadvertenty': False,
   'documentarians': False,
   'ified': False,
   'offset': False,
   'bluff': False,
   'homies': False,
   'inconveniences': False,
   'penciller': False,
   'complimented': False,
   'wanderlusting': False,
   'tortured': False,
   'poolman': False,
   'buying': False,
   'rugrats': False,
   'kidney': False,
   'gaul': False,
   'executions': False,
   'yugoslavians': False,
   'menacingly': False,
   'disabled': False,
   'aid': False,
   'taut': False,
   'clarke': False,
   'utmost': False,
   'juice': False,
   'vacances': False,
   'raeeyain': False,
   'katharine': False,
   'hints': False,
   'vision': False,
   'entirely': False,
   'captors': False,
   'diverting': True,
   'raffin': False,
   'adequately': False,
   'pagniacci': False,
   'lackadasically': False,
   'siunin': False,
   'lumet': False,
   'paired': False,
   'martin': False,
   'perfects': False,
   'unsheathes': False,
   'occur': False,
   'yourself': False,
   'wrapped': False,
   'terrio': False,
   'forster': False,
   'episode': False,
   'biographical': False,
   'anticlimax': False,
   'staging': False,
   'homely': False,
   'simplicity': False,
   'hangout': False,
   'clawed': False,
   'maids': False,
   'seeing': False,
   'stoners': False,
   'unsalveageably': False,
   'bury': False,
   'uprorously': False,
   'dwell': False,
   'anthony': False,
   '_amadeus_': False,
   'assembled': False,
   'mugs': False,
   'emphatically': False,
   'whitney': False,
   'unwelcomed': False,
   'conservatism': False,
   'overlap': False,
   'quibble': False,
   'triple': False,
   'strongman': False,
   'swat': False,
   'bulletins': False,
   'squandered': False,
   'changer': False,
   'flashbacks': False,
   'freudian': False,
   'undo': False,
   'acquits': False,
   'talmudic': False,
   'breakout': False,
   'cloning': False,
   'guests': False,
   'carts': False,
   'toda': False,
   'tournaments': False,
   'mooch': False,
   'bowed': False,
   'fiesty': False,
   'authorial': False,
   'kellner': False,
   'fellowship': False,
   'bodies': False,
   'cummings': False,
   'sioux': False,
   'transformation': False,
   'subpar': False,
   'bruno': False,
   'related': False,
   'oooh': False,
   'posing': False,
   'inportant': False,
   'redistribution': False,
   'plummer': False,
   'knock': False,
   'tulip': False,
   'hayden': False,
   'evaluation': False,
   'colo': False,
   'feng': False,
   'dads': False,
   'norm': False,
   'kid': False,
   'righting': False,
   'recipe': False,
   'paranoia': False,
   'structured': False,
   'brunet': False,
   'evelyne': False,
   'unconscious': False,
   'gymnastics': False,
   'aggravating': False,
   'robbery': False,
   'vegetables': False,
   'horn': False,
   'blacks': False,
   'hour': False,
   'safecracker': False,
   'annoyed': False,
   'unsettlingly': False,
   '140': False,
   'unbelieveably': False,
   'retalliation': False,
   'miscarrying': False,
   'sprayed': False,
   'vibrant': False,
   'stoddard': False,
   'inherently': False,
   'smoggy': False,
   'reminscent': False,
   'gebrecht': False,
   'ulrich': False,
   'loveliest': False,
   'courtrooms': False,
   'epitaph': False,
   'dampens': False,
   'cloke': False,
   'summarized': False,
   'ying': False,
   'practice': False,
   'ethnicity': False,
   'romeo': True,
   'assortment': False,
   'commit': False,
   'spots': False,
   'marky': False,
   'laugh': False,
   'studying': False,
   'cont': False,
   'shivery': False,
   'dolph': False,
   'tenet': False,
   'animator': False,
   'concentrates': False,
   'arraki': False,
   'chu': False,
   'indoors': False,
   'kube': False,
   'arrives': True,
   'brunette': False,
   'detatched': False,
   'practical': False,
   'touts': False,
   'gracie': False,
   'malt': False,
   'reagan': False,
   'maunau': False,
   'halle': False,
   'astounded': False,
   'inuit': False,
   'imponderable': False,
   'measure': False,
   'exception': False,
   'sensory': False,
   'picnic': False,
   'altogether': False,
   'doubly': False,
   'prophecies': False,
   'strains': False,
   'troubling': False,
   'armageddon': False,
   'couplings': False,
   'advanced': False,
   'santanico': False,
   'motivated': False,
   'ambassador': False,
   'bop': False,
   'improvisation': False,
   'evans': False,
   'carpings': False,
   'regretted': False,
   'argo': False,
   'avenger': False,
   'analogue': False,
   'strangers': False,
   'watercolors': False,
   'brusque': False,
   'steadiocam': False,
   'regretfully': False,
   'nathanson': False,
   'straitjacket': False,
   'patches': False,
   'surprisinly': False,
   'inventively': False,
   'unconvincingly': False,
   'radiation': False,
   'cathedrals': False,
   'loosely': False,
   'gheorghe': False,
   'elie': False,
   'clothesline': False,
   'laborious': False,
   'execs': False,
   'sabian': False,
   'disdain': False,
   'sats': False,
   'biased': False,
   'southern': False,
   'retracing': False,
   'delaney': False,
   'perceptible': False,
   'bedhopping': False,
   'rivalry': False,
   'regurgitated': False,
   'reneges': False,
   'antagonism': False,
   'ravine': False,
   'improvements': False,
   'teasing': False,
   'residing': False,
   'waxing': False,
   'personalized': False,
   '747s': False,
   'blundered': False,
   'cazale': False,
   'bogged': False,
   'gangbangers': False,
   'airduct': False,
   'hollan': False,
   'trashing': False,
   'pervasiveness': False,
   'gleeson': False,
   'spetters': False,
   'beckwith': False,
   'jackhammer': False,
   '1839': False,
   'wine': False,
   'deer': False,
   'tick': False,
   'breasted': False,
   'sesame': False,
   'mambos': False,
   'consumes': False,
   'fashioned': False,
   'erasing': False,
   'heartening': False,
   'leviathan': False,
   'kickstarting': False,
   'complete': False,
   'innovation': False,
   'dauphin': False,
   'goo': False,
   'obeying': False,
   'kodak': False,
   'flustered': False,
   'pounder': False,
   'pacino': False,
   'cookies': False,
   'tennant': False,
   'wane': False,
   'serpent': False,
   'verdell': False,
   'executive': False,
   'jitterish': False,
   'schrieber': False,
   'inning': False,
   'seventy': False,
   'unforgivingly': False,
   'up': False,
   'pigeons': False,
   'lobs': False,
   'symbols': False,
   'possible': False,
   'nearly': True,
   'damning': False,
   'magazines': False,
   'outinen': False,
   'editorial': False,
   'floors': False,
   '26': False,
   'liveliness': False,
   'minors': False,
   'dodges': False,
   'cracking': False,
   'kimball': False,
   'zamm': False,
   'desmond': False,
   'scion': False,
   'interacting': False,
   'contestants': False,
   'deactivating': False,
   'endorsement': False,
   'visor': False,
   'runtime': False,
   'subordinate': False,
   'organized': False,
   'unarmed': False,
   'therapeutic': False,
   'calamitous': False,
   'bride': False,
   'hard': False,
   'panorama': False,
   'crispin': False,
   'accountability': False,
   'reject': False,
   'metoclorian': False,
   'headley': False,
   'nolte': False,
   'basquiat': False,
   'ron': False,
   'ruled': False,
   'lebowski': False,
   'accomplishes': False,
   'applicant': False,
   'giamatti': False,
   'corrects': False,
   'aforementioned': False,
   'toghether': False,
   'ichor': False,
   'kietal': False,
   'riboud': False,
   'retrieval': False,
   'detach': False,
   'rancor': False,
   'mine': False,
   'fantasizing': False,
   'bakula': False,
   'ignored': False,
   'gus': False,
   'janusz': False,
   'duet': False,
   'humiliation': False,
   'product': False,
   'height': False,
   'smothering': False,
   'impatience': False,
   'laforge': False,
   'cincinnati': False,
   'detrimentally': False,
   'finn': False,
   'garbageman': False,
   'screwy': False,
   'manslaughter': False,
   'internalize': False,
   'collapses': False,
   'raises': False,
   'chortling': False,
   'largely': False,
   'forges': False,
   'pitifully': False,
   'grip': False,
   'exercises': False,
   'inscribe': False,
   'walt': False,
   'morosely': False,
   'eliminating': False,
   'perilous': False,
   'refrigerates': False,
   'borgnine': False,
   'lacrosse': False,
   'screamers': False,
   'dumbfounded': False,
   'stockpiling': False,
   'unmenacing': False,
   'texture': False,
   'artistic': False,
   'moonshine': False,
   'fa': False,
   'best': False,
   'brilliant': False,
   'chopin': False,
   'rapist': False,
   'gosselaar': False,
   'tito': False,
   'accuses': False,
   'reynaldo': False,
   'aquarium': False,
   'korsmo': False,
   'alert': False,
   'enhancer': False,
   'saved': False,
   'topping': False,
   'leroi': False,
   'silenced': False,
   'wreaking': False,
   'targets': False,
   'kiddies': False,
   'parisian': False,
   'mei': False,
   'yeller': False,
   ...},
  'neg')]

Training the classifier


In [12]:
training_set = feature_sets[:1900]
testing_set = feature_sets[1900:]

We won't be telling the machine the category i.e. whether the document is a postive one or a negative one. We ask it to tell that to us. Then we compare it to the known category that we have and calculate how accurate it is.

Naive bayes algorithm

It states that

\begin{equation*} posterior = \frac{PriorOccurences \times likelihood}{CurrentEvidence} \end{equation*}

Here posterior is likelihood of occurence


In [20]:
## TO-DO: To build own naive bais algorithm
# classifier = nltk.NaiveBayesClassifier.train(training_set)

## saving the classifier
# save_classifier = open("naive_bayes.pickle", "wb")
# pickle.dump(classifier, save_classifier)
# save_classifier.close()

## Now that the picke is saved we will use that.

In [21]:
## Using the pickle file now 
pickle_classifier = open("naive_bayes.pickle", "rb")
classifier = pickle.load(pickle_classifier)
pickle_classifier.close()

## Testing it's accuracy
print("Naive bayes classifier accuracy percentage : ", (nltk.classify.accuracy(classifier, testing_set))*100)


Naive bayes classifier accuracy percentage :  71.0

In [19]:
classifier.show_most_informative_features(20)


Most Informative Features
                  hatred = True              pos : neg    =     10.1 : 1.0
                 symbols = True              pos : neg    =      7.5 : 1.0
               balancing = True              pos : neg    =      7.5 : 1.0
               laughably = True              neg : pos    =      7.1 : 1.0
                   pixar = True              pos : neg    =      6.9 : 1.0
             fulfillment = True              pos : neg    =      6.2 : 1.0
                 labeled = True              pos : neg    =      6.2 : 1.0
                    jude = True              pos : neg    =      6.2 : 1.0
                 outlook = True              pos : neg    =      6.2 : 1.0
                  symbol = True              pos : neg    =      6.1 : 1.0
               strongest = True              pos : neg    =      6.1 : 1.0
                   jolie = True              neg : pos    =      5.9 : 1.0
                 misfire = True              neg : pos    =      5.8 : 1.0
                  suvari = True              neg : pos    =      5.8 : 1.0
               diverting = True              neg : pos    =      5.8 : 1.0
                     liu = True              neg : pos    =      5.8 : 1.0
          reconciliation = True              neg : pos    =      5.8 : 1.0
                  purple = True              pos : neg    =      5.5 : 1.0
                lebowski = True              pos : neg    =      5.5 : 1.0
                tatooine = True              pos : neg    =      5.5 : 1.0

What the above feature set means is lets take abysmal,

neg : pos = 6.3 : 1.0

means that it appears 6.3 times more in neg reviews than in pos reviews

Saving the trained algorithm using Pickle

We will be saving python objects so that we can quickly load them again.

Importing pickle at the top

We will now use this classifier in the next file to classify documents