In [1]:
import pandas as pd
data = pd.read_csv("data/train.csv")

In [2]:
len(data)


Out[2]:
3947

In [3]:
data


Out[3]:
Insult Date Comment
0 1 20120618192155Z "You fuck your dad."
1 0 20120528192215Z "i really don't understand your point.\xa0 It ...
2 0 NaN "A\\xc2\\xa0majority of Canadians can and has ...
3 0 NaN "listen if you dont wanna get married to a man...
4 0 20120619094753Z "C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...
5 0 20120620171226Z "@SDL OK, but I would hope they'd sign him to ...
6 0 20120503012628Z "Yeah and where are you now?"
7 1 NaN "shut the fuck up. you and the rest of your fa...
8 1 20120502173553Z "Either you are fake or extremely stupid...may...
9 1 20120620160512Z "That you are an idiot who understands neither...
10 0 20120620015140Z "@jdstorm dont wish him injury but it happened...
11 0 20120530044519Z "Be careful,Jimbo.OG has a fork with your name...
12 0 NaN "@tonnyb Or they just don't pay attention "
13 0 NaN "Hmmm. Perhaps some who are too pig-faced to g...
14 0 NaN "@barrettmarson Huh? Her income was $21,912. H...
15 1 20120611090207Z "FOR SOME REASON U SOUND RETARDED. LOL. DAMN. ...
16 1 20120320162532Z "You with the 'racist' screen name\n\nYou are ...
17 0 NaN "Oh! & cheating as well...1966 cup was a cheat...
18 1 20120320075347Z "your such a dickhead..."
19 1 20120320203947Z "Your a retard go post your head up your #%&*"
20 0 20120612051612Z "@EephusBlue\xa0Makes you want to say "Mike Ma...
21 0 20120611234235Z "http://www.youtube.com/watch?v=tLYLLPHKRU4"
22 0 20120502215304Z "And you know they've burned holes in all the ...
23 0 20120503031721Z "you are a land creature. You would drown...."
24 0 NaN "To all those doom-and-gloomers, the chicken-l...
25 0 20120612144331Z "Craig, we have been saying from the start tha...
26 0 20120529025329Z "Strange you are talking about that when there...
27 0 20120612052926Z "But how would you actually get the key out?"
28 0 20120528193950Z "oh cool, some more internet points for your c...
29 0 20120529230121Z "Even though I think there are better coaches ...
... ... ... ...
3917 0 20120619192544Z "No shit!"
3918 0 20120610042233Z "clark just needs to stop pushing and pulling ...
3919 1 20120610154957Z "faggot"
3920 1 20120530140143Z "You really should be like the other Lemmings ...
3921 0 20120609221726Z "by your logic also legalize Heroin."
3922 0 NaN "The G.O.P. can cry all they want but ther the...
3923 0 NaN "@mikieboy - In a nutshell :-)"
3924 1 20120530010840Z "@Crissa:disqus LaRaza (The Race), NBP. Nation...
3925 0 NaN "Hey, if the Republicans are determined to sta...
3926 0 20120527153826Z "Nah. Good folks there, good music POURS from ...
3927 1 20120619003831Z "Hey nutbag..youre a traitor....\n\nObama has ...
3928 1 20120610125954Z "You're avatar is fail. Loser."
3929 1 20120619022522Z "Why don't you shut your stupid mouth, pennyla...
3930 0 20120529160326Z "We didn't even know who the rebels really are...
3931 1 20120618222326Z "You goofy Repub thugs don't have a clue, not ...
3932 0 20120527154034Z "socaljim:<div><br></div><div>Nope, Kelly shou...
3933 0 NaN "@Ritual Reality No I am simply saying that ba...
3934 1 20120610083604Z "True, maybe he's deliberately being an assh*l...
3935 1 20120515160649Z "JoelWeltman, you look like a pedophile from y...
3936 0 20120529001033Z "your a little high on the points but I do thi...
3937 0 20120619145323Z "Your Yellowstone Fly Fishing Report:\n\n.. Th...
3938 0 20120612001129Z "MrO,\n\nProof is shown by liberals not wantin...
3939 0 20120619205630Z "The only ignorant person here is you, who thi...
3940 0 20120610114639Z "oh i had many cars like this before."
3941 0 NaN "@Sara Besleaga Griji, doruri sau dorin\\xc8\\...
3942 1 20120502172717Z "you are both morons and that is never happening"
3943 0 20120528164814Z "Many toolbars include spell check, like Yahoo...
3944 0 20120620142813Z "@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux F...
3945 0 20120528205648Z "How about Felix? He is sure turning into one ...
3946 0 20120515200734Z "You're all upset, defending this hipster band...

3947 rows × 3 columns


In [4]:
import numpy as np
y_train = np.array(data.Insult)

In [5]:
y_train


Out[5]:
array([1, 0, 0, ..., 0, 0, 0])

In [6]:
text_train = data.Comment.tolist()

In [7]:
text_train[6]


Out[7]:
'"Yeah and where are you now?"'

In [6]:
data_test = pd.read_csv("data/test_with_solutions.csv")

In [9]:
text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer()
cv.fit(text_train)


Out[11]:
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
len(cv.vocabulary_)


Out[12]:
16469

In [14]:
cv.vocabulary_.keys()


Out[14]:
[u'raining',
 u'hordes',
 u'foul',
 u'four',
 u'prices',
 u'woods',
 u'hanging',
 u'comically',
 u'disobeying',
 u'canes',
 u'hermano',
 u'immature',
 u'meadows',
 u'xa0first',
 u'callin',
 u'shearnut',
 u'doppeltrudy',
 u'u0303a',
 u'xe1ti',
 u'broward',
 u'63e5082f',
 u'screaming',
 u'jrpg',
 u'commenter',
 u'wednesday',
 u'jessup',
 u'asami',
 u'stereotypical',
 u'commented',
 u'specially',
 u'consenting',
 u'melvin',
 u'loathing',
 u'xf6r',
 u'errors',
 u'xa0still',
 u'thunder',
 u'cooking',
 u'allowscriptaccess',
 u'bonuses',
 u'xf6g',
 u'xchg',
 u'misandry',
 u'admiral',
 u'widget',
 u'espumita',
 u'sobs',
 u'brainwashed',
 u'lgbt',
 u'china',
 u'punks',
 u'cult',
 u'sociopaths',
 u'cyclical',
 u'kids',
 u'elaborate',
 u'eastenders',
 u'controversy',
 u'onranker',
 u'oppars',
 u'criticism',
 u'golden',
 u'nwhen',
 u'criticise',
 u'hamas',
 u'nwrongdoing',
 u'hahahahaa',
 u'cocaine',
 u'xa0funny',
 u'opponents',
 u'dna',
 u'dnc',
 u'dnt',
 u'xa3220',
 u'music',
 u'therefore',
 u'mirandized',
 u'strike',
 u'sermons',
 u'females',
 u'blabbermouth',
 u'yahoo',
 u'circumstances',
 u'morally',
 u'locked',
 u'pursue',
 u'plunged',
 u'rulings',
 u'locker',
 u'exhale',
 u'revenues',
 u'example',
 u'pints',
 u'unjust',
 u'disparagement',
 u'titanium',
 u'want',
 u'counseling',
 u'cocksucker',
 u'absolute',
 u'smlombardi',
 u'duniya',
 u'travel',
 u'feature',
 u'noynoy',
 u'fanaticism',
 u'adoptions',
 u'wrong',
 u'henrik',
 u'types',
 u'sickening',
 u'cigarettes',
 u'18th',
 u'ho',
 u'communistic',
 u'nonsensical',
 u'welcomed',
 u'rewarded',
 u'fit',
 u'bringing',
 u'fix',
 u'occupations',
 u'fuddruckers',
 u'fin',
 u'easier',
 u'thrones',
 u'schooll',
 u'nlong',
 u'slate',
 u'effects',
 u'schools',
 u'yourself',
 u'eisenhower',
 u'whitman',
 u'castigating',
 u'barton',
 u'timeout',
 u'zinya',
 u'series',
 u'allah',
 u'allan',
 u'parasites',
 u'dillusional',
 u'397',
 u'398',
 u'message',
 u'rt',
 u'rw',
 u'rr',
 u'rs',
 u'rx',
 u'ry',
 u'mason',
 u're',
 u'rf',
 u'ra',
 u'rb',
 u'unitas',
 u'rn',
 u'salmonfly',
 u'rk',
 u'foundation',
 u'stamping',
 u'schimidt',
 u'personal',
 u'threatened',
 u'u015bmy',
 u'estimate',
 u'obstructing',
 u'lesbo',
 u'xa0everytime',
 u'enormous',
 u'r1',
 u'chiara',
 u'diabetic',
 u'panetta',
 u'shipped',
 u'disturbed',
 u'speedy',
 u'repealed',
 u'ngame',
 u'speeds',
 u'purpose',
 u'xa0yea',
 u'xa0forcefully',
 u'tantos',
 u'loool',
 u'disfigured',
 u'channels',
 u'xa0yep',
 u'xa0yes',
 u'carson',
 u'olds',
 u'basketball',
 u'nigh',
 u'service',
 u'forrester',
 u'lawsuits',
 u'needed',
 u'master',
 u'hypothetical',
 u'critter',
 u'genesis',
 u'xa0made',
 u'expunged',
 u'nirvana',
 u'mutilated',
 u'crawl',
 u'positively',
 u'showed',
 u'tree',
 u'rusty',
 u'feelins',
 u'project',
 u'sheen',
 u'nprove',
 u'feeling',
 u'libtards',
 u'willingness',
 u'longs',
 u'hetrosexual',
 u'atvcar',
 u'urinate',
 u'dozen',
 u'affairs',
 u'wholesome',
 u'bleachers',
 u'beltway',
 u'doors',
 u'ufeff4',
 u'grips',
 u'taxation',
 u'concedes',
 u'thrilled',
 u'mouth',
 u'addict',
 u'eftersom',
 u'annars',
 u'singer',
 u'camp',
 u'nhmmmm',
 u'tech',
 u'detriment',
 u'mating',
 u'scream',
 u'came',
 u'saying',
 u'screennames',
 u'meetings',
 u'teresa',
 u'lethal',
 u'tempted',
 u'cheaply',
 u'lessons',
 u'orleans',
 u'pitful',
 u'ncountry',
 u'underpants',
 u'30337',
 u'rick',
 u'rich',
 u'wallabies',
 u'rice',
 u'badal',
 u'plate',
 u'salespeople',
 u'partisapating',
 u'foremost',
 u'spirit',
 u'altogether',
 u'beacoup',
 u'relish',
 u'societies',
 u'droning',
 u'snotzi',
 u'autistic',
 u'nicely',
 u'boarder',
 u'shape',
 u'comprar',
 u'openly',
 u'isreal',
 u'saul',
 u'binominal',
 u'xa0really',
 u'resigned',
 u'manson',
 u'lots',
 u'irs',
 u'freeze',
 u'solace',
 u'humankind',
 u'nturn',
 u'skimpy',
 u'letting',
 u'anology',
 u'nationwide',
 u'nature',
 u'smelled',
 u'extent',
 u'debt',
 u'veer',
 u'country',
 u'heating',
 u'planned',
 u'lookin',
 u'logic',
 u'login',
 u'argue',
 u'thurley',
 u'assus',
 u'nsome',
 u'gopher',
 u'smithereens',
 u'gleaming',
 u'sirasa',
 u'pregnancy',
 u'blonde',
 u'smackdown',
 u'union',
 u'fri',
 u'muck',
 u'much',
 u'favoratism',
 u'stadium',
 u'privilege',
 u'deportations',
 u'dots',
 u'obese',
 u'graphical',
 u'spit',
 u'sual',
 u'worker',
 u'sycophants',
 u'southerners',
 u'doubts',
 u'worked',
 u'haters',
 u'spin',
 u'cockneywideboy',
 u'droppin',
 u'ridiculously',
 u'075',
 u'tzu',
 u'juegan',
 u'jenkums',
 u'ditching',
 u'kohn',
 u'cersei',
 u'0ld',
 u'memorial',
 u'hona',
 u'doofus',
 u'spews',
 u'split',
 u'xxoo',
 u'xe5gar',
 u'inadvertently',
 u'workforce',
 u'consents',
 u'boiler',
 u'xe5gan',
 u'jacc',
 u'academic',
 u'uncoololga',
 u'xa0hurt',
 u'corporate',
 u'diarrhea',
 u'plaque',
 u'bellow',
 u'northolt',
 u'letterman',
 u'sleeps',
 u'xa0real',
 u'xa0read',
 u'previous',
 u'hai',
 u'ham',
 u'han',
 u'haa',
 u'had',
 u'advancement',
 u'hay',
 u'sulked',
 u'har',
 u'has',
 u'hat',
 u'xa0fildelning',
 u'elevation',
 u'elders',
 u'shadow',
 u'replace',
 u'chusi',
 u'desire',
 u'psychological',
 u'stooopid',
 u'deviance',
 u'alice',
 u'attorney',
 u'creek',
 u'crowd',
 u'ballooned',
 u'crown',
 u'xa0about',
 u'billboard',
 u'harassment',
 u'fuuuuck',
 u'creep',
 u'enemies',
 u'botton',
 u'bottom',
 u'polluted',
 u'contributing',
 u'unit',
 u'condit',
 u'shakes',
 u'defensive',
 u'losing',
 u'officiating',
 u'stokes',
 u'morteza',
 u'ssho',
 u'stoked',
 u'151505',
 u'lemmings',
 u'raised',
 u'sob',
 u'facility',
 u'soi',
 u'transexuality',
 u'som',
 u'soo',
 u'son',
 u'thankful',
 u'beings',
 u'raiser',
 u'raises',
 u'sow',
 u'soy',
 u'sox',
 u'shoots',
 u'despised',
 u'waits',
 u'support',
 u'constantly',
 u'raped',
 u'greatness',
 u'rapes',
 u'stebbers',
 u'lunduke',
 u'abacha',
 u'beech',
 u'congratulations',
 u'inside',
 u'devices',
 u'listenening',
 u'azz',
 u'smashed',
 u'dues',
 u'passenger',
 u'disgrace',
 u'159',
 u'textbook',
 u'chrisitans',
 u'mrs',
 u'scritto',
 u'annagillmodel',
 u'role',
 u'jenkins',
 u'transgender',
 u'xa0dominated',
 u'roll',
 u'intend',
 u'palms',
 u'teats',
 u'center',
 u'moyles',
 u'bient',
 u'intent',
 u'smelling',
 u'transporter',
 u'15k',
 u'shootout',
 u'98',
 u'overturned',
 u'chain',
 u'whoever',
 u'hangovers',
 u'figlia',
 u'corpo',
 u'muppet',
 u'ose',
 u'midst',
 u'nenvy',
 u'bribes',
 u'obsessively',
 u'oversight',
 u'bogus',
 u'sensiblemoms',
 u'downloading',
 u'ntheir',
 u'xa0gay',
 u'choice',
 u'stays',
 u'rucker',
 u'exact',
 u'minute',
 u'cooks',
 u'judaism',
 u'amoral',
 u'leave',
 u'coincidences',
 u'cooke',
 u'loads',
 u'joleon',
 u'unre',
 u'righties',
 u'trails',
 u'sign',
 u'wetshoes',
 u'xa0your',
 u'thadici',
 u'celebrated',
 u'melt',
 u'baggins',
 u'melo',
 u'crashing',
 u'boost',
 u'xe4ndringarna',
 u'drafted',
 u'jury',
 u'honour',
 u'understanding',
 u'jurk',
 u'address',
 u'passengers',
 u'plunges',
 u'redemption',
 u'brilliant',
 u'impacted',
 u'funded',
 u'accomplished',
 u'ineffective',
 u'u0103',
 u'myers',
 u'u0105',
 u'umps',
 u'logical',
 u'raking',
 u'fake',
 u'blck',
 u'opposes',
 u'u0301ch',
 u'working',
 u'angry',
 u'gulags',
 u'sisterly',
 u'opposed',
 u'organizaiton',
 u'snifters',
 u'halladay',
 u'compitition',
 u'prosecutor',
 u'wicked',
 u'wvu',
 u'consoles',
 u'ignorant',
 u'riders',
 u'sabes',
 u'originally',
 u'pretend',
 u'abortion',
 u'following',
 u'nthroughout',
 u'munroe',
 u'mirrors',
 u'awesome',
 u'pawan',
 u'incremental',
 u'lyoness',
 u'allowed',
 u'stole',
 u'listens',
 u'buttered',
 u'vjmichelle',
 u'npittsburgh',
 u'cbi',
 u'echo1',
 u'improving',
 u'revealed',
 u'egotistical',
 u'xe4rsta',
 u'obamaheads',
 u'natural',
 u'ss',
 u'dickwrinkles',
 u'sp',
 u'june',
 u'sv',
 u'su',
 u'st',
 u'hon',
 u'si',
 u'sh',
 u'so',
 u'nsecond',
 u'sl',
 u'sc',
 u'sb',
 u'sa',
 u'misplaced',
 u'sg',
 u'manga',
 u'se',
 u'nwhat',
 u'jungwirth',
 u'years',
 u'xa0this',
 u'episodes',
 u'razors',
 u'nsimplement',
 u'tendency',
 u'grammy',
 u'limbs',
 u'jib',
 u'toro',
 u'jim',
 u'louisville',
 u'aviv',
 u'blessedassurance',
 u'suspicion',
 u'troubled',
 u'tory',
 u'nshut',
 u'recipients',
 u'nation',
 u'secularized',
 u'nabby',
 u'delibratly',
 u'didn',
 u'dilfer',
 u'establishing',
 u'amnesty',
 u'quarter',
 u'ramdin',
 u'u201cdo',
 u'aldickweeds',
 u'crushing',
 u'jskop',
 u'sponsor',
 u'entering',
 u'beetle',
 u'pr1ck',
 u'disasters',
 u'troll',
 u'jsdavis13',
 u'commissar',
 u'abide',
 u'seriously',
 u'investigation',
 u'trauma',
 u'internet',
 u'karimazweena',
 u'ovonic',
 u'occupukes',
 u'sidneycatsby',
 u'settimana',
 u'samajh',
 u'disrespect',
 u'crazies',
 u'grandma',
 u'picturesenergy',
 u'pentru',
 u'modest',
 u'gordon',
 u'mudslimes',
 u'neglect',
 u'emotion',
 u'8500',
 u'njudging',
 u'saving',
 u'nduring',
 u'spoken',
 u'ona',
 u'one',
 u'xa0disturbed',
 u'mignon',
 u'tamara',
 u'open',
 u'city',
 u'bith',
 u'bite',
 u'l4c',
 u'stuffed',
 u'stephanopoulos',
 u'bits',
 u'cite',
 u'lingering',
 u'axlrodes',
 u'progs',
 u'kinkier',
 u'gratuities',
 u'forward',
 u'fooled',
 u'surged',
 u'bored',
 u'iceberg',
 u'nkenna',
 u'counters',
 u'russia',
 u'addressing',
 u'turned',
 u'argument',
 u'alley',
 u'allen',
 u'turner',
 u'dismissive',
 u'politicos',
 u'nappeal',
 u'besmirch',
 u'warriors',
 u'instructed',
 u'omfg',
 u'concern',
 u'pimple',
 u'xa0continuous',
 u'sanctimoniously',
 u'imperatives',
 u'opposite',
 u'u1ec9nh',
 u'spewing',
 u'suspects',
 u'buffet',
 u'nwhhhhhhhhaaaaaaaaaaaaaa',
 u'printed',
 u'inserted',
 u'touchy',
 u'average',
 u'phil',
 u'drive',
 u'usjobswidgetsterms',
 u'touche',
 u'redirected',
 u'laws',
 u'vart',
 u'merit',
 u'sdl',
 u'bright',
 u'inconsistent',
 u'scarce',
 u'aggressive',
 u'imagined',
 u'xb3n',
 u'actio',
 u'outrage',
 u'assistant',
 u'balconies',
 u'paizon',
 u'platte',
 u'pimp',
 u'worried',
 u'priest',
 u'worries',
 u'vision',
 u'notice',
 u'andignorant',
 u'anyways',
 u'heifers',
 u'impressions',
 u'adamomars',
 u'youself',
 u'831',
 u'sites',
 u'moldy',
 u'refreshed',
 u'impregnated',
 u'screen',
 u'concentrate',
 u'awards',
 u'nbtw',
 u'menacing',
 u'progessive',
 u'puerile',
 u'manu',
 u'many',
 u'emitowany',
 u'millionaire',
 u'workplace',
 u'mang',
 u'semitic',
 u'ialists',
 u'yearly',
 u'dreqe',
 u'3000',
 u'xa0showing',
 u'caring',
 u'west',
 u'deuteronomy',
 u'pixels',
 u'demokratisynpunkt',
 u'motives',
 u'prototype',
 u'wants',
 u'formed',
 u'shsh',
 u'photos',
 u'arrnott',
 u'former',
 u'32',
 u'slutbag',
 u'xa0spec',
 u'consultant',
 u'tribal',
 u'polls',
 u'straighten',
 u'hayes',
 u'trabert',
 u'policies',
 u'newspaper',
 u'situation',
 u'sikhs',
 u'ive',
 u'pinko',
 u'engaged',
 u'pinky',
 u'ikathyv',
 u'technology',
 u'verified',
 u'immense',
 u'denying',
 u'famousoriginallucas',
 u'otto',
 u'fuckkkkkkkkk',
 u'homophobic',
 u'npeggy',
 u'barbed',
 u'singapore',
 u'defy',
 u'wired',
 u'ntake',
 u'anelka',
 u'midpost',
 u'horsepiss',
 u'steamed',
 u'being',
 u'rest',
 u'ngos',
 u'mumbojumbo',
 u'ngod',
 u'xa0lmfao',
 u'grounded',
 u'excuses',
 u'nadmittedly',
 u'inconjunction',
 u'dicks',
 u'motres',
 u'starving',
 u'crusty',
 u'around',
 u'hve',
 u'sums',
 u'regis',
 u'dark',
 u'traffic',
 u'darn',
 u'vacuum',
 u'world',
 u'postal',
 u'dare',
 u'intel',
 u'xa0no',
 u'preference',
 u'gimme',
 u'avram',
 u'inter',
 u'what',
 u'lobster',
 u'divine',
 u'scholarships',
 u'accidental',
 u'haryana',
 u'cavity',
 u'memories',
 u'911',
 u'912',
 u'biased',
 u'chavman',
 u'leadership',
 u'biases',
 u'srqdawgs15',
 u'thailand',
 u'punishments',
 u'hinata',
 u'nmsnbc',
 u'mouthfull',
 u'vieme',
 u'cowhide',
 u'xa0decide',
 u'apologize',
 u'het',
 u'jerks',
 u'her',
 u'hes',
 u'missa',
 u'hey',
 u'hed',
 u'hee',
 u'mindless',
 u'sealed',
 u'verbatim',
 u'wits',
 u'kotor',
 u'institutions',
 u'foreheads',
 u'survived',
 u'nromney',
 u'with',
 u'handsome',
 u'abused',
 u'pull',
 u'rush',
 u'assh',
 u'rage',
 u'tripe',
 u'obviouse',
 u'dirty',
 u'abuses',
 u'politiska',
 u'asss',
 u'laird',
 u'u011bj',
 u'citizenship',
 u'johnny',
 u'watches',
 u'watched',
 u'admiring',
 u'cream',
 u'mistrzem',
 u'72913781',
 u'guildwars2',
 u'tight',
 u'tornata',
 u'friggin',
 u'puppy',
 u'arshavin',
 u'ignoramus',
 u'beaks',
 u'r_dale',
 u'midget',
 u'brotherhood',
 u'lilttle',
 u'2009',
 u'ncorporations',
 u'nastasha',
 u'tricks',
 u'mask',
 u'stroller',
 u'mast',
 u'mass',
 u'adam',
 u'dyer',
 u'legislatures',
 u'from1973',
 u'kekayaan',
 u'caused',
 u'xa0will',
 u'scc',
 u'acknowledging',
 u'welfare',
 u'reson',
 u'jason',
 u'u0303ng',
 u'causes',
 u'evicted',
 u'xa0responded',
 u'tx',
 u'tv',
 u'baddest',
 u'tu',
 u'tr',
 u'encourage',
 u'tp',
 u'n0thing',
 u'to',
 u'tail',
 u'chewing',
 u'th',
 u'ti',
 u'td',
 u'te',
 u'ta',
 u'hotest',
 u'returned',
 u'thefandangler',
 u'detention',
 u'elena',
 u'cable',
 u'pbr',
 u'hgis',
 u'joined',
 ...]

In [15]:
X_train = cv.transform(text_train)

In [16]:
X_train


Out[16]:
<3947x16469 sparse matrix of type '<type 'numpy.int64'>'
	with 100269 stored elements in Compressed Sparse Row format>

In [17]:
text_train[6]


Out[17]:
'"Yeah and where are you now?"'

In [18]:
X_train[6, :].nonzero()[1]


Out[18]:
array([  806,   983,  9576, 15388, 16367, 16397], dtype=int32)

In [19]:
X_test = cv.transform(text_test)

In [20]:
from sklearn.svm import LinearSVC
svm = LinearSVC(C=.01)

In [21]:
svm.fit(X_train, y_train)


Out[21]:
LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [22]:
svm.score(X_train, y_train)


Out[22]:
0.88421586014694709

In [23]:
svm.score(X_test, y_test)


Out[23]:
0.83679637325273892

In [24]:
coef = svm.coef_.ravel()
positive_coefficients = np.argsort(coef)[-25:]
negative_coefficients = np.argsort(coef)[:25]
interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])

In [25]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 5))
plt.bar(np.arange(50), coef[interesting_coefficients], color=["red" if c < 0 else "blue" for c in coef[interesting_coefficients]])
feature_names = np.array(cv.get_feature_names())
plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha="right");



In [27]:
from sklearn.pipeline import Pipeline

In [28]:
pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])

In [29]:
pipeline.fit(text_train, y_train)


Out[29]:
Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [30]:
pipeline.score(text_test, y_test)


Out[30]:
0.83679637325273892

In [31]:
from sklearn.grid_search import GridSearchCV

In [32]:
param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}
grid_search = GridSearchCV(pipeline, param_grid=param_grid)

In [33]:
grid_search.fit(text_train, y_train)


Out[33]:
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'classifier__C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [34]:
grid_search.score(text_test, y_test)


Out[34]:
0.84132980732905172

In [ ]:
param_grid = {'classifier__C': 10. ** np.arange(-3, 3), "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3), (2, 3), (2, 2)]}
grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)

In [ ]:
grid_search.fit(text_train, y_train)

In [ ]:
grid_search.best_params_

In [ ]:
grid_search.best_score_