In [16]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
master = pd.read_csv('all_tweets_df.csv')
unigram_features = pd.read_csv('top_1000_unigram_features.csv')

Working subset (due to RAM constraints)


In [18]:
subset = master[master['type']=='sarcastic'][:8000].append(master[master['type']=='genuine'][:8000])
# test_subset = master[master['type']=='sarcastic'][6000:8000].append(master[master['type']=='genuine'][6000:8000])

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction import DictVectorizer

In [114]:
count_vec = CountVectorizer(max_features=5000)
vector = count_vec.fit_transform(subset['0'])
# vector = DictVectorizer().fit_transform(master['0'])
varr = vector.toarray()

In [117]:
# Unigrams
count_vec.get_feature_names()


Out[117]:
['00',
 '000',
 '08',
 '09daytona',
 '10',
 '100',
 '100happydays',
 '10pm',
 '11',
 '110',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '16',
 '17',
 '18',
 '1st',
 '20',
 '200',
 '2014',
 '2015',
 '2022',
 '21',
 '210',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '29',
 '2am',
 '2biblez',
 '2nd',
 '2turnt',
 '30',
 '30am',
 '35',
 '3500',
 '38',
 '39',
 '3dwhitestrips',
 '3rd',
 '40',
 '4am',
 '4rifgunawan',
 '4th',
 '50',
 '5am',
 '5th',
 '60',
 '65',
 '66',
 '77',
 '7am',
 '80',
 '80s',
 '81',
 '82',
 '90',
 '95',
 '99',
 '9th',
 '____',
 '_megancfc',
 '_ramishaa',
 'a5h0ka',
 'aaron_rs',
 'abc',
 'abiding',
 'ability',
 'able',
 'about',
 'aboutasmuchuseastitsonafish',
 'above',
 'absolute',
 'absolutely',
 'abt',
 'abu',
 'abusive',
 'acapella',
 'accepting',
 'access',
 'accident',
 'accommodating',
 'account',
 'accounting',
 'achieved',
 'act',
 'acting',
 'action',
 'activity',
 'actors',
 'actual',
 'actually',
 'ad',
 'add',
 'added',
 'addiction',
 'adding',
 'adrian',
 'adult',
 'advance',
 'advice',
 'aegonchampionships',
 'af',
 'afford',
 'afraid',
 'africa',
 'after',
 'afternoon',
 'again',
 'against',
 'age',
 'agent',
 'agian',
 'ago',
 'agreed',
 'ah',
 'aha',
 'ahead',
 'ahh',
 'aid',
 'ain',
 'aint',
 'air',
 'airlines',
 'airplanes',
 'airport',
 'airportboredom',
 'airtelng',
 'aka',
 'al',
 'alabama',
 'alan',
 'alarm',
 'album',
 'alcohol',
 'alert',
 'alexanderloete',
 'alexilalas',
 'alive',
 'all',
 'allies',
 'allocated',
 'allowed',
 'almost',
 'alone',
 'along',
 'already',
 'alright',
 'also',
 'alsohashtag',
 'although',
 'always',
 'am',
 'amazing',
 'amazon',
 'ameria',
 'america',
 'american',
 'americanair',
 'americans',
 'amerika_blog',
 'amirite',
 'amount',
 'amp',
 'amr',
 'an',
 'analysis',
 'anchor',
 'ancook26fans',
 'and',
 'andy',
 'angel',
 'angels',
 'animals',
 'ankle',
 'anniversary',
 'announced',
 'announcers',
 'annoy',
 'annoyed',
 'annoying',
 'anon',
 'anonymous',
 'another',
 'answer',
 'answers',
 'anthem',
 'anthony',
 'antlucas_',
 'antonio',
 'anxiety',
 'anxiousmuslimah',
 'any',
 'anybody',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'ap',
 'apart',
 'apartment',
 'app',
 'apparently',
 'appear',
 'appeared',
 'apple',
 'apply',
 'appointment',
 'appreciate',
 'appreciated',
 'april',
 'arab',
 'arabcrunch',
 'are',
 'aren',
 'arequipa',
 'arg',
 'argbih',
 'argentina',
 'argentinabosnia',
 'argentinian',
 'argentinians',
 'argue',
 'argument',
 'arguments',
 'argvsbih',
 'arizonar12',
 'arm',
 'arms',
 'around',
 'arrive',
 'arrogant',
 'arsenal',
 'art',
 'article',
 'articles',
 'artistic',
 'arts',
 'as',
 'asalways',
 'asap',
 'ascertain',
 'asian',
 'asianeyes',
 'ask',
 'asked',
 'asking',
 'asks',
 'asleep',
 'aspire',
 'ass',
 'assadcrimes',
 'asshole',
 'assholes',
 'assures',
 'aswel',
 'at',
 'ate',
 'atheist',
 'athletes',
 'athletic',
 'athletics',
 'atl',
 'atlantajay1',
 'atleast',
 'atm',
 'attack',
 'attempt',
 'attention',
 'attitude',
 'attracting',
 'attractive',
 'aunt',
 'auspol',
 'australia',
 'authorization',
 'authorized',
 'auto',
 'available',
 'ave',
 'average',
 'avoid',
 'avrillavigne',
 'aw',
 'awake',
 'award',
 'aware',
 'away',
 'awe',
 'awesome',
 'awesomeness',
 'awful',
 'awkward',
 'aye',
 'aye_alize',
 'b9ace',
 'babe',
 'babies',
 'baby',
 'bacardi',
 'back',
 'backatone000',
 'backedbyresearch',
 'bad',
 'badass',
 'badinfiuence',
 'badly',
 'badmlbcontract',
 'bae',
 'bag',
 'baker',
 'bakershorty1022',
 'bakr',
 'balanced',
 'ball',
 'balls',
 'ban',
 'band',
 'bands',
 'bandwagon',
 'bang',
 'banter',
 'bar',
 'barackobama',
 'barely',
 'barn',
 'barnesffc',
 'barrel',
 'barsandmelody',
 'base',
 'baseball',
 'based',
 'basedmichael',
 'bases',
 'basic',
 'basically',
 'basilfawlty',
 'basketball',
 'bass',
 'bat',
 'bathroom',
 'batteries',
 'battery',
 'batting',
 'battle',
 'bay',
 'bayareasoxfan',
 'bbbots',
 'bbc',
 'bbcsport',
 'bbcworldcup',
 'bbmark',
 'bbpowertrip',
 'bbq',
 'bbtamara',
 'bbuk',
 'bc',
 'bced',
 'bcgovnews',
 'bcwestmind',
 'bday',
 'be',
 'beach',
 'beagrie',
 'bear',
 'beardy',
 'beasley',
 'beast',
 'beasts',
 'beat',
 'beats',
 'beautician',
 'beautiful',
 'beautifulstory',
 'beauty',
 'because',
 'becauseitsthecup',
 'beckhams',
 'become',
 'bed',
 'bedard',
 'beef',
 'been',
 'beens',
 'beer',
 'beerisbetter',
 'before',
 'begin',
 'begun',
 'behind',
 'being',
 'beings',
 'believe',
 'believed',
 'belive',
 'bellissimo',
 'belong',
 'belongs',
 'belt',
 'belter',
 'bench',
 'beneficial',
 'berbasaunt10',
 'berry',
 'besides',
 'best',
 'besties',
 'bet',
 'betances',
 'betfair',
 'betfairhelpdesk',
 'better',
 'betty_ninja',
 'between',
 'beyonc',
 'beyond',
 'beyourself',
 'bf',
 'bff',
 'bhill4three',
 'biased',
 'bible',
 'bidenshairplugs',
 'big',
 'bigboobprobs',
 'biggar',
 'bigger',
 'biggest',
 'bigots',
 'bih',
 'bike',
 'bill',
 'bills',
 'billy',
 'binanggg',
 'bing',
 'bio',
 'birth',
 'birthday',
 'bit',
 'bit99little',
 'bitch',
 'bitch_im_boss1',
 'bitches',
 'bitcoin',
 'bite',
 'bizz_marky',
 'bjp',
 'bjupton2',
 'bk',
 'black',
 'blackdye',
 'blacklagoon',
 'blair',
 'blame',
 'blaming',
 'blasphemy',
 'blast',
 'blazeit',
 'blazing',
 'bleach',
 'bleeding',
 'bless',
 'blessed',
 'blew',
 'blind',
 'block',
 'blocked',
 'blocking',
 'blog',
 'blonde',
 'blondehails',
 'blood',
 'bloody',
 'blow',
 'blown',
 'blue',
 'bluejays',
 'blumo0n',
 'bmw',
 'board',
 'boarding',
 'boardwalk',
 'boat',
 'bob',
 'bochy',
 'body',
 'bomb',
 'bond',
 'bonding',
 'boneralltimelow',
 'bonjovi',
 'bonnaroo',
 'bonucci',
 'boo',
 'boobs',
 'boogbloss',
 'book',
 'bookbuzzr',
 'boom',
 'booty',
 'bootyhatt',
 'booze',
 'bordercontrol',
 'bored',
 'boredom',
 'boreoff',
 'boring',
 'boringmilner',
 'born',
 'bos',
 'bosh',
 'bosnia',
 'bosniaherzegovina',
 'bosnian',
 'boss',
 'bossicelny',
 'boston',
 'bostonglobe',
 'both',
 'bottle',
 'bought',
 'bound',
 'bourbon',
 'bout',
 'bow',
 'bowl',
 'bowling',
 'box',
 'boy',
 'boyfriend',
 'boys',
 'bpunion',
 'brady',
 'brain',
 'brakecheck',
 'brand',
 'bras',
 'brassmonkey1066',
 'brave',
 'braves',
 'brazil',
 'brazilian',
 'brazilians',
 'break',
 'breakfast',
 'breaking',
 'brett',
 'brewers',
 'brewery',
 'bridge',
 'brien',
 'bright',
 'brightest',
 'briii_zeee',
 'brilliance',
 'brilliant',
 'brilliantday',
 'bring',
 'bringbackourboys',
 'bringing',
 'brings',
 'britabroad',
 'british',
 'britni113',
 'britnimcdonald',
 'brits',
 'bro',
 'broadband',
 'broadcast',
 'broke',
 'brolly',
 'bronx',
 'brooklyn',
 'bros',
 'brother',
 'brothers',
 'brought',
 'brown',
 'brucefeldmancfb',
 'bruh',
 'bruhhh',
 'bruiseseverywhere',
 'brutally',
 'bryanjfischer',
 'bstrassburg',
 'btw',
 'bubble',
 'bucket',
 'bucks',
 'bud',
 'buddy',
 'buffalo',
 'buggered',
 'bugs',
 'build',
 'building',
 'built',
 'bull',
 'bullpen',
 'bulls',
 'bullshit',
 'bum',
 'bummed',
 'bump',
 'bunch',
 'bundle',
 'burdick',
 'burger',
 'burkes',
 'burn',
 'burning',
 'burns',
 'burrito',
 'burst',
 'bursting',
 'bus',
 'buses',
 'bush',
 'bushwhackers',
 'business',
 'busy',
 'but',
 'butreally',
 'butt',
 'butter',
 'button',
 'buttongod_ks',
 'buy',
 'buys',
 'buzzing',
 'by',
 'bye',
 'bysarapaulson',
 'ca',
 'cafe',
 'cage',
 'caitfurm',
 'cake',
 'cal',
 'calendar',
 'cali',
 'california',
 'call',
 'called',
 'calling',
 'calls',
 'calm',
 'calum5sos',
 'came',
 'camera',
 'camp',
 'campaign',
 'campbell',
 'camping',
 'can',
 'cancelled',
 'canditotraining',
 'candle',
 'candy',
 'cannot',
 'cant',
 'cantwait',
 'cap',
 'car',
 'card',
 'cards',
 'care',
 'career',
 'carefully',
 'cares',
 'carlosmarsden',
 'carnival',
 'caroline',
 'carras16',
 'carrick',
 'cars',
 'carson',
 'carta',
 'carter',
 'case',
 'cash',
 'cashier',
 'castle',
 'casualty',
 'cat',
 'catch',
 'cats',
 'caught',
 'cause',
 'causing',
 'cbc',
 'ccot',
 'celebrate',
 'celebrated',
 'celebrates',
 'celebrating',
 'celebration',
 'celebrity',
 'celeste_pim',
 'celsius',
 'celtics',
 'center',
 'central',
 'centralpark',
 'cents',
 'ceremony',
 'certain',
 'certainly',
 'cfl',
 'chadders_5',
 'chalmers',
 'champ',
 'champions',
 'championship',
 'championships',
 'chance',
 'chances',
 'change',
 'changed',
 'changes',
 'changing',
 'channel',
 'channeling',
 'characters',
 'charge',
 'charger',
 'charismatic',
 'charmer',
 'chart',
 'chartercom',
 'chasing',
 'chat',
 'chatting',
 'cheating',
 'cheats',
 'check',
 'checking',
 'cheer',
 'cheers',
 'cheery',
 'cheese',
 'cheflife',
 'chefs',
 'chelsea',
 'chelsiemountain',
 'chem',
 'cheney',
 'cherish',
 'chest',
 'chicago',
 'chick',
 'chicken',
 'chicks',
 'child',
 'childhood',
 'children',
 'chiles',
 'chill',
 'chille',
 'chillin',
 'chilling',
 'chinatown',
 'chinese',
 'chirpy',
 'chit',
 'chloefromvine',
 'chocolate',
 'choice',
 'choose',
 'chose',
 'chris',
 'chrisbrown',
 'chriscpc11',
 'chrismara85',
 'chrisnlomas',
 'chrispatsimpson',
 'christ',
 'christianleft',
 'christians',
 'christiec733',
 'christinablank',
 'christopher',
 'chuck',
 'church',
 'chvrches',
 'cia',
 'cinematography',
 'cineworld',
 'circulate',
 'citi',
 'citizens',
 'city',
 'cityofsaskatoon',
 'civ',
 'civilcynic',
 'cj',
 'claim',
 'clarity',
 'clarkherlin',
 'class',
 'classic',
 'classmates',
 'classy',
 'cld',
 'clean',
 'cleaning',
 'clear',
 'clearance',
 'cleared',
 'clearly',
 'clearlyitsme',
 'clevelandfrowns',
 'clever',
 'cliche',
 'click',
 'clinton',
 'clive',
 'cloak',
 'clock',
 'close',
 'closed',
 'closes',
 'closing',
 'clothes',
 'clothing',
 'cloud',
 'club',
 'clue',
 'clutch',
 'cnn',
 'co',
 'coach',
 'coaches',
 'coast',
 'cobaine',
 'cockers',
 'code',
 'cody',
 'coffee',
 'coffeegirlsmile',
 'cold',
 'cole',
 'collapsed',
 'collected',
 'collection',
 'college',
 'colmorrisdavis',
 'colombia',
 'color',
 'colors',
 'coloured',
 'colours',
 'columbia',
 'colvsgre',
 'com',
 'come',
 'comeback',
 'comedian',
 'comedyhackday',
 'comeonengland',
 'comes',
 'comin',
 'coming',
 'comment',
 'commentary',
 'commentating',
 'commentators',
 'comments',
 'common',
 'commutingproblems',
 'company',
 'competition',
 'competitions',
 'complain',
 'complete',
 'completed',
 'completely',
 'completing',
 'comprehend',
 'computer',
 'computers',
 'concern',
 'concerned',
 'concert',
 'concerts',
 'conclusion',
 'concussions',
 'conditional',
 'condoms',
 'coneyisland',
 'conference',
 'confident',
 'confirm',
 'confirmation',
 'confiscating',
 'confused',
 'confusion',
 'congrats',
 'congratulate',
 'connect',
 'connection',
 'connydftba',
 'consider',
 'considering',
 'constant',
 'constantly',
 'constitution',
 'construction',
 'contact',
 'contain',
 'content',
 'contentmarketing',
 'contentstrategy',
 'continues',
 'continuing',
 'control',
 'conventions',
 'conversation',
 'conversations',
 'convinced',
 'cook',
 'cool',
 'cooperation',
 'cope',
 'cops',
 'cormicanfitness',
 'corner',
 'corporations',
 'correct',
 'correctly',
 'cos',
 'cosmosis_j0nes',
 'costa',
 'cote',
 'couch',
 'could',
 'couldabeenworse',
 'couldn',
 'council',
 'counter',
 'country',
 'countrysayingss',
 'county',
 'couple',
 'couples',
 'course',
 'courtcordova',
 'cousin',
 'cousins',
 'cover',
 'coverage',
 'cow',
 'cowell',
 'coz',
 'crab',
 'crack',
 'cracking',
 'crafty',
 'cramp',
 'cramps',
 'cranky',
 'crap',
 'crappy',
 'crash',
 'crawford',
 'crawlspace',
 'cray',
 'crazy',
 'crc',
 'cream',
 'create',
 'created',
 'creative',
 'credit',
 'creepy',
 'crew',
 'crib',
 'cried',
 'crime',
 'criminal',
 'crispyconcords',
 'cristiano',
 'criticism',
 'crop',
 'cross',
 'crosses',
 'crossing',
 'crowd',
 'crowded',
 'croydon',
 'crucial',
 'crueltyfree',
 'crush',
 ...]

In [123]:
# count_vec.transform()

In [99]:
master


Out[99]:
Unnamed: 0 0 type English ToUser Hashtags AllCapsCount
0 0 Thanks sarcastic 1 0 0 0
1 1 Top tip. To illicit a \"thank you\" from some... sarcastic 1 0 1 0
2 2 Thanks to whoever just threw the bag of waterm... sarcastic 1 0 1 0
3 3 yes let's #EndFathersDay because the mother i... sarcastic 1 0 1 0
4 4 Well it's just gonna turn into a lovely day sarcastic 1 0 0 0
5 5 Nothing to see here, move along Lerner's L... sarcastic 1 0 0 1
6 6 So who does Campbell play for? sarcastic 1 0 0 0
7 7 @JamesBraginton @STEM08 @thinkprogress \nJames... sarcastic 1 1 0 1
8 8 Does this make me fancy? #imsofancyyoualreadyk... sarcastic 1 0 1 0
9 9 I love that Arequipa just shuts the water off ... sarcastic 1 0 0 1
10 10 Everyone's at Notre Dame and I'm just sitting ... sarcastic 1 0 1 0
11 11 Tweet of the day!!!! \ud83d\ude1c Holy shit... sarcastic 1 0 0 0
12 12 @LUTZLOVER43 sarcastic 1 1 0 1
13 13 #orgasm sarcastic 1 0 1 0
14 14 I hate to see Luis Suarez get injured, returni... sarcastic 1 0 0 1
15 15 @jaycutlersux right bc Bush\/Cheney were total... sarcastic 1 1 0 0
16 16 @TPoloking don't wrry sarcastic 1 1 0 0
17 17 wow today is going just absolutely SPECTACULAR sarcastic 1 0 0 1
18 18 Looking forward to playing Costa Rica what wit... sarcastic 1 0 0 0
19 19 After the last friendly, I can only be happy a... sarcastic 1 0 0 1
20 20 Tithes paid. Bills paid. Now to go clock anoth... sarcastic 1 0 0 1
21 21 Clive thought that was in, great commentary ... sarcastic 1 0 1 0
22 22 @New0rleans_Lady @Aaron_RS Lol Thats they fair... sarcastic 1 1 0 0
23 23 \"Nothing says 'come to me baby' like a sexy p... sarcastic 1 0 0 0
24 24 @ScottCubs36 why is white so positive, you rac... sarcastic 1 1 1 0
25 25 I just love that when a celebrity or a youtube... sarcastic 1 0 1 1
26 26 Loving the football tonight \ud83d\udc9c\ud83d... sarcastic 1 0 1 0
27 27 I just pulled off 3 ticks from my hip yay! #... sarcastic 1 0 1 1
28 28 @francescaacox apparently people have been men... sarcastic 1 1 0 0
29 29 The only downside is that it wouldn't destroy ... sarcastic 1 0 0 0
... ... ... ... ... ... ... ...
423593 293542 \u26bd\ufe0fOrgullosa \ud83d\ude0b\ud83c\udf34... genuine 1 1 0 0
423594 293543 @gabeliedman I'll watch anything with Jan Hooks genuine 1 1 0 0
423595 293544 .@BigCee302MVP R U sure about that? Many studi... genuine 1 1 0 2
423596 293545 We know how to celebrate our freedom in SF! #h... genuine 1 1 1 1
423597 293546 @jawnv6 I did not. Will read when I get a chan... genuine 1 1 0 2
423598 293547 Just Switched Up Outfits, Looking Extraaa FLY!... genuine 1 0 1 1
423599 293548 Happy 4th of July America!!the best country in... genuine 1 0 1 0
423600 293549 @KelvinNeves @10stolemygoal2 hahaha stop that ... genuine 1 1 0 0
423601 293550 Ready to be in Jersey by the pool alreadddy \u... genuine 1 0 0 0
423602 293551 It's cold outside. I prefer this over yesterda... genuine 1 0 0 1
423603 293552 I'm glad none of my friends know my female cou... genuine 1 0 0 0
423604 293553 Happy 4th of July!! #nyc #macys #4thJuly #usa ... genuine 1 1 1 0
423605 293554 @KevyKevv92 lmfao slut! \nHave fun at Vince's ... genuine 1 1 0 0
423606 293555 @ShaniKelly almost time to see you\ud83d\ude08 genuine 1 1 0 0
423607 293556 Happy 4th of July!\n\ud83c\uddfa\ud83c\uddf8\u... genuine 1 1 0 0
423608 293557 Happy 4Th Of July @ColinMrattt genuine 1 1 0 0
423609 293558 I may be stuck in #NYC but my heart will alway... genuine 1 0 1 2
423610 293559 @Kamryn_Jackson @LandriWilliams y'all are lowe... genuine 1 1 0 0
423611 293560 why THE FUCK am I not consuming obscene amount... genuine 1 0 0 3
423612 293561 Selfie with my crew #PartyForOne #best4thJulyE... genuine 1 0 1 0
423613 293562 At least now I can go for Brazil since neymars... genuine 1 0 0 1
423614 293563 @heyitsSamN better wake up for breakfast tonorrow genuine 1 1 0 0
423615 293564 I'm actually so mad. I don't even know where t... genuine 1 0 0 3
423616 293565 2 hours and 20 minutes until my birthday bitch... genuine 1 0 0 0
423617 293566 Says the Floridian! @Allie_Davison: @Tomas_Ve... genuine 1 1 0 0
423618 293567 It has been a fun filled day with the family a... genuine 1 0 0 0
423619 293568 Back at @TheStandNYC for a couple shows tonigh... genuine 1 1 1 0
423620 293569 I love vagina shorts \ud83d\ude0d\ud83d\ude02\... genuine 1 1 0 1
423621 293570 my little brother and his online minecraft bes... genuine 1 0 1 0
423622 293571 Watching the fireworks over the Brooklyn bridg... genuine 1 0 0 0

423623 rows × 7 columns


In [21]:
unigrams = pd.DataFrame(varr)
unigrams['ToUser']=list(subset['ToUser'])
unigrams['Hashtags']=list(subset['Hashtags'])
unigrams['AllCapsCount']=list(subset['AllCapsCount'])

Tweets vectorized by Top-5000 Unigram vocabulary


In [22]:
unigrams


Out[22]:
0 1 2 3 4 5 6 7 8 9 ... 4993 4994 4995 4996 4997 4998 4999 ToUser Hashtags AllCapsCount
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 1
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 1
13 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
18 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 1 0
25 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 1
26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
27 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 1
28 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
29 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
15970 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15971 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 1 3
15972 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15973 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
15974 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 1
15975 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 1 0
15976 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
15977 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15978 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15979 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 1
15980 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
15981 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 1
15982 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15983 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
15984 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 1
15985 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15986 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15987 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15988 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15989 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15990 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15991 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
15992 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 3
15993 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15994 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
15995 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15996 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
15997 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 4
15998 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
15999 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

16000 rows × 5003 columns

Classification & Evaluation


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(unigrams, subset['type'], test_size=0.4, random_state=0)

In [78]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train) 
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
# print(clf.predict([[-0.8, -1]]))


Out[78]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [79]:
clf.score(X_test,y_test)


Out[79]:
0.58296875000000004

In [80]:
svc_predictions = clf.predict(X_test)

In [83]:
svc_preds_numeric = [1 if x=='sarcastic' else 0 for x in svc_predictions]
y_test_numeric = [1 if x=='sarcastic' else 0 for x in y_test]
fpr_svc, tpr_svc, thresh_svc = roc_curve(y_test_numeric, svc_preds_numeric)

In [98]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, svc_predictions)


Out[98]:
0.58296875000000004

In [84]:
plt.figure()
lw = 2
plt.plot(fpr_svc, tpr_svc, color='blue',lw=lw, label='ROC curve',)
plt.title("SVM ROC")


Out[84]:
<matplotlib.text.Text at 0x114126eb8>

In [86]:
tpr_svc


Out[86]:
array([ 0.        ,  0.98584906,  1.        ])

In [87]:
fpr_svc


Out[87]:
array([ 0.        ,  0.81490683,  1.        ])

In [26]:
from sklearn.linear_model import LogisticRegression
lr_clf= LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

In [27]:
lr_clf.fit(X_train,y_train)


Out[27]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
lr_clf.score(X_test,y_test)


Out[29]:
0.86921875000000004

In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
scores = cross_val_score(lr_clf, unigrams, subset['type'], cv=cv)
scores


Out[30]:
array([ 0.88020833,  0.88145833,  0.87666667])

In [31]:
lr_clf.predict_proba(X_test)


Out[31]:
array([[ 0.34393949,  0.65606051],
       [ 0.98309165,  0.01690835],
       [ 0.0990328 ,  0.9009672 ],
       ..., 
       [ 0.58789103,  0.41210897],
       [ 0.59976438,  0.40023562],
       [ 0.19569877,  0.80430123]])

In [75]:
lr_predictions = lr_clf.predict(X_test)

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import metrics 

# roc_auc_score(y_test,lr_clf.predict_proba(X_test))
# roc_curve(y_test, lr_predictions, pos_label="sarcastic")

In [49]:
lr_preds_numeric = [1 if x=='sarcastic' else 0 for x in lr_predictions]
y_test_numeric = [1 if x=='sarcastic' else 0 for x in y_test]

In [69]:
fpr_lr, tpr_lr, thresh_lr = roc_curve(y_test_numeric, lr_preds_numeric)

In [51]:
pd.Series(lr_predictions).value_counts()


Out[51]:
sarcastic    3209
genuine      3191
dtype: int64

In [72]:
plt.figure()
lw = 2
plt.plot(fpr_lr, tpr_lr, color='red',lw=lw, label='ROC curve',)
plt.title('LogReg ROC')


Out[72]:
<matplotlib.text.Text at 0x11447edd8>

In [52]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier()
dt_clf = dt_clf.fit(X_train, y_train)

In [53]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)


Out[53]:
GaussianNB(priors=None)

In [54]:
NB_results = clf.score(X_test, y_test)

In [97]:
NB_results


Out[97]:
0.78171875000000002

In [55]:
# cross_val_score(clf.predict_proba(X_test, y_test))
nb_predictions_positive = np.array([n[1] for n in clf.predict_proba(X_test)])

In [56]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, nb_predictions_positive, pos_label="sarcastic")

In [57]:
nb_predictions_positive


Out[57]:
array([ 1.,  0.,  1., ...,  0.,  0.,  0.])

In [58]:
y_test_1 = [1 if n=='sarcastic' else 0 for n in y_test]

In [59]:
y_test_1 = np.array(y_test_1)

In [61]:
fpr, tpr, thresholds = roc_curve(y_test, nb_predictions_positive, pos_label="sarcastic")

In [73]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve',)
plt.title("NaiveBayes ROC")


Out[73]:
<matplotlib.text.Text at 0x119856320>

In [ ]:
# thresholds
(clf.sigma_[1]).argmax()
clf.sigma_[1]
# 1.4590845579489764
# len(clf.sigma_[1])
# clf.classes_

Last feature, "AllCapsCount", is most predictive

Testing on Election tweets


In [151]:
test_tweets = pd.read_csv('test_tweets_df.csv')
test_labels = test_tweets['label']

In [155]:
test_unigrams = pd.DataFrame(count_vec.transform(test_tweets['0']).toarray())
test_unigrams['ToUser'] = test_tweets['ToUser']
test_unigrams['Hashtags'] = test_tweets['Hashtags']
test_unigrams['AllCapsCount'] = test_tweets['AllCapsCount']

In [156]:
test_labels_numeric = [1 if x=='sarcastic' else 0 for x in test_labels]

In [163]:
# X_train, X_test, y_train, y_test = train_test_split(test_unigrams, test_labels_numeric, test_size=0.4, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(test_unigrams, test_labels, test_size=0.4, random_state=0)

In [171]:
lr_clf.score(X_test, y_test)
test_preds = lr_clf.predict(X_test)

In [168]:
clf.score(X_test, y_test)


Out[168]:
0.48749999999999999

In [170]:
dt_clf.score(X_test, y_test)


Out[170]:
0.5

In [175]:
test_preds_numeric = [1 if x=='sarcastic' else 0 for x in test_preds]
y_test_numeric = [1 if x=='sarcastic' else 0 for x in y_test]
fpr_test, tpr_test, thresh_test = roc_curve(y_test_numeric, test_preds_numeric)

In [179]:
plt.plot(fpr_test, tpr_test, color='green')
plt.title("LogReg on test data")


Out[179]:
<matplotlib.text.Text at 0x119c92860>

Results

SVM: 58% accuracy

NaiveBayes: 78% accuracy

Logistic Regression: 86% accuracy!