In [1]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
from collections import Counter

import viz

from sklearn import metrics, cross_validation
from sklearn import linear_model

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

In [10]:
punctuation = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
               '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
               '?', '@', '[', '\\', ']', '^', '_', '`']

In [3]:
session = create_session()
sb5b_documents = session.query(Document).join(Source).filter(Source.name=='sb5b').all()
sample_documents = session.query(Document).join(Source).filter(Source.name=='twitter-sample').all()

In [11]:
full_corpus = MulticlassCorpus(sb5b_documents + sample_documents)
full_corpus.apply_labelfunc(lambda doc: doc.source.name)
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, stop_words=punctuation)

print full_corpus


<MulticlassCorpus X = (235110, 342782), y = (235110,)>

In [22]:
query_terms = ['sb5', 'weareohio', 'issue2', '#sb5', '#weareohio', '#issue2']
naive_corpus = MulticlassCorpus(sb5b_documents + sample_documents)
naive_corpus.apply_labelfunc(lambda doc: doc.source.name)
naive_corpus.extract_features(lambda doc: 1, features.intercept)
naive_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, stop_words=query_terms + punctuation)

print naive_corpus


<MulticlassCorpus X = (235110, 337720), y = (235110,)>

In [25]:
[name for name in naive_corpus.feature_names if 'sb5' in name]


Out[25]:
[u'!! @elena_sb5',
 u'!!! www.sb5.com.br',
 u'##sb5',
 u'##sb5 #yeson2',
 u'#2012theme @ohiogop#sb5',
 u'#antisb5',
 u'#antisb5 #educatorsagainstsb5',
 u'#betterohio #sb5debates',
 u'#betterohiologic#sb5',
 u'#cmcforum #sb5\u201d',
 u'#edreform #sb5debates',
 u'#educatorsagainstsb5',
 u'#educatorsagainstsb5 --',
 u'#gotv #repealsb5',
 u'#happyinecle anti-#sb5',
 u'#issue 2/#sb5',
 u'#issue2#sb5',
 u'#issue2#sb5 http://t.co/3lajne6o',
 u'#issue2#sb5 http://t.co/sek3jula',
 u'#issue2/#sb5',
 u'#issue2/#sb5 --',
 u'#issue2/#sb5 vote',
 u'#issue2sb5',
 u'#issue2sb5 vote',
 u'#kasichcompromiseideas#sb5',
 u'#keepsb5',
 u'#mississippi#personhood #sb5#ohunion#p2',
 u'#nassau sb5/a4/c4/104/114',
 u'#non2 #repealsb5',
 u'#noon repeal#sb5',
 u'#noon2 #repealsb5',
 u'#noon2 #sb5#fkasich',
 u'#noon2 #sb5/#edshowissue2',
 u'#noon2 #sb5/edshowissue2',
 u'#noon2 #sb5daily',
 u'#noon2 #sb5doit',
 u'#noon2 #sb5http',
 u'#noon2 #sb5more',
 u'#noon2 #sb5\u201d',
 u'#noon2 repeal#sb5',
 u'#nosb5',
 u'#nosb5 #standupoh',
 u'#ohio #sb5\u201d',
 u'#ohiosb5',
 u'#ohunion #repealsb5',
 u'#ows #repealsb5',
 u'#preisvergleich http://inbrd.de/sb5',
 u'#repealsb5',
 u'#repealsb5 #kasichsucks',
 u'#repealsb5 #noon2',
 u'#repealsb5 #ohio',
 u'#repealsb5 #p2',
 u'#repealsb5 #standupoh',
 u'#repealsb5 #votenoh2',
 u'#repealsb5 #votenoon2',
 u'#repealsb5 have',
 u'#repealsb5 http://t.co/tkzquixf',
 u'#repealsb5 nov',
 u'#repealsb5 say',
 u'#repealsb5 this',
 u"#repealsb5 we're",
 u'#repealsb5 with',
 u'#sb5#',
 u'#sb5# http://k.ai/poi/19081365',
 u'#sb5#fkasich',
 u'#sb5#issue2',
 u'#sb5#issue2 no',
 u'#sb5#ohunion#p2',
 u'#sb5#p2',
 u'#sb5#weareohio',
 u'#sb5#weareohio #issue2#standupoh',
 u"#sb5'ers",
 u"#sb5'ers like",
 u"#sb5's.",
 u"#sb5's:",
 u"#sb5's: teachers",
 u'#sb5)',
 u'#sb5) http://t.co/1vtmc7wd',
 u'#sb5) http://t.co/ioeocxmr',
 u'#sb5-',
 u'#sb5- firefighters',
 u'#sb5- it',
 u'#sb5--#ohio',
 u"#sb5--#ohio 's",
 u'#sb5-now',
 u'#sb5-now that',
 u'#sb5-the',
 u'#sb5-the anti-worker',
 u'#sb5-vote',
 u'#sb5-vote no',
 u'#sb5/',
 u'#sb5/ #issue',
 u"#sb5/ 's",
 u'#sb5/ also',
 u'#sb5/ and',
 u'#sb5/ breakdown',
 u'#sb5/ is',
 u'#sb5/ it',
 u'#sb5/ kasich',
 u'#sb5/ ohio',
 u'#sb5/ reilly',
 u'#sb5/ we',
 u"#sb5/ won't",
 u'#sb5/#edshowissue2',
 u'#sb5/#issue2',
 u'#sb5/#issue2 --',
 u'#sb5/#issue2 debate',
 u'#sb5/#issue2 encourage',
 u'#sb5/#issue2 http://t.co/g3irwlyc',
 u'#sb5/#issue2 in',
 u'#sb5/#issue2 is',
 u'#sb5/#issue2 now',
 u'#sb5//',
 u'#sb5// such',
 u'#sb5/edshowissue2',
 u'#sb5/issue',
 u'#sb5/issue 2',
 u'#sb5/issue2',
 u'#sb5=',
 u'#sb5= #issue',
 u'#sb5=$11m',
 u'#sb5=$11m who',
 u'#sb5\\',
 u'#sb5also',
 u'#sb5also too',
 u'#sb5attacksallpublicworkers',
 u'#sb5chat',
 u'#sb5chat #noon2',
 u'#sb5creator',
 u'#sb5creator shannon',
 u'#sb5daily',
 u'#sb5debate',
 u'#sb5debate #cinstateaaup',
 u'#sb5debates',
 u'#sb5debates #betterohio',
 u'#sb5debates #noon2',
 u'#sb5doit',
 u'#sb5doit please',
 u'#sb5faithlocal',
 u'#sb5faithlocal controlrightssafetypersonalveteranshttp',
 u'#sb5http',
 u'#sb5http :/',
 u'#sb5if',
 u'#sb5if we',
 u'#sb5more',
 u'#sb5more deception',
 u'#sb5nfib',
 u'#sb5no',
 u'#sb5no on',
 u'#sb5standup',
 u'#sb5standup #fightforjobs',
 u'#sb5tea',
 u'#sb5tea party',
 u'#sb5video',
 u'#sb5video what',
 u'#sb5video-',
 u'#sb5video- http://t.co/71gkvogt',
 u'#sb5\u201d',
 u'#sb5\u201d #occupyohio',
 u'#sb5\u201d #teaparty',
 u'#sb5\u201d @thedanboyd',
 u'#sb5\u201d @theyoggs',
 u'#sb5\u201d awesome',
 u'#sb5\u201d easy',
 u'#sb5\u201d vote',
 u'#sb5\u201d/do',
 u'#sb5\u201d/do your',
 u'#slownheavy#sb5',
 u'#slownheavy#sb5 is',
 u'#standupoh #issue2#sb5',
 u'#standupoh #repealsb5',
 u'#standupoh repeal#sb5',
 u'#standupohio repeal#sb5',
 u'#stopsb5',
 u'#tcot ##sb5',
 u'#tcot#sb5#issue2',
 u'#tcot#sb5#issue2 http://t.co/vzisfqiw\u201d',
 u'#teaparty #tcot#sb5#issue2',
 u'#unions=$30m #sb5=$11m',
 u'#vetosb5',
 u'#vetosb5 and',
 u'#votenoh2 #repealsb5',
 u'#votenoon2 issue2/sb5',
 u'#votenoon2 sb5/issue2',
 u'#votenoonsb5',
 u'#weareohio#sb5',
 u'#wearethe99 #repealsb5',
 u'#yeson2 http://t.co/un8bh4tn#sb5',
 u'#yeson2#yeson2#sb5',
 u'#yeswecan #repealsb5',
 u'#yoursb5number',
 u'$sb5',
 u'$sb5 #standupoh',
 u"'s #sb5/",
 u"'s car#sb5",
 u"'s pro-sb5",
 u"'s sb5)",
 u'(#sb5',
 u'(#sb5 appears',
 u'(#sb5 arguments',
 u'(#sb5 http://t.co/xlmwpsn',
 u'(#sb5 is',
 u'(#sb5 kickoff',
 u'(#sb5 on',
 u'(#sb5 would',
 u'(#sb5)',
 u'(#sb5) http://t.co/8dpomyg',
 u'(#sb5) http://t.co/siwxvr1',
 u'(#sb5) saying',
 u'-#sb5',
 u'-#sb5 group',
 u'-- http://caltweet.com/sb5',
 u'-sb5',
 u'-sb5 sticker',
 u'... http://img.ly/94sq#sb5',
 u'... http://twiffo.com/sb5',
 u'... http://yamm.hu/l/sb5',
 u'... www.repealsb5ohio.com',
 u'/#sb5',
 u'/#sb5 @phyrefyter',
 u'//#sb5',
 u'//#sb5 #ohunion',
 u'/bit ly/fefd9v#sb5',
 u'/bit ly/loesxt#sb5why',
 u'/bit ly/mfaoug#sb5',
 u'/bit ly/mtfz4s#sb5',
 u'/bit ly/ofrzjf#sb5',
 u'/t co/0d8dvij#sb5',
 u'/t co/2ejuiovx#sb5',
 u'/t co/3ngwn3ip#sb5',
 u'/t co/6rzaklsp#sb5',
 u'/t co/cvw2e06x#sb5',
 u'/t co/fer5hrpc#sb5',
 u'/t co/fpgegcc#sb5',
 u'/t co/hf4knxc9#sb5',
 u'/t co/lahan4t#sb5',
 u'/t co/qaiktbb#sb5',
 u'/t co/t1klwke#sb5',
 u'/t co/tcyoeca#sb5',
 u'/t co/u4or2vu#sb5',
 u'/t co/x0hkt4d#sb5',
 u'/t co/xmrmdnr1#sb5',
 u'1/2]#sb5',
 u'1/2]#sb5 #grannygate',
 u'2 (#sb5',
 u'2 (#sb5)',
 u'2 http://t.co/fjogxtp0#sb5',
 u'2 sb5)',
 u'2(sb5',
 u'2(sb5 #vote',
 u'2/#sb5',
 u'2/#sb5 battle',
 u'2/#sb5 debate',
 u'2/2]#sb5',
 u'2/2]#sb5 #grannygatehttp',
 u'2/sb5',
 u'2/sb5 ...',
 u'2/sb5 68-32',
 u'2/sb5 and',
 u'2/sb5 campaign',
 u'2/sb5 coming',
 u'2/sb5 debate',
 u'2/sb5 fewer',
 u'2/sb5 from',
 u'2/sb5 http://t.co/3nqbqg5x',
 u'2/sb5 http://t.co/5gmsphvl',
 u'2/sb5 http://t.co/cxk9kzgw',
 u'2/sb5 http://t.co/mb4g2x2',
 u'2/sb5 http://t.co/pkdspq9i',
 u'2/sb5 http://t.co/ufy8t6np',
 u'2/sb5 http://t.co/zhiv2ov9',
 u'2/sb5 http://t.co/zsytvqu',
 u'2/sb5 i',
 u'2/sb5 in',
 u'2/sb5 is',
 u'2/sb5 make',
 u'2/sb5 on',
 u'2/sb5 schools',
 u'2/sb5 the',
 u'2/sb5 to',
 u'2/sb5 victory',
 u'2/sb5 with',
 u'23 http://t.co/efnyjcrq#sb5',
 u'3 #sb5also',
 u'317 sb5<7',
 u'366 sb5^2',
 u'5 sb5)',
 u'5#sb5',
 u'6 http://urele.com/sb5',
 u'680 sb5>3',
 u'920rtx/sb5',
 u'920rtx/sb5 1/2',
 u'?? @elena_sb5',
 u'@aescalante09 @boni_sb5',
 u'@alex_tx20 @boni_sb5',
 u'@amanda_sb5',
 u'@billhemmer issue2/sb5',
 u'@boni_sb5',
 u'@boni_sb5 @aescalante09',
 u'@boni_sb5 @alex_tx20',
 u'@boni_sb5 @pabliich',
 u'@boni_sb5 jaja',
 u'@boni_sb5 jajaja',
 u'@boni_sb5 no',
 u'@boni_sb5 que',
 u'@boni_sb5 y',
 u'@boni_sb5 ya',
 u'@cadfile pro-sb5',
 u'@carlos__pr @elena_sb5',
 u'@crissweet4 @elena_sb5',
 u'@dsb59',
 u'@dsb59 @missamericapie',
 u'@elena_sb5',
 u'@elena_sb5 !!',
 u'@elena_sb5 #verdadescomotemplos',
 u'@elena_sb5 @carlos__pr',
 u'@elena_sb5 @crissweet4',
 u'@elena_sb5 @gemita_gm',
 u'@elena_sb5 @lidiahnndz',
 u'@elena_sb5 @mariiisofy',
 u'@elena_sb5 @martafernande47',
 u'@elena_sb5 @mayraalexis',
 u'@elena_sb5 @mirita_17',
 u'@elena_sb5 @nientedimale',
 u'@elena_sb5 como',
 u'@elena_sb5 en',
 u'@elena_sb5 es',
 u'@elena_sb5 http://t.co/3njj4tjm',
 u'@elena_sb5 jaja',
 u'@elena_sb5 jajaa',
 u'@elena_sb5 jajaja',
 u'@elena_sb5 jajajaj',
 u'@elena_sb5 joooder',
 u'@elena_sb5 los',
 u'@elena_sb5 me',
 u'@elena_sb5 menos',
 u'@elena_sb5 mira',
 u'@elena_sb5 no',
 u'@elena_sb5 pero',
 u'@elena_sb5 por',
 u'@elena_sb5 pues',
 u'@elena_sb5 que',
 u'@elena_sb5 sabes',
 u'@elena_sb5 si',
 u'@elena_sb5 siii',
 u'@elena_sb5 soy',
 u'@elena_sb5 te',
 u'@elena_sb5 tia',
 u'@elena_sb5 tu',
 u'@elena_sb5 y',
 u'@eli_limbo_11 @boni_sb5',
 u'@gemita_gm @elena_sb5',
 u'@gohpblog #sb5http',
 u'@gohpblog @sb5facts',
 u'@gohpblog anti-#sb5',
 u'@gohpblog#sb5http',
 u'@gohpblog#sb5http :/',
 u'@gregmild @sb5facts',
 u'@gregmild pro-#sb5',
 u'@hallettjoe beat-sb5',
 u'@innovationohio)#sb5',
 u'@jhillohio (#sb5',
 u'@keith_stone_sb5',
 u'@keith_stone_sb5 i',
 u'@keith_stone_sb5 still',
 u'@keith_stone_sb5 what',
 u'@klsriley @repealsb5ohio',
 u'@libertybelle4 #sb5/',
 u'@lidiahnndz @elena_sb5',
 u'@mariiisofy @elena_sb5',
 u'@martafernande47 @elena_sb5',
 u'@natalia_sb5',
 u'@natalia_sb5 no',
 u'@natalia_sb5 obvio',
 u'@natalia_sb5 y',
 u'@nientedimale @elena_sb5',
 u'@notgvn sb5facts',
 u'@ohiogop#sb5',
 u'@plunderbund anti-sb5',
 u'@plunderbund pro-#sb5',
 u'@plunderbund pro-sb5',
 u'@portiaaboulger sb5-the',
 u'@progressohio @repealsb5ohio',
 u'@progressohio pro-sb5',
 u'@randyludlow anti-#sb5',
 u'@repealsb5ohio',
 u'@repealsb5ohio @gohpblog',
 u'@repealsb5ohio @jointhefutureoh',
 u'@repealsb5ohio @seiu1199wvkyoh',
 u'@repealsb5ohio hey',
 u'@repealsb5ohio how',
 u'@repealsb5ohio is',
 u'@repealsb5ohio jon',
 u'@repealsb5ohio liar',
 u'@repealsb5ohio must',
 u'@repealsb5ohio poorly',
 u'@repealsb5ohio to',
 u'@repealsb5ohio wish',
 u'@repealsb5ohio you',
 u'@rweingarten ohio-sb5-8%',
 u'@sandytheis pro-#sb5',
 u'@sb5',
 u'@sb5fact',
 u'@sb5fact speak',
 u'@sb5fact speak-under',
 u'@sb5facts',
 u'@sb5facts 25',
 u'@sb5facts does',
 u'@sb5facts if',
 u'@sb5facts in',
 u'@sb5facts it',
 u'@sb5facts line',
 u'@sb5facts under',
 u'@sb5facts where',
 u'@sb5facts you',
 u'@sb5oh',
 u"@simondemonfor w$\u04455!\u0437dmcn\u0447\u043ba1?\u044a\u0434\u043a?w\u043a\u0435ohs`gh\u0449\u044b\u0441\u043f<te\u0445.s\u0438(q\u043e\u0439+sb5'vr}\u0431rn\u0443]y\u043am\u0430&#x",
 u'@teamsternation anti-sb5',
 u'@tommy_sb5',
 u'@treyutacht #sb5no',
 u'@tweetyblurb @repealsb5ohio',
 u'@unkybunky#sb5',
 u'@unkybunky#sb5 and',
 u'a @elena_sb5',
 u'a doozy#sb5',
 u'abertas www.sb5.com.br',
 u'about #repealsb5',
 u'about #sb5/',
 u'about #sb5http',
 u'about (#sb5',
 u'about (#sb5)',
 u'about /#sb5',
 u'about sb5/issue',
 u'about sb5/issue2',
 u'about sb5=vote',
 u'accounts @repealsb5ohio',
 u'acesse www.sb5.com.br',
 u'act sb5-hdg6',
 u'act sb5-hdss',
 u'action #repealsb5',
 u'action http://t.co/xcy1wzdm#sb5',
 u'advertiser-tribune anti-sb5',
 u'affect #sb5http',
 u'against #sb5-',
 u'against #sb5/',
 u'against #sb5/#issue2',
 u'against http://bit.ly/reviewsb5',
 u'against sb5-36',
 u'against sb5-vote',
 u'against sb5/issue',
 u'against sb5=vote',
 u'all #sb5debates',
 u'all pro-sb5',
 u'alone.twitpic.com/68g157#sb5',
 u'an anti-#sb5',
 u'an anti-sb5',
 u'and #sb5/',
 u'and firefighters#sb5',
 u"anti- #sb5'ers",
 u"anti- #sb5's.",
 u"anti- #sb5's:",
 u'anti-#sb5',
 u"anti-#sb5 's",
 u'anti-#sb5 activists',
 u'anti-#sb5 buttons',
 u'anti-#sb5 campaign',
 u'anti-#sb5 comments',
 u'anti-#sb5 crowd',
 u'anti-#sb5 effort',
 u'anti-#sb5 efforts',
 u'anti-#sb5 event',
 u'anti-#sb5 folks',
 u'anti-#sb5 great-grandmother',
 u'anti-#sb5 group',
 u'anti-#sb5 material',
 u'anti-#sb5 organizers',
 u'anti-#sb5 petition',
 u'anti-#sb5 rally',
 u'anti-#sb5 t-shirt',
 u'anti-#sb5-ers',
 u'anti-#sb5-ers love',
 u'anti-sb5',
 u'anti-sb5 ad',
 u'anti-sb5 beer',
 u'anti-sb5 button',
 u'anti-sb5 campaign',
 u'anti-sb5 coalition',
 u'anti-sb5 commercial',
 u'anti-sb5 faction',
 u'anti-sb5 folks',
 u'anti-sb5 granny',
 u'anti-sb5 group',
 u'anti-sb5 old',
 u'anti-sb5 parade',
 u'anti-sb5 petition',
 u'anti-sb5 police',
 u'anti-sb5 proponents',
 u'anti-sb5 rally',
 u'anti-sb5 signatures',
 u'anti-sb5 sticker',
 u'anti-sb5 supporters',
 u'anti-sb5 television',
 u'anti-sb5 video',
 u'anti-sb5 yard',
 u"anti-sb5'ers",
 u"anti-sb5'ers spam",
 u'anti-sb5/pro',
 u'anti-sb5/pro repeal',
 u'anti-worker #sb5/#issue2',
 u'any trust#sb5',
 u'at @repealsb5ohio',
 u'at sb5/issue',
 u'at sb5facts',
 u'at www.ohioprosperity.com/sb5',
 u'ax sb5/brandywine',
 u'bankrolling pro-#sb5',
 u'barcelona sb5/15',
 u'barcelona sb5/23',
 u'barcelona sb5/7',
 u'base w/sb5',
 u'beat-sb5',
 u'beat-sb5 parade',
 u'berding pro-#sb5',
 u'beta 920rtx/sb5',
 u'betterohio pro-#sb5',
 u'bill -sb5',
 u'bill 5#sb5',
 u'bill http://t.co/qkkswvn#sb5',
 u'bill http://youtu.be/puvbty0ah9y#sb5',
 u'biology http://sb5.biobricks.org/',
 u'blog anti-sb5',
 u'boulger-sb5',
 u'boulger-sb5 success',
 u'budget sb5http',
 u'business#sb5',
 u'business#sb5 is',
 u'buy sb5/issue2',
 u'by @unkybunky#sb5',
 u'by anti-#sb5',
 u'by documents#sb5',
 u'by sb5)',
 u'bye sb5unions',
 u'campaign http://t.co/irvonp2b#sb5',
 u'car#sb5',
 u'car#sb5 http://t.co/yjq6uqyl',
 u'carry pro-#sb5',
 u'cents http://bit.ly/p2srkc#sb5',
 u'classes http://t.co/9tph8bgn#sb5',
 u'classrooms http://t.co/pubmylfo#sb5',
 u'co/0d8dvij#sb5',
 u'co/2ejuiovx#sb5',
 u'co/3ngwn3ip#sb5',
 u'co/6rzaklsp#sb5',
 u'co/cvw2e06x#sb5',
 u'co/fer5hrpc#sb5',
 u'co/fer5hrpc#sb5 #edreform',
 u'co/fpgegcc#sb5',
 u'co/hf4knxc9#sb5',
 u'co/hf4knxc9#sb5 is',
 u'co/lahan4t#sb5',
 u'co/qaiktbb#sb5',
 u'co/qaiktbb#sb5 #hb153',
 u'co/t1klwke#sb5',
 u'co/t1klwke#sb5 #hb153',
 u'co/tcyoeca#sb5',
 u'co/u4or2vu#sb5',
 u'co/x0hkt4d#sb5',
 u'co/xmrmdnr1#sb5',
 u'collect anti-sb5',
 u'committee http://wso.li/sb5',
 u'con @elena_sb5',
 u'contract http://t.co/tynicbd#sb5',
 u'contribution http://t.co/kgznvz33#sb5',
 u'cooked#sb5',
 u'crafted #sb5#issue2',
 u'currently #nosb5',
 u'de @elena_sb5',
 u'defeat pro-sb5',
 u'defeated sb5/issue',
 u'defeating sb5/issue',
 u'details #repealsb5',
 u'discuss issue2/sb5',
 u'dislike #sb5tea',
 u'district http://twitpic.com/525vlc#sb5',
 u'dit http://q-audio.net/i/sb5',
 u'dividing #sb5-now',
 u'documents#sb5',
 u'dollars http://t.co/ukh4eth#sb5',
 u'doozy#sb5',
 u'dull #kasichcompromiseideas#sb5',
 u'ecl-sb5(s)',
 u'ecl-sb5(s) \u300celectrolux',
 u'educate sb5=no',
 u'employee http://t.co/0mlch4ya#sb5',
 u'employees http://bit.ly/ndgeav#sb5',
 u'endorses #issue2sb5',
 u'factual 1/2]#sb5',
 u'fake repealsb5ohio',
 u'fat #betterohiologic#sb5',
 u'featuring anti-#sb5',
 u'fighter #sb5//',
 u'firefighters#sb5',
 u'first anti-#sb5',
 u'first anti-sb5',
 u'first pro-sb5',
 u'fop #sb5debates',
 u'for #issue2/#sb5',
 u'for anti-sb5',
 u'for anti-sb5/pro',
 u'for business#sb5',
 u'for pro-sb5',
 u'for pro-sb5/anti',
 u'for repealsb5ohio',
 u'for sb5/denali',
 u'for sb5/issue',
 u'from anti-#sb5',
 u'funding pro-sb5',
 u'fyi-#sb5',
 u'fyi-#sb5 referendum',
 u'hashtag #sb5debate',
 u'hates oh-sb5/issue',
 u'help #repealsb5',
 u'help oh#sb5',
 u'his anti-#sb5',
 u'how anti-#sb5',
 u'how pro-#sb5',
 u'http://ar.gy/sb5',
 u'http://avaloncc.tk/service/bulletin/sb5/9/t6.htm',
 u'http://b5yy.co.cc/sb5',
 u'http://bit.ly/giyded #sb5debates',
 u'http://bit.ly/j5gyko#sb5',
 u'http://bit.ly/j5gyko#sb5 in',
 u'http://bit.ly/kujquy #nosb5',
 u'http://bit.ly/lmwmrh#sb5',
 u'http://bit.ly/m1fcc5 #repealsb5',
 u'http://bit.ly/ndgeav#sb5',
 u'http://bit.ly/p2srkc#sb5',
 u'http://bit.ly/p2srkc#sb5 is',
 u'http://bit.ly/peb9tn#sb5',
 u'http://bit.ly/peb9tn#sb5 #businessgeniuses',
 u'http://bit.ly/qa1yny#sb5',
 u'http://bit.ly/qtsraw $sb5',
 u'http://bit.ly/reviewsb5',
 u'http://caltweet.com/sb5',
 u'http://caltweet.com/sb5 bedroom',
 u'http://img.ly/94sq#sb5',
 u'http://inbrd.de/sb5',
 u'http://inbrd.de/sb5 bewertung',
 u'http://li.ru/sb5',
 u'http://lnkd.in/sb5-nm',
 u'http://meurl.info/sb5',
 u'http://myde.al/sb5',
 u'http://nhan.lv/sb5',
 u'http://oppt.co.cc/sb5',
 u'http://otf.me/sb5',
 u'http://q-audio.net/i/sb5',
 u'http://q-audio.net/i/sb5 thanks',
 u'http://sb5.biobricks.org',
 u'http://sb5.biobricks.org/',
 u'http://sb5.biobricks.org/schedule/worlds-first-synthetic-biology-slam/',
 u'http://sb5ohio.com/signing-locations/',
 u'http://sb5ohio.com/signing-locations/ #standupoh',
 u'http://t.co/0ar7zuzq#sb5',
 u'http://t.co/0mlch4ya#sb5',
 u'http://t.co/16z4sxm #sb5#weareohio',
 u'http://t.co/2zwx9xje #sb5chat',
 u'http://t.co/37xlitdq #sb5\u201d',
 u'http://t.co/3xit8hs#sb5',
 u'http://t.co/4udel6jk#sb5',
 u'http://t.co/4zlrmo8t#sb5firefighters',
 u'http://t.co/4zlrmo8t#sb5firefighters police',
 u'http://t.co/6m8rqfa#sb5',
 u'http://t.co/6m8rqfa#sb5 #standupoh',
 u'http://t.co/8wbmzdtf #sb5debates',
 u'http://t.co/9axwxiim#sb5',
 u'http://t.co/9axwxiim#sb5 utterly',
 u'http://t.co/9tph8bgn#sb5',
 u'http://t.co/9wqecks3 //#sb5',
 u'http://t.co/apforr65#sb5',
 u'http://t.co/apforr65#sb5 #p2',
 u'http://t.co/axdlqh87 #sb5debates',
 u'http://t.co/btlk8q9 #sb5\u201d',
 u'http://t.co/cafdeif #sb5#issue2',
 u'http://t.co/chravi9q#sb5',
 u'http://t.co/chravi9q#sb5 #grannygate',
 u'http://t.co/cixiv3bj#sb5',
 u'http://t.co/ctsjipf4#sb5',
 u'http://t.co/cvp8ndif#sb5',
 u'http://t.co/dbotojlk #sb5debates',
 u'http://t.co/dtvw8ynp #sb5debates',
 u'http://t.co/dujcufcl #nosb5',
 u'http://t.co/e241l8d#sb5',
 u'http://t.co/efnyjcrq#sb5',
 u'http://t.co/efnyjcrq#sb5 #cinstateaaup',
 u'http://t.co/fjlvfmfd#sb5',
 u'http://t.co/fjlvfmfd#sb5 #votenoonissue2',
 u'http://t.co/fjogxtp0#sb5',
 u'http://t.co/frzdm1yz #sb5debates',
 u'http://t.co/fx4af4hh #sb5debates',
 u'http://t.co/gcsty4u#sb5',
 u'http://t.co/gcsty4u#sb5 #standupoh',
 u'http://t.co/gng74h9#sb5',
 u'http://t.co/gng74h9#sb5 still',
 u'http://t.co/irvonp2b#sb5',
 u'http://t.co/ixymbd3b #sb5debates',
 u'http://t.co/jc8dq6u#sb5',
 u'http://t.co/jc8dq6u#sb5 in',
 u'http://t.co/kay6kuuc#sb5',
 u'http://t.co/kfswuaec #repealsb5',
 u'http://t.co/kgznvz33#sb5',
 u'http://t.co/kgznvz33#sb5 ::',
 u'http://t.co/lagucvud #sb5\u201d',
 u'http://t.co/lrdhrlym #sb5debates',
 u'http://t.co/mvuqz2vu #sb5\u201d',
 u'http://t.co/nsbfr3pb #sb5\\',
 u'http://t.co/ntljht01 #sb5debates',
 u'http://t.co/ojbkjua #sb5standup',
 u'http://t.co/oqywifap #sb5debates',
 u'http://t.co/oucg4gp4 #sb5debates',
 u'http://t.co/pubmylfo#sb5',
 u'http://t.co/qkkswvn#sb5',
 u'http://t.co/qr7z8gld @sb5',
 u'http://t.co/qvs4084h #sb5debates',
 u'http://t.co/rzrzpnwr #sb5debates',
 u'http://t.co/sdcuhcz #repealsb5',
 u'http://t.co/semgutao #sb5\u201d',
 u'http://t.co/sjqebg77 #sb5debates',
 u'http://t.co/tnsxrvli #sb5debates',
 u'http://t.co/tynicbd#sb5',
 u'http://t.co/tynicbd#sb5 is',
 u'http://t.co/ukh4eth#sb5',
 u'http://t.co/ukh4eth#sb5 not',
 u'http://t.co/un8bh4tn#sb5',
 u'http://t.co/un8bh4tn#sb5 #noon2',
 u'http://t.co/utgmadhl #sb5debates',
 u'http://t.co/vaimxbrz #sb5debates',
 u'http://t.co/vferioem#sb5',
 u'http://t.co/vu2hujon #sb5debates',
 u'http://t.co/wf9lv66g #sb5debates',
 u'http://t.co/whjc28ba #sb5debates',
 u'http://t.co/xcy1wzdm#sb5',
 u'http://t.co/xubjuvt #yeson2#yeson2#sb5',
 u'http://t.co/xyd4c1jk #sb5\u201d',
 u'http://t.co/ya6ck8iz#sb5',
 u'http://t.co/ya6ck8iz#sb5 #issue2collective',
 u'http://t.co/yaddvqnw #sb5\u201d',
 u'http://t.co/ydogpjla #sb5debates',
 u'http://t.co/yh8kcaor #sb5debates',
 u'http://t.co/yrvqihmr #sb5debates',
 u'http://t.co/yvivxmga #repealsb5',
 u'http://t.co/zmrtmhjg #sb5debates',
 u'http://thinkprogress.org/politics/2011/06/29/257402/sb5-repeal-signatures-delivered/',
 u'http://tinyurl.com/3re6fts#sb5',
 u'http://tinyurl.com/4397kla#sb5',
 u'http://tlk.tc/sb5',
 u'http://twiffo.com/sb5',
 u'http://twitpic.com/525vlc#sb5',
 u'http://urele.com/sb5',
 u'http://woelig.net/sb5',
 u'http://woelig.net/sb5 via',
 u'http://wso.li/sb5',
 u'http://www.ohea.org/sb5-central',
 u'http://www.progressohio.org/blog/2011/05/primary-day-sb5-volunteer-opportunities.html',
 u'http://www.progressohio.org/blog/2011/05/primary-day-sb5-volunteer-opportunities.html @seiu',
 u'http://www.yalestep.org/sb5/z/4046-ivona%2bactivation%2bkey+torrent+downloads.html',
 u'http://www.yalestep.org/sb5/z/4046-ivona%2bactivation%2bkey+torrent+downloads.html ivona\\+activation\\+key',
 u'http://yamm.hu/l/sb5',
 u'http://yj.pn/to-sb5',
 u'http://youtu.be/puvbty0ah9y#sb5',
 u'huh http://bit.ly/j5gyko#sb5',
 u'idea that(#sb5',
 u'if issue2/sb5',
 u'in #sb5-vote',
 u'in #sb5/#issue2',
 u'in anti-#sb5',
 u'in anti-sb5',
 u'in jeopardyjointhefuture.org/blog/363-sb5-c\u2026#weareohio',
 u'in pro-sb5',
 u'independent sb5-supporters\u2019',
 u'insider http://woelig.net/sb5',
 u'institute #sb5#',
 u'is #sb5creator',
 u'is #slownheavy#sb5',
 u'is sb5-creator',
 u'issue 2(sb5',
 u'issue 2/#sb5',
 u'issue 2/sb5',
 u'issue2/#sb5',
 u'issue2/#sb5 school',
 u'issue2/sb5',
 u'issue2/sb5 !!!',
 u'issue2/sb5 #edshowissue2',
 u'issue2/sb5 and',
 u'issue2/sb5 feeds',
 u'issue2/sb5 is',
 u'issue2/sb5 loses',
 u'issue2/sb5 saves',
 u'issue2/sb5 twitter',
 u'it http://bit.ly/qa1yny#sb5',
 u'ize sb5-03-01',
 u'jeopardyjointhefuture.org/blog/363-sb5-c\u2026#weareohio',
 u'jointhefutureoh |sb5',
 u'kasich pro-sb5',
 u'kasich sb5/issue',
 u'kill #sb5=',
 u'kill sb5-vote',
 u'kill sb5=issue',
 u'kill sb5=vote',
 u'know @repealsb5ohio',
 u'lady http://t.co/4udel6jk#sb5',
 u'language http://t.co/jc8dq6u#sb5',
 u'laugh #sb5video',
 u'leading pro-#sb5',
 u'leads pro-sb5',
 u'like sb5/issue',
 u'live http://sb5.biobricks.org',
 u'local pro-sb5',
 u'locations http://sb5ohio.com/signing-locations/',
 u'ly/fefd9v#sb5',
 u'ly/loesxt#sb5why',
 u'ly/loesxt#sb5why would',
 u'ly/mfaoug#sb5',
 u'ly/mtfz4s#sb5',
 u'ly/ofrzjf#sb5',
 u'ly/ofrzjf#sb5 #edreform',
 u'malta http://q-audio.net/i/sb5',
 u'maybe sb5/issue2',
 u'me/olayju #repealsb5',
 u'media http://t.co/ctsjipf4#sb5',
 u'meet on#sb5',
 u'mirrors #kasichcompromiseideas#sb5',
 u'mlk #sb5#issue2',
 u'money http://bit.ly/peb9tn#sb5',
 u'much cooked#sb5',
 u'my anti-sb5',
 u'new pro-#sb5',
 u'new pro-sb5/issue',
 u'niehaus pro-#sb5',
 u'no#sb5',
 u'nonsense http://t.co/3xit8hs#sb5',
 u'noon#issue2 repeal#sb5',
 u'now #repealsb5',
 u'of #sb5)',
 u'of #sb5/',
 u'of #sb5http',
 u'of #sb5video-',
 u'of (#sb5',
 u'of anti-sb5',
 u'of oh-sb5',
 u'of pro-#sb5',
 u'of pro-sb5',
 u'of repealsb5ohio',
 u'of sb5/collective',
 u'of sb5/issue',
 u'of sb5/issue2',
 u'official anti-sb5',
 u'oh pro-#sb5',
 u'oh sb5-issue',
 u'oh#sb5',
 u'oh-sb5',
 u'oh-sb5 the',
 u'oh-sb5/issue',
 u'oh-sb5/issue 2',
 u'ohio anti-sb5',
 u'ohio http://www.progressohio.org/blog/2011/05/primary-day-sb5-volunteer-opportunities.html',
 u'ohio pro-#sb5',
 u'ohio sb5-rheejected',
 u'ohio sb5/issue',
 u'ohio#sb5',
 u'ohio#sb5 literature',
 u'ohio-kill sb5=vote',
 u'ohio-sb5-8%',
 u'ohio-sb5-8% reporting',
 u'olive http://nhan.lv/sb5',
 u'om-sb5',
 u'om-sb5 flash',
 u'omni-bounce om-sb5',
 u'on #issue2#sb5',
 u'on #sb5/',
 u'on #sb5/issue2',
 u'on #sb5faithlocal',
 u'on (#sb5',
 u'on http://t.co/kay6kuuc#sb5',
 u'on issue2/sb5',
 u'on pro-#sb5',
 u'on pro-sb5',
 u'on sb5)',
 u'on sb5-',
 u'on sb5-defense',
 u'on sb5/',
 u'on sb5/issue',
 u'on sb5/no',
 u'on sb5/state',
 u'on#sb5',
 u'on#sb5 this',
 u'one http://otf.me/sb5',
 u'only pro-#sb5',
 u'oppose #sb5/issue',
 u'oppose sb5/issue',
 u'or anti-#sb5',
 u'or no#sb5',
 u'our #sb5/',
 u'overturn #sb5-the',
 u'overturn sb5)',
 u'overturn sb5-the',
 u'parody repealsb5ohio',
 u'pay sb5-required',
 u'payers http://t.co/cvp8ndif#sb5',
 u'payers http://t.co/ya6ck8iz#sb5',
 u'performance #repealsb5',
 u'pileup sb5/14',
 u'poll #sb5/issue',
 u'portia boulger-sb5',
 u'post pro-sb5',
 u'post-sb5',
 u'post-sb5 fight',
 u'pre-sb5',
 u'pro -#sb5',
 u'pro ohio#sb5',
 u'pro#sb5',
 u'pro-#sb5',
 u'pro-#sb5 ad',
 u'pro-#sb5 ads',
 u'pro-#sb5 camp',
 u'pro-#sb5 campaign',
 u'pro-#sb5 crowd',
 u'pro-#sb5 effort',
 u'pro-#sb5 efforts',
 u'pro-#sb5 facebook',
 u'pro-#sb5 fans',
 u'pro-#sb5 farm',
 u'pro-#sb5 group',
 u'pro-#sb5 lawmaker',
 u'pro-#sb5 mike',
 u'pro-#sb5 trying',
 u'pro-#sb5 tweets',
 u'pro-#sb5 using',
 u'pro-#sb5 will',
 u'pro-sb5',
 u'pro-sb5 ad',
 u'pro-sb5 advert',
 u'pro-sb5 businesses',
 u'pro-sb5 camp',
 u'pro-sb5 campaign',
 u'pro-sb5 columbus',
 u'pro-sb5 commercia',
 u'pro-sb5 commercials',
 u'pro-sb5 effort',
 u'pro-sb5 forces',
 u'pro-sb5 front',
 u'pro-sb5 group',
 u'pro-sb5 groups',
 u'pro-sb5 http://t.co/dw6ejawe',
 u'pro-sb5 legislators',
 u'pro-sb5 mailers',
 u'pro-sb5 position',
 u'pro-sb5 side',
 u'pro-sb5 stealth',
 u'pro-sb5 team',
 u'pro-sb5 this',
 u'pro-sb5 types',
 u'pro-sb5/anti',
 u'pro-sb5/anti repeal',
 u'pro-sb5/issue',
 u'pro-sb5/issue 2',
 u'pro-union anti-sb5',
 u'promo\xe7\xe3o trial/sb5',
 u'prosb5',
 u'protest sb5/issue2',
 u'quinn sb5/issue',
 u'r) http://t.co/e241l8d#sb5',
 u'rabia @elena_sb5',
 u'raises http://t.co/gng74h9#sb5',
 u'rates pro-#sb5',
 u're sb5s',
 u'recommendation http://t.co/6m8rqfa#sb5',
 u'remedy @sb5facts',
 u'repeal #sb5-',
 u'repeal #sb5http',
 u'repeal sb5#iaff',
 u'repeal sb5jointhefuture.org/blog/359-sb5-r\u2026#weareohio',
 u'repeal#sb5',
 u'repeal#sb5 #standupoh',
 u'repeal#sb5 http://t.co/3dkyuwsm',
 u'repealing sb5/issue',
 u'repealsb5ohio',
 u'repealsb5ohio bullshit',
 u'repealsb5ohio feed',
 u'repealsb5ohio it',
 u'retweet http://caltweet.com/sb5',
 u'reveals sb5-repeal',
 u'reverse #sb5--#ohio',
 u'rewards pro-sb5',
 u'rivero(colazo) somoza(sb5)',
 u'rivero(pl) somoza(sb5)',
 u'rt @dsb59',
 u'rt @sb5facts',
 u'rt http://caltweet.com/sb5',
 u'rt@johnb2#sb5',
 u'rt@johnb2#sb5 is',
 ...]

In [ ]:
# 99% of sample tweets are shorter than than 168 characters
# 100% of SB5 tweets are shorter than 162 characters
#
# sb5b_indices = corpus.y == corpus.class_lookup['sb5b']
# sample_indices = corpus.y == corpus.class_lookup['twitter-sample']

In [38]:
def test_accuracy(train_corpus, test_corpus, penalty):
    model = linear_model.LogisticRegression(fit_intercept=False, penalty=penalty)
    model.fit(train_corpus.X, train_corpus.y)
    pred_y = model.predict(test_corpus.X)
    return metrics.accuracy_score(test_corpus.y, pred_y)

def test_accuracy_folds(corpus, train_size, n_iter=20, penalty='l2'):
    folds = cross_validation.StratifiedShuffleSplit(corpus.y, train_size=train_size, n_iter=n_iter)
    for train_indices, test_indices in folds:
        train_corpus = corpus.subset(train_indices)
        test_corpus = corpus.subset(test_indices)
        yield test_accuracy(train_corpus, test_corpus, penalty)

def test_train_sizes(corpus, train_sizes, n_iter=20):
    for train_size in train_sizes:
        accuracies = list(test_accuracy_folds(corpus, train_size, n_iter=n_iter))
        print '{:} & {:.1%}'.format(train_size, np.mean(accuracies))

In [33]:
def baseline_eval(corpus):
    baseline_accuracies = list(test(corpus, 0.9, features=[0]))
    print 'baseline: {:.1%}'.format(np.mean(baseline_accuracies))

In [37]:
print 'Using all features'
test_train_sizes(full_corpus, [10, 100, 1000, 10000, 0.90], n_iter=20)


Using all features
10 & 89.4%
100 & 97.2%
1000 & 99.1%
10000 & 99.4%
0.9 & 99.4%

In [40]:
print 'Using query-naive features'
# np.array(list(
#     set(naive_corpus.feature_names) - set(full_corpus.feature_names)))
test_train_sizes(naive_corpus, [10, 100, 1000, 10000, 0.90], n_iter=20)


Using query-naive features
10 & 67.1%
100 & 77.9%
1000 & 90.8%
10000 & 96.7%
0.9 & 98.5%

In [45]:
def extreme_features(corpus):
    model = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')
    model.fit(corpus.X, corpus.y)

    coef_ = model.coef_.ravel()

    ordering = np.abs(coef_).argsort()
    edge_indices = ordering[npx.margins(25)]
    print zip(corpus.feature_names[edge_indices], coef_[edge_indices])

In [78]:
def print_coefficients(corpus, model, features):
    coef_ = model.coef_.ravel()
    # sort by the feature values
    ordering = coef_[features].argsort()
    # select the feature_names and values
    feature_names = corpus.feature_names[features][ordering]
    coef_values = coef_[features][ordering]
    for feature_name, coef_value in zip(feature_names, coef_values):
        print '{:16s} {:+.4f}'.format(feature_name, coef_value)
    print '{:16s} {:+.4f}'.format('Total', coef_values.sum())

In [79]:
def find_document(corpus, model, document):
    documents = np.array([datum.document for datum in corpus.data])
    example_indices = documents == document
    
    document_X = corpus.X[example_indices]
    pred_y = model.predict(document_X)
    print 'pred label:', corpus.labels[pred_y]
        
    features = document_X.toarray().ravel() != 0
    print_coefficients(corpus, model, features)

In [80]:
find_document(naive_corpus, model, 
              'The effort to repeal sb5 is incredible I am so happy to be apart of history')


pred label: [u'sb5b']
repeal           -4.8570
effort           -1.0017
happy to         -0.9305
to repeal        -0.7962
history          -0.6546
of history       -0.4879
i am             -0.3018
to be            -0.2763
of               -0.2417
is               -0.1889
to               -0.1877
effort to        -0.1838
be apart         -0.1090
apart of         -0.1070
the effort       -0.0924
so happy         -0.0872
repeal is        -0.0443
is incredible    -0.0202
am so            +0.0047
the              +0.0124
am               +0.0495
so               +0.2938
apart            +0.4339
be               +0.4565
happy            +0.5381
i                +0.5890
incredible       +0.5946
#intercept#      +2.6839
Total            -4.9120

In [30]:
intercept_features


Out[30]:
array([0])

In [41]:
model = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')
model.fit(naive_corpus.X, naive_corpus.y)


Out[41]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [44]:
find_document(naive_corpus, model, 
              'The effort to repeal sb5 is incredible I am so happy to be apart of history')


pred label: [u'sb5b']
features [u'#intercept#' u'am' u'am so' u'apart' u'apart of' u'be' u'be apart'
 u'effort' u'effort to' u'happy' u'happy to' u'history' u'i' u'i am'
 u'incredible' u'is' u'is incredible' u'of' u'of history' u'repeal'
 u'repeal is' u'so' u'so happy' u'the' u'the effort' u'to' u'to be'
 u'to repeal']

In [ ]: