In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [2]:
import mediacloud, json
mc = mediacloud.api.MediaCloud(api_key)
In [3]:
import csv
partisan_coding = csv.DictReader( file( 'partisan_coding_20140218.csv' ) )
media = [ row for row in partisan_coding]
print len(media)
1363
In [4]:
media = cPickle.load( file( os.path.expanduser( '~/Dropbox/mc/media.pickle' ), 'r' ) )
len(media)
Out[4]:
1363
In [12]:
def get_wc_query( media_id ):
return "+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:{}".format( media_id )
In [13]:
wc_old = media[0]['word_counts']
query = "+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:{}".format( media[0]['media_id'] )
print query
print get_wc_query( media[0]['media_id'] )
wc_new = mc.wordCount( query, '','none' )
wc_new
+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:18175
+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:18175
Out[13]:
[{u'count': 1083, u'stem': u'the', u'term': u'the'},
{u'count': 515, u'stem': u'and', u'term': u'and'},
{u'count': 220, u'stem': u'for', u'term': u'for'},
{u'count': 220, u'stem': u'said', u'term': u'said'},
{u'count': 197, u'stem': u'iowa', u'term': u'iowa'},
{u'count': 189, u'stem': u'that', u'term': u'that'},
{u'count': 132, u'stem': u'city', u'term': u'city'},
{u'count': 124, u'stem': u'with', u'term': u'with'},
{u'count': 120, u'stem': u'will', u'term': u'will'},
{u'count': 101, u'stem': u'was', u'term': u'was'},
{u'count': 99, u'stem': u'from', u'term': u'from'},
{u'count': 78, u'stem': u'have', u'term': u'have'},
{u'count': 68, u'stem': u'are', u'term': u'are'},
{u'count': 67, u'stem': u'about', u'term': u'about'},
{u'count': 61, u'stem': u'more', u'term': u'more'},
{u'count': 61, u'stem': u'his', u'term': u'his'},
{u'count': 60, u'stem': u'has', u'term': u'has'},
{u'count': 53, u'stem': u'she', u'term': u'she'},
{u'count': 50, u'stem': u'school', u'term': u'school'},
{u'count': 47, u'stem': u'her', u'term': u'her'},
{u'count': 47, u'stem': u'university', u'term': u'university'},
{u'count': 47, u'stem': u'their', u'term': u'their'},
{u'count': 46, u'stem': u'this', u'term': u'this'},
{u'count': 46, u'stem': u'police', u'term': u'police'},
{u'count': 45, u'stem': u'but', u'term': u'but'},
{u'count': 45, u'stem': u'year', u'term': u'year'},
{u'count': 43, u'stem': u'who', u'term': u'who'},
{u'count': 43, u'stem': u'one', u'term': u'one'},
{u'count': 41, u'stem': u'state', u'term': u'state'},
{u'count': 41, u'stem': u'they', u'term': u'they'},
{u'count': 41, u'stem': u'not', u'term': u'not'},
{u'count': 37, u'stem': u'all', u'term': u'all'},
{u'count': 37, u'stem': u'county', u'term': u'county'},
{u'count': 35, u'stem': u'johnson', u'term': u'johnson'},
{u'count': 35, u'stem': u'when', u'term': u'when'},
{u'count': 33, u'stem': u'after', u'term': u'after'},
{u'count': 33, u'stem': u'there', u'term': u'there'},
{u'count': 32, u'stem': u'which', u'term': u'which'},
{u'count': 31, u'stem': u'people', u'term': u'people'},
{u'count': 31, u'stem': u'information', u'term': u'information'},
{u'count': 31, u'stem': u'also', u'term': u'also'},
{u'count': 31, u'stem': u'been', u'term': u'been'},
{u'count': 30, u'stem': u'were', u'term': u'were'},
{u'count': 30, u'stem': u'new', u'term': u'new'},
{u'count': 30, u'stem': u'center', u'term': u'center'},
{u'count': 30, u'stem': u'two', u'term': u'two'},
{u'count': 30, u'stem': u'community', u'term': u'community'},
{u'count': 28, u'stem': u'would', u'term': u'would'},
{u'count': 28, u'stem': u'district', u'term': u'district'},
{u'count': 28, u'stem': u'other', u'term': u'other'},
{u'count': 27, u'stem': u'board', u'term': u'board'},
{u'count': 26, u'stem': u'into', u'term': u'into'},
{u'count': 26, u'stem': u'out', u'term': u'out'},
{u'count': 25, u'stem': u'think', u'term': u'think'},
{u'count': 25, u'stem': u'students', u'term': u'students'},
{u'count': 24, u'stem': u'had', u'term': u'had'},
{u'count': 24, u'stem': u'first', u'term': u'first'},
{u'count': 23, u'stem': u'what', u'term': u'what'},
{u'count': 23, u'stem': u'its', u'term': u'its'},
{u'count': 23, u'stem': u'public', u'term': u'public'},
{u'count': 22, u'stem': u'can', u'term': u'can'},
{u'count': 22, u'stem': u'some', u'term': u'some'},
{u'count': 22, u'stem': u'house', u'term': u'house'},
{u'count': 22, u'stem': u'years', u'term': u'years'},
{u'count': 22, u'stem': u'high', u'term': u'high'},
{u'count': 21, u'stem': u'than', u'term': u'than'},
{u'count': 21, u'stem': u'now', u'term': u'now'},
{u'count': 21, u'stem': u'program', u'term': u'program'},
{u'count': 21, u'stem': u'day', u'term': u'day'},
{u'count': 20, u'stem': u'going', u'term': u'going'},
{u'count': 20, u'stem': u'building', u'term': u'building'},
{u'count': 20, u'stem': u'our', u'term': u'our'},
{u'count': 20, u'stem': u'according', u'term': u'according'},
{u'count': 20, u'stem': u'them', u'term': u'them'},
{u'count': 19, u'stem': u'really', u'term': u'really'},
{u'count': 19, u'stem': u'him', u'term': u'him'},
{u'count': 19, u'stem': u'time', u'term': u'time'},
{u'count': 19, u'stem': u'before', u'term': u'before'},
{u'count': 18, u'stem': u'you', u'term': u'you'},
{u'count': 18, u'stem': u'just', u'term': u'just'},
{u'count': 18, u'stem': u'host', u'term': u'host'},
{u'count': 17, u'stem': u'get', u'term': u'get'},
{u'count': 17, u'stem': u'college', u'term': u'college'},
{u'count': 17, u'stem': u'members', u'term': u'members'},
{u'count': 17, u'stem': u'million', u'term': u'million'},
{u'count': 17, u'stem': u'three', u'term': u'three'},
{u'count': 16, u'stem': u'most', u'term': u'most'},
{u'count': 16, u'stem': u'only', u'term': u'only'},
{u'count': 16, u'stem': u'property', u'term': u'property'},
{u'count': 16, u'stem': u'well', u'term': u'well'},
{u'count': 16, u'stem': u'over', u'term': u'over'},
{u'count': 16, u'stem': u'like', u'term': u'like'},
{u'count': 16, u'stem': u'meeting', u'term': u'meeting'},
{u'count': 16, u'stem': u'call', u'term': u'call'},
{u'count': 16, u'stem': u'service', u'term': u'service'},
{u'count': 16, u'stem': u'very', u'term': u'very'},
{u'count': 16, u'stem': u'take', u'term': u'take'},
{u'count': 15, u'stem': u'local', u'term': u'local'},
{u'count': 15, u'stem': u'four', u'term': u'four'},
{u'count': 15, u'stem': u'being', u'term': u'being'},
{u'count': 15, u'stem': u'old', u'term': u'old'},
{u'count': 15, u'stem': u'president', u'term': u'president'},
{u'count': 15, u'stem': u'while', u'term': u'while'},
{u'count': 14, u'stem': u'funding', u'term': u'funding'},
{u'count': 14, u'stem': u'woman', u'term': u'woman'},
{u'count': 14, u'stem': u'during', u'term': u'during'},
{u'count': 14, u'stem': u'home', u'term': u'home'},
{u'count': 14, u'stem': u'second', u'term': u'second'},
{u'count': 14, u'stem': u'april', u'term': u'april'},
{u'count': 14, u'stem': u'percent', u'term': u'percent'},
{u'count': 14, u'stem': u'free', u'term': u'free'},
{u'count': 14, u'stem': u'because', u'term': u'because'},
{u'count': 13, u'stem': u'tax', u'term': u'tax'},
{u'count': 13, u'stem': u'committee', u'term': u'committee'},
{u'count': 13, u'stem': u'how', u'term': u'how'},
{u'count': 13, u'stem': u'made', u'term': u'made'},
{u'count': 13, u'stem': u'including', u'term': u'including'},
{u'count': 13, u'stem': u'art', u'term': u'art'},
{u'count': 13, u'stem': u'work', u'term': u'work'},
{u'count': 13, u'stem': u'see', u'term': u'see'},
{u'count': 12, u'stem': u'could', u'term': u'could'},
{u'count': 12, u'stem': u'area', u'term': u'area'},
{u'count': 12, u'stem': u'today', u'term': u'today'},
{u'count': 12, u'stem': u'fire', u'term': u'fire'},
{u'count': 12, u'stem': u'many', u'term': u'many'},
{u'count': 12, u'stem': u'way', u'term': u'way'},
{u'count': 12, u'stem': u'those', u'term': u'those'},
{u'count': 12, u'stem': u'say', u'term': u'say'},
{u'count': 12, u'stem': u'department', u'term': u'department'},
{u'count': 12, u'stem': u'much', u'term': u'much'},
{u'count': 12, u'stem': u'make', u'term': u'make'},
{u'count': 12, u'stem': u'against', u'term': u'against'},
{u'count': 12, u'stem': u'office', u'term': u'office'},
{u'count': 12, u'stem': u'man', u'term': u'man'},
{u'count': 12, u'stem': u'may', u'term': u'may'},
{u'count': 12, u'stem': u'degree', u'term': u'degree'},
{u'count': 12, u'stem': u'march', u'term': u'march'},
{u'count': 12, u'stem': u'place', u'term': u'place'},
{u'count': 12, u'stem': u'coralville', u'term': u'coralville'},
{u'count': 12, u'stem': u'schools', u'term': u'schools'},
{u'count': 12, u'stem': u'family', u'term': u'family'},
{u'count': 12, u'stem': u'council', u'term': u'council'},
{u'count': 12, u'stem': u'each', u'term': u'each'},
{u'count': 12, u'stem': u'food', u'term': u'food'},
{u'count': 12, u'stem': u'street', u'term': u'street'},
{u'count': 11, u'stem': u'project', u'term': u'project'},
{u'count': 11, u'stem': u'education', u'term': u'education'},
{u'count': 11, u'stem': u'both', u'term': u'both'},
{u'count': 11, u'stem': u'third', u'term': u'third'},
{u'count': 11, u'stem': u'monday', u'term': u'monday'},
{u'count': 11, u'stem': u'online', u'term': u'online'},
{u'count': 11, u'stem': u'law', u'term': u'law'},
{u'count': 11, u'stem': u'where', u'term': u'where'},
{u'count': 11, u'stem': u'know', u'term': u'know'},
{u'count': 11, u'stem': u'early', u'term': u'early'},
{u'count': 11, u'stem': u'number', u'term': u'number'},
{u'count': 11, u'stem': u'based', u'term': u'based'},
{u'count': 11, u'stem': u'lot', u'term': u'lot'},
{u'count': 11, u'stem': u'still', u'term': u'still'},
{u'count': 11, u'stem': u'teacher', u'term': u'teacher'},
{u'count': 11, u'stem': u'tuesday', u'term': u'tuesday'},
{u'count': 11, u'stem': u'www', u'term': u'www'},
{u'count': 11, u'stem': u'children', u'term': u'children'},
{u'count': 11, u'stem': u'former', u'term': u'former'},
{u'count': 11, u'stem': u'through', u'term': u'through'},
{u'count': 10, u'stem': u'show', u'term': u'show'},
{u'count': 10, u'stem': u'criminal', u'term': u'criminal'},
{u'count': 10, u'stem': u'church', u'term': u'church'},
{u'count': 10, u'stem': u'increase', u'term': u'increase'},
{u'count': 10, u'stem': u'group', u'term': u'group'},
{u'count': 10, u'stem': u'back', u'term': u'back'},
{u'count': 10, u'stem': u'org', u'term': u'org'},
{u'count': 10, u'stem': u'design', u'term': u'design'},
{u'count': 10, u'stem': u'another', u'term': u'another'},
{u'count': 10, u'stem': u'world', u'term': u'world'},
{u'count': 10, u'stem': u'such', u'term': u'such'},
{u'count': 10, u'stem': u'director', u'term': u'director'},
{u'count': 10, u'stem': u'development', u'term': u'development'},
{u'count': 10, u'stem': u'open', u'term': u'open'},
{u'count': 10, u'stem': u'staff', u'term': u'staff'},
{u'count': 10, u'stem': u'often', u'term': u'often'},
{u'count': 10, u'stem': u'here', u'term': u'here'},
{u'count': 10, u'stem': u'says', u'term': u'says'},
{u'count': 10, u'stem': u'press', u'term': u'press'},
{u'count': 10, u'stem': u'found', u'term': u'found'},
{u'count': 10, u'stem': u'officer', u'term': u'officer'},
{u'count': 10, u'stem': u'these', u'term': u'these'},
{u'count': 10, u'stem': u'five', u'term': u'five'},
{u'count': 9, u'stem': u'student', u'term': u'student'},
{u'count': 9, u'stem': u'located', u'term': u'located'},
{u'count': 9, u'stem': u'thursday', u'term': u'thursday'},
{u'count': 9, u'stem': u'alcohol', u'term': u'alcohol'},
{u'count': 9, u'stem': u'cost', u'term': u'cost'},
{u'count': 9, u'stem': u'officers', u'term': u'officers'},
{u'count': 9, u'stem': u'saturday', u'term': u'saturday'},
{u'count': 9, u'stem': u'friday', u'term': u'friday'},
{u'count': 9, u'stem': u'flood', u'term': u'flood'},
{u'count': 9, u'stem': u'wednesday', u'term': u'wednesday'},
{u'count': 9, u'stem': u'good', u'term': u'good'},
{u'count': 9, u'stem': u'west', u'term': u'west'},
{u'count': 9, u'stem': u'arts', u'term': u'arts'},
{u'count': 9, u'stem': u'iowans', u'term': u'iowans'},
{u'count': 9, u'stem': u'request', u'term': u'request'},
{u'count': 9, u'stem': u'liberty', u'term': u'liberty'},
{u'count': 9, u'stem': u'include', u'term': u'include'},
{u'count': 9, u'stem': u'north', u'term': u'north'},
{u'count': 9, u'stem': u'citizen', u'term': u'citizen'},
{u'count': 9, u'stem': u'facility', u'term': u'facility'},
{u'count': 9, u'stem': u'should', u'term': u'should'},
{u'count': 9, u'stem': u'business', u'term': u'business'},
{u'count': 9, u'stem': u'don', u'term': u'don'},
{u'count': 9, u'stem': u'wood', u'term': u'wood'},
{u'count': 9, u'stem': u'system', u'term': u'system'},
{u'count': 9, u'stem': u'national', u'term': u'national'},
{u'count': 9, u'stem': u'fall', u'term': u'fall'},
{u'count': 9, u'stem': u'then', u'term': u'then'},
{u'count': 9, u'stem': u'elementary', u'term': u'elementary'},
{u'count': 9, u'stem': u'six', u'term': u'six'},
{u'count': 9, u'stem': u'between', u'term': u'between'},
{u'count': 9, u'stem': u'change', u'term': u'change'},
{u'count': 9, u'stem': u'called', u'term': u'called'},
{u'count': 9, u'stem': u'across', u'term': u'across'},
{u'count': 9, u'stem': u'didn', u'term': u'didn'},
{u'count': 9, u'stem': u'general', u'term': u'general'},
{u'count': 9, u'stem': u'attorney', u'term': u'attorney'},
{u'count': 8, u'stem': u'issue', u'term': u'issue'},
{u'count': 8, u'stem': u'set', u'term': u'set'},
{u'count': 8, u'stem': u'white', u'term': u'white'},
{u'count': 8, u'stem': u'programs', u'term': u'programs'},
{u'count': 8, u'stem': u'something', u'term': u'something'},
{u'count': 8, u'stem': u'action', u'term': u'action'},
{u'count': 8, u'stem': u'states', u'term': u'states'},
{u'count': 8, u'stem': u'money', u'term': u'money'},
{u'count': 8, u'stem': u'any', u'term': u'any'},
{u'count': 8, u'stem': u'part', u'term': u'part'},
{u'count': 8, u'stem': u'senior', u'term': u'senior'},
{u'count': 8, u'stem': u'come', u'term': u'come'},
{u'count': 8, u'stem': u'less', u'term': u'less'},
{u'count': 8, u'stem': u'last', u'term': u'last'},
{u'count': 8, u'stem': u'music', u'term': u'music'},
{u'count': 8, u'stem': u'com', u'term': u'com'},
{u'count': 8, u'stem': u'makes', u'term': u'makes'},
{u'count': 8, u'stem': u'use', u'term': u'use'},
{u'count': 8, u'stem': u'told', u'term': u'told'},
{u'count': 8, u'stem': u'interest', u'term': u'interest'},
{u'count': 8, u'stem': u'obama', u'term': u'obama'},
{u'count': 8, u'stem': u'process', u'term': u'process'},
{u'count': 8, u'stem': u'budget', u'term': u'budget'},
{u'count': 8, u'stem': u'drive', u'term': u'drive'},
{u'count': 8, u'stem': u'run', u'term': u'run'},
{u'count': 8, u'stem': u'took', u'term': u'took'},
{u'count': 8, u'stem': u'downtown', u'term': u'downtown'},
{u'count': 8, u'stem': u'court', u'term': u'court'},
{u'count': 8, u'stem': u'half', u'term': u'half'},
{u'count': 8, u'stem': u'among', u'term': u'among'},
{u'count': 8, u'stem': u'facilities', u'term': u'facilities'},
{u'count': 8, u'stem': u'always', u'term': u'always'},
{u'count': 8, u'stem': u'within', u'term': u'within'},
{u'count': 8, u'stem': u'level', u'term': u'level'},
{u'count': 8, u'stem': u'spring', u'term': u'spring'},
{u'count': 8, u'stem': u'sunday', u'term': u'sunday'},
{u'count': 8, u'stem': u'present', u'term': u'present'},
{u'count': 8, u'stem': u'around', u'term': u'around'},
{u'count': 8, u'stem': u'services', u'term': u'services'},
{u'count': 7, u'stem': u'taken', u'term': u'taken'},
{u'count': 7, u'stem': u'better', u'term': u'better'},
{u'count': 7, u'stem': u'green', u'term': u'green'},
{u'count': 7, u'stem': u'country', u'term': u'country'},
{u'count': 7, u'stem': u'doing', u'term': u'doing'},
{u'count': 7, u'stem': u'plan', u'term': u'plan'},
{u'count': 7, u'stem': u'graduate', u'term': u'graduate'},
{u'count': 7, u'stem': u'expected', u'term': u'expected'},
{u'count': 7, u'stem': u'never', u'term': u'never'},
{u'count': 7, u'stem': u'little', u'term': u'little'},
{u'count': 7, u'stem': u'past', u'term': u'past'},
{u'count': 7, u'stem': u'under', u'term': u'under'},
{u'count': 7, u'stem': u'opportunity', u'term': u'opportunity'},
{u'count': 7, u'stem': u'came', u'term': u'came'},
{u'count': 7, u'stem': u'tom', u'term': u'tom'},
{u'count': 7, u'stem': u'did', u'term': u'did'},
{u'count': 7, u'stem': u'report', u'term': u'report'},
{u'count': 7, u'stem': u'campus', u'term': u'campus'},
{u'count': 7, u'stem': u'john', u'term': u'john'},
{u'count': 7, u'stem': u'received', u'term': u'received'},
{u'count': 7, u'stem': u'des', u'term': u'des'},
{u'count': 7, u'stem': u'kids', u'term': u'kids'},
{u'count': 7, u'stem': u'united', u'term': u'united'},
{u'count': 7, u'stem': u'event', u'term': u'event'},
{u'count': 7, u'stem': u'best', u'term': u'best'},
{u'count': 7, u'stem': u'fund', u'term': u'fund'},
{u'count': 7, u'stem': u'projects', u'term': u'projects'},
{u'count': 7, u'stem': u'economic', u'term': u'economic'},
{u'count': 7, u'stem': u'continue', u'term': u'continue'},
{u'count': 7, u'stem': u'library', u'term': u'library'},
{u'count': 7, u'stem': u'until', u'term': u'until'},
{u'count': 7, u'stem': u'apartment', u'term': u'apartment'},
{u'count': 7, u'stem': u'help', u'term': u'help'},
{u'count': 7, u'stem': u'top', u'term': u'top'},
{u'count': 7, u'stem': u'things', u'term': u'things'},
{u'count': 7, u'stem': u'government', u'term': u'government'},
{u'count': 7, u'stem': u'orchestra', u'term': u'orchestra'},
{u'count': 7, u'stem': u'recently', u'term': u'recently'},
{u'count': 7, u'stem': u'filed', u'term': u'filed'},
{u'count': 7, u'stem': u'care', u'term': u'care'},
{u'count': 7, u'stem': u'ago', u'term': u'ago'},
{u'count': 7, u'stem': u'own', u'term': u'own'},
{u'count': 7, u'stem': u'available', u'term': u'available'},
{u'count': 7, u'stem': u'others', u'term': u'others'},
{u'count': 7, u'stem': u'yet', u'term': u'yet'},
{u'count': 7, u'stem': u'put', u'term': u'put'},
{u'count': 7, u'stem': u'jail', u'term': u'jail'},
{u'count': 7, u'stem': u'since', u'term': u'since'},
{u'count': 7, u'stem': u'mother', u'term': u'mother'},
{u'count': 7, u'stem': u'moines', u'term': u'moines'},
{u'count': 7, u'stem': u'nearly', u'term': u'nearly'},
{u'count': 7, u'stem': u'safety', u'term': u'safety'},
{u'count': 7, u'stem': u'investigation', u'term': u'investigation'},
{u'count': 7, u'stem': u'residence', u'term': u'residence'},
{u'count': 7, u'stem': u'along', u'term': u'along'},
{u'count': 6, u'stem': u'already', u'term': u'already'},
{u'count': 6, u'stem': u'evening', u'term': u'evening'},
{u'count': 6, u'stem': u'real', u'term': u'real'},
{u'count': 6, u'stem': u'freeman', u'term': u'freeman'},
{u'count': 6, u'stem': u'planning', u'term': u'planning'},
{u'count': 6, u'stem': u'jones', u'term': u'jones'},
{u'count': 6, u'stem': u'results', u'term': u'results'},
{u'count': 6, u'stem': u'right', u'term': u'right'},
{u'count': 6, u'stem': u'construction', u'term': u'construction'},
{u'count': 6, u'stem': u'experience', u'term': u'experience'},
{u'count': 6, u'stem': u'provide', u'term': u'provide'},
{u'count': 6, u'stem': u'recycling', u'term': u'recycling'},
{u'count': 6, u'stem': u'questions', u'term': u'questions'},
{u'count': 6, u'stem': u'solon', u'term': u'solon'},
{u'count': 6, u'stem': u'months', u'term': u'months'},
{u'count': 6, u'stem': u'move', u'term': u'move'},
{u'count': 6, u'stem': u'great', u'term': u'great'},
{u'count': 6, u'stem': u'child', u'term': u'child'},
{u'count': 6, u'stem': u'your', u'term': u'your'},
{u'count': 6, u'stem': u'daughter', u'term': u'daughter'},
{u'count': 6, u'stem': u'seen', u'term': u'seen'},
{u'count': 6, u'stem': u'park', u'term': u'park'},
{u'count': 6, u'stem': u'strong', u'term': u'strong'},
{u'count': 6, u'stem': u'making', u'term': u'making'},
{u'count': 6, u'stem': u'next', u'term': u'next'},
{u'count': 6, u'stem': u'hopes', u'term': u'hopes'},
{u'count': 6, u'stem': u'lower', u'term': u'lower'},
{u'count': 6, u'stem': u'washington', u'term': u'washington'},
{u'count': 6, u'stem': u'news', u'term': u'news'},
{u'count': 6, u'stem': u'potential', u'term': u'potential'},
{u'count': 6, u'stem': u'annual', u'term': u'annual'},
{u'count': 6, u'stem': u'different', u'term': u'different'},
{u'count': 6, u'stem': u'communities', u'term': u'communities'},
{u'count': 6, u'stem': u'organization', u'term': u'organization'},
{u'count': 6, u'stem': u'having', u'term': u'having'},
{u'count': 6, u'stem': u'pay', u'term': u'pay'},
{u'count': 6, u'stem': u'ave', u'term': u'ave'},
{u'count': 6, u'stem': u'field', u'term': u'field'},
{u'count': 6, u'stem': u'tif', u'term': u'tif'},
{u'count': 6, u'stem': u'week', u'term': u'week'},
{u'count': 6, u'stem': u'events', u'term': u'events'},
{u'count': 6, u'stem': u'bill', u'term': u'bill'},
{u'count': 6, u'stem': u'growth', u'term': u'growth'},
{u'count': 6, u'stem': u'moved', u'term': u'moved'},
{u'count': 6, u'stem': u'email', u'term': u'email'},
{u'count': 6, u'stem': u'murdah', u'term': u'murdah'},
{u'count': 6, u'stem': u'assistant', u'term': u'assistant'},
{u'count': 6, u'stem': u'contact', u'term': u'contact'},
{u'count': 6, u'stem': u'men', u'term': u'men'},
{u'count': 6, u'stem': u'young', u'term': u'young'},
{u'count': 6, u'stem': u'reform', u'term': u'reform'},
{u'count': 6, u'stem': u'hours', u'term': u'hours'},
{u'count': 6, u'stem': u'morning', u'term': u'morning'},
{u'count': 6, u'stem': u'date', u'term': u'date'},
{u'count': 6, u'stem': u'every', u'term': u'every'},
{u'count': 6, u'stem': u'tang', u'term': u'tang'},
{u'count': 6, u'stem': u'case', u'term': u'case'},
{u'count': 6, u'stem': u'several', u'term': u'several'},
{u'count': 6, u'stem': u'love', u'term': u'love'},
{u'count': 6, u'stem': u'complaint', u'term': u'complaint'},
{u'count': 6, u'stem': u'lawsuit', u'term': u'lawsuit'},
{u'count': 6, u'stem': u'speech', u'term': u'speech'},
{u'count': 6, u'stem': u'traffic', u'term': u'traffic'},
{u'count': 6, u'stem': u'weeks', u'term': u'weeks'},
{u'count': 6, u'stem': u'quality', u'term': u'quality'},
{u'count': 6, u'stem': u'campaign', u'term': u'campaign'},
{u'count': 6, u'stem': u'space', u'term': u'space'},
{u'count': 6, u'stem': u'thought', u'term': u'thought'},
{u'count': 6, u'stem': u'women', u'term': u'women'},
{u'count': 6, u'stem': u'using', u'term': u'using'},
{u'count': 6, u'stem': u'residents', u'term': u'residents'},
{u'count': 6, u'stem': u'himself', u'term': u'himself'},
{u'count': 6, u'stem': u'branstad', u'term': u'branstad'},
{u'count': 6, u'stem': u'justice', u'term': u'justice'},
{u'count': 6, u'stem': u'store', u'term': u'store'},
{u'count': 6, u'stem': u'debate', u'term': u'debate'},
{u'count': 6, u'stem': u'point', u'term': u'point'},
{u'count': 6, u'stem': u'once', u'term': u'once'},
{u'count': 6, u'stem': u'health', u'term': u'health'},
{u'count': 6, u'stem': u'academy', u'term': u'academy'},
{u'count': 6, u'stem': u'brown', u'term': u'brown'},
{u'count': 6, u'stem': u'team', u'term': u'team'},
{u'count': 5, u'stem': u'jan', u'term': u'jan'},
{u'count': 5, u'stem': u'businesses', u'term': u'businesses'},
{u'count': 5, u'stem': u'eligible', u'term': u'eligible'},
{u'count': 5, u'stem': u'owner', u'term': u'owner'},
{u'count': 5, u'stem': u'job', u'term': u'job'},
{u'count': 5, u'stem': u'rates', u'term': u'rates'},
{u'count': 5, u'stem': u'learning', u'term': u'learning'},
{u'count': 5, u'stem': u'register', u'term': u'register'},
{u'count': 5, u'stem': u'sexual', u'term': u'sexual'},
{u'count': 5, u'stem': u'want', u'term': u'want'},
{u'count': 5, u'stem': u'released', u'term': u'released'},
{u'count': 5, u'stem': u'purchased', u'term': u'purchased'},
{u'count': 5, u'stem': u'officials', u'term': u'officials'},
{u'count': 5, u'stem': u'hill', u'term': u'hill'},
{u'count': 5, u'stem': u'awards', u'term': u'awards'},
{u'count': 5, u'stem': u'fema', u'term': u'fema'},
{u'count': 5, u'stem': u'medical', u'term': u'medical'},
{u'count': 5, u'stem': u'end', u'term': u'end'},
{u'count': 5, u'stem': u'order', u'term': u'order'},
{u'count': 5, u'stem': u'grants', u'term': u'grants'},
{u'count': 5, u'stem': u'plans', u'term': u'plans'},
{u'count': 5, u'stem': u'pretty', u'term': u'pretty'},
{u'count': 5, u'stem': u'went', u'term': u'went'},
{u'count': 5, u'stem': u'miller', u'term': u'miller'},
{u'count': 5, u'stem': u'start', u'term': u'start'},
{u'count': 5, u'stem': u'stores', u'term': u'stores'},
{u'count': 5, u'stem': u'special', u'term': u'special'},
{u'count': 5, u'stem': u'look', u'term': u'look'},
{u'count': 5, u'stem': u'again', u'term': u'again'},
{u'count': 5, u'stem': u'winter', u'term': u'winter'},
{u'count': 5, u'stem': u'left', u'term': u'left'},
{u'count': 5, u'stem': u'upon', u'term': u'upon'},
{u'count': 5, u'stem': u'master', u'term': u'master'},
{u'count': 5, u'stem': u'museum', u'term': u'museum'},
{u'count': 5, u'stem': u'behavior', u'term': u'behavior'},
{u'count': 5, u'stem': u'melrose', u'term': u'melrose'},
{u'count': 5, u'stem': u'senate', u'term': u'senate'},
{u'count': 5, u'stem': u'award', u'term': u'award'},
{u'count': 5, u'stem': u'involved', u'term': u'involved'},
{u'count': 5, u'stem': u'recent', u'term': u'recent'},
{u'count': 5, u'stem': u'hall', u'term': u'hall'},
{u'count': 5, u'stem': u'hard', u'term': u'hard'},
{u'count': 5, u'stem': u'test', u'term': u'test'},
{u'count': 5, u'stem': u'federal', u'term': u'federal'},
{u'count': 5, u'stem': u'greater', u'term': u'greater'},
{u'count': 5, u'stem': u'son', u'term': u'son'},
{u'count': 5, u'stem': u'closed', u'term': u'closed'},
{u'count': 5, u'stem': u'hancher', u'term': u'hancher'},
{u'count': 5, u'stem': u'star', u'term': u'star'},
{u'count': 5, u'stem': u'began', u'term': u'began'},
{u'count': 5, u'stem': u'major', u'term': u'major'},
{u'count': 5, u'stem': u'above', u'term': u'above'},
{u'count': 5, u'stem': u'placed', u'term': u'placed'},
{u'count': 5, u'stem': u'video', u'term': u'video'},
{u'count': 5, u'stem': u'price', u'term': u'price'},
{u'count': 5, u'stem': u'need', u'term': u'need'},
{u'count': 5, u'stem': u'lunch', u'term': u'lunch'},
{u'count': 5, u'stem': u'although', u'term': u'although'},
{u'count': 5, u'stem': u'important', u'term': u'important'},
{u'count': 5, u'stem': u'support', u'term': u'support'},
{u'count': 5, u'stem': u'add', u'term': u'add'},
{u'count': 5, u'stem': u'car', u'term': u'car'},
{u'count': 5, u'stem': u'black', u'term': u'black'},
{u'count': 5, u'stem': u'cases', u'term': u'cases'},
{u'count': 5, u'stem': u'life', u'term': u'life'},
{u'count': 5, u'stem': u'month', u'term': u'month'},
{u'count': 5, u'stem': u'down', u'term': u'down'},
{u'count': 5, u'stem': u'together', u'term': u'together'},
{u'count': 5, u'stem': u'east', u'term': u'east'},
{u'count': 5, u'stem': u'couldn', u'term': u'couldn'},
{u'count': 5, u'stem': u'includes', u'term': u'includes'},
{u'count': 5, u'stem': u'currently', u'term': u'currently'},
{u'count': 5, u'stem': u'moore', u'term': u'moore'},
{u'count': 5, u'stem': u'remains', u'term': u'remains'},
{u'count': 5, u'stem': u'started', u'term': u'started'},
{u'count': 5, u'stem': u'stories', u'term': u'stories'},
{u'count': 5, u'stem': u'excited', u'term': u'excited'},
{u'count': 5, u'stem': u'approved', u'term': u'approved'},
{u'count': 5, u'stem': u'however', u'term': u'however'},
{u'count': 5, u'stem': u'site', u'term': u'site'},
{u'count': 5, u'stem': u'considered', u'term': u'considered'},
{u'count': 5, u'stem': u'january', u'term': u'january'},
{u'count': 5, u'stem': u'replacement', u'term': u'replacement'},
{u'count': 5, u'stem': u'hold', u'term': u'hold'},
{u'count': 5, u'stem': u'same', u'term': u'same'},
{u'count': 5, u'stem': u'named', u'term': u'named'},
{u'count': 5, u'stem': u'sent', u'term': u'sent'},
{u'count': 5, u'stem': u'misdemeanor', u'term': u'misdemeanor'},
{u'count': 5, u'stem': u'given', u'term': u'given'},
{u'count': 5, u'stem': u'lives', u'term': u'lives'},
{u'count': 5, u'stem': u'must', u'term': u'must'},
{u'count': 5, u'stem': u'ready', u'term': u'ready'},
{u'count': 5, u'stem': u'legal', u'term': u'legal'},
{u'count': 5, u'stem': u'struggle', u'term': u'struggle'},
{u'count': 5, u'stem': u'used', u'term': u'used'},
{u'count': 5, u'stem': u'tickets', u'term': u'tickets'},
{u'count': 5, u'stem': u'small', u'term': u'small'},
{u'count': 5, u'stem': u'hospitals', u'term': u'hospitals'},
{u'count': 5, u'stem': u'floor', u'term': u'floor'}]
In [7]:
wc_old
Out[7]:
[{u'count': 179, u'stem': u'iowa', u'term': u'iowa'},
{u'count': 55, u'stem': u'student', u'term': u'students'},
{u'count': 31, u'stem': u'johnson', u'term': u'johnson'},
{u'count': 23, u'stem': u'coralvill', u'term': u'coralville'},
{u'count': 20, u'stem': u'colleg', u'term': u'college'},
{u'count': 19, u'stem': u'famili', u'term': u'family'},
{u'count': 18, u'stem': u'tax', u'term': u'tax'},
{u'count': 15, u'stem': u'children', u'term': u'children'},
{u'count': 12, u'stem': u'donat', u'term': u'donations'},
{u'count': 12, u'stem': u'teacher', u'term': u'teacher'},
{u'count': 12, u'stem': u'washington', u'term': u'washington'},
{u'count': 11, u'stem': u'alleg', u'term': u'allegedly'},
{u'count': 11, u'stem': u'flood', u'term': u'flood'},
{u'count': 11, u'stem': u'educ', u'term': u'education'},
{u'count': 10, u'stem': u'studi', u'term': u'studies'},
{u'count': 10, u'stem': u'republican', u'term': u'republican'},
{u'count': 10, u'stem': u'free', u'term': u'free'},
{u'count': 10, u'stem': u'elementari', u'term': u'elementary'},
{u'count': 9, u'stem': u'war', u'term': u'war'},
{u'count': 9, u'stem': u'space', u'term': u'space'},
{u'count': 8, u'stem': u'obama', u'term': u'obama'},
{u'count': 8, u'stem': u'assault', u'term': u'assault'},
{u'count': 8, u'stem': u'american', u'term': u'american'},
{u'count': 8, u'stem': u'dubuqu', u'term': u'dubuque'},
{u'count': 7, u'stem': u'rick', u'term': u'rick'},
{u'count': 7, u'stem': u'democrat', u'term': u'democratic'},
{u'count': 7, u'stem': u'professor', u'term': u'professor'},
{u'count': 7, u'stem': u'caucus', u'term': u'caucus'},
{u'count': 7, u'stem': u'thompson', u'term': u'thompson'},
{u'count': 6, u'stem': u'hawkey', u'term': u'hawkeye'},
{u'count': 6, u'stem': u'lane', u'term': u'lane'},
{u'count': 6, u'stem': u'hospit', u'term': u'hospital'},
{u'count': 6, u'stem': u'photo', u'term': u'photo'},
{u'count': 6, u'stem': u'love', u'term': u'love'},
{u'count': 6, u'stem': u'museum', u'term': u'museum'},
{u'count': 6, u'stem': u'branstad', u'term': u'branstad'},
{u'count': 6, u'stem': u'athlet', u'term': u'athletics'},
{u'count': 6, u'stem': u'mall', u'term': u'mall'},
{u'count': 6, u'stem': u'avenu', u'term': u'avenue'},
{u'count': 6, u'stem': u'regent', u'term': u'regents'},
{u'count': 5, u'stem': u'chicago', u'term': u'chicago'},
{u'count': 5, u'stem': u'renew', u'term': u'renewable'},
{u'count': 5, u'stem': u'teach', u'term': u'teaching'},
{u'count': 5, u'stem': u'ticket', u'term': u'ticket'},
{u'count': 5, u'stem': u'fundrais', u'term': u'fundraiser'},
{u'count': 5, u'stem': u'wrestl', u'term': u'wrestling'},
{u'count': 5, u'stem': u'alcohol', u'term': u'alcohol'},
{u'count': 5, u'stem': u'highway', u'term': u'highway'},
{u'count': 5, u'stem': u'gilbert', u'term': u'gilbert'},
{u'count': 5, u'stem': u'complaint', u'term': u'complaints'},
{u'count': 5, u'stem': u'miller', u'term': u'miller'},
{u'count': 5, u'stem': u'tom', u'term': u'tom'},
{u'count': 5, u'stem': u'robert', u'term': u'robert'},
{u'count': 5, u'stem': u'advoc', u'term': u'advocates'},
{u'count': 5, u'stem': u'non', u'term': u'non'},
{u'count': 5, u'stem': u'plaza', u'term': u'plaza'},
{u'count': 5, u'stem': u'quartet', u'term': u'quartet'},
{u'count': 5, u'stem': u'holiday', u'term': u'holiday'},
{u'count': 5, u'stem': u'sept', u'term': u'sept'},
{u'count': 5, u'stem': u'pedestrian', u'term': u'pedestrian'},
{u'count': 5, u'stem': u'plummer', u'term': u'plummer'},
{u'count': 5, u'stem': u'choke', u'term': u'choke'},
{u'count': 5, u'stem': u'burlington', u'term': u'burlington'},
{u'count': 4, u'stem': u'retail', u'term': u'retailer'},
{u'count': 4, u'stem': u'ridg', u'term': u'ridge'},
{u'count': 4, u'stem': u'physician', u'term': u'physicians'},
{u'count': 4, u'stem': u'conserv', u'term': u'conservation'},
{u'count': 4, u'stem': u'gut', u'term': u'gut'},
{u'count': 4, u'stem': u'crimin', u'term': u'criminal'},
{u'count': 4, u'stem': u'van', u'term': u'van'},
{u'count': 4, u'stem': u'marijuana', u'term': u'marijuana'},
{u'count': 4, u'stem': u'loebsack', u'term': u'loebsack'},
{u'count': 4, u'stem': u'moin', u'term': u'moines'},
{u'count': 4, u'stem': u'laura', u'term': u'laura'},
{u'count': 4, u'stem': u'coordin', u'term': u'coordinator'},
{u'count': 4, u'stem': u'supervisor', u'term': u'supervisors'},
{u'count': 4, u'stem': u'tiffin', u'term': u'tiffin'},
{u'count': 4, u'stem': u'trail', u'term': u'trail'},
{u'count': 4, u'stem': u'dan', u'term': u'dan'},
{u'count': 4, u'stem': u'bachelor', u'term': u'bachelor'},
{u'count': 4, u'stem': u'child', u'term': u'child'},
{u'count': 4, u'stem': u'econom', u'term': u'economic'},
{u'count': 4, u'stem': u'sycamor', u'term': u'sycamore'},
{u'count': 4, u'stem': u'cedar', u'term': u'cedar'},
{u'count': 4, u'stem': u'nathan', u'term': u'nathan'},
{u'count': 4, u'stem': u'campus', u'term': u'campus'},
{u'count': 4, u'stem': u'linn', u'term': u'linn'},
{u'count': 4, u'stem': u'dodg', u'term': u'dodge'},
{u'count': 4, u'stem': u'campaign', u'term': u'campaign'},
{u'count': 4, u'stem': u'steve', u'term': u'steve'},
{u'count': 4, u'stem': u'des', u'term': u'des'},
{u'count': 4, u'stem': u'william', u'term': u'williams'},
{u'count': 4, u'stem': u'reward', u'term': u'reward'},
{u'count': 4, u'stem': u'casino', u'term': u'casino'},
{u'count': 4, u'stem': u'roosevelt', u'term': u'roosevelt'},
{u'count': 4, u'stem': u'nurs', u'term': u'nursing'},
{u'count': 4, u'stem': u'winner', u'term': u'winners'},
{u'count': 4, u'stem': u'iowan', u'term': u'iowans'},
{u'count': 4, u'stem': u'krei', u'term': u'krei'},
{u'count': 4, u'stem': u'cancer', u'term': u'cancer'},
{u'count': 4, u'stem': u'commiss', u'term': u'commission'},
{u'count': 4, u'stem': u'agreement', u'term': u'agreement'},
{u'count': 4, u'stem': u'foundat', u'term': u'foundation'},
{u'count': 4, u'stem': u'christoph', u'term': u'christopher'},
{u'count': 4, u'stem': u'hart', u'term': u'hart'},
{u'count': 4, u'stem': u'attende', u'term': u'attendees'},
{u'count': 4, u'stem': u'increment', u'term': u'increment'},
{u'count': 4, u'stem': u'michael', u'term': u'michael'},
{u'count': 3, u'stem': u'cinema', u'term': u'cinema'},
{u'count': 3, u'stem': u'collabor', u'term': u'collaborated'},
{u'count': 3, u'stem': u'mural', u'term': u'mural'},
{u'count': 3, u'stem': u'smith', u'term': u'smith'},
{u'count': 3, u'stem': u'yoga', u'term': u'yoga'},
{u'count': 3, u'stem': u'interst', u'term': u'interstate'},
{u'count': 3, u'stem': u'mason', u'term': u'mason'},
{u'count': 3, u'stem': u'tap', u'term': u'tap'},
{u'count': 3, u'stem': u'celebr', u'term': u'celebrate'},
{u'count': 3, u'stem': u'ceremoni', u'term': u'ceremony'},
{u'count': 3, u'stem': u'discov', u'term': u'discovered'},
{u'count': 3, u'stem': u'david', u'term': u'david'},
{u'count': 3, u'stem': u'crimestopp', u'term': u'crimestoppers'},
{u'count': 3, u'stem': u'presidenti', u'term': u'presidential'},
{u'count': 3, u'stem': u'armi', u'term': u'army'},
{u'count': 3, u'stem': u'mom', u'term': u'mom'},
{u'count': 3, u'stem': u'prairi', u'term': u'prairie'},
{u'count': 3, u'stem': u'unfair', u'term': u'unfairly'},
{u'count': 3, u'stem': u'wrap', u'term': u'wrapping'},
{u'count': 3, u'stem': u'hawk', u'term': u'hawk'},
{u'count': 3, u'stem': u'johnston', u'term': u'johnston'},
{u'count': 3, u'stem': u'deputi', u'term': u'deputy'},
{u'count': 3, u'stem': u'strive', u'term': u'strive'},
{u'count': 3, u'stem': u'martin', u'term': u'martin'},
{u'count': 3, u'stem': u'prohibit', u'term': u'prohibition'},
{u'count': 3, u'stem': u'massachusett', u'term': u'massachusetts'},
{u'count': 3, u'stem': u'mari', u'term': u'mary'},
{u'count': 3, u'stem': u'crisi', u'term': u'crisis'},
{u'count': 3, u'stem': u'reconstruct', u'term': u'reconstruction'},
{u'count': 3, u'stem': u'classroom', u'term': u'classrooms'},
{u'count': 3, u'stem': u'shuttl', u'term': u'shuttle'},
{u'count': 3, u'stem': u'provost', u'term': u'provost'},
{u'count': 3, u'stem': u'solon', u'term': u'solon'},
{u'count': 3, u'stem': u'toy', u'term': u'toys'},
{u'count': 3, u'stem': u'couldn', u'term': u'couldn'},
{u'count': 3, u'stem': u'romney', u'term': u'romney'},
{u'count': 3, u'stem': u'implement', u'term': u'implement'},
{u'count': 3, u'stem': u'jeff', u'term': u'jeff'},
{u'count': 3, u'stem': u'nomin', u'term': u'nomination'},
{u'count': 3, u'stem': u'dolan', u'term': u'dolan'},
{u'count': 3, u'stem': u'brian', u'term': u'brian'},
{u'count': 3, u'stem': u'ensur', u'term': u'ensure'},
{u'count': 3, u'stem': u'harri', u'term': u'harry'},
{u'count': 3, u'stem': u'waterfront', u'term': u'waterfront'},
{u'count': 3, u'stem': u'gop', u'term': u'gop'},
{u'count': 3, u'stem': u'par', u'term': u'par'},
{u'count': 3, u'stem': u'org', u'term': u'org'},
{u'count': 3, u'stem': u'pro', u'term': u'pro'},
{u'count': 3, u'stem': u'funer', u'term': u'funeral'},
{u'count': 3, u'stem': u'unemploy', u'term': u'unemployment'},
{u'count': 3, u'stem': u'lawsuit', u'term': u'lawsuit'},
{u'count': 3, u'stem': u'vee', u'term': u'vee'},
{u'count': 3, u'stem': u'joe', u'term': u'joe'},
{u'count': 3, u'stem': u'exceed', u'term': u'exceeded'},
{u'count': 3, u'stem': u'refresh', u'term': u'refreshments'},
{u'count': 3, u'stem': u'jim', u'term': u'jim'},
{u'count': 3, u'stem': u'victim', u'term': u'victim'},
{u'count': 3, u'stem': u'jecc', u'term': u'jecc'},
{u'count': 3, u'stem': u'journal', u'term': u'journalism'},
{u'count': 3, u'stem': u'firearm', u'term': u'firearm'},
{u'count': 3, u'stem': u'john', u'term': u'john'},
{u'count': 3, u'stem': u'coral', u'term': u'coral'},
{u'count': 3, u'stem': u'meyer', u'term': u'meyer'},
{u'count': 3, u'stem': u'prioriti', u'term': u'priority'},
{u'count': 3, u'stem': u'mitt', u'term': u'mitt'},
{u'count': 3, u'stem': u'agenda', u'term': u'agenda'},
{u'count': 3, u'stem': u'tim', u'term': u'tim'},
{u'count': 3, u'stem': u'longfellow', u'term': u'longfellow'},
{u'count': 3, u'stem': u'slockett', u'term': u'slockett'}]
In [15]:
shuffle(media)
for media_source in media:
media_source['word_counts'] = mc.wordCount( get_wc_query( media_source['media_id'] ) )
mc.wordCount
In [16]:
for media_source in media:
source_word_counts = media_source[ 'word_counts']
counts = [ count[ 'count'] for count in source_word_counts ]
total_words = float( sum( counts ) )
#print total_words
scaled_counts = dict ( [ ( count['term'], count['count'] / total_words) for count in source_word_counts] )
#print scaled_counts
media_source['scaled_counts'] = scaled_counts
In [17]:
media[0].keys()
Out[17]:
['Partisan code', 'scaled_counts', 'word_counts', 'media_id', 'Media_url']
In [21]:
for media_source in media:
words_present = dict ( [ ( word, True ) for word in media_source['scaled_counts'].keys() ] )
#print scaled_counts
media_source['words_present'] = words_present
In [22]:
cPickle.dump( media, file( os.path.expanduser( '~/Dropbox/mc/media.pickle' ), 'wb' ) )
In [1]:
import cPickle
import os.path
media = cPickle.load( file( os.path.expanduser( '~/Dropbox/mc/media.pickle' ), 'r' ) )
In [2]:
def get_media_with_valid_partisan_codes( media ):
for media_source in media:
media_source['Partisan code'] = media_source['Partisan code'].strip().lower()
valid_partisan_codes = set ( ['liberal',
'not valid (broken link, not in english, etc.)',
'conservative',
'libertarian',
'none of the above'])
media = [ media_source for media_source in media if media_source['Partisan code'] in valid_partisan_codes ]
return media
In [3]:
print len (media)
media = get_media_with_valid_partisan_codes( media )
print len (media )
1359
1359
In [4]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn import preprocessing
from collections import Counter
def get_X_Y( media ):
feature_field = 'words_present'
feature_dicts = [ media_source[ feature_field ] for media_source in media ]
trans = vec.fit_transform( feature_dicts )
X = trans.toarray()
partisan_codes = [ media_source[ 'Partisan code' ] for media_source in media]
c = Counter( partisan_codes )
#print set(c.keys())
le.fit( partisan_codes )
Y = le.transform( partisan_codes )
return X, Y
In [5]:
X, Y = get_X_Y( media )
print len(X)
print len(Y)
1359
1359
In [7]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=33)
In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import sklearn.metrics
from sklearn.metrics import confusion_matrix
clfs = [ MultinomialNB(), svm.SVC( probability=True, kernel = 'rbf'), LinearSVC(), SGDClassifier() ]
for clf in clfs:
print clf
clf.fit( X_train, y_train)
y_pred = clf.predict( X_train)
accuracy = sklearn.metrics.accuracy_score( y_train, y_pred )
print accuracy
print "calculating confusion matrix"
cm = confusion_matrix( y_train, y_pred)
print "confusion matrix"
print cm
#cv = KFold( len(X), 5, shuffle=True, random_state=0 )
#scores = cross_val_score( LinearSVC(), X, Y, cv=cv)
#print scores
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.857703631011
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-9-29251be541d4> in <module>()
10 for clf in clfs:
11 print clf
---> 12 clf.fit( X_train, y_train)
13 y_pred = clf.predict( X_train)
14 accuracy = sklearn.metrics.accuracy_score( y_train, y_pred )
/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in fit(self, X, y, sample_weight)
174
175 seed = rnd.randint(np.iinfo('i').max)
--> 176 fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
177 # see comment on the other call to np.iinfo in this file
178
/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
229 cache_size=self.cache_size, coef0=self.coef0,
230 gamma=self._gamma, epsilon=self.epsilon,
--> 231 max_iter=self.max_iter, random_seed=random_seed)
232
233 self._warn_from_fit_status()
KeyboardInterrupt:
[[193 0 0 32 1]
[ 1 213 0 28 0]
[ 3 3 5 13 0]
[ 6 0 1 439 2]
[ 8 8 0 39 24]]
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
In [6]:
partisan_codes = [ media_source[ 'Partisan code' ] for media_source in media]
from collections import Counter
c = Counter( partisan_codes )
print set(c.keys())
c
set(['libertarian', 'liberal', 'not valid (broken link, not in english, etc.)', 'none of the above', 'conservative'])
Out[6]:
Counter({'none of the above': 591, 'liberal': 328, 'conservative': 301, 'not valid (broken link, not in english, etc.)': 104, 'libertarian': 35})
In [12]:
In [106]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf = svm.SVC( probability=True, kernel = 'rbf')
clf.fit( X[:10], Y[:10] )
Out[106]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
In [107]:
#print clf.predict_proba( X[:10])
print clf.predict( X[:10])
print Y[:10]
[1 1 1 1 1 1 1 1 1 1]
[4 0 1 0 1 3 0 1 1 3]
In [97]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X[:10], Y[:10])
clf.coef_
Out[97]:
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
In [77]:
Y
Y_pred = clf.predict( X )
In [80]:
import sklearn.metrics
sklearn.metrics.accuracy_score( Y, Y_pred )
Out[80]:
0.43487858719646799
In [69]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix( Y, Y_pred)
print cm
print Y_pred
print Y
print Counter( Y )
print Counter( Y_pred )
[[ 0 0 0 301 0]
[ 0 0 0 328 0]
[ 0 0 0 35 0]
[ 0 0 0 591 0]
[ 0 0 0 104 0]]
[3 3 3 ..., 3 3 3]
[4 0 1 ..., 3 3 3]
Counter({3: 591, 1: 328, 0: 301, 4: 104, 2: 35})
Counter({3: 1359})
In [76]:
le.inverse_transform( Y )
Out[76]:
array(['Conservative', 'None of the above', 'None of the above', ...,
'Not Valid (broken link, not in English, etc.)', 'Liberal',
'None of the above'],
dtype='<S53')
In [11]:
Content source: AchyuthIIIT/mediacloud
Similar notebooks: