In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [2]:
import mediacloud, json
mc = mediacloud.api.MediaCloud(api_key)

In [3]:
import csv
partisan_coding = csv.DictReader( file( 'partisan_coding_20140218.csv' ) )
media = [ row for row in partisan_coding]
print len(media)


1363

In [4]:
media = cPickle.load( file( os.path.expanduser( '~/Dropbox/mc/media.pickle' ), 'r' ) )
len(media)


Out[4]:
1363

In [12]:
def get_wc_query( media_id ):
    return "+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:{}".format( media_id )

In [13]:
wc_old = media[0]['word_counts']
query = "+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:{}".format( media[0]['media_id'] )
print query
print get_wc_query( media[0]['media_id'] )
wc_new = mc.wordCount( query, '','none' )
wc_new


+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:18175
+publish_date:[2012-01-01T00:00:00.000Z TO 2014-01-01T00:00:00.000Z] AND +media_id:18175
Out[13]:
[{u'count': 1083, u'stem': u'the', u'term': u'the'},
 {u'count': 515, u'stem': u'and', u'term': u'and'},
 {u'count': 220, u'stem': u'for', u'term': u'for'},
 {u'count': 220, u'stem': u'said', u'term': u'said'},
 {u'count': 197, u'stem': u'iowa', u'term': u'iowa'},
 {u'count': 189, u'stem': u'that', u'term': u'that'},
 {u'count': 132, u'stem': u'city', u'term': u'city'},
 {u'count': 124, u'stem': u'with', u'term': u'with'},
 {u'count': 120, u'stem': u'will', u'term': u'will'},
 {u'count': 101, u'stem': u'was', u'term': u'was'},
 {u'count': 99, u'stem': u'from', u'term': u'from'},
 {u'count': 78, u'stem': u'have', u'term': u'have'},
 {u'count': 68, u'stem': u'are', u'term': u'are'},
 {u'count': 67, u'stem': u'about', u'term': u'about'},
 {u'count': 61, u'stem': u'more', u'term': u'more'},
 {u'count': 61, u'stem': u'his', u'term': u'his'},
 {u'count': 60, u'stem': u'has', u'term': u'has'},
 {u'count': 53, u'stem': u'she', u'term': u'she'},
 {u'count': 50, u'stem': u'school', u'term': u'school'},
 {u'count': 47, u'stem': u'her', u'term': u'her'},
 {u'count': 47, u'stem': u'university', u'term': u'university'},
 {u'count': 47, u'stem': u'their', u'term': u'their'},
 {u'count': 46, u'stem': u'this', u'term': u'this'},
 {u'count': 46, u'stem': u'police', u'term': u'police'},
 {u'count': 45, u'stem': u'but', u'term': u'but'},
 {u'count': 45, u'stem': u'year', u'term': u'year'},
 {u'count': 43, u'stem': u'who', u'term': u'who'},
 {u'count': 43, u'stem': u'one', u'term': u'one'},
 {u'count': 41, u'stem': u'state', u'term': u'state'},
 {u'count': 41, u'stem': u'they', u'term': u'they'},
 {u'count': 41, u'stem': u'not', u'term': u'not'},
 {u'count': 37, u'stem': u'all', u'term': u'all'},
 {u'count': 37, u'stem': u'county', u'term': u'county'},
 {u'count': 35, u'stem': u'johnson', u'term': u'johnson'},
 {u'count': 35, u'stem': u'when', u'term': u'when'},
 {u'count': 33, u'stem': u'after', u'term': u'after'},
 {u'count': 33, u'stem': u'there', u'term': u'there'},
 {u'count': 32, u'stem': u'which', u'term': u'which'},
 {u'count': 31, u'stem': u'people', u'term': u'people'},
 {u'count': 31, u'stem': u'information', u'term': u'information'},
 {u'count': 31, u'stem': u'also', u'term': u'also'},
 {u'count': 31, u'stem': u'been', u'term': u'been'},
 {u'count': 30, u'stem': u'were', u'term': u'were'},
 {u'count': 30, u'stem': u'new', u'term': u'new'},
 {u'count': 30, u'stem': u'center', u'term': u'center'},
 {u'count': 30, u'stem': u'two', u'term': u'two'},
 {u'count': 30, u'stem': u'community', u'term': u'community'},
 {u'count': 28, u'stem': u'would', u'term': u'would'},
 {u'count': 28, u'stem': u'district', u'term': u'district'},
 {u'count': 28, u'stem': u'other', u'term': u'other'},
 {u'count': 27, u'stem': u'board', u'term': u'board'},
 {u'count': 26, u'stem': u'into', u'term': u'into'},
 {u'count': 26, u'stem': u'out', u'term': u'out'},
 {u'count': 25, u'stem': u'think', u'term': u'think'},
 {u'count': 25, u'stem': u'students', u'term': u'students'},
 {u'count': 24, u'stem': u'had', u'term': u'had'},
 {u'count': 24, u'stem': u'first', u'term': u'first'},
 {u'count': 23, u'stem': u'what', u'term': u'what'},
 {u'count': 23, u'stem': u'its', u'term': u'its'},
 {u'count': 23, u'stem': u'public', u'term': u'public'},
 {u'count': 22, u'stem': u'can', u'term': u'can'},
 {u'count': 22, u'stem': u'some', u'term': u'some'},
 {u'count': 22, u'stem': u'house', u'term': u'house'},
 {u'count': 22, u'stem': u'years', u'term': u'years'},
 {u'count': 22, u'stem': u'high', u'term': u'high'},
 {u'count': 21, u'stem': u'than', u'term': u'than'},
 {u'count': 21, u'stem': u'now', u'term': u'now'},
 {u'count': 21, u'stem': u'program', u'term': u'program'},
 {u'count': 21, u'stem': u'day', u'term': u'day'},
 {u'count': 20, u'stem': u'going', u'term': u'going'},
 {u'count': 20, u'stem': u'building', u'term': u'building'},
 {u'count': 20, u'stem': u'our', u'term': u'our'},
 {u'count': 20, u'stem': u'according', u'term': u'according'},
 {u'count': 20, u'stem': u'them', u'term': u'them'},
 {u'count': 19, u'stem': u'really', u'term': u'really'},
 {u'count': 19, u'stem': u'him', u'term': u'him'},
 {u'count': 19, u'stem': u'time', u'term': u'time'},
 {u'count': 19, u'stem': u'before', u'term': u'before'},
 {u'count': 18, u'stem': u'you', u'term': u'you'},
 {u'count': 18, u'stem': u'just', u'term': u'just'},
 {u'count': 18, u'stem': u'host', u'term': u'host'},
 {u'count': 17, u'stem': u'get', u'term': u'get'},
 {u'count': 17, u'stem': u'college', u'term': u'college'},
 {u'count': 17, u'stem': u'members', u'term': u'members'},
 {u'count': 17, u'stem': u'million', u'term': u'million'},
 {u'count': 17, u'stem': u'three', u'term': u'three'},
 {u'count': 16, u'stem': u'most', u'term': u'most'},
 {u'count': 16, u'stem': u'only', u'term': u'only'},
 {u'count': 16, u'stem': u'property', u'term': u'property'},
 {u'count': 16, u'stem': u'well', u'term': u'well'},
 {u'count': 16, u'stem': u'over', u'term': u'over'},
 {u'count': 16, u'stem': u'like', u'term': u'like'},
 {u'count': 16, u'stem': u'meeting', u'term': u'meeting'},
 {u'count': 16, u'stem': u'call', u'term': u'call'},
 {u'count': 16, u'stem': u'service', u'term': u'service'},
 {u'count': 16, u'stem': u'very', u'term': u'very'},
 {u'count': 16, u'stem': u'take', u'term': u'take'},
 {u'count': 15, u'stem': u'local', u'term': u'local'},
 {u'count': 15, u'stem': u'four', u'term': u'four'},
 {u'count': 15, u'stem': u'being', u'term': u'being'},
 {u'count': 15, u'stem': u'old', u'term': u'old'},
 {u'count': 15, u'stem': u'president', u'term': u'president'},
 {u'count': 15, u'stem': u'while', u'term': u'while'},
 {u'count': 14, u'stem': u'funding', u'term': u'funding'},
 {u'count': 14, u'stem': u'woman', u'term': u'woman'},
 {u'count': 14, u'stem': u'during', u'term': u'during'},
 {u'count': 14, u'stem': u'home', u'term': u'home'},
 {u'count': 14, u'stem': u'second', u'term': u'second'},
 {u'count': 14, u'stem': u'april', u'term': u'april'},
 {u'count': 14, u'stem': u'percent', u'term': u'percent'},
 {u'count': 14, u'stem': u'free', u'term': u'free'},
 {u'count': 14, u'stem': u'because', u'term': u'because'},
 {u'count': 13, u'stem': u'tax', u'term': u'tax'},
 {u'count': 13, u'stem': u'committee', u'term': u'committee'},
 {u'count': 13, u'stem': u'how', u'term': u'how'},
 {u'count': 13, u'stem': u'made', u'term': u'made'},
 {u'count': 13, u'stem': u'including', u'term': u'including'},
 {u'count': 13, u'stem': u'art', u'term': u'art'},
 {u'count': 13, u'stem': u'work', u'term': u'work'},
 {u'count': 13, u'stem': u'see', u'term': u'see'},
 {u'count': 12, u'stem': u'could', u'term': u'could'},
 {u'count': 12, u'stem': u'area', u'term': u'area'},
 {u'count': 12, u'stem': u'today', u'term': u'today'},
 {u'count': 12, u'stem': u'fire', u'term': u'fire'},
 {u'count': 12, u'stem': u'many', u'term': u'many'},
 {u'count': 12, u'stem': u'way', u'term': u'way'},
 {u'count': 12, u'stem': u'those', u'term': u'those'},
 {u'count': 12, u'stem': u'say', u'term': u'say'},
 {u'count': 12, u'stem': u'department', u'term': u'department'},
 {u'count': 12, u'stem': u'much', u'term': u'much'},
 {u'count': 12, u'stem': u'make', u'term': u'make'},
 {u'count': 12, u'stem': u'against', u'term': u'against'},
 {u'count': 12, u'stem': u'office', u'term': u'office'},
 {u'count': 12, u'stem': u'man', u'term': u'man'},
 {u'count': 12, u'stem': u'may', u'term': u'may'},
 {u'count': 12, u'stem': u'degree', u'term': u'degree'},
 {u'count': 12, u'stem': u'march', u'term': u'march'},
 {u'count': 12, u'stem': u'place', u'term': u'place'},
 {u'count': 12, u'stem': u'coralville', u'term': u'coralville'},
 {u'count': 12, u'stem': u'schools', u'term': u'schools'},
 {u'count': 12, u'stem': u'family', u'term': u'family'},
 {u'count': 12, u'stem': u'council', u'term': u'council'},
 {u'count': 12, u'stem': u'each', u'term': u'each'},
 {u'count': 12, u'stem': u'food', u'term': u'food'},
 {u'count': 12, u'stem': u'street', u'term': u'street'},
 {u'count': 11, u'stem': u'project', u'term': u'project'},
 {u'count': 11, u'stem': u'education', u'term': u'education'},
 {u'count': 11, u'stem': u'both', u'term': u'both'},
 {u'count': 11, u'stem': u'third', u'term': u'third'},
 {u'count': 11, u'stem': u'monday', u'term': u'monday'},
 {u'count': 11, u'stem': u'online', u'term': u'online'},
 {u'count': 11, u'stem': u'law', u'term': u'law'},
 {u'count': 11, u'stem': u'where', u'term': u'where'},
 {u'count': 11, u'stem': u'know', u'term': u'know'},
 {u'count': 11, u'stem': u'early', u'term': u'early'},
 {u'count': 11, u'stem': u'number', u'term': u'number'},
 {u'count': 11, u'stem': u'based', u'term': u'based'},
 {u'count': 11, u'stem': u'lot', u'term': u'lot'},
 {u'count': 11, u'stem': u'still', u'term': u'still'},
 {u'count': 11, u'stem': u'teacher', u'term': u'teacher'},
 {u'count': 11, u'stem': u'tuesday', u'term': u'tuesday'},
 {u'count': 11, u'stem': u'www', u'term': u'www'},
 {u'count': 11, u'stem': u'children', u'term': u'children'},
 {u'count': 11, u'stem': u'former', u'term': u'former'},
 {u'count': 11, u'stem': u'through', u'term': u'through'},
 {u'count': 10, u'stem': u'show', u'term': u'show'},
 {u'count': 10, u'stem': u'criminal', u'term': u'criminal'},
 {u'count': 10, u'stem': u'church', u'term': u'church'},
 {u'count': 10, u'stem': u'increase', u'term': u'increase'},
 {u'count': 10, u'stem': u'group', u'term': u'group'},
 {u'count': 10, u'stem': u'back', u'term': u'back'},
 {u'count': 10, u'stem': u'org', u'term': u'org'},
 {u'count': 10, u'stem': u'design', u'term': u'design'},
 {u'count': 10, u'stem': u'another', u'term': u'another'},
 {u'count': 10, u'stem': u'world', u'term': u'world'},
 {u'count': 10, u'stem': u'such', u'term': u'such'},
 {u'count': 10, u'stem': u'director', u'term': u'director'},
 {u'count': 10, u'stem': u'development', u'term': u'development'},
 {u'count': 10, u'stem': u'open', u'term': u'open'},
 {u'count': 10, u'stem': u'staff', u'term': u'staff'},
 {u'count': 10, u'stem': u'often', u'term': u'often'},
 {u'count': 10, u'stem': u'here', u'term': u'here'},
 {u'count': 10, u'stem': u'says', u'term': u'says'},
 {u'count': 10, u'stem': u'press', u'term': u'press'},
 {u'count': 10, u'stem': u'found', u'term': u'found'},
 {u'count': 10, u'stem': u'officer', u'term': u'officer'},
 {u'count': 10, u'stem': u'these', u'term': u'these'},
 {u'count': 10, u'stem': u'five', u'term': u'five'},
 {u'count': 9, u'stem': u'student', u'term': u'student'},
 {u'count': 9, u'stem': u'located', u'term': u'located'},
 {u'count': 9, u'stem': u'thursday', u'term': u'thursday'},
 {u'count': 9, u'stem': u'alcohol', u'term': u'alcohol'},
 {u'count': 9, u'stem': u'cost', u'term': u'cost'},
 {u'count': 9, u'stem': u'officers', u'term': u'officers'},
 {u'count': 9, u'stem': u'saturday', u'term': u'saturday'},
 {u'count': 9, u'stem': u'friday', u'term': u'friday'},
 {u'count': 9, u'stem': u'flood', u'term': u'flood'},
 {u'count': 9, u'stem': u'wednesday', u'term': u'wednesday'},
 {u'count': 9, u'stem': u'good', u'term': u'good'},
 {u'count': 9, u'stem': u'west', u'term': u'west'},
 {u'count': 9, u'stem': u'arts', u'term': u'arts'},
 {u'count': 9, u'stem': u'iowans', u'term': u'iowans'},
 {u'count': 9, u'stem': u'request', u'term': u'request'},
 {u'count': 9, u'stem': u'liberty', u'term': u'liberty'},
 {u'count': 9, u'stem': u'include', u'term': u'include'},
 {u'count': 9, u'stem': u'north', u'term': u'north'},
 {u'count': 9, u'stem': u'citizen', u'term': u'citizen'},
 {u'count': 9, u'stem': u'facility', u'term': u'facility'},
 {u'count': 9, u'stem': u'should', u'term': u'should'},
 {u'count': 9, u'stem': u'business', u'term': u'business'},
 {u'count': 9, u'stem': u'don', u'term': u'don'},
 {u'count': 9, u'stem': u'wood', u'term': u'wood'},
 {u'count': 9, u'stem': u'system', u'term': u'system'},
 {u'count': 9, u'stem': u'national', u'term': u'national'},
 {u'count': 9, u'stem': u'fall', u'term': u'fall'},
 {u'count': 9, u'stem': u'then', u'term': u'then'},
 {u'count': 9, u'stem': u'elementary', u'term': u'elementary'},
 {u'count': 9, u'stem': u'six', u'term': u'six'},
 {u'count': 9, u'stem': u'between', u'term': u'between'},
 {u'count': 9, u'stem': u'change', u'term': u'change'},
 {u'count': 9, u'stem': u'called', u'term': u'called'},
 {u'count': 9, u'stem': u'across', u'term': u'across'},
 {u'count': 9, u'stem': u'didn', u'term': u'didn'},
 {u'count': 9, u'stem': u'general', u'term': u'general'},
 {u'count': 9, u'stem': u'attorney', u'term': u'attorney'},
 {u'count': 8, u'stem': u'issue', u'term': u'issue'},
 {u'count': 8, u'stem': u'set', u'term': u'set'},
 {u'count': 8, u'stem': u'white', u'term': u'white'},
 {u'count': 8, u'stem': u'programs', u'term': u'programs'},
 {u'count': 8, u'stem': u'something', u'term': u'something'},
 {u'count': 8, u'stem': u'action', u'term': u'action'},
 {u'count': 8, u'stem': u'states', u'term': u'states'},
 {u'count': 8, u'stem': u'money', u'term': u'money'},
 {u'count': 8, u'stem': u'any', u'term': u'any'},
 {u'count': 8, u'stem': u'part', u'term': u'part'},
 {u'count': 8, u'stem': u'senior', u'term': u'senior'},
 {u'count': 8, u'stem': u'come', u'term': u'come'},
 {u'count': 8, u'stem': u'less', u'term': u'less'},
 {u'count': 8, u'stem': u'last', u'term': u'last'},
 {u'count': 8, u'stem': u'music', u'term': u'music'},
 {u'count': 8, u'stem': u'com', u'term': u'com'},
 {u'count': 8, u'stem': u'makes', u'term': u'makes'},
 {u'count': 8, u'stem': u'use', u'term': u'use'},
 {u'count': 8, u'stem': u'told', u'term': u'told'},
 {u'count': 8, u'stem': u'interest', u'term': u'interest'},
 {u'count': 8, u'stem': u'obama', u'term': u'obama'},
 {u'count': 8, u'stem': u'process', u'term': u'process'},
 {u'count': 8, u'stem': u'budget', u'term': u'budget'},
 {u'count': 8, u'stem': u'drive', u'term': u'drive'},
 {u'count': 8, u'stem': u'run', u'term': u'run'},
 {u'count': 8, u'stem': u'took', u'term': u'took'},
 {u'count': 8, u'stem': u'downtown', u'term': u'downtown'},
 {u'count': 8, u'stem': u'court', u'term': u'court'},
 {u'count': 8, u'stem': u'half', u'term': u'half'},
 {u'count': 8, u'stem': u'among', u'term': u'among'},
 {u'count': 8, u'stem': u'facilities', u'term': u'facilities'},
 {u'count': 8, u'stem': u'always', u'term': u'always'},
 {u'count': 8, u'stem': u'within', u'term': u'within'},
 {u'count': 8, u'stem': u'level', u'term': u'level'},
 {u'count': 8, u'stem': u'spring', u'term': u'spring'},
 {u'count': 8, u'stem': u'sunday', u'term': u'sunday'},
 {u'count': 8, u'stem': u'present', u'term': u'present'},
 {u'count': 8, u'stem': u'around', u'term': u'around'},
 {u'count': 8, u'stem': u'services', u'term': u'services'},
 {u'count': 7, u'stem': u'taken', u'term': u'taken'},
 {u'count': 7, u'stem': u'better', u'term': u'better'},
 {u'count': 7, u'stem': u'green', u'term': u'green'},
 {u'count': 7, u'stem': u'country', u'term': u'country'},
 {u'count': 7, u'stem': u'doing', u'term': u'doing'},
 {u'count': 7, u'stem': u'plan', u'term': u'plan'},
 {u'count': 7, u'stem': u'graduate', u'term': u'graduate'},
 {u'count': 7, u'stem': u'expected', u'term': u'expected'},
 {u'count': 7, u'stem': u'never', u'term': u'never'},
 {u'count': 7, u'stem': u'little', u'term': u'little'},
 {u'count': 7, u'stem': u'past', u'term': u'past'},
 {u'count': 7, u'stem': u'under', u'term': u'under'},
 {u'count': 7, u'stem': u'opportunity', u'term': u'opportunity'},
 {u'count': 7, u'stem': u'came', u'term': u'came'},
 {u'count': 7, u'stem': u'tom', u'term': u'tom'},
 {u'count': 7, u'stem': u'did', u'term': u'did'},
 {u'count': 7, u'stem': u'report', u'term': u'report'},
 {u'count': 7, u'stem': u'campus', u'term': u'campus'},
 {u'count': 7, u'stem': u'john', u'term': u'john'},
 {u'count': 7, u'stem': u'received', u'term': u'received'},
 {u'count': 7, u'stem': u'des', u'term': u'des'},
 {u'count': 7, u'stem': u'kids', u'term': u'kids'},
 {u'count': 7, u'stem': u'united', u'term': u'united'},
 {u'count': 7, u'stem': u'event', u'term': u'event'},
 {u'count': 7, u'stem': u'best', u'term': u'best'},
 {u'count': 7, u'stem': u'fund', u'term': u'fund'},
 {u'count': 7, u'stem': u'projects', u'term': u'projects'},
 {u'count': 7, u'stem': u'economic', u'term': u'economic'},
 {u'count': 7, u'stem': u'continue', u'term': u'continue'},
 {u'count': 7, u'stem': u'library', u'term': u'library'},
 {u'count': 7, u'stem': u'until', u'term': u'until'},
 {u'count': 7, u'stem': u'apartment', u'term': u'apartment'},
 {u'count': 7, u'stem': u'help', u'term': u'help'},
 {u'count': 7, u'stem': u'top', u'term': u'top'},
 {u'count': 7, u'stem': u'things', u'term': u'things'},
 {u'count': 7, u'stem': u'government', u'term': u'government'},
 {u'count': 7, u'stem': u'orchestra', u'term': u'orchestra'},
 {u'count': 7, u'stem': u'recently', u'term': u'recently'},
 {u'count': 7, u'stem': u'filed', u'term': u'filed'},
 {u'count': 7, u'stem': u'care', u'term': u'care'},
 {u'count': 7, u'stem': u'ago', u'term': u'ago'},
 {u'count': 7, u'stem': u'own', u'term': u'own'},
 {u'count': 7, u'stem': u'available', u'term': u'available'},
 {u'count': 7, u'stem': u'others', u'term': u'others'},
 {u'count': 7, u'stem': u'yet', u'term': u'yet'},
 {u'count': 7, u'stem': u'put', u'term': u'put'},
 {u'count': 7, u'stem': u'jail', u'term': u'jail'},
 {u'count': 7, u'stem': u'since', u'term': u'since'},
 {u'count': 7, u'stem': u'mother', u'term': u'mother'},
 {u'count': 7, u'stem': u'moines', u'term': u'moines'},
 {u'count': 7, u'stem': u'nearly', u'term': u'nearly'},
 {u'count': 7, u'stem': u'safety', u'term': u'safety'},
 {u'count': 7, u'stem': u'investigation', u'term': u'investigation'},
 {u'count': 7, u'stem': u'residence', u'term': u'residence'},
 {u'count': 7, u'stem': u'along', u'term': u'along'},
 {u'count': 6, u'stem': u'already', u'term': u'already'},
 {u'count': 6, u'stem': u'evening', u'term': u'evening'},
 {u'count': 6, u'stem': u'real', u'term': u'real'},
 {u'count': 6, u'stem': u'freeman', u'term': u'freeman'},
 {u'count': 6, u'stem': u'planning', u'term': u'planning'},
 {u'count': 6, u'stem': u'jones', u'term': u'jones'},
 {u'count': 6, u'stem': u'results', u'term': u'results'},
 {u'count': 6, u'stem': u'right', u'term': u'right'},
 {u'count': 6, u'stem': u'construction', u'term': u'construction'},
 {u'count': 6, u'stem': u'experience', u'term': u'experience'},
 {u'count': 6, u'stem': u'provide', u'term': u'provide'},
 {u'count': 6, u'stem': u'recycling', u'term': u'recycling'},
 {u'count': 6, u'stem': u'questions', u'term': u'questions'},
 {u'count': 6, u'stem': u'solon', u'term': u'solon'},
 {u'count': 6, u'stem': u'months', u'term': u'months'},
 {u'count': 6, u'stem': u'move', u'term': u'move'},
 {u'count': 6, u'stem': u'great', u'term': u'great'},
 {u'count': 6, u'stem': u'child', u'term': u'child'},
 {u'count': 6, u'stem': u'your', u'term': u'your'},
 {u'count': 6, u'stem': u'daughter', u'term': u'daughter'},
 {u'count': 6, u'stem': u'seen', u'term': u'seen'},
 {u'count': 6, u'stem': u'park', u'term': u'park'},
 {u'count': 6, u'stem': u'strong', u'term': u'strong'},
 {u'count': 6, u'stem': u'making', u'term': u'making'},
 {u'count': 6, u'stem': u'next', u'term': u'next'},
 {u'count': 6, u'stem': u'hopes', u'term': u'hopes'},
 {u'count': 6, u'stem': u'lower', u'term': u'lower'},
 {u'count': 6, u'stem': u'washington', u'term': u'washington'},
 {u'count': 6, u'stem': u'news', u'term': u'news'},
 {u'count': 6, u'stem': u'potential', u'term': u'potential'},
 {u'count': 6, u'stem': u'annual', u'term': u'annual'},
 {u'count': 6, u'stem': u'different', u'term': u'different'},
 {u'count': 6, u'stem': u'communities', u'term': u'communities'},
 {u'count': 6, u'stem': u'organization', u'term': u'organization'},
 {u'count': 6, u'stem': u'having', u'term': u'having'},
 {u'count': 6, u'stem': u'pay', u'term': u'pay'},
 {u'count': 6, u'stem': u'ave', u'term': u'ave'},
 {u'count': 6, u'stem': u'field', u'term': u'field'},
 {u'count': 6, u'stem': u'tif', u'term': u'tif'},
 {u'count': 6, u'stem': u'week', u'term': u'week'},
 {u'count': 6, u'stem': u'events', u'term': u'events'},
 {u'count': 6, u'stem': u'bill', u'term': u'bill'},
 {u'count': 6, u'stem': u'growth', u'term': u'growth'},
 {u'count': 6, u'stem': u'moved', u'term': u'moved'},
 {u'count': 6, u'stem': u'email', u'term': u'email'},
 {u'count': 6, u'stem': u'murdah', u'term': u'murdah'},
 {u'count': 6, u'stem': u'assistant', u'term': u'assistant'},
 {u'count': 6, u'stem': u'contact', u'term': u'contact'},
 {u'count': 6, u'stem': u'men', u'term': u'men'},
 {u'count': 6, u'stem': u'young', u'term': u'young'},
 {u'count': 6, u'stem': u'reform', u'term': u'reform'},
 {u'count': 6, u'stem': u'hours', u'term': u'hours'},
 {u'count': 6, u'stem': u'morning', u'term': u'morning'},
 {u'count': 6, u'stem': u'date', u'term': u'date'},
 {u'count': 6, u'stem': u'every', u'term': u'every'},
 {u'count': 6, u'stem': u'tang', u'term': u'tang'},
 {u'count': 6, u'stem': u'case', u'term': u'case'},
 {u'count': 6, u'stem': u'several', u'term': u'several'},
 {u'count': 6, u'stem': u'love', u'term': u'love'},
 {u'count': 6, u'stem': u'complaint', u'term': u'complaint'},
 {u'count': 6, u'stem': u'lawsuit', u'term': u'lawsuit'},
 {u'count': 6, u'stem': u'speech', u'term': u'speech'},
 {u'count': 6, u'stem': u'traffic', u'term': u'traffic'},
 {u'count': 6, u'stem': u'weeks', u'term': u'weeks'},
 {u'count': 6, u'stem': u'quality', u'term': u'quality'},
 {u'count': 6, u'stem': u'campaign', u'term': u'campaign'},
 {u'count': 6, u'stem': u'space', u'term': u'space'},
 {u'count': 6, u'stem': u'thought', u'term': u'thought'},
 {u'count': 6, u'stem': u'women', u'term': u'women'},
 {u'count': 6, u'stem': u'using', u'term': u'using'},
 {u'count': 6, u'stem': u'residents', u'term': u'residents'},
 {u'count': 6, u'stem': u'himself', u'term': u'himself'},
 {u'count': 6, u'stem': u'branstad', u'term': u'branstad'},
 {u'count': 6, u'stem': u'justice', u'term': u'justice'},
 {u'count': 6, u'stem': u'store', u'term': u'store'},
 {u'count': 6, u'stem': u'debate', u'term': u'debate'},
 {u'count': 6, u'stem': u'point', u'term': u'point'},
 {u'count': 6, u'stem': u'once', u'term': u'once'},
 {u'count': 6, u'stem': u'health', u'term': u'health'},
 {u'count': 6, u'stem': u'academy', u'term': u'academy'},
 {u'count': 6, u'stem': u'brown', u'term': u'brown'},
 {u'count': 6, u'stem': u'team', u'term': u'team'},
 {u'count': 5, u'stem': u'jan', u'term': u'jan'},
 {u'count': 5, u'stem': u'businesses', u'term': u'businesses'},
 {u'count': 5, u'stem': u'eligible', u'term': u'eligible'},
 {u'count': 5, u'stem': u'owner', u'term': u'owner'},
 {u'count': 5, u'stem': u'job', u'term': u'job'},
 {u'count': 5, u'stem': u'rates', u'term': u'rates'},
 {u'count': 5, u'stem': u'learning', u'term': u'learning'},
 {u'count': 5, u'stem': u'register', u'term': u'register'},
 {u'count': 5, u'stem': u'sexual', u'term': u'sexual'},
 {u'count': 5, u'stem': u'want', u'term': u'want'},
 {u'count': 5, u'stem': u'released', u'term': u'released'},
 {u'count': 5, u'stem': u'purchased', u'term': u'purchased'},
 {u'count': 5, u'stem': u'officials', u'term': u'officials'},
 {u'count': 5, u'stem': u'hill', u'term': u'hill'},
 {u'count': 5, u'stem': u'awards', u'term': u'awards'},
 {u'count': 5, u'stem': u'fema', u'term': u'fema'},
 {u'count': 5, u'stem': u'medical', u'term': u'medical'},
 {u'count': 5, u'stem': u'end', u'term': u'end'},
 {u'count': 5, u'stem': u'order', u'term': u'order'},
 {u'count': 5, u'stem': u'grants', u'term': u'grants'},
 {u'count': 5, u'stem': u'plans', u'term': u'plans'},
 {u'count': 5, u'stem': u'pretty', u'term': u'pretty'},
 {u'count': 5, u'stem': u'went', u'term': u'went'},
 {u'count': 5, u'stem': u'miller', u'term': u'miller'},
 {u'count': 5, u'stem': u'start', u'term': u'start'},
 {u'count': 5, u'stem': u'stores', u'term': u'stores'},
 {u'count': 5, u'stem': u'special', u'term': u'special'},
 {u'count': 5, u'stem': u'look', u'term': u'look'},
 {u'count': 5, u'stem': u'again', u'term': u'again'},
 {u'count': 5, u'stem': u'winter', u'term': u'winter'},
 {u'count': 5, u'stem': u'left', u'term': u'left'},
 {u'count': 5, u'stem': u'upon', u'term': u'upon'},
 {u'count': 5, u'stem': u'master', u'term': u'master'},
 {u'count': 5, u'stem': u'museum', u'term': u'museum'},
 {u'count': 5, u'stem': u'behavior', u'term': u'behavior'},
 {u'count': 5, u'stem': u'melrose', u'term': u'melrose'},
 {u'count': 5, u'stem': u'senate', u'term': u'senate'},
 {u'count': 5, u'stem': u'award', u'term': u'award'},
 {u'count': 5, u'stem': u'involved', u'term': u'involved'},
 {u'count': 5, u'stem': u'recent', u'term': u'recent'},
 {u'count': 5, u'stem': u'hall', u'term': u'hall'},
 {u'count': 5, u'stem': u'hard', u'term': u'hard'},
 {u'count': 5, u'stem': u'test', u'term': u'test'},
 {u'count': 5, u'stem': u'federal', u'term': u'federal'},
 {u'count': 5, u'stem': u'greater', u'term': u'greater'},
 {u'count': 5, u'stem': u'son', u'term': u'son'},
 {u'count': 5, u'stem': u'closed', u'term': u'closed'},
 {u'count': 5, u'stem': u'hancher', u'term': u'hancher'},
 {u'count': 5, u'stem': u'star', u'term': u'star'},
 {u'count': 5, u'stem': u'began', u'term': u'began'},
 {u'count': 5, u'stem': u'major', u'term': u'major'},
 {u'count': 5, u'stem': u'above', u'term': u'above'},
 {u'count': 5, u'stem': u'placed', u'term': u'placed'},
 {u'count': 5, u'stem': u'video', u'term': u'video'},
 {u'count': 5, u'stem': u'price', u'term': u'price'},
 {u'count': 5, u'stem': u'need', u'term': u'need'},
 {u'count': 5, u'stem': u'lunch', u'term': u'lunch'},
 {u'count': 5, u'stem': u'although', u'term': u'although'},
 {u'count': 5, u'stem': u'important', u'term': u'important'},
 {u'count': 5, u'stem': u'support', u'term': u'support'},
 {u'count': 5, u'stem': u'add', u'term': u'add'},
 {u'count': 5, u'stem': u'car', u'term': u'car'},
 {u'count': 5, u'stem': u'black', u'term': u'black'},
 {u'count': 5, u'stem': u'cases', u'term': u'cases'},
 {u'count': 5, u'stem': u'life', u'term': u'life'},
 {u'count': 5, u'stem': u'month', u'term': u'month'},
 {u'count': 5, u'stem': u'down', u'term': u'down'},
 {u'count': 5, u'stem': u'together', u'term': u'together'},
 {u'count': 5, u'stem': u'east', u'term': u'east'},
 {u'count': 5, u'stem': u'couldn', u'term': u'couldn'},
 {u'count': 5, u'stem': u'includes', u'term': u'includes'},
 {u'count': 5, u'stem': u'currently', u'term': u'currently'},
 {u'count': 5, u'stem': u'moore', u'term': u'moore'},
 {u'count': 5, u'stem': u'remains', u'term': u'remains'},
 {u'count': 5, u'stem': u'started', u'term': u'started'},
 {u'count': 5, u'stem': u'stories', u'term': u'stories'},
 {u'count': 5, u'stem': u'excited', u'term': u'excited'},
 {u'count': 5, u'stem': u'approved', u'term': u'approved'},
 {u'count': 5, u'stem': u'however', u'term': u'however'},
 {u'count': 5, u'stem': u'site', u'term': u'site'},
 {u'count': 5, u'stem': u'considered', u'term': u'considered'},
 {u'count': 5, u'stem': u'january', u'term': u'january'},
 {u'count': 5, u'stem': u'replacement', u'term': u'replacement'},
 {u'count': 5, u'stem': u'hold', u'term': u'hold'},
 {u'count': 5, u'stem': u'same', u'term': u'same'},
 {u'count': 5, u'stem': u'named', u'term': u'named'},
 {u'count': 5, u'stem': u'sent', u'term': u'sent'},
 {u'count': 5, u'stem': u'misdemeanor', u'term': u'misdemeanor'},
 {u'count': 5, u'stem': u'given', u'term': u'given'},
 {u'count': 5, u'stem': u'lives', u'term': u'lives'},
 {u'count': 5, u'stem': u'must', u'term': u'must'},
 {u'count': 5, u'stem': u'ready', u'term': u'ready'},
 {u'count': 5, u'stem': u'legal', u'term': u'legal'},
 {u'count': 5, u'stem': u'struggle', u'term': u'struggle'},
 {u'count': 5, u'stem': u'used', u'term': u'used'},
 {u'count': 5, u'stem': u'tickets', u'term': u'tickets'},
 {u'count': 5, u'stem': u'small', u'term': u'small'},
 {u'count': 5, u'stem': u'hospitals', u'term': u'hospitals'},
 {u'count': 5, u'stem': u'floor', u'term': u'floor'}]

In [7]:
wc_old


Out[7]:
[{u'count': 179, u'stem': u'iowa', u'term': u'iowa'},
 {u'count': 55, u'stem': u'student', u'term': u'students'},
 {u'count': 31, u'stem': u'johnson', u'term': u'johnson'},
 {u'count': 23, u'stem': u'coralvill', u'term': u'coralville'},
 {u'count': 20, u'stem': u'colleg', u'term': u'college'},
 {u'count': 19, u'stem': u'famili', u'term': u'family'},
 {u'count': 18, u'stem': u'tax', u'term': u'tax'},
 {u'count': 15, u'stem': u'children', u'term': u'children'},
 {u'count': 12, u'stem': u'donat', u'term': u'donations'},
 {u'count': 12, u'stem': u'teacher', u'term': u'teacher'},
 {u'count': 12, u'stem': u'washington', u'term': u'washington'},
 {u'count': 11, u'stem': u'alleg', u'term': u'allegedly'},
 {u'count': 11, u'stem': u'flood', u'term': u'flood'},
 {u'count': 11, u'stem': u'educ', u'term': u'education'},
 {u'count': 10, u'stem': u'studi', u'term': u'studies'},
 {u'count': 10, u'stem': u'republican', u'term': u'republican'},
 {u'count': 10, u'stem': u'free', u'term': u'free'},
 {u'count': 10, u'stem': u'elementari', u'term': u'elementary'},
 {u'count': 9, u'stem': u'war', u'term': u'war'},
 {u'count': 9, u'stem': u'space', u'term': u'space'},
 {u'count': 8, u'stem': u'obama', u'term': u'obama'},
 {u'count': 8, u'stem': u'assault', u'term': u'assault'},
 {u'count': 8, u'stem': u'american', u'term': u'american'},
 {u'count': 8, u'stem': u'dubuqu', u'term': u'dubuque'},
 {u'count': 7, u'stem': u'rick', u'term': u'rick'},
 {u'count': 7, u'stem': u'democrat', u'term': u'democratic'},
 {u'count': 7, u'stem': u'professor', u'term': u'professor'},
 {u'count': 7, u'stem': u'caucus', u'term': u'caucus'},
 {u'count': 7, u'stem': u'thompson', u'term': u'thompson'},
 {u'count': 6, u'stem': u'hawkey', u'term': u'hawkeye'},
 {u'count': 6, u'stem': u'lane', u'term': u'lane'},
 {u'count': 6, u'stem': u'hospit', u'term': u'hospital'},
 {u'count': 6, u'stem': u'photo', u'term': u'photo'},
 {u'count': 6, u'stem': u'love', u'term': u'love'},
 {u'count': 6, u'stem': u'museum', u'term': u'museum'},
 {u'count': 6, u'stem': u'branstad', u'term': u'branstad'},
 {u'count': 6, u'stem': u'athlet', u'term': u'athletics'},
 {u'count': 6, u'stem': u'mall', u'term': u'mall'},
 {u'count': 6, u'stem': u'avenu', u'term': u'avenue'},
 {u'count': 6, u'stem': u'regent', u'term': u'regents'},
 {u'count': 5, u'stem': u'chicago', u'term': u'chicago'},
 {u'count': 5, u'stem': u'renew', u'term': u'renewable'},
 {u'count': 5, u'stem': u'teach', u'term': u'teaching'},
 {u'count': 5, u'stem': u'ticket', u'term': u'ticket'},
 {u'count': 5, u'stem': u'fundrais', u'term': u'fundraiser'},
 {u'count': 5, u'stem': u'wrestl', u'term': u'wrestling'},
 {u'count': 5, u'stem': u'alcohol', u'term': u'alcohol'},
 {u'count': 5, u'stem': u'highway', u'term': u'highway'},
 {u'count': 5, u'stem': u'gilbert', u'term': u'gilbert'},
 {u'count': 5, u'stem': u'complaint', u'term': u'complaints'},
 {u'count': 5, u'stem': u'miller', u'term': u'miller'},
 {u'count': 5, u'stem': u'tom', u'term': u'tom'},
 {u'count': 5, u'stem': u'robert', u'term': u'robert'},
 {u'count': 5, u'stem': u'advoc', u'term': u'advocates'},
 {u'count': 5, u'stem': u'non', u'term': u'non'},
 {u'count': 5, u'stem': u'plaza', u'term': u'plaza'},
 {u'count': 5, u'stem': u'quartet', u'term': u'quartet'},
 {u'count': 5, u'stem': u'holiday', u'term': u'holiday'},
 {u'count': 5, u'stem': u'sept', u'term': u'sept'},
 {u'count': 5, u'stem': u'pedestrian', u'term': u'pedestrian'},
 {u'count': 5, u'stem': u'plummer', u'term': u'plummer'},
 {u'count': 5, u'stem': u'choke', u'term': u'choke'},
 {u'count': 5, u'stem': u'burlington', u'term': u'burlington'},
 {u'count': 4, u'stem': u'retail', u'term': u'retailer'},
 {u'count': 4, u'stem': u'ridg', u'term': u'ridge'},
 {u'count': 4, u'stem': u'physician', u'term': u'physicians'},
 {u'count': 4, u'stem': u'conserv', u'term': u'conservation'},
 {u'count': 4, u'stem': u'gut', u'term': u'gut'},
 {u'count': 4, u'stem': u'crimin', u'term': u'criminal'},
 {u'count': 4, u'stem': u'van', u'term': u'van'},
 {u'count': 4, u'stem': u'marijuana', u'term': u'marijuana'},
 {u'count': 4, u'stem': u'loebsack', u'term': u'loebsack'},
 {u'count': 4, u'stem': u'moin', u'term': u'moines'},
 {u'count': 4, u'stem': u'laura', u'term': u'laura'},
 {u'count': 4, u'stem': u'coordin', u'term': u'coordinator'},
 {u'count': 4, u'stem': u'supervisor', u'term': u'supervisors'},
 {u'count': 4, u'stem': u'tiffin', u'term': u'tiffin'},
 {u'count': 4, u'stem': u'trail', u'term': u'trail'},
 {u'count': 4, u'stem': u'dan', u'term': u'dan'},
 {u'count': 4, u'stem': u'bachelor', u'term': u'bachelor'},
 {u'count': 4, u'stem': u'child', u'term': u'child'},
 {u'count': 4, u'stem': u'econom', u'term': u'economic'},
 {u'count': 4, u'stem': u'sycamor', u'term': u'sycamore'},
 {u'count': 4, u'stem': u'cedar', u'term': u'cedar'},
 {u'count': 4, u'stem': u'nathan', u'term': u'nathan'},
 {u'count': 4, u'stem': u'campus', u'term': u'campus'},
 {u'count': 4, u'stem': u'linn', u'term': u'linn'},
 {u'count': 4, u'stem': u'dodg', u'term': u'dodge'},
 {u'count': 4, u'stem': u'campaign', u'term': u'campaign'},
 {u'count': 4, u'stem': u'steve', u'term': u'steve'},
 {u'count': 4, u'stem': u'des', u'term': u'des'},
 {u'count': 4, u'stem': u'william', u'term': u'williams'},
 {u'count': 4, u'stem': u'reward', u'term': u'reward'},
 {u'count': 4, u'stem': u'casino', u'term': u'casino'},
 {u'count': 4, u'stem': u'roosevelt', u'term': u'roosevelt'},
 {u'count': 4, u'stem': u'nurs', u'term': u'nursing'},
 {u'count': 4, u'stem': u'winner', u'term': u'winners'},
 {u'count': 4, u'stem': u'iowan', u'term': u'iowans'},
 {u'count': 4, u'stem': u'krei', u'term': u'krei'},
 {u'count': 4, u'stem': u'cancer', u'term': u'cancer'},
 {u'count': 4, u'stem': u'commiss', u'term': u'commission'},
 {u'count': 4, u'stem': u'agreement', u'term': u'agreement'},
 {u'count': 4, u'stem': u'foundat', u'term': u'foundation'},
 {u'count': 4, u'stem': u'christoph', u'term': u'christopher'},
 {u'count': 4, u'stem': u'hart', u'term': u'hart'},
 {u'count': 4, u'stem': u'attende', u'term': u'attendees'},
 {u'count': 4, u'stem': u'increment', u'term': u'increment'},
 {u'count': 4, u'stem': u'michael', u'term': u'michael'},
 {u'count': 3, u'stem': u'cinema', u'term': u'cinema'},
 {u'count': 3, u'stem': u'collabor', u'term': u'collaborated'},
 {u'count': 3, u'stem': u'mural', u'term': u'mural'},
 {u'count': 3, u'stem': u'smith', u'term': u'smith'},
 {u'count': 3, u'stem': u'yoga', u'term': u'yoga'},
 {u'count': 3, u'stem': u'interst', u'term': u'interstate'},
 {u'count': 3, u'stem': u'mason', u'term': u'mason'},
 {u'count': 3, u'stem': u'tap', u'term': u'tap'},
 {u'count': 3, u'stem': u'celebr', u'term': u'celebrate'},
 {u'count': 3, u'stem': u'ceremoni', u'term': u'ceremony'},
 {u'count': 3, u'stem': u'discov', u'term': u'discovered'},
 {u'count': 3, u'stem': u'david', u'term': u'david'},
 {u'count': 3, u'stem': u'crimestopp', u'term': u'crimestoppers'},
 {u'count': 3, u'stem': u'presidenti', u'term': u'presidential'},
 {u'count': 3, u'stem': u'armi', u'term': u'army'},
 {u'count': 3, u'stem': u'mom', u'term': u'mom'},
 {u'count': 3, u'stem': u'prairi', u'term': u'prairie'},
 {u'count': 3, u'stem': u'unfair', u'term': u'unfairly'},
 {u'count': 3, u'stem': u'wrap', u'term': u'wrapping'},
 {u'count': 3, u'stem': u'hawk', u'term': u'hawk'},
 {u'count': 3, u'stem': u'johnston', u'term': u'johnston'},
 {u'count': 3, u'stem': u'deputi', u'term': u'deputy'},
 {u'count': 3, u'stem': u'strive', u'term': u'strive'},
 {u'count': 3, u'stem': u'martin', u'term': u'martin'},
 {u'count': 3, u'stem': u'prohibit', u'term': u'prohibition'},
 {u'count': 3, u'stem': u'massachusett', u'term': u'massachusetts'},
 {u'count': 3, u'stem': u'mari', u'term': u'mary'},
 {u'count': 3, u'stem': u'crisi', u'term': u'crisis'},
 {u'count': 3, u'stem': u'reconstruct', u'term': u'reconstruction'},
 {u'count': 3, u'stem': u'classroom', u'term': u'classrooms'},
 {u'count': 3, u'stem': u'shuttl', u'term': u'shuttle'},
 {u'count': 3, u'stem': u'provost', u'term': u'provost'},
 {u'count': 3, u'stem': u'solon', u'term': u'solon'},
 {u'count': 3, u'stem': u'toy', u'term': u'toys'},
 {u'count': 3, u'stem': u'couldn', u'term': u'couldn'},
 {u'count': 3, u'stem': u'romney', u'term': u'romney'},
 {u'count': 3, u'stem': u'implement', u'term': u'implement'},
 {u'count': 3, u'stem': u'jeff', u'term': u'jeff'},
 {u'count': 3, u'stem': u'nomin', u'term': u'nomination'},
 {u'count': 3, u'stem': u'dolan', u'term': u'dolan'},
 {u'count': 3, u'stem': u'brian', u'term': u'brian'},
 {u'count': 3, u'stem': u'ensur', u'term': u'ensure'},
 {u'count': 3, u'stem': u'harri', u'term': u'harry'},
 {u'count': 3, u'stem': u'waterfront', u'term': u'waterfront'},
 {u'count': 3, u'stem': u'gop', u'term': u'gop'},
 {u'count': 3, u'stem': u'par', u'term': u'par'},
 {u'count': 3, u'stem': u'org', u'term': u'org'},
 {u'count': 3, u'stem': u'pro', u'term': u'pro'},
 {u'count': 3, u'stem': u'funer', u'term': u'funeral'},
 {u'count': 3, u'stem': u'unemploy', u'term': u'unemployment'},
 {u'count': 3, u'stem': u'lawsuit', u'term': u'lawsuit'},
 {u'count': 3, u'stem': u'vee', u'term': u'vee'},
 {u'count': 3, u'stem': u'joe', u'term': u'joe'},
 {u'count': 3, u'stem': u'exceed', u'term': u'exceeded'},
 {u'count': 3, u'stem': u'refresh', u'term': u'refreshments'},
 {u'count': 3, u'stem': u'jim', u'term': u'jim'},
 {u'count': 3, u'stem': u'victim', u'term': u'victim'},
 {u'count': 3, u'stem': u'jecc', u'term': u'jecc'},
 {u'count': 3, u'stem': u'journal', u'term': u'journalism'},
 {u'count': 3, u'stem': u'firearm', u'term': u'firearm'},
 {u'count': 3, u'stem': u'john', u'term': u'john'},
 {u'count': 3, u'stem': u'coral', u'term': u'coral'},
 {u'count': 3, u'stem': u'meyer', u'term': u'meyer'},
 {u'count': 3, u'stem': u'prioriti', u'term': u'priority'},
 {u'count': 3, u'stem': u'mitt', u'term': u'mitt'},
 {u'count': 3, u'stem': u'agenda', u'term': u'agenda'},
 {u'count': 3, u'stem': u'tim', u'term': u'tim'},
 {u'count': 3, u'stem': u'longfellow', u'term': u'longfellow'},
 {u'count': 3, u'stem': u'slockett', u'term': u'slockett'}]

In [15]:
shuffle(media)
for media_source in media:
    media_source['word_counts'] = mc.wordCount( get_wc_query( media_source['media_id'] ) )
    mc.wordCount

In [16]:
for media_source in media:
    source_word_counts = media_source[ 'word_counts'] 
    counts = [ count[ 'count'] for count in  source_word_counts ]
    total_words = float( sum( counts ) )
    #print total_words
    scaled_counts = dict ( [ ( count['term'], count['count'] / total_words) for count in source_word_counts] )
    #print scaled_counts
    media_source['scaled_counts'] = scaled_counts

In [17]:
media[0].keys()


Out[17]:
['Partisan code', 'scaled_counts', 'word_counts', 'media_id', 'Media_url']

In [21]:
for media_source in media:
    words_present = dict ( [ ( word, True ) for word in media_source['scaled_counts'].keys() ] )
    #print scaled_counts
    media_source['words_present'] = words_present

In [22]:
cPickle.dump( media, file( os.path.expanduser( '~/Dropbox/mc/media.pickle' ), 'wb' ) )

Start here to avoid having to requery for the data


In [1]:
import cPickle
import os.path

media = cPickle.load( file( os.path.expanduser( '~/Dropbox/mc/media.pickle' ), 'r' ) )

In [2]:
def get_media_with_valid_partisan_codes( media ):
    for media_source in media:
        media_source['Partisan code'] = media_source['Partisan code'].strip().lower()
    valid_partisan_codes = set ( ['liberal', 
                              'not valid (broken link, not in english, etc.)',
                              'conservative',
                              'libertarian',
                              'none of the above'])

    media = [ media_source for media_source in media if media_source['Partisan code'] in valid_partisan_codes ]
    
    return media

In [3]:
print len (media)
media = get_media_with_valid_partisan_codes( media )
print len (media )


1359
1359

Get featurized X, Y


In [4]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn import preprocessing
from collections import Counter

def get_X_Y( media ):
    feature_field = 'words_present'
    
    feature_dicts = [ media_source[ feature_field ] for media_source in media ]
    
    trans = vec.fit_transform( feature_dicts )
    X = trans.toarray()
    
    partisan_codes = [ media_source[ 'Partisan code' ] for media_source in media]
    
    c = Counter( partisan_codes )
    #print set(c.keys())

    le.fit( partisan_codes )
    Y = le.transform( partisan_codes )
    
    return X, Y

In [5]:
X, Y = get_X_Y( media )
print len(X)
print len(Y)


1359
1359

In [7]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=33)

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import sklearn.metrics
from sklearn.metrics import confusion_matrix
    
clfs = [ MultinomialNB(), svm.SVC( probability=True, kernel = 'rbf'), LinearSVC(), SGDClassifier()  ] 
for clf in clfs:
    print clf
    clf.fit( X_train, y_train)
    y_pred = clf.predict( X_train)
    accuracy = sklearn.metrics.accuracy_score( y_train, y_pred )
    print accuracy
    
    print "calculating confusion matrix"
    cm = confusion_matrix( y_train, y_pred)
    print "confusion matrix"
    print cm
    

    #cv = KFold( len(X), 5, shuffle=True, random_state=0 )
    #scores = cross_val_score( LinearSVC(), X, Y, cv=cv)
    #print scores


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.857703631011
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-9-29251be541d4> in <module>()
     10 for clf in clfs:
     11     print clf
---> 12     clf.fit( X_train, y_train)
     13     y_pred = clf.predict( X_train)
     14     accuracy = sklearn.metrics.accuracy_score( y_train, y_pred )

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in fit(self, X, y, sample_weight)
    174 
    175         seed = rnd.randint(np.iinfo('i').max)
--> 176         fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
    177         # see comment on the other call to np.iinfo in this file
    178 

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
    229                 cache_size=self.cache_size, coef0=self.coef0,
    230                 gamma=self._gamma, epsilon=self.epsilon,
--> 231                 max_iter=self.max_iter, random_seed=random_seed)
    232 
    233         self._warn_from_fit_status()

KeyboardInterrupt: 
[[193   0   0  32   1]
 [  1 213   0  28   0]
 [  3   3   5  13   0]
 [  6   0   1 439   2]
 [  8   8   0  39  24]]
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [6]:
partisan_codes = [ media_source[ 'Partisan code' ] for media_source in media]
from collections import Counter
c = Counter( partisan_codes )
print set(c.keys())
c


set(['libertarian', 'liberal', 'not valid (broken link, not in english, etc.)', 'none of the above', 'conservative'])
Out[6]:
Counter({'none of the above': 591, 'liberal': 328, 'conservative': 301, 'not valid (broken link, not in english, etc.)': 104, 'libertarian': 35})

In [12]:


In [106]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf = svm.SVC( probability=True, kernel = 'rbf')
clf.fit( X[:10], Y[:10] )


Out[106]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [107]:
#print clf.predict_proba( X[:10])
print clf.predict( X[:10])
print Y[:10]


[1 1 1 1 1 1 1 1 1 1]
[4 0 1 0 1 3 0 1 1 3]

In [97]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X[:10], Y[:10]) 
clf.coef_


Out[97]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [77]:
Y
Y_pred = clf.predict( X )

In [80]:
import sklearn.metrics
sklearn.metrics.accuracy_score( Y, Y_pred )


Out[80]:
0.43487858719646799

In [69]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix( Y, Y_pred)
print cm
print Y_pred
print Y
print Counter( Y )
print Counter( Y_pred )


[[  0   0   0 301   0]
 [  0   0   0 328   0]
 [  0   0   0  35   0]
 [  0   0   0 591   0]
 [  0   0   0 104   0]]
[3 3 3 ..., 3 3 3]
[4 0 1 ..., 3 3 3]
Counter({3: 591, 1: 328, 0: 301, 4: 104, 2: 35})
Counter({3: 1359})

In [76]:
le.inverse_transform( Y )


Out[76]:
array(['Conservative', 'None of the above', 'None of the above', ...,
       'Not Valid (broken link, not in English, etc.)', 'Liberal',
       'None of the above'], 
      dtype='<S53')

In [11]: