mediacloud_heuristic_readability_comparison



In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [2]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [3]:
import mediacloud, json
localkey = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'
mc_heuristic = mediacloud.api.MediaCloud( localkey )
mc_readability = mediacloud.api.MediaCloud( localkey )
mc_chloe = mediacloud.api.MediaCloud( api_key )

In [4]:
mc_heuristic.V2_API_URL = 'http://localhost:7000/api/v2/'
mc_readability.V2_API_URL = 'http://localhost:8000/api/v2/'
mc_chloe.V2_API_URL = "https://api.mediacloud.org/api/v2/"

In [9]:
mc_chloe.mediaList()
mc_chloe.wordCount     ( solr_query='tags_id_media:2453107' )
mc_heuristic.wordCount( solr_query='tags_id_media:125' )


Out[9]:
[{u'count': 43, u'stem': u'wilson', u'term': u'wilson'},
 {u'count': 33, u'stem': u'democrat', u'term': u'democrats'},
 {u'count': 25, u'stem': u'juri', u'term': u'jury'},
 {u'count': 22, u'stem': u'american', u'term': u'american'},
 {u'count': 20, u'stem': u'obama', u'term': u'obama'},
 {u'count': 18, u'stem': u'ferguson', u'term': u'ferguson'},
 {u'count': 16, u'stem': u'republican', u'term': u'republicans'},
 {u'count': 15, u'stem': u'prosecutor', u'term': u'prosecutor'},
 {u'count': 14, u'stem': u'tax', u'term': u'tax'},
 {u'count': 13, u'stem': u'america', u'term': u'america'},
 {u'count': 13, u'stem': u'famili', u'term': u'family'},
 {u'count': 10, u'stem': u'immigr', u'term': u'immigration'},
 {u'count': 10, u'stem': u'war', u'term': u'war'},
 {u'count': 10, u'stem': u'campaign', u'term': u'campaign'},
 {u'count': 10, u'stem': u'wasn', u'term': u'wasn'},
 {u'count': 10, u'stem': u'thanksgiv', u'term': u'thanksgiving'},
 {u'count': 10, u'stem': u'michael', u'term': u'michael'},
 {u'count': 9, u'stem': u'student', u'term': u'students'},
 {u'count': 9, u'stem': u'darren', u'term': u'darren'},
 {u'count': 9, u'stem': u'cop', u'term': u'cops'},
 {u'count': 9, u'stem': u'indict', u'term': u'indict'},
 {u'count': 9, u'stem': u'holiday', u'term': u'holiday'},
 {u'count': 9, u'stem': u'true', u'term': u'true'},
 {u'count': 9, u'stem': u'mcculloch', u'term': u'mcculloch'},
 {u'count': 8, u'stem': u'texa', u'term': u'texas'},
 {u'count': 8, u'stem': u'econom', u'term': u'economic'},
 {u'count': 8, u'stem': u'mike', u'term': u'mike'},
 {u'count': 7, u'stem': u'jay', u'term': u'jay'},
 {u'count': 7, u'stem': u'global', u'term': u'global'},
 {u'count': 7, u'stem': u'economi', u'term': u'economy'},
 {u'count': 7, u'stem': u'sanction', u'term': u'sanctions'},
 {u'count': 7, u'stem': u'free', u'term': u'free'},
 {u'count': 7, u'stem': u'loui', u'term': u'louis'},
 {u'count': 6, u'stem': u'frustrat', u'term': u'frustration'},
 {u'count': 6, u'stem': u'riot', u'term': u'rioting'},
 {u'count': 6, u'stem': u'media', u'term': u'media'},
 {u'count': 6, u'stem': u'ohio', u'term': u'ohio'},
 {u'count': 6, u'stem': u'washington', u'term': u'washington'},
 {u'count': 6, u'stem': u'crime', u'term': u'crime'},
 {u'count': 5, u'stem': u'african', u'term': u'african'},
 {u'count': 5, u'stem': u'child', u'term': u'child'},
 {u'count': 5, u'stem': u'ban', u'term': u'ban'},
 {u'count': 5, u'stem': u'gotta', u'term': u'gotta'},
 {u'count': 5, u'stem': u'valu', u'term': u'value'},
 {u'count': 5, u'stem': u'gruber', u'term': u'gruber'},
 {u'count': 5, u'stem': u'missouri', u'term': u'missouri'},
 {u'count': 5, u'stem': u'hagel', u'term': u'hagel'},
 {u'count': 5, u'stem': u'reagan', u'term': u'reagan'},
 {u'count': 4, u'stem': u'celebr', u'term': u'celebrating'},
 {u'count': 4, u'stem': u'gov', u'term': u'gov'},
 {u'count': 4, u'stem': u'european', u'term': u'european'},
 {u'count': 4, u'stem': u'gdp', u'term': u'gdp'},
 {u'count': 4, u'stem': u'conserv', u'term': u'conservative'},
 {u'count': 4, u'stem': u'crimin', u'term': u'criminal'},
 {u'count': 4, u'stem': u'trayvon', u'term': u'trayvon'},
 {u'count': 4, u'stem': u'arm', u'term': u'armed'},
 {u'count': 4, u'stem': u'alcohol', u'term': u'alcohol'},
 {u'count': 4, u'stem': u'violent', u'term': u'violent'},
 {u'count': 4, u'stem': u'intellectu', u'term': u'intellectually'},
 {u'count': 4, u'stem': u'gop', u'term': u'gop'},
 {u'count': 4, u'stem': u'twitter', u'term': u'twitter'},
 {u'count': 4, u'stem': u'georg', u'term': u'george'},
 {u'count': 4, u'stem': u'fact', u'term': u'fact'},
 {u'count': 4, u'stem': u'victim', u'term': u'victims'},
 {u'count': 4, u'stem': u'john', u'term': u'john'},
 {u'count': 4, u'stem': u'victori', u'term': u'victory'},
 {u'count': 4, u'stem': u'infrastructur', u'term': u'infrastructure'},
 {u'count': 4, u'stem': u'fals', u'term': u'false'},
 {u'count': 4, u'stem': u'labor', u'term': u'labor'},
 {u'count': 4, u'stem': u'non', u'term': u'non'},
 {u'count': 4, u'stem': u'corpor', u'term': u'corporate'},
 {u'count': 4, u'stem': u'diari', u'term': u'diaries'},
 {u'count': 4, u'stem': u'resign', u'term': u'resigned'},
 {u'count': 4, u'stem': u'bias', u'term': u'bias'},
 {u'count': 4, u'stem': u'walmart', u'term': u'walmart'},
 {u'count': 4, u'stem': u'injustic', u'term': u'injustice'},
 {u'count': 4, u'stem': u'educ', u'term': u'education'},
 {u'count': 4, u'stem': u'lee', u'term': u'lee'},
 {u'count': 4, u'stem': u'potato', u'term': u'potato'},
 {u'count': 3, u'stem': u'apolog', u'term': u'apologize'},
 {u'count': 3, u'stem': u'fuck', u'term': u'fuck'},
 {u'count': 3, u'stem': u'deserv', u'term': u'deserved'},
 {u'count': 3, u'stem': u'joseph', u'term': u'joseph'},
 {u'count': 3, u'stem': u'manipul', u'term': u'manipulated'},
 {u'count': 3, u'stem': u'disgust', u'term': u'disgusting'},
 {u'count': 3, u'stem': u'turkey', u'term': u'turkey'},
 {u'count': 3, u'stem': u'poison', u'term': u'poisoning'},
 {u'count': 3, u'stem': u'inequ', u'term': u'inequality'},
 {u'count': 3, u'stem': u'mom', u'term': u'mom'},
 {u'count': 3, u'stem': u'carolina', u'term': u'carolina'},
 {u'count': 3, u'stem': u'christian', u'term': u'christians'},
 {u'count': 3, u'stem': u'donat', u'term': u'donations'},
 {u'count': 3, u'stem': u'crisi', u'term': u'crisis'},
 {u'count': 3, u'stem': u'rice', u'term': u'rice'},
 {u'count': 3, u'stem': u'nixon', u'term': u'nixon'},
 {u'count': 3, u'stem': u'illeg', u'term': u'illegally'},
 {u'count': 3, u'stem': u'solar', u'term': u'solar'},
 {u'count': 3, u'stem': u'solut', u'term': u'solution'},
 {u'count': 3, u'stem': u'congress', u'term': u'congress'},
 {u'count': 3, u'stem': u'vietnam', u'term': u'vietnam'},
 {u'count': 3, u'stem': u'unemploy', u'term': u'unemployment'},
 {u'count': 3, u'stem': u'joe', u'term': u'joe'},
 {u'count': 3, u'stem': u'franc', u'term': u'france'},
 {u'count': 3, u'stem': u'wealth', u'term': u'wealth'},
 {u'count': 3, u'stem': u'outrag', u'term': u'outrage'},
 {u'count': 3, u'stem': u'violenc', u'term': u'violence'},
 {u'count': 3, u'stem': u'hillari', u'term': u'hillary'},
 {u'count': 3, u'stem': u'imax', u'term': u'imax'},
 {u'count': 3, u'stem': u'coward', u'term': u'cowardly'},
 {u'count': 3, u'stem': u'cnn', u'term': u'cnn'},
 {u'count': 3, u'stem': u'russel', u'term': u'russell'},
 {u'count': 3, u'stem': u'inject', u'term': u'injection'},
 {u'count': 3, u'stem': u'walt', u'term': u'walt'},
 {u'count': 3, u'stem': u'gap', u'term': u'gap'},
 {u'count': 3, u'stem': u'journalist', u'term': u'journalists'},
 {u'count': 3, u'stem': u'anymor', u'term': u'anymore'},
 {u'count': 3, u'stem': u'weaken', u'term': u'weakened'},
 {u'count': 3, u'stem': u'revel', u'term': u'revelation'},
 {u'count': 3, u'stem': u'georgia', u'term': u'georgia'},
 {u'count': 3, u'stem': u'seattl', u'term': u'seattle'},
 {u'count': 3, u'stem': u'commiss', u'term': u'commission'},
 {u'count': 3, u'stem': u'mortgag', u'term': u'mortgage'},
 {u'count': 3, u'stem': u'workplac', u'term': u'workplaces'},
 {u'count': 3, u'stem': u'jon', u'term': u'jon'},
 {u'count': 3, u'stem': u'latino', u'term': u'latino'},
 {u'count': 3, u'stem': u'internet', u'term': u'internet'},
 {u'count': 3, u'stem': u'everyday', u'term': u'everyday'},
 {u'count': 3, u'stem': u'medicar', u'term': u'medicare'},
 {u'count': 3, u'stem': u'loot', u'term': u'looting'},
 {u'count': 3, u'stem': u'cbo', u'term': u'cbo'},
 {u'count': 3, u'stem': u'elev', u'term': u'elevate'},
 {u'count': 3, u'stem': u'johnson', u'term': u'johnson'},
 {u'count': 3, u'stem': u'europ', u'term': u'europe'},
 {u'count': 3, u'stem': u'rep', u'term': u'rep'},
 {u'count': 3, u'stem': u'arbitr', u'term': u'arbitrate'},
 {u'count': 3, u'stem': u'groceri', u'term': u'grocery'},
 {u'count': 2, u'stem': u'colleg', u'term': u'college'},
 {u'count': 2, u'stem': u'devast', u'term': u'devastating'},
 {u'count': 2, u'stem': u'bless', u'term': u'bless'},
 {u'count': 2, u'stem': u'retail', u'term': u'retailers'},
 {u'count': 2, u'stem': u'trigger', u'term': u'trigger'},
 {u'count': 2, u'stem': u'ron', u'term': u'ron'},
 {u'count': 2, u'stem': u'candl', u'term': u'candles'},
 {u'count': 2, u'stem': u'invas', u'term': u'invasion'},
 {u'count': 2, u'stem': u'smith', u'term': u'smith'},
 {u'count': 2, u'stem': u'option', u'term': u'options'},
 {u'count': 2, u'stem': u'virginia', u'term': u'virginia'},
 {u'count': 2, u'stem': u'unpopular', u'term': u'unpopular'},
 {u'count': 2, u'stem': u'houston', u'term': u'houston'},
 {u'count': 2, u'stem': u'manslaught', u'term': u'manslaughter'},
 {u'count': 2, u'stem': u'legislatur', u'term': u'legislature'},
 {u'count': 2, u'stem': u'klein', u'term': u'klein'},
 {u'count': 2, u'stem': u'china', u'term': u'china'},
 {u'count': 2, u'stem': u'chicago', u'term': u'chicago'},
 {u'count': 2, u'stem': u'reven', u'term': u'revenant'},
 {u'count': 2, u'stem': u'cartoon', u'term': u'cartoon'},
 {u'count': 2, u'stem': u'etc', u'term': u'etc'},
 {u'count': 2, u'stem': u'rage', u'term': u'rage'},
 {u'count': 2, u'stem': u'lien', u'term': u'lien'},
 {u'count': 2, u'stem': u'salad', u'term': u'salad'},
 {u'count': 2, u'stem': u'teach', u'term': u'teaching'},
 {u'count': 2, u'stem': u'david', u'term': u'david'},
 {u'count': 2, u'stem': u'amnesti', u'term': u'amnesty'},
 {u'count': 2, u'stem': u'phoenix', u'term': u'phoenix'},
 {u'count': 2, u'stem': u'essay', u'term': u'essays'},
 {u'count': 2, u'stem': u'halloween', u'term': u'halloween'},
 {u'count': 2, u'stem': u'fundament', u'term': u'fundamental'},
 {u'count': 2, u'stem': u'peak', u'term': u'peaked'},
 {u'count': 2, u'stem': u'aerosol', u'term': u'aerosol'},
 {u'count': 2, u'stem': u'misguid', u'term': u'misguided'},
 {u'count': 2, u'stem': u'kick', u'term': u'kick'},
 {u'count': 2, u'stem': u'kos', u'term': u'kos'},
 {u'count': 2, u'stem': u'dad', u'term': u'dad'},
 {u'count': 2, u'stem': u'marijuana', u'term': u'marijuana'},
 {u'count': 2, u'stem': u'agricultur', u'term': u'agricultural'},
 {u'count': 2, u'stem': u'studi', u'term': u'studies'},
 {u'count': 2, u'stem': u'eric', u'term': u'eric'},
 {u'count': 2, u'stem': u'disagr', u'term': u'disagreements'},
 {u'count': 2, u'stem': u'plainfield', u'term': u'plainfield'},
 {u'count': 2, u'stem': u'independ', u'term': u'independent'},
 {u'count': 2, u'stem': u'ronald', u'term': u'ronald'},
 {u'count': 2, u'stem': u'manchest', u'term': u'manchester'},
 {u'count': 2, u'stem': u'legitim', u'term': u'legitimate'},
 {u'count': 2, u'stem': u'rape', u'term': u'rape'},
 {u'count': 2, u'stem': u'toledo', u'term': u'toledo'},
 {u'count': 2, u'stem': u'bruis', u'term': u'bruising'},
 {u'count': 2, u'stem': u'undeni', u'term': u'undeniable'},
 {u'count': 2, u'stem': u'stolen', u'term': u'stolen'},
 {u'count': 2, u'stem': u'punch', u'term': u'punched'},
 {u'count': 2, u'stem': u'irrelev', u'term': u'irrelevant'},
 {u'count': 2, u'stem': u'dem', u'term': u'dems'},
 {u'count': 2, u'stem': u'san', u'term': u'san'},
 {u'count': 2, u'stem': u'protestor', u'term': u'protestors'},
 {u'count': 2, u'stem': u'suspens', u'term': u'suspense'},
 {u'count': 2, u'stem': u'flame', u'term': u'flames'},
 {u'count': 2, u'stem': u'california', u'term': u'california'},
 {u'count': 2, u'stem': u'hospit', u'term': u'hospitals'},
 {u'count': 2, u'stem': u'flip', u'term': u'flip'},
 {u'count': 2, u'stem': u'indign', u'term': u'indignity'},
 {u'count': 2, u'stem': u'neoliber', u'term': u'neoliberalism'},
 {u'count': 2, u'stem': u'taylor', u'term': u'taylor'},
 {u'count': 2, u'stem': u'bob', u'term': u'bob'},
 {u'count': 2, u'stem': u'cosbi', u'term': u'cosby'},
 {u'count': 2, u'stem': u'couldn', u'term': u'couldn'},
 {u'count': 2, u'stem': u'split', u'term': u'split'},
 {u'count': 2, u'stem': u'columbus', u'term': u'columbus'},
 {u'count': 2, u'stem': u'taho', u'term': u'tahoe'},
 {u'count': 2, u'stem': u'deport', u'term': u'deportation'},
 {u'count': 2, u'stem': u'rocket', u'term': u'rocket'},
 {u'count': 2, u'stem': u'erad', u'term': u'eradicated'},
 {u'count': 2, u'stem': u'activist', u'term': u'activists'},
 {u'count': 2, u'stem': u'romney', u'term': u'romney'},
 {u'count': 2, u'stem': u'climat', u'term': u'climate'},
 {u'count': 2, u'stem': u'lend', u'term': u'lend'},
 {u'count': 2, u'stem': u'digest', u'term': u'digestive'},
 {u'count': 2, u'stem': u'narrowli', u'term': u'narrowly'},
 {u'count': 2, u'stem': u'dorian', u'term': u'dorian'},
 {u'count': 2, u'stem': u'barack', u'term': u'barack'},
 {u'count': 2, u'stem': u'cinnamon', u'term': u'cinnamon'},
 {u'count': 2, u'stem': u'repeal', u'term': u'repeal'},
 {u'count': 2, u'stem': u'love', u'term': u'love'},
 {u'count': 2, u'stem': u'birthday', u'term': u'birthday'},
 {u'count': 2, u'stem': u'systemat', u'term': u'systematic'},
 {u'count': 2, u'stem': u'ballot', u'term': u'ballots'},
 {u'count': 2, u'stem': u'professor', u'term': u'professor'},
 {u'count': 2, u'stem': u'inflat', u'term': u'inflation'},
 {u'count': 2, u'stem': u'everywher', u'term': u'everywhere'},
 {u'count': 2, u'stem': u'dumb', u'term': u'dumb'},
 {u'count': 2, u'stem': u'dye', u'term': u'dye'},
 {u'count': 2, u'stem': u'flood', u'term': u'flooding'},
 {u'count': 2, u'stem': u'piss', u'term': u'pissed'},
 {u'count': 2, u'stem': u'tom', u'term': u'tom'},
 {u'count': 2, u'stem': u'assault', u'term': u'assault'},
 {u'count': 2, u'stem': u'fingerprint', u'term': u'fingerprints'},
 {u'count': 2, u'stem': u'robert', u'term': u'robert'},
 {u'count': 2, u'stem': u'opt', u'term': u'opted'},
 {u'count': 2, u'stem': u'bush', u'term': u'bush'},
 {u'count': 2, u'stem': u'laptop', u'term': u'laptops'},
 {u'count': 2, u'stem': u'foul', u'term': u'foul'},
 {u'count': 2, u'stem': u'lethal', u'term': u'lethal'},
 {u'count': 2, u'stem': u'meanwhil', u'term': u'meanwhile'},
 {u'count': 2, u'stem': u'fatal', u'term': u'fatal'},
 {u'count': 2, u'stem': u'grim', u'term': u'grim'},
 {u'count': 2, u'stem': u'philjd', u'term': u'philjd'},
 {u'count': 2, u'stem': u'kasich', u'term': u'kasich'},
 {u'count': 2, u'stem': u'ichibon', u'term': u'ichibon'},
 {u'count': 2, u'stem': u'bureaucrat', u'term': u'bureaucrats'},
 {u'count': 2, u'stem': u'hypocrisi', u'term': u'hypocrisy'},
 {u'count': 2, u'stem': u'jeopardi', u'term': u'jeopardy'},
 {u'count': 2, u'stem': u'vandal', u'term': u'vandalism'},
 {u'count': 2, u'stem': u'walton', u'term': u'walton'},
 {u'count': 2, u'stem': u'brook', u'term': u'brooks'},
 {u'count': 2, u'stem': u'advoc', u'term': u'advocates'},
 {u'count': 2, u'stem': u'forens', u'term': u'forensic'},
 {u'count': 2, u'stem': u'tower', u'term': u'tower'},
 {u'count': 2, u'stem': u'trubek', u'term': u'trubek'},
 {u'count': 2, u'stem': u'rescu', u'term': u'rescue'},
 {u'count': 2, u'stem': u'fbi', u'term': u'fbi'},
 {u'count': 2, u'stem': u'antonio', u'term': u'antonio'},
 {u'count': 2, u'stem': u'anti', u'term': u'anti'},
 {u'count': 2, u'stem': u'rioter', u'term': u'rioters'},
 {u'count': 2, u'stem': u'mitt', u'term': u'mitt'},
 {u'count': 2, u'stem': u'sen', u'term': u'sen'},
 {u'count': 2, u'stem': u'nativ', u'term': u'native'},
 {u'count': 2, u'stem': u'suprem', u'term': u'supreme'},
 {u'count': 2, u'stem': u'jame', u'term': u'james'},
 {u'count': 2, u'stem': u'heir', u'term': u'heir'},
 {u'count': 2, u'stem': u'rubl', u'term': u'ruble'},
 {u'count': 2, u'stem': u'teenag', u'term': u'teenager'},
 {u'count': 2, u'stem': u'deliver', u'term': u'deliverances'},
 {u'count': 2, u'stem': u'dig', u'term': u'digging'},
 {u'count': 2, u'stem': u'carter', u'term': u'carter'},
 {u'count': 2, u'stem': u'slaveri', u'term': u'slavery'},
 {u'count': 2, u'stem': u'pope', u'term': u'pope'},
 {u'count': 2, u'stem': u'pregnant', u'term': u'pregnant'},
 {u'count': 2, u'stem': u'children', u'term': u'children'},
 {u'count': 2, u'stem': u'scientist', u'term': u'scientists'},
 {u'count': 2, u'stem': u'dire', u'term': u'dire'},
 {u'count': 2, u'stem': u'asad', u'term': u'asad'},
 {u'count': 2, u'stem': u'disagre', u'term': u'disagreed'},
 {u'count': 2, u'stem': u'opinion', u'term': u'opinion'},
 {u'count': 2, u'stem': u'defam', u'term': u'defamation'},
 {u'count': 2, u'stem': u'fade', u'term': u'fading'},
 {u'count': 2, u'stem': u'credibl', u'term': u'credible'},
 {u'count': 2, u'stem': u'spin', u'term': u'spin'},
 {u'count': 2, u'stem': u'hassan', u'term': u'hassan'},
 {u'count': 2, u'stem': u'entail', u'term': u'entails'},
 {u'count': 2, u'stem': u'murki', u'term': u'murky'},
 {u'count': 2, u'stem': u'testifi', u'term': u'testify'},
 {u'count': 2, u'stem': u'luca', u'term': u'lucas'},
 {u'count': 2, u'stem': u'sharpton', u'term': u'sharpton'},
 {u'count': 2, u'stem': u'truebluemajor', u'term': u'truebluemajority'},
 {u'count': 2, u'stem': u'recount', u'term': u'recounted'},
 {u'count': 2, u'stem': u'balloon', u'term': u'balloon'},
 {u'count': 2, u'stem': u'undermin', u'term': u'undermine'},
 {u'count': 2, u'stem': u'radic', u'term': u'radical'},
 {u'count': 2, u'stem': u'webb', u'term': u'webb'},
 {u'count': 2, u'stem': u'atlant', u'term': u'atlantic'},
 {u'count': 2, u'stem': u'privaci', u'term': u'privacy'},
 {u'count': 2, u'stem': u'scrutini', u'term': u'scrutiny'},
 {u'count': 2, u'stem': u'french', u'term': u'french'},
 {u'count': 2, u'stem': u'alterc', u'term': u'altercation'},
 {u'count': 2, u'stem': u'homicid', u'term': u'homicide'},
 {u'count': 2, u'stem': u'cheer', u'term': u'cheer'},
 {u'count': 2, u'stem': u'obamacar', u'term': u'obamacare'},
 {u'count': 2, u'stem': u'villain', u'term': u'villain'},
 {u'count': 2, u'stem': u'congratul', u'term': u'congratulated'},
 {u'count': 2, u'stem': u'micheal', u'term': u'micheal'},
 {u'count': 2, u'stem': u'mrs', u'term': u'mrs'},
 {u'count': 2, u'stem': u'fairi', u'term': u'fairy'},
 {u'count': 2, u'stem': u'amid', u'term': u'amid'},
 {u'count': 2, u'stem': u'excerpt', u'term': u'excerpt'},
 {u'count': 2, u'stem': u'pray', u'term': u'pray'},
 {u'count': 2, u'stem': u'petti', u'term': u'pettiness'},
 {u'count': 2, u'stem': u'russian', u'term': u'russians'},
 {u'count': 2, u'stem': u'disabl', u'term': u'disabled'},
 {u'count': 2, u'stem': u'grandfath', u'term': u'grandfather'},
 {u'count': 2, u'stem': u'shopper', u'term': u'shoppers'},
 {u'count': 2, u'stem': u'frack', u'term': u'fracking'},
 {u'count': 2, u'stem': u'erupt', u'term': u'erupted'},
 {u'count': 2, u'stem': u'cleveland', u'term': u'cleveland'},
 {u'count': 2, u'stem': u'offens', u'term': u'offense'},
 {u'count': 2, u'stem': u'tackl', u'term': u'tackle'},
 {u'count': 2, u'stem': u'medicaid', u'term': u'medicaid'},
 {u'count': 2, u'stem': u'leftov', u'term': u'leftovers'},
 {u'count': 2, u'stem': u'barbara', u'term': u'barbara'},
 {u'count': 2, u'stem': u'ital', u'term': u'ital'},
 {u'count': 2, u'stem': u'fur', u'term': u'fur'},
 {u'count': 2, u'stem': u'racism', u'term': u'racism'},
 {u'count': 2, u'stem': u'oppress', u'term': u'oppress'},
 {u'count': 2, u'stem': u'undocu', u'term': u'undocumented'},
 {u'count': 2, u'stem': u'deaf', u'term': u'deaf'},
 {u'count': 2, u'stem': u'chuck', u'term': u'chuck'},
 {u'count': 1, u'stem': u'inept', u'term': u'inept'},
 {u'count': 1, u'stem': u'backlash', u'term': u'backlash'},
 {u'count': 1, u'stem': u'atraumat', u'term': u'atraumatic'},
 {u'count': 1, u'stem': u'london', u'term': u'london'},
 {u'count': 1, u'stem': u'grift', u'term': u'grift'},
 {u'count': 1, u'stem': u'kishik', u'term': u'kishik'},
 {u'count': 1, u'stem': u'conven', u'term': u'convening'},
 {u'count': 1, u'stem': u'cock', u'term': u'cock'},
 {u'count': 1, u'stem': u'cylind', u'term': u'cylinder'},
 {u'count': 1, u'stem': u'confess', u'term': u'confessed'},
 {u'count': 1, u'stem': u'misl', u'term': u'misled'},
 {u'count': 1, u'stem': u'neurosci', u'term': u'neuroscience'},
 {u'count': 1, u'stem': u'pinpoint', u'term': u'pinpoint'},
 {u'count': 1, u'stem': u'minotaur', u'term': u'minotaur'},
 {u'count': 1, u'stem': u'nfl', u'term': u'nfl'},
 {u'count': 1, u'stem': u'shorebird', u'term': u'shorebirds'},
 {u'count': 1, u'stem': u'photograph', u'term': u'photographs'},
 {u'count': 1, u'stem': u'bystand', u'term': u'bystanders'},
 {u'count': 1, u'stem': u'appetit', u'term': u'appetite'},
 {u'count': 1, u'stem': u'jurisdict', u'term': u'jurisdiction'},
 {u'count': 1, u'stem': u'genocid', u'term': u'genocide'},
 {u'count': 1, u'stem': u'praxxus', u'term': u'praxxus'},
 {u'count': 1, u'stem': u'curbelo', u'term': u'curbelos'},
 {u'count': 1, u'stem': u'interstellar', u'term': u'interstellar'},
 {u'count': 1, u'stem': u'chip', u'term': u'chips'},
 {u'count': 1, u'stem': u'gazett', u'term': u'gazette'},
 {u'count': 1, u'stem': u'parker', u'term': u'parker'},
 {u'count': 1, u'stem': u'youtub', u'term': u'youtube'},
 {u'count': 1, u'stem': u'oath', u'term': u'oath'},
 {u'count': 1, u'stem': u'beatdown', u'term': u'beatdown'},
 {u'count': 1, u'stem': u'willi', u'term': u'willy'},
 {u'count': 1, u'stem': u'defer', u'term': u'deference'},
 {u'count': 1, u'stem': u'crescent', u'term': u'crescent'},
 {u'count': 1, u'stem': u'malleabl', u'term': u'malleable'},
 {u'count': 1, u'stem': u'regardless', u'term': u'regardless'},
 {u'count': 1, u'stem': u'carnegi', u'term': u'carnegie'},
 {u'count': 1, u'stem': u'flail', u'term': u'flail'},
 {u'count': 1, u'stem': u'harrel', u'term': u'harrell'},
 {u'count': 1,
  u'stem': u'middleagedhousewif',
  u'term': u'middleagedhousewife'},
 {u'count': 1, u'stem': u'perceiv', u'term': u'perceived'},
 {u'count': 1, u'stem': u'humanitarian', u'term': u'humanitarian'},
 {u'count': 1, u'stem': u'cord', u'term': u'cord'},
 {u'count': 1, u'stem': u'olsen', u'term': u'olsen'},
 {u'count': 1, u'stem': u'cyanid', u'term': u'cyanide'},
 {u'count': 1, u'stem': u'beagl', u'term': u'beagles'},
 {u'count': 1, u'stem': u'horribl', u'term': u'horrible'},
 {u'count': 1, u'stem': u'ideologu', u'term': u'ideologues'},
 {u'count': 1, u'stem': u'repast', u'term': u'repast'},
 {u'count': 1, u'stem': u'jacki', u'term': u'jackie'},
 {u'count': 1, u'stem': u'wyden', u'term': u'wyden'},
 {u'count': 1, u'stem': u'milawn', u'term': u'milawn'},
 {u'count': 1, u'stem': u'shaharazad', u'term': u'shaharazade'},
 {u'count': 1, u'stem': u'setback', u'term': u'setback'},
 {u'count': 1, u'stem': u'bunch', u'term': u'bunch'},
 {u'count': 1, u'stem': u'kotkin', u'term': u'kotkin'},
 {u'count': 1, u'stem': u'roughhous', u'term': u'roughhousing'},
 {u'count': 1, u'stem': u'warner', u'term': u'warner'},
 {u'count': 1, u'stem': u'alaska', u'term': u'alaska'},
 {u'count': 1, u'stem': u'reagon', u'term': u'reagon'},
 {u'count': 1, u'stem': u'indispens', u'term': u'indispensable'},
 {u'count': 1, u'stem': u'ploughshar', u'term': u'ploughshares'},
 {u'count': 1, u'stem': u'awl', u'term': u'awl'},
 {u'count': 1, u'stem': u'skew', u'term': u'skew'},
 {u'count': 1, u'stem': u'greg', u'term': u'greg'},
 {u'count': 1, u'stem': u'episod', u'term': u'episodes'},
 {u'count': 1, u'stem': u'disclosur', u'term': u'disclosure'},
 {u'count': 1, u'stem': u'dutch', u'term': u'dutch'},
 {u'count': 1, u'stem': u'unilater', u'term': u'unilateral'},
 {u'count': 1, u'stem': u'interst', u'term': u'intersted'},
 {u'count': 1, u'stem': u'godzilla', u'term': u'godzilla'},
 {u'count': 1, u'stem': u'wrestler', u'term': u'wrestlers'},
 {u'count': 1, u'stem': u'los', u'term': u'los'},
 {u'count': 1, u'stem': u'unjust', u'term': u'unjust'},
 {u'count': 1, u'stem': u'deduct', u'term': u'deduction'},
 {u'count': 1, u'stem': u'elector', u'term': u'electorate'},
 {u'count': 1, u'stem': u'overus', u'term': u'overuse'},
 {u'count': 1, u'stem': u'publican', u'term': u'publican'},
 {u'count': 1, u'stem': u'snarki', u'term': u'snarky'},
 {u'count': 1, u'stem': u'keef', u'term': u'keefe'},
 {u'count': 1, u'stem': u'malfeas', u'term': u'malfeasance'},
 {u'count': 1, u'stem': u'boot', u'term': u'boots'},
 {u'count': 1, u'stem': u'tea', u'term': u'tea'},
 {u'count': 1, u'stem': u'download', u'term': u'downloads'},
 {u'count': 1, u'stem': u'batshit', u'term': u'batshit'},
 {u'count': 1, u'stem': u'dismiss', u'term': u'dismissed'},
 {u'count': 1, u'stem': u'cabl', u'term': u'cable'},
 {u'count': 1, u'stem': u'unguard', u'term': u'unguarded'},
 {u'count': 1, u'stem': u'fracker', u'term': u'frackers'},
 {u'count': 1, u'stem': u'harvest', u'term': u'harvests'},
 {u'count': 1, u'stem': u'mater', u'term': u'mater'},
 {u'count': 1, u'stem': u'relaps', u'term': u'relapse'},
 {u'count': 1, u'stem': u'deterr', u'term': u'deterrent'},
 {u'count': 1, u'stem': u'unnecessarili', u'term': u'unnecessarily'},
 {u'count': 1, u'stem': u'cohn', u'term': u'cohn'},
 {u'count': 1, u'stem': u'kindergarten', u'term': u'kindergarten'},
 {u'count': 1, u'stem': u'breakup', u'term': u'breakup'},
 {u'count': 1, u'stem': u'crackdown', u'term': u'crackdown'},
 {u'count': 1, u'stem': u'nutshel', u'term': u'nutshell'},
 {u'count': 1, u'stem': u'miseri', u'term': u'misery'},
 {u'count': 1, u'stem': u'crumbl', u'term': u'crumbling'},
 {u'count': 1, u'stem': u'treadmil', u'term': u'treadmill'},
 {u'count': 1, u'stem': u'backyard', u'term': u'backyard'},
 {u'count': 1, u'stem': u'misogynist', u'term': u'misogynist'},
 {u'count': 1, u'stem': u'righteous', u'term': u'righteous'},
 {u'count': 1, u'stem': u'atop', u'term': u'atop'},
 {u'count': 1, u'stem': u'overturn', u'term': u'overturned'},
 {u'count': 1, u'stem': u'fest', u'term': u'fest'},
 {u'count': 1, u'stem': u'southerland', u'term': u'southerland'},
 {u'count': 1, u'stem': u'conced', u'term': u'conceded'},
 {u'count': 1, u'stem': u'gergel', u'term': u'gergel'},
 {u'count': 1, u'stem': u'parabol', u'term': u'parabolic'},
 {u'count': 1, u'stem': u'famin', u'term': u'famine'},
 {u'count': 1, u'stem': u'impregn', u'term': u'impregnable'},
 {u'count': 1, u'stem': u'scientif', u'term': u'scientific'},
 {u'count': 1, u'stem': u'gangster', u'term': u'gangster'},
 {u'count': 1, u'stem': u'interact', u'term': u'interactive'},
 {u'count': 1, u'stem': u'slain', u'term': u'slain'},
 {u'count': 1, u'stem': u'harvey', u'term': u'harvey'},
 {u'count': 1, u'stem': u'jih\u0101d', u'term': u'jih\u0101d'},
 {u'count': 1, u'stem': u'polygami', u'term': u'polygamy'},
 {u'count': 1, u'stem': u'nagasaki', u'term': u'nagasaki'},
 {u'count': 1, u'stem': u'assail', u'term': u'assailant'},
 {u'count': 1, u'stem': u'dietari', u'term': u'dietary'},
 {u'count': 1, u'stem': u'innov', u'term': u'innovation'},
 {u'count': 1, u'stem': u'anonym', u'term': u'anonymous'},
 {u'count': 1, u'stem': u'folio', u'term': u'folios'},
 {u'count': 1, u'stem': u'neomg', u'term': u'neomg'},
 {u'count': 1, u'stem': u'reflectionsv37', u'term': u'reflectionsv37'},
 {u'count': 1, u'stem': u'keyword', u'term': u'keywords'},
 {u'count': 1, u'stem': u'discov', u'term': u'discovered'},
 {u'count': 1, u'stem': u'starter', u'term': u'starter'},
 {u'count': 1, u'stem': u'lgbt', u'term': u'lgbt'},
 {u'count': 1, u'stem': u'tial', u'term': u'tial'},
 {u'count': 1, u'stem': u'iraq', u'term': u'iraq'},
 {u'count': 1, u'stem': u'islam', u'term': u'islamic'},
 {u'count': 1, u'stem': u'cirincion', u'term': u'cirincione'},
 {u'count': 1, u'stem': u'carnag', u'term': u'carnage'},
 {u'count': 1, u'stem': u'empathi', u'term': u'empathy'},
 {u'count': 1, u'stem': u'civilli', u'term': u'civilly'},
 {u'count': 1, u'stem': u'stichey', u'term': u'stichey'},
 {u'count': 1, u'stem': u'denial', u'term': u'denial'},
 {u'count': 1, u'stem': u'graphic', u'term': u'graphic'},
 {u'count': 1, u'stem': u'hillier', u'term': u'hillier'},
 {u'count': 1, u'stem': u'longish', u'term': u'longish'},
 {u'count': 1, u'stem': u'presidenti', u'term': u'presidential'},
 {u'count': 1, u'stem': u'marketplac', u'term': u'marketplace'},
 {u'count': 1, u'stem': u'unarm', u'term': u'unarmed'},
 {u'count': 1, u'stem': u'deem', u'term': u'deemed'},
 {u'count': 1, u'stem': u'alexandria', u'term': u'alexandria'},
 {u'count': 1, u'stem': u'senseless', u'term': u'senseless'},
 {u'count': 1, u'stem': u'doll', u'term': u'dolled'},
 {u'count': 1, u'stem': u'wyom', u'term': u'wyoming'},
 {u'count': 1, u'stem': u'tonedevil', u'term': u'tonedevil'},
 {u'count': 1, u'stem': u'unavoid', u'term': u'unavoidable'},
 {u'count': 1, u'stem': u'armi', u'term': u'army'},
 {u'count': 1, u'stem': u'mussel', u'term': u'mussel'},
 {u'count': 1, u'stem': u'countless', u'term': u'countless'},
 {u'count': 1, u'stem': u'formul', u'term': u'formulation'},
 {u'count': 1, u'stem': u'gerrydal', u'term': u'gerrydales'},
 {u'count': 1, u'stem': u'unconstitut', u'term': u'unconstitutional'},
 {u'count': 1, u'stem': u'nutmeg', u'term': u'nutmeg'},
 {u'count': 1, u'stem': u'humbl', u'term': u'humble'},
 {u'count': 1, u'stem': u'indigen', u'term': u'indigenous'},
 {u'count': 1, u'stem': u'feather', u'term': u'feathering'},
 {u'count': 1, u'stem': u'bleed', u'term': u'bleeding'},
 {u'count': 1, u'stem': u'wildcatt', u'term': u'wildcatters'},
 {u'count': 1, u'stem': u'lafayett', u'term': u'lafayette'}]

In [6]:
mc_heuristic.mediaList( rows=1)
mc_readability.mediaList( rows= 1 )


Out[6]:
[{u'media_id': 1,
  u'media_sets': [{u'description': u'Top 25 mainstream media sources by monthly unique users from the U.S. according to the Google AdPlanner service.',
    u'media_sets_id': 1,
    u'name': u'Top 25 Mainstream Media'},
   {u'description': u'top 25 mainstream media according to Google AdPlanner',
    u'media_sets_id': 16959,
    u'name': u'Top 25 Mainstream Medi'},
   {u'description': u'OpEd Project',
    u'media_sets_id': 20797,
    u'name': u'OpEd Project'}],
  u'media_source_tags': [{u'description': None,
    u'label': None,
    u'show_on_media': None,
    u'show_on_stories': None,
    u'tag': u'3',
    u'tag_set': u'usnewspapercirculation',
    u'tag_sets_id': 3,
    u'tags_id': 109},
   {u'description': None,
    u'label': None,
    u'show_on_media': None,
    u'show_on_stories': None,
    u'tag': u'include',
    u'tag_set': u'word_cloud',
    u'tag_sets_id': 17,
    u'tags_id': 6071565},
   {u'description': None,
    u'label': None,
    u'show_on_media': None,
    u'show_on_stories': None,
    u'tag': u'default',
    u'tag_set': u'word_cloud',
    u'tag_sets_id': 17,
    u'tags_id': 6729599},
   {u'description': u"Top U.S. mainstream media according Google Ad Planner's measure of unique monthly users.",
    u'label': u'U.S. Mainstream Media',
    u'show_on_media': None,
    u'show_on_stories': 1,
    u'tag': u'ap_english_us_top25_20100110',
    u'tag_set': u'collection',
    u'tag_sets_id': 5,
    u'tags_id': 8875027},
   {u'description': None,
    u'label': None,
    u'show_on_media': None,
    u'show_on_stories': None,
    u'tag': u'oped_project',
    u'tag_set': u'collection',
    u'tag_sets_id': 5,
    u'tags_id': 8878390},
   {u'description': u'A site that is a mainstream media outlet, such as The New York Times and The Washington Post; an online-only news outlet, such as Slate, Salon, or the Huffington Post; or a citizen journalism or non-profit news outlet, such as Global Voices or ProPublica',
    u'label': u'General News',
    u'show_on_media': None,
    u'show_on_stories': 1,
    u'tag': u'General Online News Media',
    u'tag_set': u'media_type',
    u'tag_sets_id': 1099,
    u'tags_id': 8878416}],
  u'name': u'New York Times',
  u'url': u'http://nytimes.com'}]

In [6]:


In [11]:
from collections import Counter
import scipy.spatial.distance
from sklearn.feature_extraction import DictVectorizer

def compare_words_counts_stories_id ( stories_id ):
    result = compare_word_counts( '', solr_query='stories_id:' + stories_id, filter_query='')
    return result

    
def compare_word_counts( comparison_name, solr_query, filter_query):
    sample_size = 100000
    comp_size = 100
    
    counts_heuristic = mc_heuristic.wordCount(solr_query=solr_query,  solr_filter=filter_query,
                                                  sample_size=sample_size, num_words = 2000 )
        
    counts_readability = mc_readability.wordCount(solr_query=solr_query, solr_filter=filter_query,
                                                  sample_size=sample_size, num_words = 2000 )
    
    #print 'heuristic', counts_heuristic
    #print 'readability', counts_readability
    
    sentences_heuristic = mc_heuristic.sentenceCount( solr_query=solr_query, solr_filter=filter_query )['count']
    sentences_readabilty = mc_readability.sentenceCount( solr_query=solr_query, solr_filter=filter_query )['count']
    
    if sentences_heuristic == 0 and sentences_readabilty == 0:
        return None
        
    len( counts_heuristic )
    counter_heuristic = Counter( dict( [ (c['stem'], c['count']) for c in counts_heuristic ] ) )
    len( counts_readability )
    
    counter_readability = Counter( dict( [ (c['stem'], c['count']) for c in counts_readability ] ))
    #dict( counter_readability )   
    
    v = DictVectorizer( sparse=False )
    
    X = v.fit_transform( [ dict( counter_heuristic.most_common(comp_size)) , dict(counter_readability.most_common( comp_size)) ]  )
    #print X
    
    core_heuristic =  v.transform( dict(counter_heuristic.most_common(comp_size ) ) )
    core_readability = v.transform( dict(counter_readability.most_common(comp_size ) ) )
    s = set ( counter_heuristic.keys() )
    s.update( counter_readability.keys() ) 
    
    #print counter_heuristic.keys()[:10]
    #print len( counter_heuristic.keys() )
    #print len( counter_readability.keys() )
    #print 's length', len(s)
    #print 'vocab length', len(v.vocabulary_)
    
    common_words_heuristic  = set( dict( counter_heuristic.most_common( comp_size ) ).keys() )
    common_words_readabilty = set( dict( counter_readability.most_common( comp_size ) ).keys() )
    
    non_duplicate_words =  len( common_words_heuristic.symmetric_difference( common_words_readabilty ) )
    
    cosine_distance = scipy.spatial.distance.cosine( core_heuristic, core_readability )
    #print comparison['name'], cosine_distance
    
    result = {'name': comparison_name + filter_query.replace('sentence:', ' - ' ),
                     'distance': cosine_distance, 
                     'non_dup_words': non_duplicate_words, 
                     'heuristic sent': sentences_heuristic, 'readabilty sent': sentences_readabilty,
                     'heur top 10': counter_heuristic.most_common( 10 ),
                     'read top 10': counter_readability.most_common( 10 ),                     
                     }
    
    return result

In [8]:
compare_word_counts( '', solr_query='stories_id:13794', filter_query='')
#mc_readability.story( stories_id=21 )
#mc_readability.media( 58017 )

In [13]:
import pandas as pd


comparisons = [               
    { 'solr_query': 'tags_id_media:2453107', 'name': "region/pewknight", 'topic_queries': True },
    { 'solr_query': 'media_id:1', 'name': "new york times", 'topic_queries': True},
    { 'solr_query': 'tags_id_media:8875027', 'name': "ap_english_us_top25", 'topic_queries': True},
    { 'solr_query': 'tags_id_media:125', 'name': "political blogs", 'topic_queries': True},
    { 'solr_query': 'tags_id_media:7796878',  'name': 'russian', 'topic_queries': False},
    { 'solr_query': 'tags_id_media:8878255',  'name': 'arabic', 'topic_queries': False},
    { 'solr_query': '*:*', 'name': '*:* all ', 'topic_queries': True },
    ]             

filter_queries = [ '', 'sentence:obama', 'sentence:gaza', 'sentence:fcc', 'sentence:facebook' ]

results = []

for comparison in comparisons:
    
    
    
    for filter_query in filter_queries:
    

        
        if filter_query != '' and not comparison['topic_queries']:
            continue

        print 'counting for ', comparison['name'], filter_query
        
        solr_query = comparison['solr_query']
        comparison_name = comparison['name']
        
        result = compare_word_counts( comparison_name, solr_query, filter_query )
        
        if result is not None:
            results.append( result )


counting for  region/pewknight 
counting for  region/pewknight sentence:obama
counting for  region/pewknight sentence:gaza
counting for  region/pewknight sentence:fcc
counting for  region/pewknight sentence:facebook
counting for  new york times 
counting for  new york times sentence:obama
counting for  new york times sentence:gaza
counting for  new york times sentence:fcc
counting for  new york times sentence:facebook
counting for  ap_english_us_top25 
counting for  ap_english_us_top25 sentence:obama
counting for  ap_english_us_top25 sentence:gaza
counting for  ap_english_us_top25 sentence:fcc
counting for  ap_english_us_top25 sentence:facebook
counting for  political blogs 
counting for  political blogs sentence:obama
counting for  political blogs sentence:gaza
counting for  political blogs sentence:fcc
counting for  political blogs sentence:facebook
counting for  russian 
counting for  arabic 
counting for  *:* all  
counting for  *:* all  sentence:obama
counting for  *:* all  sentence:gaza
counting for  *:* all  sentence:fcc
counting for  *:* all  sentence:facebook

In [ ]:
df = pd.DataFrame( results )
df
#print scipy.spatial.distance.cosine( X[1], core_readability )

In [25]:
result = results[0]
for result in results:
    print result['name']
    print 'read', repr( result['read top 10'] )
    print 'heur', repr( result['heur top 10'] )


region/pewknight
read [(u'photo', 1165), (u'michigan', 1034), (u'dalla', 947), (u'famili', 935), (u'thanksgiv', 874), (u'texa', 823), (u'holiday', 729), (u'student', 727), (u'touchdown', 669), (u'offens', 620)]
heur [(u'photo', 1397), (u'dalla', 1084), (u'michigan', 1043), (u'famili', 1009), (u'texa', 989), (u'thanksgiv', 880), (u'student', 801), (u'wyli', 778), (u'holiday', 745), (u'touchdown', 643)]
region/pewknight - obama
read [(u'obama', 331), (u'barack', 70), (u'immigr', 50), (u'republican', 35), (u'congress', 25), (u'illeg', 17), (u'washington', 16), (u'democrat', 14), (u'ferguson', 13), (u'deport', 13)]
heur [(u'obama', 336), (u'barack', 78), (u'immigr', 51), (u'republican', 35), (u'washington', 25), (u'congress', 24), (u'illeg', 17), (u'democrat', 15), (u'campaign', 13), (u'ferguson', 13)]
region/pewknight - gaza
read [(u'gaza', 11), (u'israel', 3), (u'war', 3), (u'isra', 3), (u'milit', 3), (u'hama', 2), (u'palestinian', 2), (u'sieg', 2), (u'ferguson', 2), (u'blockad', 1)]
heur [(u'gaza', 11), (u'israel', 3), (u'war', 3), (u'isra', 3), (u'milit', 3), (u'hama', 2), (u'palestinian', 2), (u'sieg', 2), (u'ferguson', 2), (u'blockad', 1)]
region/pewknight - fcc
read [(u'fcc', 7), (u'telehealth', 2), (u'uva', 2), (u'profici', 1), (u'wvir', 1), (u'tti', 1), (u'mignon', 1), (u'nbc29', 1), (u'onlin', 1), (u'sport', 1)]
heur [(u'clybum', 1), (u'fcc', 1), (u'mignon', 1), (u'profici', 1), (u'barrier', 1)]
region/pewknight - facebook
read [(u'facebook', 655), (u'twitter', 391), (u'cbsmiami', 259), (u'cbsdfw', 220), (u'dalla', 100), (u'miami', 85), (u'mlive', 47), (u'florida', 38), (u'texa', 36), (u'detroit', 30)]
heur [(u'facebook', 622), (u'twitter', 380), (u'cbsmiami', 265), (u'cbsdfw', 220), (u'dalla', 100), (u'miami', 88), (u'florida', 39), (u'mlive', 37), (u'texa', 36), (u'detroit', 23)]
new york times
read [(u'american', 569), (u'headlin', 422), (u'famili', 404), (u'leagu', 336), (u'free', 300), (u'love', 297), (u'studi', 279), (u'thanksgiv', 267), (u'org', 266), (u'colleg', 260)]
heur [(u'american', 572), (u'famili', 432), (u'leagu', 332), (u'thanksgiv', 317), (u'love', 298), (u'free', 296), (u'studi', 270), (u'children', 255), (u'org', 255), (u'colleg', 249)]
new york times - obama
read [(u'obama', 135), (u'hagel', 17), (u'republican', 10), (u'immigr', 10), (u'china', 9), (u'american', 9), (u'washington', 7), (u'ambiti', 5), (u'pacif', 5), (u'barack', 5)]
heur [(u'obama', 120), (u'hagel', 16), (u'republican', 11), (u'china', 10), (u'american', 9), (u'washington', 7), (u'immigr', 7), (u'barack', 6), (u'pacif', 5), (u'ambiti', 5)]
new york times - gaza
read [(u'gaza', 8), (u'palestinian', 5), (u'israel', 4), (u'isra', 4), (u'egypt', 2), (u'european', 2), (u'territori', 1), (u'violenc', 1), (u'doha', 1), (u'malaysia', 1)]
heur [(u'gaza', 7), (u'palestinian', 5), (u'israel', 4), (u'isra', 4), (u'egypt', 2), (u'european', 2), (u'violenc', 1), (u'doha', 1), (u'malaysia', 1), (u'blockad', 1)]
new york times - facebook
read [(u'facebook', 51), (u'twitter', 10), (u'eloni', 4), (u'instagram', 3), (u'zuckerberg', 3), (u'tara', 3), (u'essay', 2), (u'children', 2), (u'obama', 2), (u'screenshot', 2)]
heur [(u'facebook', 58), (u'twitter', 7), (u'cranberri', 5), (u'campaign', 4), (u'eloni', 3), (u'cornbread', 3), (u'zuckerberg', 3), (u'tara', 3), (u'essay', 2), (u'children', 2)]
ap_english_us_top25
read [(u'famili', 1352), (u'love', 1035), (u'children', 818), (u'american', 804), (u'free', 775), (u'photo', 730), (u'thanksgiv', 702), (u'england', 701), (u'leagu', 681), (u'holiday', 680)]
heur [(u'famili', 1359), (u'love', 979), (u'thanksgiv', 857), (u'children', 855), (u'american', 772), (u'free', 761), (u'holiday', 732), (u'leagu', 680), (u'photo', 634), (u'england', 613)]
ap_english_us_top25 - obama
read [(u'obama', 2050), (u'barack', 293), (u'immigr', 186), (u'republican', 117), (u'democrat', 114), (u'hagel', 86), (u'american', 81), (u'washington', 77), (u'turkey', 73), (u'congress', 71)]
heur [(u'obama', 1523), (u'barack', 243), (u'immigr', 174), (u'republican', 87), (u'democrat', 84), (u'hagel', 75), (u'washington', 71), (u'american', 69), (u'turkey', 64), (u'pardon', 55)]
ap_english_us_top25 - gaza
read [(u'gaza', 94), (u'isra', 36), (u'israel', 35), (u'hama', 26), (u'palestinian', 25), (u'war', 13), (u'milit', 9), (u'jewish', 8), (u'netanyahu', 6), (u'southern', 5)]
heur [(u'gaza', 70), (u'isra', 29), (u'israel', 28), (u'palestinian', 22), (u'hama', 19), (u'war', 9), (u'jewish', 8), (u'milit', 8), (u'netanyahu', 5), (u'southern', 5)]
ap_english_us_top25 - fcc
read [(u'fcc', 31), (u'internet', 9), (u'broadband', 5), (u'wheeler', 5), (u'carlo', 3), (u'obama', 3), (u'congress', 3), (u'tom', 3), (u'reclassifi', 3), (u'euro', 3)]
heur [(u'fcc', 22), (u'carlo', 3), (u'broadband', 3), (u'obama', 3), (u'wheeler', 3), (u'congress', 3), (u'internet', 3), (u'euro', 3), (u'mexican', 2), (u'madrid', 2)]
ap_english_us_top25 - facebook
read [(u'facebook', 1564), (u'twitter', 245), (u'media', 67), (u'photo', 62), (u'googl', 46), (u'instagram', 45), (u'websit', 40), (u'app', 38), (u'linkedin', 29), (u'onlin', 28)]
heur [(u'facebook', 1225), (u'twitter', 176), (u'media', 53), (u'photo', 42), (u'googl', 35), (u'websit', 33), (u'instagram', 32), (u'app', 28), (u'onlin', 23), (u'linkedin', 21)]
political blogs
read [(u'obama', 2304), (u'democrat', 1787), (u'american', 1430), (u'wilson', 1346), (u'republican', 1314), (u'ferguson', 1104), (u'juri', 1006), (u'immigr', 870), (u'fact', 819), (u'cop', 760)]
heur [(u'obama', 2450), (u'wilson', 2146), (u'democrat', 2090), (u'republican', 1572), (u'american', 1539), (u'juri', 1422), (u'ferguson', 1212), (u'fact', 954), (u'famili', 940), (u'thanksgiv', 899)]
political blogs - obama
read [(u'obama', 1798), (u'democrat', 168), (u'barack', 144), (u'immigr', 142), (u'republican', 117), (u'american', 105), (u'congress', 97), (u'hagel', 64), (u'gop', 57), (u'bush', 55)]
heur [(u'obama', 1894), (u'barack', 179), (u'democrat', 177), (u'immigr', 140), (u'republican', 138), (u'american', 95), (u'congress', 92), (u'hagel', 86), (u'gop', 57), (u'campaign', 40)]
political blogs - gaza
read [(u'gaza', 36), (u'israel', 19), (u'war', 10), (u'hama', 10), (u'flood', 7), (u'palestinian', 6), (u'isra', 6), (u'crime', 4), (u'obama', 4), (u'blockad', 3)]
heur [(u'gaza', 38), (u'israel', 19), (u'war', 10), (u'hama', 10), (u'palestinian', 7), (u'flood', 7), (u'isra', 6), (u'crime', 4), (u'obama', 4), (u'unrwa', 3)]
political blogs - fcc
read [(u'fcc', 2), (u'fec', 1), (u'tom', 1), (u'corpor', 1), (u'wheeler', 1), (u'internet', 1)]
heur [(u'veto', 1), (u'corpor', 1), (u'wheeler', 1), (u'internet', 1), (u'fcc', 1), (u'inst', 1), (u'mideast', 1), (u'tom', 1), (u'suck', 1), (u'war', 1)]
political blogs - facebook
read [(u'facebook', 149), (u'twitter', 22), (u'media', 11), (u'googl', 10), (u'onlin', 7), (u'racist', 7), (u'websit', 6), (u'obama', 6), (u'wilson', 6), (u'photo', 5)]
heur [(u'facebook', 166), (u'twitter', 24), (u'media', 10), (u'googl', 10), (u'racist', 9), (u'ferguson', 8), (u'obama', 8), (u'wilson', 8), (u'darren', 6), (u'onlin', 6)]
russian
read [(u'\u0447\u0442\u043e', 17085), (u'\u044d\u0442\u043e', 8889), (u'\u043a\u0430\u043a', 6544), (u'\u0434\u043b\u044f', 4206), (u'\u0432\u0441\u0435', 3953), (u'\u0435\u0433\u043e', 3197), (u'\u0432\u043e\u0442', 3040), (u'\u0442\u0430\u043a', 2981), (u'\u043e\u043d\u0438', 2863), (u'\u0435\u0441\u0442\u044c', 2857)]
heur [(u'\u0447\u0442\u043e', 20400), (u'\u044d\u0442\u043e', 10642), (u'\u043a\u0430\u043a', 8229), (u'\u0434\u043b\u044f', 5010), (u'\u0432\u0441\u0435', 4836), (u'\u0442\u0430\u043a', 3898), (u'\u0435\u0433\u043e', 3869), (u'\u0432\u043e\u0442', 3522), (u'\u043e\u043d\u0438', 3493), (u'\u0440\u043e\u0441\u0441\u0438\u0438', 3421)]
arabic
read [(u'\u0639\u0644\u0649', 18418), (u'\u0625\u0644\u0649', 7965), (u'\u0627\u0644\u064a\u0648\u0645', 4581), (u'\u0627\u0644\u062a\u064a', 4570), (u'\u0645\u0635\u0631', 3569), (u'\u0628\u0639\u062f', 3412), (u'\u0645\u0628\u0627\u0631\u0643', 3153), (u'\u062e\u0644\u0627\u0644', 2922), (u'\u0627\u0644\u0630\u064a', 2850), (u'\u0647\u0630\u0627', 2616)]
heur [(u'\u0639\u0644\u0649', 18666), (u'\u0625\u0644\u0649', 7545), (u'\u0645\u0644\u0641', 5172), (u'\u0627\u0644\u064a\u0648\u0645', 4753), (u'\u0627\u0644\u062a\u064a', 4274), (u'\u0628\u064a\u0646', 3601), (u'\u0645\u0635\u0631', 3501), (u'\u0645\u0628\u0627\u0631\u0643', 3331), (u'\u0628\u0639\u062f', 3244), (u'\u062e\u0644\u0627\u0644', 2879)]
*:* all 
read [(u'para', 2409), (u'der', 2302), (u'los', 1911), (u'por', 1853), (u'les', 1802), (u'\u0447\u0442\u043e', 1797), (u'des', 1783), (u'con', 1319), (u'und', 1309), (u'las', 1169)]
heur [(u'para', 2532), (u'der', 2363), (u'los', 2095), (u'les', 2059), (u'por', 2014), (u'des', 2008), (u'\u0447\u0442\u043e', 1999), (u'und', 1421), (u'con', 1372), (u'las', 1250)]
*:* all  - obama
read [(u'obama', 32413), (u'barack', 5846), (u'immigr', 2360), (u'republican', 1819), (u'democrat', 1537), (u'american', 1535), (u'congress', 1123), (u'president', 1005), (u'washington', 969), (u'hagel', 916)]
heur [(u'obama', 30569), (u'barack', 5949), (u'immigr', 2327), (u'republican', 1775), (u'democrat', 1408), (u'american', 1398), (u'president', 1073), (u'congress', 1050), (u'washington', 949), (u'hagel', 935)]
*:* all  - gaza
read [(u'gaza', 5420), (u'israel', 1381), (u'palestinian', 974), (u'hama', 959), (u'isra', 722), (u'war', 395), (u'egypt', 300), (u'des', 243), (u'les', 243), (u'dan', 211)]
heur [(u'gaza', 5505), (u'israel', 1368), (u'hama', 970), (u'palestinian', 968), (u'isra', 654), (u'war', 396), (u'egypt', 301), (u'des', 275), (u'les', 268), (u'dan', 215)]
*:* all  - fcc
read [(u'fcc', 1048), (u'con', 172), (u'por', 137), (u'koplowitz', 126), (u'carlo', 108), (u'los', 97), (u'internet', 94), (u'euro', 91), (u'una', 80), (u'esther', 77)]
heur [(u'fcc', 1029), (u'con', 171), (u'por', 139), (u'koplowitz', 126), (u'carlo', 112), (u'euro', 95), (u'los', 95), (u'internet', 92), (u'para', 83), (u'las', 79)]
*:* all  - facebook
read [(u'facebook', 25282), (u'twitter', 3761), (u'para', 960), (u'googl', 954), (u'instagram', 862), (u'por', 846), (u'media', 731), (u'sur', 728), (u'foto', 614), (u'con', 607)]
heur [(u'facebook', 25448), (u'twitter', 3622), (u'para', 1048), (u'googl', 980), (u'por', 876), (u'instagram', 830), (u'sur', 769), (u'media', 733), (u'logger', 707), (u'los', 634)]

In [ ]:
print df.to_string(columns=['name', 'distance', 'non_dup_words', 'heuristic sent', 'readabilty sent'])

In [ ]:
stories_ids = []

last_processed_stories_id = 0

story_sample_size = 600

while ( len ( stories_ids ) <= story_sample_size ):
    
    print 'last_processed_stories_id', last_processed_stories_id
    
    stories = mc_heuristic.storyList( solr_query='tags_id_media:8875027', 
                                     last_processed_stories_id=last_processed_stories_id, rows=100 )

    if len( stories ) == 0:
        break;
        
    stories_ids.extend( [ s['stories_id'] for s in stories ] )
    
    print len(stories_ids)
    
    last_processed_stories_id = stories[-1]['processed_stories_id']

distances = []

stories_distances = {}
for stories_id in stories_ids:
    if len( distances ) % 10 == 0:
        print 'obtained cosine distances for ', len( distances), 'stories'
        
    print stories_id
    
    result = compare_words_counts_stories_id( stories_id )

    if result is None:
        print 'skipping empty story', stories_id
    else:
        distances.append( result['distance'] )
        stories_distances[ stories_id ] = result['distance']

In [526]:
len( distances )
#len( stories_ids )


Out[526]:
700

In [ ]:
import matplotlib
sorted_distances = sorted( distances )
sorted_distances = sorted( [ max( float(d), 0 ) for d in distances if not math.isnan(d)] )
sorted_distances

In [ ]:
print len(sorted_distances )
plt.plot(  np.arange( len( sorted_distances ) ), sorted_distances )
plt.xlim( [0, len( sorted_distances ) ] )
xgridlines = getp(gca(), 'xgridlines')
ygridlines = getp(gca(), 'ygridlines')
plt.setp(xgridlines, 'linestyle', '-')
plt.setp(ygridlines, 'linestyle', '-')
plt.grid()
#matplotlib.pyplot.hist( distances, bins=1000, cumulative=True, normed=True  )

In [ ]:
cPickle.dump( stories_distances, 
                 file( os.path.expanduser( '~/Dropbox/mc/extractor_test/stories_distances.pickle'), "wb") )

In [ ]:
stories_distances = cPickle.load(  
                                  file( os.path.expanduser( 
                                                           '/home/dlarochelle/Dropbox/mc/extractor_test/stories_distances.pickle'), 
                                                           "rb") )

In [ ]:
print len( stories_distances )
print len ( [ stories_id for stories_id in stories_distances.keys() if stories_distances[ stories_id]  > 0.4  ] )

high_diff_stories = [ stories_id for stories_id in stories_distances.keys() if stories_distances[ stories_id]  > 0.4  ]
high_diff_stories

In [527]:
len( high_diff_stories )


Out[527]:
60

In [ ]:
to_print = []
for stories_id in high_diff_stories:
    story = mc_heuristic.story( stories_id )
    story_readability  = mc_readability.story( stories_id )
    
    to_print.append( { 'stories_id': stories_id,'url': story['url'], 
                      'story_distance': stories_distances[ stories_id ],
                      'sen heur': len( story['story_sentences'] ),
                      'sen read': len( story_readability[ 'story_sentences' ] ),
                      } )

In [ ]:
import pandas

df = pandas.DataFrame( to_print )
df.to_clipboard()

In [517]:
stories_id = u'234506'

compare_words_counts_stories_id( stories_id )

solr_query = 'stories_id:' + stories_id
filter_query = ''
sample_size = 100000
counts_heuristic = mc_heuristic.wordCount(solr_query=solr_query,  solr_filter=filter_query,
                                                  sample_size=sample_size, num_words = 1000 )
counts_readability = mc_readability.wordCount(solr_query=solr_query, solr_filter=filter_query,
                                                      sample_size=sample_size, num_words = 1000 )
    
print repr( counts_heuristic )
print '--'
print repr( counts_readability )
counts_readability
story = mc_heuristic.story( stories_id , sentences=True)


[{u'count': 2, u'term': u'discover', u'stem': u'discov'}, {u'count': 2, u'term': u'facts', u'stem': u'fact'}, {u'count': 2, u'term': u'hunger', u'stem': u'hunger'}, {u'count': 1, u'term': u'translation', u'stem': u'translat'}, {u'count': 1, u'term': u'blockbuster', u'stem': u'blockbust'}, {u'count': 1, u'term': u'ugandan', u'stem': u'ugandan'}, {u'count': 1, u'term': u'hobbit', u'stem': u'hobbit'}, {u'count': 1, u'term': u'pinpoints', u'stem': u'pinpoint'}, {u'count': 1, u'term': u'potter', u'stem': u'potter'}, {u'count': 1, u'term': u'arena', u'stem': u'arena'}, {u'count': 1, u'term': u'filtering', u'stem': u'filter'}, {u'count': 1, u'term': u'royale', u'stem': u'royal'}, {u'count': 1, u'term': u'harry', u'stem': u'harri'}, {u'count': 1, u'term': u'wanderlust', u'stem': u'wanderlust'}, {u'count': 1, u'term': u'oscar', u'stem': u'oscar'}, {u'count': 1, u'term': u'buckinghamshire', u'stem': u'buckinghamshir'}, {u'count': 1, u'term': u'gmt', u'stem': u'gmt'}, {u'count': 1, u'term': u'trivia', u'stem': u'trivia'}, {u'count': 1, u'term': u'genre', u'stem': u'genr'}, {u'count': 1, u'term': u'tokyo', u'stem': u'tokyo'}, {u'count': 1, u'term': u'adventurous', u'stem': u'adventur'}, {u'count': 1, u'term': u'zoom', u'stem': u'zoom'}, {u'count': 1, u'term': u'casino', u'stem': u'casino'}, {u'count': 1, u'term': u'interactive', u'stem': u'interact'}, {u'count': 1, u'term': u'zealand', u'stem': u'zealand'}, {u'count': 1, u'term': u'enchanted', u'stem': u'enchant'}, {u'count': 1, u'term': u'mockingjay', u'stem': u'mockingjay'}]
--
[{u'count': 1, u'term': u'blockbusters', u'stem': u'blockbust'}, {u'count': 1, u'term': u'hunger', u'stem': u'hunger'}, {u'count': 1, u'term': u'jurassic', u'stem': u'jurass'}, {u'count': 1, u'term': u'interactive', u'stem': u'interact'}]

In [518]:
story['url']


Out[518]:
u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'

In [519]:
[ s['sentence'] for s in story['story_sentences'] ]
#mc_heuristic.media( story['media_id'] )


Out[519]:
[u'Discover the Hunger Games arena and more famous locations in film with a new interactive map',
 u'7:00AM GMT 25 Nov 2014',
 u'Inspired by some of the most famous settings in film \u2013 what New Zealand is to The Hobbit, or what Tokyo is to Lost in Translation \u2013 the map pinpoints the locations behind 300 of the most successful films in history.',
 u'Hunger Games: Mockingjay Part 1 , allowing you to brush up on film trivia just in time for Oscar season.',
 u'Get the most out of the map by filtering according to genre or year.',
 u'Or, if you\u2019re feeling adventurous, simply zoom in on a location to discover which blockbuster was filmed there.',
 u'Your wanderlust could reveal some surprising facts: who knew that the Ugandan jungle scenes in Casino Royale were in fact filmed in Buckinghamshire?',
 u'Or that the location was also used for the enchanted forest in Harry Potter?',
 u'Latest and breaking stories from the United States',
 u'Catch up on all the latest football news']

In [520]:
story_readability = mc_readability.story( stories_id , sentences=True)
[ s['sentence'] for s in story_readability['story_sentences'] ]


Out[520]:
[u'Around the world in 300 films.',
 u'Interactive: See where the biggest blockbusters were made, from Hunger Games to Jurassic Park']

In [521]:
response = mc_heuristic.sentenceList( solr_query )
response = response['response']
response


Out[521]:
{u'docs': [{u'_version_': 1491657164351078407,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Discover the Hunger Games arena and more famous locations in film with a new interactive map',
   u'sentence_number': 0,
   u'solr_id': u'234506!1049417',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049417',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164351078408,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'7:00AM GMT 25 Nov 2014',
   u'sentence_number': 1,
   u'solr_id': u'234506!1049418',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049418',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164351078409,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Inspired by some of the most famous settings in film \u2013 what New Zealand is to The Hobbit, or what Tokyo is to Lost in Translation \u2013 the map pinpoints the locations behind 300 of the most successful films in history.',
   u'sentence_number': 2,
   u'solr_id': u'234506!1049419',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049419',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164352126976,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Hunger Games: Mockingjay Part 1 , allowing you to brush up on film trivia just in time for Oscar season.',
   u'sentence_number': 3,
   u'solr_id': u'234506!1049420',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049420',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164352126977,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Get the most out of the map by filtering according to genre or year.',
   u'sentence_number': 4,
   u'solr_id': u'234506!1049421',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049421',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164352126978,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Or, if you\u2019re feeling adventurous, simply zoom in on a location to discover which blockbuster was filmed there.',
   u'sentence_number': 5,
   u'solr_id': u'234506!1049422',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049422',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164352126979,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Your wanderlust could reveal some surprising facts: who knew that the Ugandan jungle scenes in Casino Royale were in fact filmed in Buckinghamshire?',
   u'sentence_number': 6,
   u'solr_id': u'234506!1049423',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049423',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164352126980,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Or that the location was also used for the enchanted forest in Harry Potter?',
   u'sentence_number': 7,
   u'solr_id': u'234506!1049424',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049424',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657164352126981,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Latest and breaking stories from the United States',
   u'sentence_number': 8,
   u'solr_id': u'234506!1049425',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049425',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491657160733491203,
   u'media_id': 1750,
   u'medium_name': u'Daily Telegraph',
   u'processed_stories_id': 44401,
   u'publish_date': u'2014-11-25 07:00:52',
   u'sentence': u'Catch up on all the latest football news',
   u'sentence_number': 9,
   u'solr_id': u'234506!1049426',
   u'stories_id': 234506,
   u'story_sentences_id': u'1049426',
   u'url': u'http://telegraph.feedsportal.com/c/32726/f/568348/s/40cf0436/sc/38/l/0L0Stelegraph0O0Cculture0Cfilm0C11250A10A10CAround0Ethe0Eworld0Ein0E30A0A0Efilms0Bhtml/story01.htm'},
  {u'_version_': 1491658958446788610,
   u'media_id': None,
   u'medium_name': None,
   u'processed_stories_id': 44401,
   u'publish_date': None,
   u'sentence_number': None,
   u'solr_id': u'234506!0',
   u'stories_id': 234506,
   u'story_sentences_id': u'0',
   u'title': u'Around the world in 300 films',
   u'url': None}],
 u'numFound': 11,
 u'start': 0}

In [522]:
stories_distances[stories_id]


Out[522]:
0.66666666666666674

In [523]:
solr_query='stories_id:' + stories_id

In [524]:
a = array( [1,1,1,1,1,1,1] )
b = array( [1,1,1,1,1,0,0] )
scipy.spatial.distance.cosine( a, b )


Out[524]:
0.15484574527148354

In [525]:
all_media = []

last_media_id = 0

while True:
    media = mc.mediaList( last_media_id=last_media_id, rows=1000)
    print last_media_id, len( media ), len( all_media )

    if len(media) == 0:
        break
        
    last_media_id = media[-1]['media_id']
    last_media_id
    

        
    all_media.extend(media)
    
    if len( all_media ) > 200:
        break

len(all_media)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-525-8376a35c28e6> in <module>()
      4 
      5 while True:
----> 6     media = mc.mediaList( last_media_id=last_media_id, rows=1000)
      7     print last_media_id, len( media ), len( all_media )
      8 

NameError: name 'mc' is not defined

In [ ]:
def fetch_1000_stories_id_from_query( solr_query ):  
    q = "+sentence_number:0 AND {} ".format( solr_query )
    d = mc.sentenceList( solr_query=q, 
                        sort=mc.SORT_RANDOM, rows=1000 )
    ret = [ s['stories_id'] for s in d['response']['docs'] ]
    return ret

fetch_1000_stories_id_from_query( '+publish_date:[2014-05-10T00:00:00Z TO 2014-05-21T00:00:00Z} AND media_id:1 ' )

In [ ]:
len([ (s['stories_id'], s['publish_date']) for s in d['response']['docs'] ])

In [ ]:
mc.story( 243384113 )