Candidate Generator

Jiarui Xu - jxu57@illinois.edu


In [1]:
import json
import pyprind
import sys
import pickle

In [11]:
data_folder = "/Volumes/backup/ccg_tweet_wikifier_data/"
wikidata_file = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/wikidata-20160404-all.json"
entity_alias_output_file = data_folder+"wikidata/entity_alias.txt"
alias2entity_lower = data_folder+"wikidata/alias2entity_lower.pickle"
entity_alias_output_txt_file = data_folder+"wikidata/alias2entity.txt"

In [4]:
with open(data_folder+"Tweet/NEEL_tweets(with_grams).pickle", "rb") as f:
    tweet_corpus = pickle.load(f)

In [19]:
def alias_mapping(src_file):
    """
    Build a mapping from aliasn to entity_list
    """
    # for progress bar
    line_count = 12081825
    bar = pyprind.ProgBar(line_count, width=70, monitor = True)
    
    a2e = {}
    
    with open(src_file, "rb") as f:
        for line in f:
            bar.update()
            segments = line.strip().split("\t")
            entity = segments[0]
            a2e[entity] = set(segments[1:])
    return a2e

In [20]:
alias_to_entity = alias_mapping(entity_alias_output_txt_file)


0%                                                                  100%
[######################################################################] | ETA: 00:00:00
Total time elapsed: 00:01:51

In [21]:
alias_to_entity_lower = {}
for als in alias_to_entity.keys():
    als_lower = als.lower()
    if als_lower in alias_to_entity_lower:
        alias_to_entity_lower[als_lower] |= alias_to_entity[als]
    else:
        alias_to_entity_lower[als_lower] = alias_to_entity[als]

In [24]:
for tweet in tweet_corpus.values():
    for gold in tweet['goldens']:
        mention = gold['mention']
        if mention.lower() not in alias_to_entity_lower:
            print "========"
            print mention
            print gold['wiki_title']


========
chrisbharrison
Chris_Harrison
========
TheBachelor
The_Bachelor_(U.S._TV_series)
========
dangeroussummer
The_Dangerous_Summer_(band)
========
Twentieth Century Fox Animation
Fox_Animation_Studios
========
CCBL championship series
Cape_Cod_Baseball_League
========
Shuhardi Mat Isa
Nasharudin_Mat_Isa
========
Boehner
John_Boehner
========
DE Jarvis Moss
Jarvis_Moss
========
Judge Mark Ciavarella, Jr
Mark_Ciavarella
========
Joe B
Joe_B._Hall
========
SLAMonline.com
SLAM_Magazine
========
SLAMnewswire
SLAM_Magazine
========
Roger Cheng
Roger_Chang
========
T-Mobile
T-Mobile
========
PiersMorgan
Piers_Morgan
========
AJELive
Al_Jazeera
========
Co Mayo
County_Mayo
========
bbcwm
BBC_WM
========
brmb
Free_Radio_Birmingham
========
Harry Potter and the Deathly Hallows Part II
Harry_Potter_and_the_Deathly_Hallows_%e2%80%93_Part_2
========
broadcaster NRK
NRK
========
southern U.S.
Southern_United_States
========
justinbieber
Justin_Bieber
========
JohnCena
John_Cena
========
Long Island, NY
Long_Island
========
washingtonpost
The_Washington_Post
========
LAGalaxy
Los_Angeles_Galaxy
========
Lord Sugar
Alan_Sugar
========
ebertchicago
Roger_Ebert
========
chelseafc
Chelsea_F.C.
========
Miami Beach Police
Miami_Beach_Police_Department
========
Miami Beach Police Departm
Miami_Beach_Police_Department
========
WhiteSox
Chicago_White_Sox
========
Congressman Bill Johnson
Bill_Johnson_(Ohio_politician)
========
Good Weekend
The_Sydney_Morning_Herald
========
Pep Guardiola
Pep_Guardiola
========
Utøya shooter
Anders_Behring_Breivik
========
Joey7Barton
Joey_Barton
========
House Rules Committee Chairman Dreier
David_Dreier
========
US House Republicans
Republican_Party_(United_States)
========
Houston Business Journal
American_City_Business_Journals
========
RevRunWisdom
Joseph_Simmons
========
TNWapple
The_Next_Web
========
Speaker Boehner
John_Boehner
========
www.dailymail.co.uk
Daily_Mail
========
Swish bank UBS
Union_Bank_of_Switzerland
========
plaxico
Plaxico_Burress
========
Adli the former minister of Interior
Habib_el-Adly
========
Venice festival
Venice_Film_Festival
========
north Belfast
Belfast_North_(UK_Parliament_constituency)
========
Faleomavaega
Eni_Faleomavaega
========
BarackObama
Barack_Obama
========
ICONic Boyz
America%27s_Best_Dance_Crew_(season_6)
========
MTVNews
MTV_News
========
ussoccer_wnt
United_States_women%27s_national_soccer_team
========
USWNT
United_States_women%27s_national_soccer_team
========
Man City
Manchester_City_F.C.
========
enews
E!_News
========
Blue Bombers
Winnipeg_Blue_Bombers
========
Umno Youth
United_Malays_National_Organisation
========
Selangor government
Selangor
========
S'gor MB
Abdul_Khalid_Ibrahim
========
South of Border
Mexico
========
The Godfather: Part II
The_Godfather_Part_II
========
ScottyMcCreery
Scotty_McCreery
========
USMNT
United_States_men%27s_national_soccer_team
========
Jurgen Klinsmann
J%c3%bcrgen_Klinsmann
========
alexmorgan13
Alex_Morgan
========
Vermaelen
Thomas_Vermaelen
========
Szczesny
Wojciech_Szcz%c4%99sny
========
President Obama
Barack_Obama
========
Jaguars
Jacksonville_Jaguars
========
The Duchess of Cornwall
Duchess_of_Cornwall
========
iPods
IPod
========
ddlovato
Demi_Lovato
========
Amy Whinehouse
Amy_Winehouse
========
AlanCarr
Alan_Carr
========
Sainsburys
Sainsbury%27s
========
SkyNews
Sky_News
========
owlcity
Owl_City
========
Harry Potter film
Harry_Potter_(film_series)
========
OpPaypal
PayPal
========
BigSean
Big_Sean
========
Harry Potter and the Deathly Hallows part 2
Harry_Potter_and_the_Deathly_Hallows_%e2%80%93_Part_2
========
apprenticefinal
The_Apprentice_(UK_TV_series)
========
DavidShuster
David_Shuster
========
justinbieber
Justin_Bieber
========
TheNextWebME
The_Next_Web
========
Dovizioso
Andrea_Dovizioso
========
Bieber Fever
Justin_Bieber
========
UK Housing Minister
Ministry_of_Housing_and_Local_Government
========
Gadhafi's Son
Saif_al-Arab_Gaddafi
========
The Prisoner of the Chamber
Harry_Potter_and_the_Chamber_of_Secrets
========
thenextwebuk
The_Next_Web
========
JayZ
Jay-Z
========
How to Succeed in Business
How_to_Succeed_in_Business_Without_Really_Trying
========
Hemelhempstead
Hemel_Hempstead
========
Womens World Cup
Women%27s_National_Basketball_Association
========
106andpark
106_%26_Park
========
Facebook.
Facebook
========
Facebook.
Facebook
========
Deitrick
Campus_of_Virginia_Tech
========
AngelaSimmons
Run%27s_House
========
LAGalaxy
Los_Angeles_Galaxy
========
CMPunk
CM_Punk
========
twilight movie
The_Twilight_Saga_(film_series)
========
ArcelorMi
ArcelorMittal
========
newtgingrich
Newt_Gingrich
========
L.A
Los_Angeles
========
PondersEnd
Ponders_End
========
Wathamstow
Walthamstow
========
PalmersGreen
Palmers_Green
========
AllisonIraheta
Allison_Iraheta
========
adamlambert
Adam_Lambert
========
President Obama
Barack_Obama
========
Petr Cech
Petr_%c4%8cech
========
TheNextWebME
The_Next_Web
========
Holland, MI
Holland%2c_Michigan
========
President Obama
Barack_Obama
========
arusbridger
Alan_Rusbridger
========
St. John Fisher
John_Fisher
========
Luka Modric
Luka_Modri%c4%87
========
TNWmicrosoft
The_Next_Web
========
Met police commissioner Sir Paul Stephenson
Paul_Stephenson_(police_officer)
========
E.A. Poe
Edgar_Allan_Poe
========
Matt_HayesSN
Matt_Hayes
========
Dietrick
Campus_of_Virginia_Tech
========
Former Ravens S Dawan Landry
Dawan_Landry
========
Jaguars
Jacksonville_Jaguars
========
viatumblr
Tumblr
========
Packers
Green_Bay_Packers
========
KeithClark_
Keith_Clark
========
Boehner
John_Boehner
========
jamesmaslow
James_Maslow
========
justinbieber
Justin_Bieber
========
N.Korea
North_Korea
========
ICameron
David_Cameron
========
Oxlade-Chamberlain
Alex_Oxlade-Chamberlain
========
Sergio Aguero
Sergio_Ag%c3%bcero
========
Salford/city
City_of_Salford
========
http://CNNMoney.com
CNNMoney.com
========
DR Cromatrie
Dominique_Rodgers-Cromartie
========
US gov
Federal_government_of_the_United_States
========
Smithdown Road area of Liverpool
Smithdown_Road%2c_Liverpool
========
Boehner
John_Boehner
========
Conan O’Brien
Conan_O%27Brien
========
USWNT
United_States_women%27s_national_soccer_team
========
MeaganGood
Meagan_Good
========
Syrian capital, Damascus
Damascus
========
Donahoe
John_Donahoe
========
austin360
Austin_American-Statesman
========
Man City
Manchester_City_F.C.
========
Tevez
Carlos_T%c3%a9vez
========
Ho Chi Minh City, Vietnam
Ho_Chi_Minh_City
========
ChuckNorriz
Chuck_Norris
========
virginiatech
Virginia_Tech
========
Britain's Capital city
London
========
Senator Reid
Harry_Reid
========
Canon Rebel series
Canon_EOS
========
Canon Rebel EOS
Canon_EOS
========
Sen. Sanders
Bernie_Sanders
========
Sheehey
Will_Sheehey
========
Hulls
Jordan_Hulls
========
RB Reggie Bush
Reggie_Bush
========
J.K Rowling
J._K._Rowling
========
John C Maxwell
John_C._Maxwell
========
WR Michael Crabtree
Michael_Crabtree
========
Aden, Yemen
Aden
========
johncena
John_Cena
========
DavidCornDC
David_Corn
========
Nick Boles
Nicholas_Boles
========
Mateu Lahoz
Antonio_Mateu_Lahoz
========
mumford and sons
Mumford_%26_Sons
========
jasonterry31
Jason_Terry
========
swish41
Dirk_Nowitzki
========
ChadMMurray
Chad_Michael_Murray
========
prayfor
Norway
========
Jay_Glazer
Jay_Glazer
========
DE Jason Babin
Jason_Babin
========
coach Jim Washburn
Jim_Washburn
========
al-Assad
Bashar_al-Assad
========
GlobeandMail
The_Globe_and_Mail
========
Steve Gostkowksi
Stephen_Gostkowski
========
HarryPotter and the Deathly Hallows part 2
Harry_Potter_and_the_Deathly_Hallows_%e2%80%93_Part_2
========
chelseafc
Chelsea_F.C.
========
Chelsea squad
Chelsea_F.C.
========
Udinese
Udinese_Calcio
========
HP7.2
Harry_Potter_and_the_Deathly_Hallows_%e2%80%93_Part_2
========
SkySportsNews
Sky_Sports_News
========
LB Michael Boley
Michael_Boley
========
North Wazirirstan
North_Waziristan
========
His Holiness the Dalai Lama
14th_Dalai_Lama
========
President Barack Obama
Barack_Obama
========
Deathly Hallows Part 2
Harry_Potter_and_the_Deathly_Hallows_%e2%80%93_Part_2
========
Harry P
Harry_Potter_(film_series)
========
el periodico
El_Peri%c3%b3dico_de_Catalunya
========
Cesc Fabregas
Cesc_F%c3%a0bregas
========
Canucks
Vancouver_Canucks
========
N.O.W
News_of_the_World
========
Jaguars
Jacksonville_Jaguars
========
Bruno Mars
Bruno_Mars
========
JohnKingUSA
John_King_(journalist)
========
president al-Assad
Bashar_al-Assad
========
Jankovic
Jelena_Jankovi%c4%87
========
FrankieTheSats
Frankie_Sandford
========
WWEmagazine
WWE_Magazine
========
Kun Aguero
Sergio_Ag%c3%bcero
========
CYBEX
Cybex_International
========
Juwanna Man
Juwanna_Mann
========
lil Kim
Lil%27_Kim
========
Villas-Boas
Andr%c3%a9_Villas-Boas
========
Chelsea boss Andre Villas-Boas
Andr%c3%a9_Villas-Boas
========
Tim Ferriss
Timothy_Ferriss
========
President Obama
Barack_Obama
========
J.K. Rowling
J._K._Rowling
========
Jose Enrique
Jos%c3%a9_Enrique_S%c3%a1nchez
========
NBC Miami
WTVJ
========
Bismarck/Mandan
Bismarck%e2%80%93Mandan
========
Apple Retail Stores
Apple_Store
========
ParisHilton
Paris_Hilton
========
Riteishd
Riteish_Deshmukh
========
kurt coban
Kurt_Cobain
========
jimmi hendrix
Jimi_Hendrix
========
US news
U.S._News_%26_World_Report
========
HadleyFreeman
Hadley_Freeman
========
R Broyles
Ryan_Broyles
========
US Secretary of State Clinton
Hillary_Rodham_Clinton
========
Assad
Bashar_al-Assad
========
anonymouSabu
Hector_Xavier_Monsegur
========
Mirror.co.uk
Daily_Mirror
========
TNWinsider
The_Next_Web
========
TNW Insider
The_Next_Web
========
GovernorPerry
Rick_Perry
========
FoxNews
Fox_News_Channel
========
RickPerry
Rick_Perry
========
Smaalenene
Smaalenenes_Avis
========
thewantedmusic
The_Wanted
========
MetPolice
Metropolitan_Police_Service
========
JoelOsteen
Joel_Osteen
========
NFLN
NFL_Network
========
michaelombardi
Michael_Lombardi_(American_football)
========
Met Police
Metropolitan_Police_Service
========
TheChew
The_Chew
========
EWErickson
Erick_Erickson
========
Charleston, SC.
Charleston%2c_South_Carolina
========
MardyFish
Mardy_Fish
========
DjokerNole
Novak_Djokovic
========
Clydesdale Bank 40
ECB_40
========
Twilight Breaking dawn
The_Twilight_Saga%3a_Breaking_Dawn_%e2%80%93_Part_1
========
Glee 3D
Glee%3a_The_3D_Concert_Movie
========
ESPNSoccernet
ESPN_FC
========
Stoke City
Stoke_City_F.C.
========
Europa League
UEFA_Europa_League
========
Hajduk Split
HNK_Hajduk_Split
========
Indianapolis Department of Public Safety
Indianapolis_Metropolitan_Police_Department
========
realmadrid
Real_Madrid_C.F.
========
D. McNabb
Donovan_McNabb
========
CSLEWIS
C._S._Lewis
========
US national team
United_States_men%27s_national_soccer_team
========
Jurgen Klinsmann
J%c3%bcrgen_Klinsmann
========
justinbieber
Justin_Bieber
========
Dana Ave in Hyde Park
Hyde_Park%2c_Boston
========
Stoke City
Stoke_City_F.C.
========
chelseafc
Chelsea_F.C.
========
www.chelseafc.com
Chelsea_F.C.
========
President Obama
Barack_Obama
========
Man City
Manchester_City_F.C.
========
BBCWorld
BBC_World_News
========
President Barack Obama
Barack_Obama
========
World of Warc
World_of_Warcraft
========
President Obama
Barack_Obama
========
U.S. military
United_States_Armed_Forces
========
Luis Suarez
Luis_Su%c3%a1rez
========
mchammer
MC_Hammer
========
amy whinehouse
Amy_Winehouse
========
Junhyung
Yong_Jun-hyung
========
Erik  Bedard
%c3%89rik_B%c3%a9dard
========
pee wee Herman
Pee-wee_Herman
========
Ohi plant
%c5%8ci_Nuclear_Power_Plant
========
Ohi nuclear plant
%c5%8ci_Nuclear_Power_Plant
========
NFLPA
National_Football_League_Players_Association
========
Carmageddon Race
Carmageddon
========
Health Shuler
Heath_Shuler
========
Ferb
Ferb_Fletcher
========
EmWatson
Emma_Watson
========
Dan Radcliffe
Daniel_Radcliffe
========
TomFelton
Tom_Felton
========
HarryPotter
Harry_Potter_(film_series)
========
Alvaro Pereira
%c3%81lvaro_Pereira
========
UPSHelp
United_Parcel_Service
========
Apple Stores
Apple_Store
========
Murdochs
Rupert_Murdoch
========
bynickdavies
Nick_Davies
========
newsbrooke
Heather_Brooke
========
chairman Ed Snider
Ed_Snider
========
Comcast-Spectacor
Comcast_Spectacor
========
Maruti Suzuki India
Maruti_Suzuki
========
NFLPA
National_Football_League_Players_Association
========
AndersBBreivik
Anders_Behring_Breivik
========
Murdochs
Rupert_Murdoch
========
Skysports.com
Sky_Sports
========
Sept 20
September_20
========
JoePa
Joe_Paterno
========
President Obama
Barack_Obama
========
Mankins
Logan_Mankins
========
RB Willis McGahee
Willis_McGahee
========
Milwall
Millwall_F.C.
========
106andpark
106_%26_Park
========
AustinMahone
Austin_Mahone
========
TNWmobile
The_Next_Web
========
US Senior Open
United_States_Senior_Open
========
USMNT
United_States_men%27s_national_soccer_team
========
St.Clair
St._Clair-Superior
========
JLSOfficial
JLS
========
Met Police
Metropolitan_Police_Service
========
BeingSalmanKhan
Salman_Khan
========
Big 12 Media Days
Big_12_Conference
========
tnwtwit
The_Next_Web
========
BBCNews
BBC_News
========
777s
Boeing_777
========
Deut
Book_of_Deuteronomy
========
Fabregas
Cesc_F%c3%a0bregas
========
SkySportsNews
Sky_Sports_News
========
QB Kevin Kolb
Kevin_Kolb
========
Scottish and Southern
SSE_plc
========
boogiecousins
DeMarcus_Cousins
========
ebled24
Eric_Bledsoe
========
jimmywa11
John_Wall_(basketball)
========
Boehner
John_Boehner
========
borowitzreport
Andy_Borowitz
========
ALBUQUERQUE, NM
Albuquerque%2c_New_Mexico
========
British soldiers
British_Army
========
AW Tozer
A._W._Tozer
========
McIlroyRory
Rory_McIlroy
========
Graeme_McDowell
Graeme_McDowell
========
Asomugha
Nnamdi_Asomugha
========
Cesc Fabregas
Cesc_F%c3%a0bregas
========
Met Police
Metropolitan_Police_Service
========
BofI Holding, Inc.
BofI_Holding_Inc
========
RyanSeacrest
Ryan_Seacrest
========
AmyWinehouse
Amy_Winehouse
========
Cesc Fabregas
Cesc_F%c3%a0bregas
========
taylorswift13
Taylor_Swift
========
khap movie
Khap_(film)
========
ddlovato
Demi_Lovato
========
officialjaden
Jaden_Smith
========
justinbieber
Justin_Bieber
========
NY Archdiocese
Roman_Catholic_Archdiocese_of_New_York
========
justinbieber
Justin_Bieber
========
Boeing CEO McNerney
James_McNerney
========
Windows Phone 7.5
Windows_Phone_7
========
Wall St
Wall_Street
========
lion king
The_Lion_King_(musical)
========
SFGiants
San_Francisco_Giants
========
Kim Jaejoong
Jaejoong
========
Colin Kaepernick
Colin_Kaepernick
========
JohnCena
John_Cena
========
JohnCena
John_Cena
========
NHLJets
Winnipeg_Jets
========
www.winnipegjets.com
Winnipeg_Jets
========
Boehner
John_Boehner
========
former Raiders guard Robert Gallery
Robert_Gallery
========
Rice Krispy
Rice_Krispies
========
Cesc Fabregas
Cesc_F%c3%a0bregas
========
Assad
Bashar_al-Assad
========
MileyCyrus
Miley_Cyrus
========
www.teenchoiceawards.com
Teen_Choice_Awards
========
Liverpool Uni
University_of_Liverpool
========
GoAztecs
San_Diego_State_Aztecs
========
PGATour
PGA_Tour
========
Reno_Tahoe_Open
Reno%e2%80%93Tahoe_Open
========
Times-Picayune, NOLA.com
The_Times-Picayune
========
San Francisco Business Times
American_City_Business_Journals
========
Met Police
Metropolitan_Police_Service
========
SkyNewsBreak
Sky_News
========
Dwayne Wade
Dwyane_Wade
========
Dwayne Wade
Dwyane_Wade
========
Wazin
Wazzin
========
Dhiba
Dehiba
========
Nafousa mountain
Nafusa_Mountains
========
Kicker Ryan Longwell
Ryan_Longwell
========
Rebecca Brooks
Rebekah_Brooks
========
News1130
CKWX
========
1 John
First_Epistle_of_John
========
US News and World Report
U.S._News_%26_World_Report
========
brianstelter
Brian_Stelter
========
Rev. Al Sharpton
Al_Sharpton
========
Atletico Madrid
Atl%c3%a9tico_Madrid
========
SkySportsNews
Sky_Sports_News
========
Man City
Manchester_City_F.C.
========
Sergio Aguero
Sergio_Ag%c3%bcero
========
Mrs. Murdoch
Wendi_Deng_Murdoch
========
White Sox GM Kenny Williams
Kenny_Williams_(baseball)
========
Arsene Wenger
Ars%c3%a8ne_Wenger
========
Jon cena
John_Cena
========
Jurgen Klinsmann
J%c3%bcrgen_Klinsmann
========
Koscielny
Laurent_Koscielny
========
NewsCorp
News_Corporation
========
Met Police
Metropolitan_Police_Service
========
Texas Gov. Rick Perry
Rick_Perry
========
Houstons
Houston
========
DEVisHot
Dev_(singer)
========
JLSOfficial
JLS
========
The San Francisco Chronicle
San_Francisco_Chronicle
========
Boehner
John_Boehner
========
series 2 of #Sherlock
Sherlock_(TV_series)
========
series 1
Sherlock_(TV_series)
========
chairman James Murdoch
James_Murdoch
========
ICC Under-19 World Cup
ICC_Under-19_Cricket_World_Cup
========
NBA Guard Sasha Vujacic
Sasha_Vuja%c4%8di%c4%87
========
Turkish team
Anadolu_Efes_S.K.
========
USWNT
United_States_women%27s_national_soccer_team
========
USWNT
United_States_women%27s_national_soccer_team
========
MetPolice
Metropolitan_Police_Service
========
Louise boat
LulzSec
========
thenextwebuk
The_Next_Web
========
Yoichi Yokozawa
Yorihiko_Kojima
========
Mitsubishi CEO
Yorihiko_Kojima
========
LA Noire
L.A._Noire
========
Miss Selfridges
Miss_Selfridge
========
CSLewis
C._S._Lewis
========
Luis Suarez
Luis_Su%c3%a1rez
========
Victoria Avenue, Southend
Southend_Victoria_railway_station
========
Stoke City
Stoke_City_F.C.
========
Kidrauhl
Justin_Bieber
========
hottubtimemachine
Hot_Tub_Time_Machine
========
Sergio Aguero
Sergio_Ag%c3%bcero
========
ajaymaken
Ajay_Maken
========
Kalmadi
Suresh_Kalmadi
========
Sir Paul Stephenson
Paul_Stephenson_(police_officer)
========
Department of Enviromental Quality
Oklahoma_Department_of_Environmental_Quality
========
Audio Production Schools
SAE_Institute
========
emmy nomination
Emmy_Award
========
CBSNews
CBS_News
========
bbcworldservice
BBC_World_Service
========
n government
Council_of_Ministers_(Syria)
========
USWNT
United_States_women%27s_national_soccer_team
========
UN court
International_Court_of_Justice
========
Indian Air Force (IAF) chief P V Naik
Pradeep_Vasant_Naik
========
YahooSports
Yahoo!_Sports
========
uslpro
USL_Pro
========
Charles 'Bubba' Smith
Bubba_Smith
========
BubbaSmith
Bubba_Smith
========
AshrafGhori
Ashraf_Ghori
========
Azkals
Philippines_national_football_team
========
DC.
Washington%2c_D.C.
========
Rebecca brooks
Rebekah_Brooks
========
Boehner
John_Boehner
========
Marley Station mall
Marley_Station
========
WR Santonio Holmes
Santonio_Holmes
========
RiverMarket
River_Market%2c_Kansas_City
========
ICONic Boyz
America%27s_Best_Dance_Crew_(season_6)
========
Miami Int. Airport
Miami_International_Airport
========
ladygaga
Lady_Gaga
========
JLSOfficial
JLS
========
i-tunes
ITunes
========
Krabby Patty
SpongeBob_SquarePants
========
Dominique Rodgers-Crom
Dominique_Rodgers-Cromartie
========
Asomugha
Nnamdi_Asomugha
========
Sen. John Cornyn
John_Cornyn
========
ladygaga
Lady_Gaga
========
amywinehouse
Amy_Winehouse
========
Utøya Island
Ut%c3%b8ya
========
Exxon Mobil
ExxonMobil
========
starboynathan
Starboy_Nathan
========
Jason LaCanfora
Jason_La_Canfora
========
Broderick Bunkley
Brodrick_Bunkley
========
Cesc Fabregas
Cesc_F%c3%a0bregas
========
Met Police
Metropolitan_Police_Service
========
SFGiants
San_Francisco_Giants
========
HuffDaddy
Aubrey_Huff
========
NateTheGreat
Nate_Schierholtz
========
Copa América
Copa_Am%c3%a9rica
========
seventeenmag
Seventeen_(magazine)
========
sharm el sheikh
Sharm_el-Sheikh
========
JanelleMonae
Janelle_Mon%c3%a1e
========
Giffords
Gabrielle_Giffords
========
Ryeo Wook
Kim_Ryeowook
========
ryeong9
Kim_Ryeowook
========
Dietrick dining hall
Campus_of_Virginia_Tech
========
WR Denarius Moore
Denarius_Moore
========
Israeli intelligence
Mossad
========
andersoncooper
Anderson_Cooper
========
mileycyrus
Miley_Cyrus
========
PM Raila Odinga
Raila_Odinga
========
shahidkapoor
Shahid_Kapoor
========
www.thesun.co.uk
The_Sun_(United_Kingdom)
========
tripleh
Triple_H
========
JKRowling
J._K._Rowling
========
jarpad
Jared_Padalecki
========
trentsevern
Trent%e2%80%93Severn_Waterway
========
Android Market
Google_Play
========
Hagrid
Rubeus_Hagrid
========
Gillispie
Billy_Gillispie
========
No 10
10_Downing_Street
========
Furcal
Rafael_Furcal
========
Serigo Aguero
Sergio_Ag%c3%bcero
========
harimaumalaya
Malaysia_national_football_team
========
Jln Besar stadium
Jalan_Besar_Stadium
========
Hillside/Shelbourne
Hillside_Shopping_Centre
========
denisleary
Denis_Leary
========
Plaxico
Plaxico_Burress
========
Met Police
Metropolitan_Police_Service
========
SouljaBoy
Soulja_Boy
========
chriscolfer
Chris_Colfer
========
ladygaga
Lady_Gaga
========
Lord Jesus
Jesus
========
USWNT
United_States_women%27s_national_soccer_team
========
Oslo, Norway
Oslo
========
Cesc Fabregas
Cesc_F%c3%a0bregas
========
www.skysports.com
Sky_Sports
========
Pep Guardiola
Pep_Guardiola
========
gunners
Arsenal_F.C.

In [14]:
tweet_corpus['102065579078332417']


Out[14]:
{'cashtag_mapping': {},
 'goldens': [{'end_idx': '45',
   'freebase_title': '/m/05q656y',
   'mention': 'Judge Mark Ciavarella, Jr',
   'mention_orig': 'Judge Mark Ciavarella, Jr',
   'start_idx': '20',
   'tid': '102065579078332417',
   'wiki_title': 'Mark_Ciavarella'}],
 'hashtag_mapping': {u'Prison': {u'indices': [69, 76], u'text': u'Prison'},
  u'detention': {u'indices': [124, 134], u'text': u'detention'}},
 'ngrams': {1: ['Former',
   'Pennsylvania',
   'Judge',
   'Mark',
   'Ciavarella',
   ',',
   'Jr',
   'Sentenced',
   'to',
   '28',
   'Yrs',
   'in',
   '#Prison',
   'for',
   '"',
   'Kids-for-Cash',
   '"',
   'Scheme',
   'http://t.co/IXI8m5q',
   '#detention',
   'http://t',
   'co/IXI8m5q',
   'co',
   'IXI8m5q'],
  2: ['Former Pennsylvania',
   'Pennsylvania Judge',
   'Judge Mark',
   'Mark Ciavarella',
   'Ciavarella,',
   ', Jr',
   'Jr Sentenced',
   'Sentenced to',
   'to 28',
   '28 Yrs',
   'Yrs in',
   'in #Prison',
   '#Prison for',
   'for "',
   '"Kids-for-Cash',
   'Kids-for-Cash"',
   '" Scheme',
   'Scheme http://t.co/IXI8m5q',
   'http://t.co/IXI8m5q #detention',
   'Scheme http://t',
   'co/IXI8m5q',
   'http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  3: ['Former Pennsylvania Judge',
   'Pennsylvania Judge Mark',
   'Judge Mark Ciavarella',
   'Mark Ciavarella,',
   'Ciavarella, Jr',
   ', Jr Sentenced',
   'Jr Sentenced to',
   'Sentenced to 28',
   'to 28 Yrs',
   '28 Yrs in',
   'Yrs in #Prison',
   'in #Prison for',
   '#Prison for "',
   'for "Kids-for-Cash',
   '"Kids-for-Cash"',
   'Kids-for-Cash" Scheme',
   '" Scheme http://t.co/IXI8m5q',
   'Scheme http://t.co/IXI8m5q #detention',
   '" Scheme http://t',
   'co/IXI8m5q',
   'Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  4: ['Former Pennsylvania Judge Mark',
   'Pennsylvania Judge Mark Ciavarella',
   'Judge Mark Ciavarella,',
   'Mark Ciavarella, Jr',
   'Ciavarella, Jr Sentenced',
   ', Jr Sentenced to',
   'Jr Sentenced to 28',
   'Sentenced to 28 Yrs',
   'to 28 Yrs in',
   '28 Yrs in #Prison',
   'Yrs in #Prison for',
   'in #Prison for "',
   '#Prison for "Kids-for-Cash',
   'for "Kids-for-Cash"',
   '"Kids-for-Cash" Scheme',
   'Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   '" Scheme http://t.co/IXI8m5q #detention',
   'Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   '" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  5: ['Former Pennsylvania Judge Mark Ciavarella',
   'Pennsylvania Judge Mark Ciavarella,',
   'Judge Mark Ciavarella, Jr',
   'Mark Ciavarella, Jr Sentenced',
   'Ciavarella, Jr Sentenced to',
   ', Jr Sentenced to 28',
   'Jr Sentenced to 28 Yrs',
   'Sentenced to 28 Yrs in',
   'to 28 Yrs in #Prison',
   '28 Yrs in #Prison for',
   'Yrs in #Prison for "',
   'in #Prison for "Kids-for-Cash',
   '#Prison for "Kids-for-Cash"',
   'for "Kids-for-Cash" Scheme',
   '"Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   '"Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  6: ['Former Pennsylvania Judge Mark Ciavarella,',
   'Pennsylvania Judge Mark Ciavarella, Jr',
   'Judge Mark Ciavarella, Jr Sentenced',
   'Mark Ciavarella, Jr Sentenced to',
   'Ciavarella, Jr Sentenced to 28',
   ', Jr Sentenced to 28 Yrs',
   'Jr Sentenced to 28 Yrs in',
   'Sentenced to 28 Yrs in #Prison',
   'to 28 Yrs in #Prison for',
   '28 Yrs in #Prison for "',
   'Yrs in #Prison for "Kids-for-Cash',
   'in #Prison for "Kids-for-Cash"',
   '#Prison for "Kids-for-Cash" Scheme',
   'for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   '"Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   '"Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  7: ['Former Pennsylvania Judge Mark Ciavarella, Jr',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced',
   'Judge Mark Ciavarella, Jr Sentenced to',
   'Mark Ciavarella, Jr Sentenced to 28',
   'Ciavarella, Jr Sentenced to 28 Yrs',
   ', Jr Sentenced to 28 Yrs in',
   'Jr Sentenced to 28 Yrs in #Prison',
   'Sentenced to 28 Yrs in #Prison for',
   'to 28 Yrs in #Prison for "',
   '28 Yrs in #Prison for "Kids-for-Cash',
   'Yrs in #Prison for "Kids-for-Cash"',
   'in #Prison for "Kids-for-Cash" Scheme',
   '#Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   '#Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  8: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to',
   'Judge Mark Ciavarella, Jr Sentenced to 28',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs',
   'Ciavarella, Jr Sentenced to 28 Yrs in',
   ', Jr Sentenced to 28 Yrs in #Prison',
   'Jr Sentenced to 28 Yrs in #Prison for',
   'Sentenced to 28 Yrs in #Prison for "',
   'to 28 Yrs in #Prison for "Kids-for-Cash',
   '28 Yrs in #Prison for "Kids-for-Cash"',
   'Yrs in #Prison for "Kids-for-Cash" Scheme',
   'in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   '#Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   '#Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  9: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison',
   ', Jr Sentenced to 28 Yrs in #Prison for',
   'Jr Sentenced to 28 Yrs in #Prison for "',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'to 28 Yrs in #Prison for "Kids-for-Cash"',
   '28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  10: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for',
   ', Jr Sentenced to 28 Yrs in #Prison for "',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   '28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   '28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  11: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   '28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   '28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  12: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  13: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  14: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  15: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   ', Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  16: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  17: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash"',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  18: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  19: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q',
   'Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q',
   'co',
   'IXI8m5q #detention'],
  20: ['Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
   'Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t',
   'co/IXI8m5q #detention',
   'co',
   'IXI8m5q #detention'],
  21: [],
  22: [],
  23: [],
  24: [],
  25: [],
  26: [],
  27: [],
  28: [],
  29: [],
  30: [],
  31: [],
  32: [],
  33: [],
  34: [],
  35: [],
  36: [],
  37: [],
  38: [],
  39: [],
  40: [],
  41: [],
  42: [],
  43: [],
  44: [],
  45: [],
  46: [],
  47: [],
  48: [],
  49: [],
  50: [],
  51: [],
  52: [],
  53: [],
  54: [],
  55: [],
  56: [],
  57: [],
  58: [],
  59: [],
  60: [],
  61: [],
  62: [],
  63: [],
  64: [],
  65: [],
  66: [],
  67: [],
  68: [],
  69: [],
  70: [],
  71: [],
  72: [],
  73: [],
  74: [],
  75: [],
  76: [],
  77: [],
  78: [],
  79: [],
  80: [],
  81: [],
  82: [],
  83: [],
  84: [],
  85: [],
  86: [],
  87: [],
  88: [],
  89: [],
  90: [],
  91: [],
  92: [],
  93: [],
  94: [],
  95: [],
  96: [],
  97: [],
  98: [],
  99: []},
 'stanford_parsed': {u'sentences': [{u'dependencies': [[u'root',
      u'ROOT',
      u'Ciavarella'],
     [u'amod', u'Ciavarella', u'Former'],
     [u'nn', u'Ciavarella', u'Pennsylvania'],
     [u'nn', u'Ciavarella', u'Judge'],
     [u'nn', u'Ciavarella', u'Mark'],
     [u'nn', u'Sentenced', u'Jr'],
     [u'appos', u'Ciavarella', u'Sentenced'],
     [u'num', u'Yrs', u'28'],
     [u'prep_to', u'Sentenced', u'Yrs'],
     [u'prep_in', u'Yrs', u'#Prison'],
     [u'amod', u'Scheme', u'Kids-for-Cash'],
     [u'prep_for', u'#Prison', u'Scheme'],
     [u'nn', u'#detention', u'http://t.co/IXI8m5q'],
     [u'dep', u'Ciavarella', u'#detention']],
    u'parsetree': u"(ROOT (NP (NP (NP (JJ Former) (NNP Pennsylvania) (NNP Judge) (NNP Mark) (NNP Ciavarella)) (, ,) (NP (NP (NNP Jr) (NNP Sentenced)) (PP (TO to) (NP (NP (CD 28) (NNS Yrs)) (PP (IN in) (NP (NP (NN #Prison)) (PP (IN for) (NP (`` ``) (JJ Kids-for-Cash) ('' '') (NN Scheme))))))))) (NP (NN http://t.co/IXI8m5q) (NN #detention))))",
    u'text': u'Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
    u'words': [[u'Former',
      {u'CharacterOffsetBegin': u'0',
       u'CharacterOffsetEnd': u'6',
       u'Lemma': u'former',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'JJ'}],
     [u'Pennsylvania',
      {u'CharacterOffsetBegin': u'7',
       u'CharacterOffsetEnd': u'19',
       u'Lemma': u'Pennsylvania',
       u'NamedEntityTag': u'LOCATION',
       u'PartOfSpeech': u'NNP'}],
     [u'Judge',
      {u'CharacterOffsetBegin': u'20',
       u'CharacterOffsetEnd': u'25',
       u'Lemma': u'Judge',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NNP'}],
     [u'Mark',
      {u'CharacterOffsetBegin': u'26',
       u'CharacterOffsetEnd': u'30',
       u'Lemma': u'Mark',
       u'NamedEntityTag': u'PERSON',
       u'PartOfSpeech': u'NNP'}],
     [u'Ciavarella',
      {u'CharacterOffsetBegin': u'31',
       u'CharacterOffsetEnd': u'41',
       u'Lemma': u'Ciavarella',
       u'NamedEntityTag': u'PERSON',
       u'PartOfSpeech': u'NNP'}],
     [u',',
      {u'CharacterOffsetBegin': u'41',
       u'CharacterOffsetEnd': u'42',
       u'Lemma': u',',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u','}],
     [u'Jr',
      {u'CharacterOffsetBegin': u'43',
       u'CharacterOffsetEnd': u'45',
       u'Lemma': u'Jr',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NNP'}],
     [u'Sentenced',
      {u'CharacterOffsetBegin': u'46',
       u'CharacterOffsetEnd': u'55',
       u'Lemma': u'Sentenced',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NNP'}],
     [u'to',
      {u'CharacterOffsetBegin': u'56',
       u'CharacterOffsetEnd': u'58',
       u'Lemma': u'to',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'TO'}],
     [u'28',
      {u'CharacterOffsetBegin': u'59',
       u'CharacterOffsetEnd': u'61',
       u'Lemma': u'28',
       u'NamedEntityTag': u'DURATION',
       u'NormalizedNamedEntityTag': u'P28Y',
       u'PartOfSpeech': u'CD',
       u'Timex': u'<TIMEX3 tid="t1" type="DURATION" value="P28Y">28 Yrs</TIMEX3>'}],
     [u'Yrs',
      {u'CharacterOffsetBegin': u'62',
       u'CharacterOffsetEnd': u'65',
       u'Lemma': u'yr',
       u'NamedEntityTag': u'DURATION',
       u'NormalizedNamedEntityTag': u'P28Y',
       u'PartOfSpeech': u'NNS',
       u'Timex': u'<TIMEX3 tid="t1" type="DURATION" value="P28Y">28 Yrs</TIMEX3>'}],
     [u'in',
      {u'CharacterOffsetBegin': u'66',
       u'CharacterOffsetEnd': u'68',
       u'Lemma': u'in',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'IN'}],
     [u'#Prison',
      {u'CharacterOffsetBegin': u'69',
       u'CharacterOffsetEnd': u'76',
       u'Lemma': u'#prison',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NN'}],
     [u'for',
      {u'CharacterOffsetBegin': u'77',
       u'CharacterOffsetEnd': u'80',
       u'Lemma': u'for',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'IN'}],
     [u'``',
      {u'CharacterOffsetBegin': u'81',
       u'CharacterOffsetEnd': u'82',
       u'Lemma': u'``',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'``'}],
     [u'Kids-for-Cash',
      {u'CharacterOffsetBegin': u'82',
       u'CharacterOffsetEnd': u'95',
       u'Lemma': u'kids-for-cash',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'JJ'}],
     [u"''",
      {u'CharacterOffsetBegin': u'95',
       u'CharacterOffsetEnd': u'96',
       u'Lemma': u"''",
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u"''"}],
     [u'Scheme',
      {u'CharacterOffsetBegin': u'97',
       u'CharacterOffsetEnd': u'103',
       u'Lemma': u'scheme',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NN'}],
     [u'http://t.co/IXI8m5q',
      {u'CharacterOffsetBegin': u'104',
       u'CharacterOffsetEnd': u'123',
       u'Lemma': u'http://t.co/ixi8m5q',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NN'}],
     [u'#detention',
      {u'CharacterOffsetBegin': u'124',
       u'CharacterOffsetEnd': u'134',
       u'Lemma': u'#detention',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NN'}]]}]},
 'tweet_info': {u'contributors': None,
  u'coordinates': None,
  u'created_at': u'Fri Aug 12 17:15:05 +0000 2011',
  u'entities': {u'hashtags': [{u'indices': [69, 76], u'text': u'Prison'},
    {u'indices': [124, 134], u'text': u'detention'}],
   u'symbols': [],
   u'urls': [{u'display_url': u'owl.li/61QPx',
     u'expanded_url': u'http://owl.li/61QPx',
     u'indices': [104, 123],
     u'url': u'http://t.co/IXI8m5q'}],
   u'user_mentions': []},
  u'favorite_count': 2,
  u'favorited': False,
  u'geo': None,
  u'id': 102065579078332417,
  u'id_str': u'102065579078332417',
  u'in_reply_to_screen_name': None,
  u'in_reply_to_status_id': None,
  u'in_reply_to_status_id_str': None,
  u'in_reply_to_user_id': None,
  u'in_reply_to_user_id_str': None,
  u'is_quote_status': False,
  u'lang': u'en',
  u'place': None,
  u'possibly_sensitive': False,
  u'possibly_sensitive_appealable': False,
  u'retweet_count': 28,
  u'retweeted': False,
  u'source': u'<a href="http://www.hootsuite.com" rel="nofollow">Hootsuite</a>',
  u'text': u'Former Pennsylvania Judge Mark Ciavarella, Jr Sentenced to 28 Yrs in #Prison for "Kids-for-Cash" Scheme http://t.co/IXI8m5q #detention',
  u'truncated': False,
  u'user': {u'contributors_enabled': False,
   u'created_at': u'Thu Oct 23 20:47:35 +0000 2008',
   u'default_profile': False,
   u'default_profile_image': False,
   u'description': u'Independent, daily global news hour anchored by Amy Goodman & Juan Gonz\xe1lez. Stream M-F 8AM ET https://t.co/qj9euWzTVH. Support independent media\u2014Donate today!',
   u'entities': {u'description': {u'urls': [{u'display_url': u'democracynow.org',
       u'expanded_url': u'http://democracynow.org',
       u'indices': [95, 118],
       u'url': u'https://t.co/qj9euWzTVH'}]},
    u'url': {u'urls': [{u'display_url': u'democracynow.org',
       u'expanded_url': u'http://www.democracynow.org',
       u'indices': [0, 22],
       u'url': u'http://t.co/XZYcOgrqqm'}]}},
   u'favourites_count': 2024,
   u'follow_request_sent': False,
   u'followers_count': 451077,
   u'following': False,
   u'friends_count': 5831,
   u'geo_enabled': True,
   u'has_extended_profile': False,
   u'id': 16935292,
   u'id_str': u'16935292',
   u'is_translation_enabled': False,
   u'is_translator': False,
   u'lang': u'en',
   u'listed_count': 15120,
   u'location': u'New York, NY',
   u'name': u'Democracy Now!',
   u'notifications': False,
   u'profile_background_color': u'FFFFFF',
   u'profile_background_image_url': u'http://pbs.twimg.com/profile_background_images/813049285/98d3a7252de1caa47aadf2e7d8898ac5.png',
   u'profile_background_image_url_https': u'https://pbs.twimg.com/profile_background_images/813049285/98d3a7252de1caa47aadf2e7d8898ac5.png',
   u'profile_background_tile': False,
   u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/16935292/1455653179',
   u'profile_image_url': u'http://pbs.twimg.com/profile_images/420934536969461760/-ZK_2p2-_normal.png',
   u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/420934536969461760/-ZK_2p2-_normal.png',
   u'profile_link_color': u'4E8DAD',
   u'profile_sidebar_border_color': u'000000',
   u'profile_sidebar_fill_color': u'F3F2F7',
   u'profile_text_color': u'242324',
   u'profile_use_background_image': False,
   u'protected': False,
   u'screen_name': u'democracynow',
   u'statuses_count': 57456,
   u'time_zone': u'Eastern Time (US & Canada)',
   u'url': u'http://t.co/XZYcOgrqqm',
   u'utc_offset': -14400,
   u'verified': True}},
 'url_mapping': {u'http://t.co/IXI8m5q': {u'display_url': u'owl.li/61QPx',
   u'expanded_url': u'http://owl.li/61QPx',
   u'indices': [104, 123],
   u'url': u'http://t.co/IXI8m5q'}},
 'usermention_mapping': {}}

In [26]:
t =  "http://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=DESCRIBE%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FJeb_Bush%3E&format=application%2Fmicrodata%2Bjson"

In [29]:
t.decode("utf8")


Out[29]:
u'http://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=DESCRIBE%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FJeb_Bush%3E&format=application%2Fmicrodata%2Bjson'

In [31]:
import urllib
m= urllib.unquote(t)

In [32]:
m


Out[32]:
'http://dbpedia.org/sparql?default-graph-uri=http://dbpedia.org&query=DESCRIBE <http://dbpedia.org/resource/Jeb_Bush>&format=application/microdata+json'

In [33]:
urllib.quote(m)


Out[33]:
'http%3A//dbpedia.org/sparql%3Fdefault-graph-uri%3Dhttp%3A//dbpedia.org%26query%3DDESCRIBE%20%3Chttp%3A//dbpedia.org/resource/Jeb_Bush%3E%26format%3Dapplication/microdata%2Bjson'

In [34]:
from sqlitedict import SqliteDict

In [ ]:
from sqlitedict import SqliteDict
mydict = SqliteDict('./my_db.sqlite', autocommit=True)
>>> mydict['some_key'] = any_picklable_object
>>> print mydict['some_key']  # prints the new value