In [1]:
import pickle
import random
import sys
import pyprind
In [ ]:
# get number of relation types
def num_type_relations(unit):
return len(unit['claims'])
In [ ]:
# get number of relations
def num_relations(unit):
counts = [len(m) for m in unit["claims"]]
return sum(counts)
In [ ]:
# get related entities
def make_Q(src):
m = "Q"+ str(src)
return m
def related_entities(unit):
ret = []
for claim in unit['claims'].values():
# print claim
for single in claim:
try:
# print single[u'mainsnak'][u'datavalue'][u'value'][u'numeric-id']
ret.append( make_Q(single[u'mainsnak'][u'datavalue'][u'value'][u'numeric-id']) )
except:
pass
return ret
In [ ]:
# get freebase ID
def get_freebase(unit):
ret = ""
try:
ret = unit['claims'][u'P646'][0][u'mainsnak'][u'datavalue']['value']
except:
ret = "NONE_FREEBASE"
return ret
In [ ]:
# get wikipedia page
def get_wikititle(unit):
ret = ""
try:
ret = unit['sitelinks']['enwiki']['title']
except:
ret = "NONE_WIKITITLE"
return ret
In [ ]:
# get aliases
def get_aliases(unit):
try:
ret = [unit[u'labels'][u'en'][u'value']]
except:
ret = ["NONE_EN_LABEL"]
try:
ret.extend([ele['value'] for ele in unit[u'aliases'][u'en']])
except:
pass
return ret
In [ ]:
def make_info(unit):
ret = {}
ret['ntr'] = num_type_relations(unit)
ret['nr'] = num_relations(unit)
ret['re'] = related_entities(unit)
ret['fb'] = get_freebase(unit)
ret['wk'] = get_wikititle(unit)
ret['aliases'] = get_aliases(unit)
return ret
In [ ]:
import json
bar = pyprind.ProgBar(20951710, width = 70)
with open("/Volumes/backup/ccg_tweet_wikifier_data/wikidata/entity_info.txt", "wb") as g:
with open("/Volumes/backup/ccg_tweet_wikifier_data/wikidata/wikidata-20160404-all.json", "rb") as f:
for line in f:
if len(line.strip()) <2:
continue
bar.update()
wiki_info = {}
j_content = json.loads(line.strip()[:-1])
ida = j_content[u'id']
wiki_info[ida] = make_info(j_content)
g.write(str(wiki_info))
g.write("\n")
In [20]:
entity_info_file = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/entity_info.txt"
In [21]:
bar = pyprind.ProgBar(20951710, width = 70)
entity_info = {}
with open(entity_info_file, "rb") as f:
for line in f:
pline = line.strip()
bar.update()
info = eval(pline)
entity_info.update(info)
In [22]:
len(entity_info)
Out[22]:
In [23]:
from sqlitedict import SqliteDict
mydict = SqliteDict('./wikidata_small.sqlite', autocommit=True)
In [ ]:
bar = pyprind.ProgBar(20951710, width = 70)
for key in entity_info.keys():
bar.update()
mydict[key] = entity_info
In [ ]:
entity_info_json_output = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/entity_info.json"
In [ ]:
with open(entity_info_json_output, "wb") as f:
json.dump(entity_info, f)
In [ ]:
import json
In [ ]:
with open(entity_info_json_output, "wb") as f:
json.dump(entity_info, f)
In [ ]:
import requests
from bs4 import BeautifulSoup
import json
def get_dbpedia_info(uri, linkWikipedia):
print uri + ', ' + linkWikipedia
session = requests.Session()
graph = 'http://dbpedia.org'
query = 'DESCRIBE <http://dbpedia.org/resource/Jdddeb_Bush,_Jr.>'
format = 'application/json'
response = session.get('http://dbpedia.org/sparql', params={'default-graph-uri': graph, 'query': query, 'format': format})
print json.loads(response.text)
#get_info(line, response[line]['http://xmlns.com/foaf/0.1/isPrimaryTopicOf'][0]['value'])
In [ ]:
response.content
In [ ]:
p = response.json()
In [ ]:
len(p)
In [2]:
sys.path.append("/Users/erichsu/Documents/research/WikiNLP/wikinlp/")
In [ ]:
from WikiNLP import WikiNLP
In [3]:
import DBpedia
In [5]:
op = DBpedia.DBpedia_BOT()
In [6]:
m = op.query_entity("Jeb_Bush")
In [16]:
import time
print time.time()
op.query_entity("Jebdsad_Bush")
print time.time()
In [17]:
time(m)
In [19]:
0.1*20000000/60/60/24
Out[19]:
In [ ]: