In [ ]:
import pickle
import random
import sys
import pyprind

read from WikiData


In [ ]:
# get number of relation types

def num_type_relations(unit):
    return len(unit['claims'])

In [ ]:
# get number of relations

def num_relations(unit):
    counts = [len(m) for m in unit["claims"]]
    return sum(counts)

In [ ]:
# get related entities

def make_Q(src):
    m = "Q"+ str(src)
    return  m

def related_entities(unit):
    ret = []
    for claim in unit['claims'].values():
        # print claim
        for single in claim:
            try:
                # print single[u'mainsnak'][u'datavalue'][u'value'][u'numeric-id']
                ret.append(   make_Q(single[u'mainsnak'][u'datavalue'][u'value'][u'numeric-id'])   )
            except:
                pass
    
    return ret

In [ ]:
# get freebase ID

def get_freebase(unit):
    ret = ""
    try:
        ret = unit['claims'][u'P646'][0][u'mainsnak'][u'datavalue']['value']
    except:
        ret = "NONE_FREEBASE"
    return ret

In [ ]:
# get wikipedia page

def get_wikititle(unit):
    ret = ""
    try:
        ret = unit['sitelinks']['enwiki']['title']
    except:
        ret = "NONE_WIKITITLE"
    return ret

In [ ]:
# get aliases

def get_aliases(unit):
    try:
        ret = [unit[u'labels'][u'en'][u'value']]
    except:
        ret = ["NONE_EN_LABEL"]

    try:
        ret.extend([ele['value'] for ele in unit[u'aliases'][u'en']])
    except:
        pass
    return ret

In [ ]:
def make_info(unit):
    ret = {}
    ret['ntr'] = num_type_relations(unit)
    ret['nr'] = num_relations(unit)
    ret['re'] = related_entities(unit)
    ret['fb'] = get_freebase(unit)
    ret['wk'] = get_wikititle(unit)
    ret['aliases'] = get_aliases(unit)
    return ret

In [ ]:
import json

bar = pyprind.ProgBar(20951710, width = 70)

with open("/Volumes/backup/ccg_tweet_wikifier_data/wikidata/entity_info.txt", "wb") as g:
    with open("/Volumes/backup/ccg_tweet_wikifier_data/wikidata/wikidata-20160404-all.json", "rb") as f:
        for line in f:
            if len(line.strip()) <2:
                continue
            bar.update()
            wiki_info = {}
            j_content = json.loads(line.strip()[:-1])
            ida = j_content[u'id']
            wiki_info[ida] = make_info(j_content)
            g.write(str(wiki_info))
            g.write("\n")

In [ ]:
entity_info_file = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/entity_info.txt"

In [ ]:
bar = pyprind.ProgBar(20951710, width = 70)

entity_info = {}

with open(entity_info_file, "rb") as f:
    for line in f:
        pline = line.strip()
        bar.update()
        
        info = eval(pline)
        entity_info.update(info)

In [ ]:
len(entity_info)

In [ ]:
from sqlitedict import SqliteDict
mydict = SqliteDict('./wikidata_small.sqlite', autocommit=True)

In [ ]:
bar = pyprind.ProgBar(20951710, width = 70)

for key in entity_info.keys():
    bar.update()
    mydict[key] = entity_info[key]

In [ ]:
entity_info_json_output = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/entity_info.json"

In [ ]:
with open(entity_info_json_output, "wb") as f:
    json.dump(entity_info, f)

In [ ]:
import json

In [ ]:
with open(entity_info_json_output, "wb") as f:
    json.dump(entity_info, f)

In [ ]:
import requests
from bs4 import BeautifulSoup
import json


def get_dbpedia_info(uri, linkWikipedia):
    print uri + ', ' + linkWikipedia

session = requests.Session()

graph = 'http://dbpedia.org'
query = 'DESCRIBE <http://dbpedia.org/resource/Jdddeb_Bush,_Jr.>'
format = 'application/json'


response = session.get('http://dbpedia.org/sparql', params={'default-graph-uri': graph, 'query': query, 'format': format})
print json.loads(response.text)
#get_info(line, response[line]['http://xmlns.com/foaf/0.1/isPrimaryTopicOf'][0]['value'])

In [ ]:
response.content

In [ ]:
p = response.json()

In [ ]:
len(p)

In [ ]:
sys.path.append("/Users/erichsu/Documents/research/WikiNLP/wikinlp/")

In [ ]:
from WikiNLP import WikiNLP

In [ ]:
import DBpedia

In [ ]:
op = DBpedia.DBpedia_BOT()

In [ ]:
m = op.query_entity("Jeb_Bush")

In [ ]:
import time
print time.time()
op.query_entity("Jebdsad_Bush")
print time.time()

In [ ]:
time(m)

In [ ]:
0.1*20000000/60/60/24

In [ ]: