In [2]:
from time import time,sleep
from glob import glob
import json
import urllib
import string
import cPickle as pickle
from os import path
In [3]:
API_KEY = "AIzaSyAnupT8pNVHf2WFPidvMcmdrfXgt6RoM0w"
DEVELOPER_KEY = API_KEY
SERVICE_URL = 'https://www.googleapis.com/freebase/v1/mqlread'
dir_ = '/data/csc/fb_persons/'
In [14]:
query = [{
"*": None,
"type": "/people/person"
}]
In [36]:
def get_iterator(q):
params = {
'query': json.dumps(q),
'key': API_KEY,
'cursor': cursor
}
progress = True
while progress:
url = SERVICE_URL + '?' + urllib.urlencode(params)
try:
response = json.loads(urllib.urlopen(url).read())
except:
sleep(30)
continue
if not 'cursor' in response:
sleep(30)
continue
#raise BadResponse("Response does not contain cursor.")
params['cursor'] = response['cursor']
if response['cursor'] == False:
progress = False
yield response['cursor'], response['result']
In [39]:
cursor = ''
j = 0
j, cursor = pickle.load(open(path.join(dir_, "cursors.pkl")))[-1]
In [40]:
print j
In [41]:
j += 1
cursors = []
persons = []
filesize = 10000
for cursor, people in get_iterator(query):
persons.extend(people)
if len(persons) >= filesize:
cursors.append((j, cursor))
persons_file = path.join(dir_, "person_{}.pkl".format(j))
cursors_file = path.join(dir_, "cursors.pkl")
pickle.dump(persons, open(persons_file, 'wb'))
pickle.dump(cursors, open(cursors_file, "wb"))
j += 1
persons = []
print "\r%i" % (j+1)*filesize,
In [42]:
j
Out[42]: