In [2]:
from time import time,sleep
from glob import glob
import json
import urllib
import string
import cPickle as pickle
from os import path

In [3]:
API_KEY = "AIzaSyAnupT8pNVHf2WFPidvMcmdrfXgt6RoM0w"
DEVELOPER_KEY = API_KEY
SERVICE_URL = 'https://www.googleapis.com/freebase/v1/mqlread'
dir_ = '/data/csc/fb_persons/'

In [14]:
query = [{
  "*": None,
  "type": "/people/person"
}]

In [36]:
def get_iterator(q):
  params = {
            'query': json.dumps(q),
            'key': API_KEY,
            'cursor': cursor
           }
  progress = True
  while progress:
    url = SERVICE_URL + '?' + urllib.urlencode(params)
    try:
      response = json.loads(urllib.urlopen(url).read())
    except:
      sleep(30)
      continue
    if not 'cursor' in response:
      sleep(30)
      continue
      #raise BadResponse("Response does not contain cursor.")
    params['cursor'] = response['cursor']
    if response['cursor'] == False:
      progress = False
    yield response['cursor'], response['result']

In [39]:
cursor = ''
j = 0
j, cursor = pickle.load(open(path.join(dir_, "cursors.pkl")))[-1]

In [40]:
print j


337

In [41]:
j += 1
cursors = []
persons = []
filesize = 10000

for cursor, people in get_iterator(query):
  persons.extend(people)
  if len(persons) >= filesize:
    cursors.append((j, cursor))
    persons_file = path.join(dir_, "person_{}.pkl".format(j))
    cursors_file = path.join(dir_, "cursors.pkl")
    pickle.dump(persons, open(persons_file, 'wb'))
    pickle.dump(cursors, open(cursors_file, "wb"))
    j += 1
    persons = []
    print "\r%i" % (j+1)*filesize,

In [42]:
j


Out[42]:
338