In [2]:
from time import time,sleep
from glob import glob
import json
import urllib
import string
import cPickle as pickle
from os import path
from collections import defaultdict

In [3]:
API_KEY = "AIzaSyAnupT8pNVHf2WFPidvMcmdrfXgt6RoM0w"
SERVICE_URL = 'https://www.googleapis.com/freebase/v1/mqlread'
cursor = ""

In [4]:
def get_iterator(q):
  params = {
            'query': json.dumps(q),
            'key': API_KEY,
            'cursor': cursor
           }
  progress = True
  while progress:
    url = SERVICE_URL + '?' + urllib.urlencode(params)
    try:
      response = json.loads(urllib.urlopen(url).read())
    except:
      sleep(30)
      continue
    if not 'cursor' in response:
      sleep(30)
      continue
      #raise BadResponse("Response does not contain cursor.")
    params['cursor'] = response['cursor']
    if response['cursor'] == False:
      progress = False
    yield response['cursor'], response['result']

In [5]:
null = None
query = [{
  "id": null,
  "name": null,
  "type": "/people/ethnicity",
  "/people/ethnicity/included_in_group": []
}]

In [6]:
response = []
for cursor, partial_results in get_iterator(query):
  response.extend(partial_results)

In [7]:
graph = defaultdict(lambda: [])
for link in response:
  graph[link["name"]].extend(link["/people/ethnicity/included_in_group"])
graph = dict(graph)

In [8]:
print len(graph)


5686

In [9]:
parents_cache = graph
all_parents_cache = {}
depth_cache = {}
grandparent_cache = {}

def get_parents(k):
  if k in parents_cache:
    return parents_cache[k]
  else:
    return []
  
def get_depth(k, deep=0):
  if deep >= 6:
    return 0
  if k not in depth_cache:
    depths = [get_depth(x, deep+1) for x in get_parents(k)]
    depths.append(0)
    depth_cache[k] = max(depths) + 1
  return depth_cache[k]

def get_all_parents(k, deep=0):
  if deep >= 6: return []
  if k not in all_parents_cache:
    tmp = list(get_parents(k))
    all_parents = list(tmp)
    for parent in tmp:
      all_parents.extend(get_all_parents(parent, deep+1))
    all_parents_cache[k] = list(set(all_parents).difference([k]))
  return all_parents_cache[k]

def get_grandparent(k, deep=0):
  if deep >= 6: return k
  if not get_parents(k): return k
  if k not in grandparent_cache:
    grandparents = [get_grandparent(x, deep+1) for x in get_parents(k)]
    grandparents = [x for x in grandparents if x]
    grandparent_cache[k] = grandparents[0]
  return grandparent_cache[k]

In [10]:
for k in graph:
  grandparent_cache[k] = get_grandparent(k)

In [12]:
graph["Arab American"]


Out[12]:
[u'Asian American']

Calculate the Frequency


In [13]:
import cPickle as pickle
from collections import Counter

In [14]:
people_db = pickle.load(open("/data/csc/fb_persons/100percentpeople.pkl", "rb"))
freqs = []
for x in people_db[["ethnicity"]].dropna().values.flatten():
  if isinstance(x, tuple):
    freqs.extend(x)
  else:
    freqs.append(x)

In [15]:
people_db = None

In [17]:
eth_freq = Counter(freqs)

In [18]:
total = float(sum(eth_freq.values()))
eth_prob = {k:c/total for k, c in eth_freq.iteritems()}

In [34]:



Out[34]:
0.011462612982744454

Strategy #1

Merge only the infrequent labels


In [37]:
new_freqs = []
threshold = eth_freq.most_common()[200][1]/total
for f in freqs:
  if eth_prob[f] < threshold:
    if (f not in grandparent_cache or
    f == grandparent_cache[f] or 
    grandparent_cache[f] not in eth_prob or 
    eth_prob[grandparent_cache[f]] < threshold):
      new_freqs.append("Other")
      continue
    else:
      new_freqs.append(grandparent_cache[f])
  else:
    new_freqs.append(f)

In [61]:
mapping = defaultdict(lambda: set([]))
threshold = eth_freq.most_common()[200][1]/total
for f in freqs:
  if not f: continue
  if eth_prob[f] < threshold:
    if (f not in grandparent_cache or
    f == grandparent_cache[f] or 
    grandparent_cache[f] not in eth_prob or 
    eth_prob[grandparent_cache[f]] < threshold):
      mapping["Other"].update([f])
      continue
    else:
      mapping[grandparent_cache[f]].update([f])
  else:
    mapping[f].update([f])
mapping = dict(mapping)

In [62]:
tmp = Counter(new_freqs)
selected_profs = tmp.keys()
print len(selected_profs)
lines =  u"\n".join([u"{},{},{}".format(k,v, "|".join(mapping[k])) for k, v in tmp.most_common()])
fh = open("ethnicities.csv", "w")
fh.write(lines.encode("utf8"))
fh.close()


202