In [2]:
from time import time,sleep
from glob import glob
import json
import urllib
import string
import cPickle as pickle
from os import path
from collections import defaultdict
In [3]:
API_KEY = "AIzaSyAnupT8pNVHf2WFPidvMcmdrfXgt6RoM0w"
SERVICE_URL = 'https://www.googleapis.com/freebase/v1/mqlread'
cursor = ""
In [4]:
def get_iterator(q):
params = {
'query': json.dumps(q),
'key': API_KEY,
'cursor': cursor
}
progress = True
while progress:
url = SERVICE_URL + '?' + urllib.urlencode(params)
try:
response = json.loads(urllib.urlopen(url).read())
except:
sleep(30)
continue
if not 'cursor' in response:
sleep(30)
continue
#raise BadResponse("Response does not contain cursor.")
params['cursor'] = response['cursor']
if response['cursor'] == False:
progress = False
yield response['cursor'], response['result']
In [5]:
null = None
query = [{
"id": null,
"name": null,
"type": "/people/ethnicity",
"/people/ethnicity/included_in_group": []
}]
In [6]:
response = []
for cursor, partial_results in get_iterator(query):
response.extend(partial_results)
In [7]:
graph = defaultdict(lambda: [])
for link in response:
graph[link["name"]].extend(link["/people/ethnicity/included_in_group"])
graph = dict(graph)
In [8]:
print len(graph)
In [9]:
parents_cache = graph
all_parents_cache = {}
depth_cache = {}
grandparent_cache = {}
def get_parents(k):
if k in parents_cache:
return parents_cache[k]
else:
return []
def get_depth(k, deep=0):
if deep >= 6:
return 0
if k not in depth_cache:
depths = [get_depth(x, deep+1) for x in get_parents(k)]
depths.append(0)
depth_cache[k] = max(depths) + 1
return depth_cache[k]
def get_all_parents(k, deep=0):
if deep >= 6: return []
if k not in all_parents_cache:
tmp = list(get_parents(k))
all_parents = list(tmp)
for parent in tmp:
all_parents.extend(get_all_parents(parent, deep+1))
all_parents_cache[k] = list(set(all_parents).difference([k]))
return all_parents_cache[k]
def get_grandparent(k, deep=0):
if deep >= 6: return k
if not get_parents(k): return k
if k not in grandparent_cache:
grandparents = [get_grandparent(x, deep+1) for x in get_parents(k)]
grandparents = [x for x in grandparents if x]
grandparent_cache[k] = grandparents[0]
return grandparent_cache[k]
In [10]:
for k in graph:
grandparent_cache[k] = get_grandparent(k)
In [12]:
graph["Arab American"]
Out[12]:
In [13]:
import cPickle as pickle
from collections import Counter
In [14]:
people_db = pickle.load(open("/data/csc/fb_persons/100percentpeople.pkl", "rb"))
freqs = []
for x in people_db[["ethnicity"]].dropna().values.flatten():
if isinstance(x, tuple):
freqs.extend(x)
else:
freqs.append(x)
In [15]:
people_db = None
In [17]:
eth_freq = Counter(freqs)
In [18]:
total = float(sum(eth_freq.values()))
eth_prob = {k:c/total for k, c in eth_freq.iteritems()}
In [34]:
Out[34]:
In [37]:
new_freqs = []
threshold = eth_freq.most_common()[200][1]/total
for f in freqs:
if eth_prob[f] < threshold:
if (f not in grandparent_cache or
f == grandparent_cache[f] or
grandparent_cache[f] not in eth_prob or
eth_prob[grandparent_cache[f]] < threshold):
new_freqs.append("Other")
continue
else:
new_freqs.append(grandparent_cache[f])
else:
new_freqs.append(f)
In [61]:
mapping = defaultdict(lambda: set([]))
threshold = eth_freq.most_common()[200][1]/total
for f in freqs:
if not f: continue
if eth_prob[f] < threshold:
if (f not in grandparent_cache or
f == grandparent_cache[f] or
grandparent_cache[f] not in eth_prob or
eth_prob[grandparent_cache[f]] < threshold):
mapping["Other"].update([f])
continue
else:
mapping[grandparent_cache[f]].update([f])
else:
mapping[f].update([f])
mapping = dict(mapping)
In [62]:
tmp = Counter(new_freqs)
selected_profs = tmp.keys()
print len(selected_profs)
lines = u"\n".join([u"{},{},{}".format(k,v, "|".join(mapping[k])) for k, v in tmp.most_common()])
fh = open("ethnicities.csv", "w")
fh.write(lines.encode("utf8"))
fh.close()