In [1]:
from time import time,sleep
from glob import glob
import json
import urllib
import string
import cPickle as pickle
from os import path
from collections import defaultdict
In [2]:
API_KEY = "AIzaSyAnupT8pNVHf2WFPidvMcmdrfXgt6RoM0w"
SERVICE_URL = 'https://www.googleapis.com/freebase/v1/mqlread'
cursor = ""
In [3]:
def get_iterator(q):
params = {
'query': json.dumps(q),
'key': API_KEY,
'cursor': cursor
}
progress = True
while progress:
url = SERVICE_URL + '?' + urllib.urlencode(params)
try:
response = json.loads(urllib.urlopen(url).read())
except:
sleep(30)
continue
if not 'cursor' in response:
sleep(30)
continue
#raise BadResponse("Response does not contain cursor.")
params['cursor'] = response['cursor']
if response['cursor'] == False:
progress = False
yield response['cursor'], response['result']
In [4]:
null = None
query = [{
"id": null,
"name": null,
"type": "/people/profession",
"/people/profession/specialization_of": []
}]
In [5]:
response = []
for cursor, partial_results in get_iterator(query):
response.extend(partial_results)
In [6]:
len(response)
Out[6]:
In [22]:
graph = defaultdict(lambda: [])
for link in response:
graph[link["name"]].extend(link["/people/profession/specialization_of"])
graph = dict(graph)
In [23]:
graph["Manager"]
Out[23]:
In [252]:
null = None
query2 = [{
"id": null,
"name": null,
"type": "/people/profession",
"/people/profession/specializations": []
}]
In [255]:
specializations = []
for cursor, partial_results in get_iterator(query2):
specializations.extend(partial_results)
In [ ]:
len(specializations)
In [251]:
special_graph = defaultdict(lambda: [])
for link in response:
special_graph[link["name"]].extend(link["/people/profession/specializations"])
special_graph = dict(special_graph)
In [ ]:
for k in graph:
graph[k] = set(graph[k])
In [187]:
parents_cache = graph
all_parents_cache = {}
depth_cache = {}
grandparent_cache = {}
In [101]:
def get_parents(k):
if k in parents_cache:
return parents_cache[k]
else:
return []
In [102]:
def get_depth(k, deep=0):
if deep >= 6:
return 0
if k not in depth_cache:
depths = [get_depth(x, deep+1) for x in get_parents(k)]
depths.append(0)
depth_cache[k] = max(depths) + 1
return depth_cache[k]
In [104]:
def get_all_parents(k, deep=0):
if deep >= 6: return []
if k not in all_parents_cache:
tmp = list(get_parents(k))
all_parents = list(tmp)
for parent in tmp:
all_parents.extend(get_all_parents(parent, deep+1))
all_parents_cache[k] = list(set(all_parents).difference([k]))
return all_parents_cache[k]
In [169]:
def get_grandparent(k, deep=0):
if deep >= 6: return k
if not get_parents(k): return k
if k not in grandparent_cache:
grandparents = [get_grandparent(x, deep+1) for x in get_parents(k)]
grandparents = [x for x in grandparents if x]
grandparent_cache[k] = grandparents[0]
return grandparent_cache[k]
In [107]:
for k in graph:
get_depth(k)
In [108]:
for k in graph:
get_all_parents(k)
In [193]:
for k in graph:
grandparent_cache[k] = get_grandparent(k)
In [124]:
import cPickle as pickle
from collections import Counter
In [126]:
people_db = pickle.load(open("/data/csc/fb_persons/100percentpeople.pkl", "rb"))
In [127]:
freqs = []
for x in people_db[["profession"]].dropna().values.flatten():
if isinstance(x, tuple):
freqs.extend(x)
else:
freqs.append(x)
In [130]:
people_db = None
In [131]:
prof_freq = Counter(freqs)
In [135]:
total = float(sum(prof_freq.values()))
prof_prob = {k:c/total for k, c in prof_freq.iteritems()}
In [250]:
grandparent_cache["Lawyer"]
Out[250]:
In [259]:
new_freqs = []
threshold = prof_freq.most_common()[200][1]/total
for f in freqs:
if prof_prob[f] < threshold:
if (f not in grandparent_cache or
f == grandparent_cache[f] or
grandparent_cache[f] not in prof_prob or
prof_prob[grandparent_cache[f]] < threshold):
new_freqs.append("Other")
continue
else:
new_freqs.append(grandparent_cache[f])
else:
new_freqs.append(f)
In [260]:
mapping = defaultdict(lambda: set([]))
threshold = prof_freq.most_common()[200][1]/total
for f in freqs:
if not f: continue
if prof_prob[f] < threshold:
if (f not in grandparent_cache or
f == grandparent_cache[f] or
grandparent_cache[f] not in prof_prob or
prof_prob[grandparent_cache[f]] < threshold):
mapping["Other"].update([f])
continue
else:
mapping[grandparent_cache[f]].update([f])
else:
mapping[f].update([f])
mapping = dict(mapping)
In [261]:
tmp = Counter(new_freqs)
selected_profs = tmp.keys()
print len(selected_profs)
lines = u"\n".join([u"{},{},{}".format(k,v, "|".join(mapping.get(k, []))) for k, v in tmp.most_common()])
fh = open("professions.csv", "w")
fh.write(lines.encode("utf8"))
fh.close()
In [247]:
new_freqs2 = []
threshold = 2e-5
for f in freqs:
if f in grandparent_cache:
g = grandparent_cache[f]
if (g not in prof_prob or
prof_prob[g] < threshold):
new_freqs2.append("Other")
else:
new_freqs2.append(g)
In [248]:
tmp = Counter(new_freqs2)
selected_profs = tmp.keys()
print len(selected_profs)
print "\n".join(["{},{}".format(k,v) for k, v in tmp.most_common()])