In [1]:
from time import time,sleep
from glob import glob
import json
import urllib
import string
import cPickle as pickle
from os import path
from collections import defaultdict

Querying Freebase


In [2]:
API_KEY = "AIzaSyAnupT8pNVHf2WFPidvMcmdrfXgt6RoM0w"
SERVICE_URL = 'https://www.googleapis.com/freebase/v1/mqlread'
cursor = ""

In [3]:
def get_iterator(q):
  params = {
            'query': json.dumps(q),
            'key': API_KEY,
            'cursor': cursor
           }
  progress = True
  while progress:
    url = SERVICE_URL + '?' + urllib.urlencode(params)
    try:
      response = json.loads(urllib.urlopen(url).read())
    except:
      sleep(30)
      continue
    if not 'cursor' in response:
      sleep(30)
      continue
      #raise BadResponse("Response does not contain cursor.")
    params['cursor'] = response['cursor']
    if response['cursor'] == False:
      progress = False
    yield response['cursor'], response['result']

Specialization-of attribute


In [4]:
null = None
query = [{
  "id": null,
  "name": null,
  "type": "/people/profession",
  "/people/profession/specialization_of": []
}]

In [5]:
response = []
for cursor, partial_results in get_iterator(query):
  response.extend(partial_results)

In [6]:
len(response)


Out[6]:
4152

In [22]:
graph = defaultdict(lambda: [])
for link in response:
  graph[link["name"]].extend(link["/people/profession/specialization_of"])
graph = dict(graph)

In [23]:
graph["Manager"]


Out[23]:
[u'Baseball Coach', u'Ironmaster', u'Coach', u'Business executive']

Specializations attribute


In [252]:
null = None
query2 = [{
  "id": null,
  "name": null,
  "type": "/people/profession",
  "/people/profession/specializations": []
}]

In [255]:
specializations = []
for cursor, partial_results in get_iterator(query2):
  specializations.extend(partial_results)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-255-cc059087b587> in <module>()
      1 specializations = []
----> 2 for cursor, partial_results in get_iterator(query2):
      3   specializations.extend(partial_results)

<ipython-input-3-b6e262e691f9> in get_iterator(q)
     14       continue
     15     if not 'cursor' in response:
---> 16       sleep(30)
     17       continue
     18       #raise BadResponse("Response does not contain cursor.")

KeyboardInterrupt: 

In [ ]:
len(specializations)

In [251]:
special_graph = defaultdict(lambda: [])
for link in response:
  special_graph[link["name"]].extend(link["/people/profession/specializations"])
special_graph = dict(special_graph)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-251-0ac2466f2bbd> in <module>()
      1 special_graph = defaultdict(lambda: [])
      2 for link in response:
----> 3   special_graph[link["name"]].extend(link["/people/profession/specializations"])
      4 special_graph = dict(graph)

KeyError: '/people/profession/specializations'

Eliminating double edges


In [ ]:
for k in graph:
  graph[k] = set(graph[k])

Calculating Depth and All Parents


In [187]:
parents_cache = graph
all_parents_cache = {}
depth_cache = {}
grandparent_cache = {}

In [101]:
def get_parents(k):
  if k in parents_cache:
    return parents_cache[k]
  else:
    return []

In [102]:
def get_depth(k, deep=0):
  if deep >= 6:
    return 0
  if k not in depth_cache:
    depths = [get_depth(x, deep+1) for x in get_parents(k)]
    depths.append(0)
    depth_cache[k] = max(depths) + 1
  return depth_cache[k]

In [104]:
def get_all_parents(k, deep=0):
  if deep >= 6: return []
  if k not in all_parents_cache:
    tmp = list(get_parents(k))
    all_parents = list(tmp)
    for parent in tmp:
      all_parents.extend(get_all_parents(parent, deep+1))
    all_parents_cache[k] = list(set(all_parents).difference([k]))
  return all_parents_cache[k]

In [169]:
def get_grandparent(k, deep=0):
  if deep >= 6: return k
  if not get_parents(k): return k
  if k not in grandparent_cache:
    grandparents = [get_grandparent(x, deep+1) for x in get_parents(k)]
    grandparents = [x for x in grandparents if x]
    grandparent_cache[k] = grandparents[0]
  return grandparent_cache[k]

In [107]:
for k in graph:
  get_depth(k)

In [108]:
for k in graph:
  get_all_parents(k)

In [193]:
for k in graph:
  grandparent_cache[k] = get_grandparent(k)

Calculate the Frequency


In [124]:
import cPickle as pickle
from collections import Counter

In [126]:
people_db = pickle.load(open("/data/csc/fb_persons/100percentpeople.pkl", "rb"))

In [127]:
freqs = []
for x in people_db[["profession"]].dropna().values.flatten():
  if isinstance(x, tuple):
    freqs.extend(x)
  else:
    freqs.append(x)

In [130]:
people_db = None

In [131]:
prof_freq = Counter(freqs)

In [135]:
total = float(sum(prof_freq.values()))
prof_prob = {k:c/total for k, c in prof_freq.iteritems()}

In [250]:
grandparent_cache["Lawyer"]


Out[250]:
u'Criminal defense lawyer'

Strategy #1

Merge only the infrequent labels


In [259]:
new_freqs = []
threshold = prof_freq.most_common()[200][1]/total
for f in freqs:
  if prof_prob[f] < threshold:
    if (f not in grandparent_cache or
    f == grandparent_cache[f] or 
    grandparent_cache[f] not in prof_prob or 
    prof_prob[grandparent_cache[f]] < threshold):
      new_freqs.append("Other")
      continue
    else:
      new_freqs.append(grandparent_cache[f])

  else:
    new_freqs.append(f)

In [260]:
mapping = defaultdict(lambda: set([]))
threshold = prof_freq.most_common()[200][1]/total
for f in freqs:
  if not f: continue
  if prof_prob[f] < threshold:
    if (f not in grandparent_cache or
    f == grandparent_cache[f] or 
    grandparent_cache[f] not in prof_prob or 
    prof_prob[grandparent_cache[f]] < threshold):
      mapping["Other"].update([f])
      continue
    else:
      mapping[grandparent_cache[f]].update([f])
  else:
    mapping[f].update([f])
mapping = dict(mapping)

In [261]:
tmp = Counter(new_freqs)
selected_profs = tmp.keys()
print len(selected_profs)
lines =  u"\n".join([u"{},{},{}".format(k,v, "|".join(mapping.get(k, []))) for k, v in tmp.most_common()])
fh = open("professions.csv", "w")
fh.write(lines.encode("utf8"))
fh.close()


203

Startegy #2

Merge All


In [247]:
new_freqs2 = []
threshold = 2e-5
for f in freqs:
  if f in grandparent_cache:
    g = grandparent_cache[f]
    if (g not in prof_prob or 
       prof_prob[g] < threshold):
      new_freqs2.append("Other")
    else:
      new_freqs2.append(g)

In [248]:
tmp = Counter(new_freqs2)
selected_profs = tmp.keys()
print len(selected_profs)
print "\n".join(["{},{}".format(k,v) for k, v in tmp.most_common()])


99
Actor,453366
Artist,298794
Writer,202413
Athlete,134553
Other,106602
Film Producer,98519
Film director,76402
Politician,51540
Editor,49424
Television producer,32785
Businessperson,28053
Scientist,21400
Educator,20931
Baseball Coach,19645
Mathematician,19309
Performer,17952
Engineer,15234
Broadcaster,13957
Television Director,13518
Referee,10993
Entertainer,10526
Business executive,10173
Set Decorator,7914
Entrepreneur,5331
Casting Director,5307
Historian,4319
Soldier,4128
Navigator,4094
Philosopher,3946
Economist,3437
Producer,2719
Critic,1981
Activist,1898
Military Officer,1875
Peace officer,1700
Technician,1473
Publisher,908
Scholar,872
Political Scientist,807
Anthropologist,795
Accountant,705
Television presenter,645
Filmmaker,631
Salesperson,561
Office Worker,380
Philanthropist,363
Investor,348
Social Worker,336
Civil servant,321
Special effects supervisor,296
Stunt Coordinator,287
Boom Operator,272
Tarento,265
Sex worker,261
Explorer,259
Animation Director,232
Warlord,213
Foley Artist,198
Business magnate,171
Set Dresser,151
Manufacturer,125
Socialite,122
Sound Mixer,121
Public Servant,116
Lighting Director,108
Truck driver,108
ADR Recordist,106
Art collector,100
TV Art Director,93
ADR Editor,86
Sound Department,79
Cinematography,76
ADR Mixer,75
Key Makeup Artist,70
Foley Editor,70
Bodyguard,70
Veteran,70
Visual Effects Coordinator,67
Foley Mixer,65
Advertising Executive,65
Nobleman,63
Visual Effects,58
Script supervisor,55
Taxi driver,50
Media proprietor,46
Mail carrier,46
Rector,45
ADR Director,45
Humanitarian,45
Bus driver,44
Postal worker,43
Special Effects Foreman,43
CG Supervisor,43
Sound Supervisor,43
Bookkeeper,42
On-set Dresser,42
Foley Recordist,39
Ophthalmology,39
Special Effects,38