In [1]:
from io import open
import cPickle as pickle
import pandas as pd
import numpy as np

In [2]:
religions_df = pd.DataFrame.from_csv("/home/rmyeid/notebooks/compsocial/religions.csv", index_col=None)
professions_df = pd.DataFrame.from_csv("/home/rmyeid/notebooks/compsocial/professions.csv", index_col=None)
ethnicities_df = pd.DataFrame.from_csv("/home/rmyeid/notebooks/compsocial/ethnicities.csv", index_col=None)
countries_df = pd.DataFrame.from_csv("/home/rmyeid/notebooks/compsocial/countries.csv", index_col=None)

In [3]:
maps = {}
for attr, df in [("religion", religions_df),
                 ("ethnicities", ethnicities_df),
                 ("professions", professions_df),
                 ("countries", countries_df)]:
  print attr
  maps[attr] = {}
  for row in df.values:
    cat, subcategories = row[0], row[-1]
    try:
      for subcat in subcategories.strip().split("|"):
        maps[attr][subcat] = cat.lower().strip().replace("/", "_")
    except:
      print row


religion
ethnicities
professions
countries

In [4]:
fh = open("/data/csc/fb_persons/100percentpeople.pkl", "rb")
df = pickle.load(fh)

In [5]:
def generalize_label(prof, label):
  if isinstance(prof, float): return None
  if isinstance(prof, tuple):
    tmp = tuple(maps[label][x] for x in prof if x in maps[label])
    tmp = tuple(set(tmp))
    if len(tmp) == 1: return tmp[0]
    return tmp
  if isinstance(prof, unicode):
    return maps[label].get(prof, "other")

In [6]:
df["profession"] = [generalize_label(x, "professions") for x in df[["profession"]].values.flatten()]
df["ethnicity"] = [generalize_label(x, "ethnicities") for x in df[["ethnicity"]].values.flatten()]
df["religion"] = [generalize_label(x, "religion") for x in df[["religion"]].values.flatten()]

In [7]:
df["nationality"] = [generalize_label(x, "countries") for x in df[["nationality"]].values.flatten()]

In [8]:
pickle.dump(df, open("/data/csc/fb_persons/unified_100percentpeople.pkl", "wb"))