In [1]:
import pickle
import glob
import numpy as np
import pandas as pd
from dateutil import parser
from datetime import date

In [2]:
pickle_dir = "/data/csc/fb_persons/"
files = glob.glob(pickle_dir+"person*.pkl")

In [3]:
fields = ["name", "profession", "date_of_birth", "nationality",
          "religion", "place_of_birth", "gender", "ethnicity"]

In [4]:
records = []
errors = 0
total = len(files)
for i, file in enumerate(files):
  #if i > 33: break
  fh = open(file)
  
  data = pickle.load(fh)
  for person in data:
    try:
      record = {}
      for field in fields:
        record[field] = person[field] if person[field] else np.nan
        if isinstance(record[field], list):
          record[field]  = record[field][0] if len(record[field]) == 1 else tuple(record[field])
      
      if record["date_of_birth"] is not np.nan:
        record["date_of_birth"] = parser.parse(record["date_of_birth"]).date()
    except ValueError:
      errors += 1
      continue
    records.append(record)
  print "\r%i/%i" % (i+1, total),
print("We could not process {} records".format(errors))


338/338 We could not process 4 records

In [5]:
df = pd.DataFrame.from_records(records)

In [6]:
errors


Out[6]:
4

In [7]:
len(df)


Out[7]:
3379996

In [8]:
#pickle.dump(df, open("/data/csc/fb_persons/10percentpeople.pkl", "wb"))
pickle.dump(df, open("/data/csc/fb_persons/100percentpeople.pkl", "wb"))