In [1]:
import pickle
import glob
import numpy as np
import pandas as pd
from dateutil import parser
from datetime import date
In [2]:
pickle_dir = "/data/csc/fb_persons/"
files = glob.glob(pickle_dir+"person*.pkl")
In [3]:
fields = ["name", "profession", "date_of_birth", "nationality",
"religion", "place_of_birth", "gender", "ethnicity"]
In [4]:
records = []
errors = 0
total = len(files)
for i, file in enumerate(files):
#if i > 33: break
fh = open(file)
data = pickle.load(fh)
for person in data:
try:
record = {}
for field in fields:
record[field] = person[field] if person[field] else np.nan
if isinstance(record[field], list):
record[field] = record[field][0] if len(record[field]) == 1 else tuple(record[field])
if record["date_of_birth"] is not np.nan:
record["date_of_birth"] = parser.parse(record["date_of_birth"]).date()
except ValueError:
errors += 1
continue
records.append(record)
print "\r%i/%i" % (i+1, total),
print("We could not process {} records".format(errors))
In [5]:
df = pd.DataFrame.from_records(records)
In [6]:
errors
Out[6]:
In [7]:
len(df)
Out[7]:
In [8]:
#pickle.dump(df, open("/data/csc/fb_persons/10percentpeople.pkl", "wb"))
pickle.dump(df, open("/data/csc/fb_persons/100percentpeople.pkl", "wb"))