In [30]:
from os.path import join, dirname
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
from sys import path
from os import getcwd
# Sort out paths so that this can use functions from the main codebase
path.insert(0, dirname(getcwd()))
import environment
from scan3 import settings
In [35]:
# Load up some data to work with
data_fname = join(settings.DATA_IN_ROOT, "data_staging", "all_by_baby_enriched_v3.csv")
df = pd.read_csv(data_fname)
print("Loaded {} rows of data".format(len(df)))
Define some functions that tidy up and normalise the choices for each categorical field.
Generate some lookup tables that can be incorporated in the main pipeline.
From the initial discussion with Basky the ethnicity fields need some additional attention:
We have four target categories, which are simple enough from the group2 values, so first need to look at whether that field is always filled out or whether we sometimes have to use the first field.
A quick glance suggests that the first field is more specific, but is filled out less than the second (may be due to different centers).
In [39]:
# This helps us to figure out where we need to look at both fields to help figure out the ethnic group
MISSING_ETHNIC_VALUES = ("missing",
"not specified", "not stated", "not given",
"patient unwilling to disclose")
OTHER_ETHNIC_VALUES = ("other", "other mixed", "mixed", "mixed other", "other mixed race",
"any other group", "other ethnic group", "mixed ethnic group")
def make_mapper(target_name, synonyms):
def scan_synonyms(fname, val):
if (isinstance(val, float) and np.isnan(val)):
return target_name
else:
# Remove annoying characters and collapse multiple spaces
val = re.sub(" {2,10}", " ",
re.sub("and|backgrou|back gro|back ground| und| unspecif", "",
re.sub("wb", "white british",
re.sub("[_/()-]", " ", val.strip().lower()))))
if val is None or val in synonyms:
return target_name
else:
return val
return scan_synonyms
ETHNIC_FIELDS = ["dem_ethnic_group", "dem_ethnic_group2"]
def tidy_missing_other(df):
"""
Sort out missing values, so we can make it easier to process and analyse this data
"""
map_missing = make_mapper("missing", MISSING_ETHNIC_VALUES)
map_other = make_mapper("other", OTHER_ETHNIC_VALUES)
for fname in ETHNIC_FIELDS:
df[fname] = df[fname].map(lambda x: map_missing(fname, x))
df[fname] = df[fname].map(lambda x: map_other(fname, x))
return df
def generate_report(df):
cdfs = {}
for fname in ETHNIC_FIELDS:
vals = list(set(df[fname]))
counts = [len(df[df[fname] == val] == True) for val in vals]
count_df = pd.DataFrame({"count": counts, "pct": np.array(counts) / float(len(df))}, index=vals)
count_df.sort_values(by="count", inplace=True, ascending=False)
cdfs[fname] = count_df
return cdfs
df = tidy_missing_other(df)
report = generate_report(df)
# This shows the distinct values for this field, so can expliciclty map typos etc, and see if the missing/other
# symonyms need updating
print("\n".join(sorted(report["dem_ethnic_group"].index)))
#print(df[df.dem_ethnic_group2 == "other"]["dem_ethnic_group"].head())
In [21]:
#print(df[df.dem_ethnic_group2 == "other"][["dem_ethnic_group", "dem_ethnic_group2"]])
In [12]:
from collections import OrderedDict
import json
# Check whether we have anything in field1 when field2 is missing
g2_missing_g1 = set(df["dem_ethnic_group"][(df["dem_ethnic_group2"] == "missing") & (df["dem_ethnic_group"] != "missing")])
# TODO Need to do the same thing for other
# Yes, so we need to map these too
# print "\n".join(sorted(g2_missing_g1))
# So the values we need to map are the g2 values, plus these ones.
# Actually, the key should probably be both columns?
values_to_map = set(df.dem_ethnic_group2).union(g2_missing_g1)
keys = sorted(values_to_map)
dummy_lookup = OrderedDict((k, "") for k in keys)
print(json.dumps(dummy_lookup, indent=4))
# print "\n".join(map(lambda x: "\"{}\": ".format(x), sorted(values_to_map)))
In [14]:
ETHNIC_GROUP_MAP = {
"arab": "caucasian",
"asian": "asian",
"black": "afro",
"black - african": "afro",
"black - caribbean": "afro",
"black - other": "afro",
"black british": "afro",
"black-east asian": "other",
"black-south asian": "other",
"british": "caucasian",
"caribbean": "afro",
"caucasian": "caucasian",
"east asian": "asian",
"east asian (oriental)": "asian",
"missing": "missing",
"other": "other",
"other white unspecif": "caucasian",
"pakistani/british pa": "asian",
"polish": "caucasian",
"south asian": "asian",
"south asian-east asian": "asian",
"white": "caucasian",
"white-black": "other",
"white-east asian": "other",
"white-south asian": "other"
}
ETHNIC_GROUP_MAP_COMMENTS = {
"british": "Assume caucasian as the person would likely have been more specific otherwise"
}
def ethnic_mapper(val):
return ETHNIC_GROUP_MAP.get(val, val)
# To map these, we try all the g2 fields, then deal with missing and other afterwards
df["dem_ethnic_group_norm"] = df.dem_ethnic_group2.map(ethnic_mapper)
for val in ("missing", "other", ):
subset = (df.dem_ethnic_group2 == val)
df.loc[subset, "dem_ethnic_group_norm"] = df.dem_ethnic_group.loc[subset].map(ethnic_mapper)
# Need to run some checks
def print_checks():
for val in ("missing", "other", ):
for fname in ("dem_ethnic_group2", "dem_ethnic_group", ):
print("normed is {}".format(val))
print("{} is".format(fname), sorted(set(df[df.dem_ethnic_group_norm == val][fname])))
print()
# print df[(df.dem_ethnic_group_norm == "other") & (df.dem_ethnic_group== "nigerian")][["dem_ethnic_group", "dem_ethnic_group2", "dem_ethnic_group_norm"]]
# More specific tests, to make sure my mapping makes sense, need to encapsulate these in some actual tests for
# when we get new data or change mappings
# groups = sorted(set(df[df.dem_ethnic_group_normed == "other"].dem_ethnic_group))
print("Normed is other")
for k, g in df[df.dem_ethnic_group_norm == "other"].groupby("dem_ethnic_group"):
print("group is {}".format(k))
print("group2 is ")
print("\n".join(sorted(set(g.dem_ethnic_group2))))
print()
In [42]:
set(df.dem_ethnic_group + " | " + df.dem_ethnic_group2)
Out[42]:
In [54]:
df["dem_ethnic_key"] = df.dem_ethnic_group + " | " + df.dem_ethnic_group2
map_file = join(settings.DATA_IN_ROOT, "ethnicity_map.json")
with open(map_file, "rb") as f:
ethnic_map = json.load(f)
def ethnic_mapper(k):
return ethnic_map.get(k, "unknown")
df["dem_ethnic_norm"] = df.dem_ethnic_key.map(ethnic_mapper)
report = {}
for k, sdf in df.groupby("dem_ethnic_norm"):
report[k] = "{:,}, {:.0%}".format(len(sdf), float(len(sdf)) / len(df))
print(report)