In [1]:
from os.path import join, basename, dirname
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sys import path
from os import getcwd

# Sort out paths so that this can use functions from the main codebase
path.insert(0, dirname(getcwd()))

import environment
from scan3 import settings

In [2]:
# Load up some data to work with
data_fname = join(settings.DATA_IN_ROOT, "data_staging", "all_by_baby_enriched_v3.csv")
df = pd.read_csv(data_fname)
print("Loaded {} rows of data".format(len(df)))


Loaded 63788 rows of data
/Users/lukelatimer/.conda/envs/scan3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (50) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [11]:
# Test that we can generate some reports, though this should be an actual test, this is really just so that I
# can easily run some code to generate tables to send to the guys

from scan3.server.data_import import binary_norm

df_n = binary_norm.apply_binary_norm(df)
report = binary_norm.generate_report(df_n)

for k, sub_report in dict(report).items():
    print(k)
    for k2, v in sub_report.items():
        print("\t{}".format(v))


dem_alcohol_norm
	nan 46% 29602
	0.0 51% 32831
	1.0 2% 1355
dem_cigarettes_norm
	nan 22% 14010
	0.0 74% 47210
	1.0 4% 2568

Overview

Define some functions that tidy up and normalise the values for each binary field.

Generate some lookup tables that can be incorporated in the main pipeline


In [6]:
BINARY_FIELD_MAPS = dict(
    dem_alcohol={
        "n/k": None,
        "no alcohol": 0,
        "alcohol": 1
    },
    dem_cigarettes={
        "no": 0,
        "smoker": 1
    }
)

def binary_norm(fname, fval):
    mapper = BINARY_FIELD_MAPS[fname]
    if isinstance(fval, float) and np.isnan(fval):
        return None
    try:
        return mapper[fval.lower()]
    except KeyError:
        raise KeyError("No {} mapping for {}".format(fname, fval))

def get_binary_counts(df, fname):
    vals = ("nan", 0., 1.)
    counts = []
    pcts = []
    for val in vals:
        if val == "nan":
            counts.append(len(df[df[fname].map(np.isnan) == True]))
        else:
            counts.append(len(df[(df[fname] == val) == True]))
        pcts.append(counts[-1] / float(len(df)))
    return zip(vals, pcts, counts)
    
cat_field_test = ["dem_alcohol", "dem_cigarettes"]

for fname in cat_field_test:
    normed_name = "{}_norm".format(fname)
    df[normed_name] = df[fname].map(lambda x: binary_norm(fname, x))
    
    print(normed_name)
    
    for v in get_binary_counts(df, normed_name):
        print("\t{} {:.0%} {:.0f}".format(*v))


dem_alcohol_norm
	nan 46% 29602
	0.0 51% 32831
	1.0 2% 1355
dem_cigarettes_norm
	nan 22% 14010
	0.0 74% 47210
	1.0 4% 2568

In [7]:
print(set(df["dem_cigarettes"]))


{nan, 'No', 'Smoker'}