In [1]:
from os.path import join, basename, dirname
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sys import path
from os import getcwd
# Sort out paths so that this can use functions from the main codebase
path.insert(0, dirname(getcwd()))
import environment
from scan3 import settings
In [2]:
# Load up some data to work with
data_fname = join(settings.DATA_IN_ROOT, "data_staging", "all_by_baby_enriched_v3.csv")
df = pd.read_csv(data_fname)
print("Loaded {} rows of data".format(len(df)))
In [10]:
# Test that we can generate some reports, though this should be an actual test, this is really just so that I
# can easily run some code to generate tables to send to the guys
from scan3.server.data_import import binary_norm
df_n = binary_norm.apply_binary_norm(df)
report = binary_norm.generate_report(df_n)
for k, sub_report in dict(report).items():
print(k)
for k2, v in sub_report.items():
print("\t{}".format(v))
In [6]:
BINARY_FIELD_MAPS = dict(
dem_alcohol={
"n/k": None,
"no alcohol": 0,
"alcohol": 1
},
dem_cigarettes={
"no": 0,
"smoker": 1
}
)
def binary_norm(fname, fval):
mapper = BINARY_FIELD_MAPS[fname]
if isinstance(fval, float) and np.isnan(fval):
return None
try:
return mapper[fval.lower()]
except KeyError:
raise KeyError("No {} mapping for {}".format(fname, fval))
def get_binary_counts(df, fname):
vals = ("nan", 0., 1.)
counts = []
pcts = []
for val in vals:
if val == "nan":
counts.append(len(df[df[fname].map(np.isnan) == True]))
else:
counts.append(len(df[(df[fname] == val) == True]))
pcts.append(counts[-1] / float(len(df)))
return zip(vals, pcts, counts)
cat_field_test = ["dem_alcohol", "dem_cigarettes"]
for fname in cat_field_test:
normed_name = "{}_norm".format(fname)
df[normed_name] = df[fname].map(lambda x: binary_norm(fname, x))
print(normed_name)
for v in get_binary_counts(df, normed_name):
print("\t{} {:.0%} {:.0f}".format(*v))
In [7]:
print(set(df["dem_cigarettes"]))