In [1]:
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_fname = r"../data_staging/all_by_baby_enriched_v3.csv"
df = pd.read_csv(data_fname)
First, have a look at the distribution of gestational ages, as this might determine which buckets we use.
In [2]:
all = pd.concat([df.t1_ga_weeks, df.t2_ga_weeks, df.t3_ga_weeks])
all.dropna(inplace=True)
print all.describe()
print "Less than 0: {}".format(len(all[all < 0]))
all = all[all > 0]
all.hist().plot()
plt.show()
In [3]:
# Look at the trimester scans one by one
t1 = df[df.t1_ga_weeks > 0]
t2 = df[(df.t2_ga_weeks > 0) & (df.t2_ga_weeks < 30)]
t3 = df[(df.t3_ga_weeks >= 30)]
print "T1\n", t1.t1_ga_weeks.describe()
print "T2\n", t2.t2_ga_weeks.describe()
print "T3\n", t3.t3_ga_weeks.describe()
t1.t1_ga_weeks.hist(color="blue", label="scan1", bins=20).plot()
t2.t2_ga_weeks.hist(color="green", label="scan2", bins=20).plot()
t3.t3_ga_weeks.hist(color="purple", label="scan3", bins=20).plot()
plt.title("Trimester scan count by week")
plt.legend()
plt.show()
In [4]:
t2 = df[(df.t2_ga_weeks > 0) & (df.t2_ga_weeks < 30)]
print t2.t2_ga_weeks.describe()
t2.t2_ga_weeks.hist().plot()
plt.show()
In [5]:
# Check whether the records we're dropping are actually ok and that we haven't made a mistake somewhere
# print df[df.t2_ga_weeks > 30].iloc[0]
In [6]:
t3 = df[(df.t3_ga_weeks >= 30)]
print t3.t3_ga_weeks.describe()
t3_dist = np.histogram(t3.t3_ga_weeks)
print t3_dist
t3.t3_ga_weeks.hist().plot()
plt.show()
Need to be sure what readings can be ignored and which ones are generally in an expected range. Do some analysis and ask Basky for advice.
It actually looks reasonable if we use a log scale instead, then there aren't so many outliers.
In [7]:
t1.loc[:, "t1_pappa_log"] = np.log(t1.t1_pappa)
print t1.t1_pappa.describe()
print t1.t1_pappa_log.describe()
t1_pappa_threshold = 99.95
pappatv = np.percentile(t1.t1_pappa.dropna().sort_values(), t1_pappa_threshold)
t1.t1_pappa_log.hist(bins=50).plot()
plt.title("Histogram of log(Trimester1 pappa)")
plt.show()
plt.scatter(t1.t1_ga_weeks, t1.t1_pappa_log, alpha=0.5, label="log(t1_pappa)")
plt.plot([t1.t1_ga_weeks.min(), t1.t1_ga_weeks.max()], np.log([pappatv, pappatv]),
linestyle="--", label="{}% < {}".format(t1_pappa_threshold, pappatv))
plt.title("log(Trimester1 pappa) by GA")
plt.legend(bbox_to_anchor=(1.6, 1.))
plt.show()
Now look at some basic plots of hormone readings, to see if there are simple looking relationships before we express as multiples of the mean
In [9]:
hfields = ["pappa", "b_hcg"]
print len(t1)
t11 = t1[(t1.t1_pappa.map(np.isnan) == False) & (t1.t1_pappa > 0) & (t1.t1_pappa <= 10)]
t12 = t1[t1.t1_pappa > 10]
t12_hist = np.histogram(t12.t1_pappa)
print "Hist of pappa>10"
print t12_hist
print pd.DataFrame(t12_hist[0], index=t12_hist[1][0:-1])
print len(t11)
print t11.t1_pappa.describe()
t11.t1_pappa.hist(bins=50, color="blue").plot()
plt.title("Trimester 1 pappa [0, 10]")
plt.show()
In [10]:
plt.scatter(t11.t1_ga_weeks, t11.t1_pappa, alpha=0.5)
plt.title("Trimester 1 GA weeks by Pappa [0, 10]")
plt.show()
I think the right way to interpret this is that each vertical slice (which is probably a single day) is a distribution, and so (assuming enough data in that slice), we determine the mean of that, then express every other reading in that slice as a multiple of that mean.
That means that regardless of the actual ga, we now have a pappa MoM which can all be compared, as they are expressed relative to the "normal" reading at that ga.
We do need to think about the number of obs in each slice, and widen if there aren't enough, for example under 11.5 weeks and over 14.
This is where we could also throw in some colours to see whether age and race etc have any effect (which is how the published means are calculated).
It might be interesting to view this as a 3d surface, or a heatmap with the colour denoting density. Just for interst really, to see how the distribution changes with ga.
In [11]:
data_rows = []
# Bucket the gestational ages
ga_buckets = np.histogram(t11.t1_ga_weeks, bins=50)[1]
# Use the same buckets for all ga bins, so bucket the total pappa
h_buckets = np.histogram(t11.t1_pappa, bins=50)[1]
# For each ga bucket, get the associated pappa readings
for l, r in zip(ga_buckets[0:-1], ga_buckets[1:]):
ga_bucket = t11[(t11.t1_ga_weeks >= l) & (t11.t1_ga_weeks < r)]
if len(ga_bucket) == 0:
#TODO Want to interpolate ideally, but just use the last row for now
data_rows.append(data_rows[-1])
else:
data_rows.append(ga_bucket.t1_pappa)
data_rows.append(t11[(t11.t1_ga_weeks >= r)].t1_pappa)
# Now calculate histograms of the pappa by ga bucket, so that we have a density for each slice
rows = []
for drow in data_rows:
rows.append(np.histogram(drow, bins=h_buckets)[0])
# Calc extents for the axes
ga_extents = [t11.t1_ga_weeks.min(), t11.t1_ga_weeks.max()]
pappa_extents = [t11.t1_pappa.min(), t11.t1_pappa.max()]
# Plot the whole lot, sort of a 3d heatmap?
# Transpose the data so we have rows as pappa levels and columns as GA
rows = np.transpose(rows)
fig, ax = plt.subplots(figsize=(5, 6))
im = ax.imshow(rows, extent=ga_extents + pappa_extents, cmap="hot", origin="lower", interpolation="bilinear")
plt.colorbar(im, orientation='vertical')
plt.title("Trimester 1 GA by Pappa [0, 10] by density")
plt.show()
In [12]:
week = 12
d1 = 4
s = t11[(t11.t1_ga_weeks >= (week + d1 / 7.)) & (t11.t1_ga_weeks < (week + (d1 + 1) / 7.))]
print s.t1_pappa.describe()
s.t1_pappa.hist(bins=50).plot()
plt.title("Week {}, day {} Pappa [0, 10] distribution".format(week, d1))
plt.show()
The main observation here is that the mean is not representative of where most of the data lies, because of the skewed nature of the distribution.
Does it matter? I think so, because a lot of readings will show that they're not near the usual level, which isn't actually the case.
In [13]:
log_s = np.log(s.t1_pappa)
print log_s.describe()
print np.exp(log_s.mean())
# log_hist = np.histogram(log_s, bins=50)
log_s.hist(bins=50).plot()
plt.title("Week {}, day {} log(Pappa [0, 10]) distribution".format(week, d1))
plt.show()
This is showing that if we look at the log of pappa instead, we get something that looks a lot more guassian, and in fact the mean of this distribution translates back into something that visually looks a lot closer to the middle of the original distribution.
In [14]:
plt.scatter(s.dem_mat_age, log_s, alpha=0.5)
plt.title("Maternal age by log(pappa)")
plt.show()
Doesn't look like it.
For each of the hormone levels, need to generate a function that normalises a value for a ga, for which we'll need to: