Age, Weight and Height of PGP Participants


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
import re
import numpy as np
from datetime import datetime
from datetime import date

In [2]:
# Set default plot stype for matplotlib
plt.style.use('ggplot')

# Set default figure size for matplotlib
plt.rcParams['figure.figsize'] = (12, 6)

In [3]:
# Connect to untap database (using sqlite3), found here: 
# https://workbench.su92l.arvadosapi.com/projects/su92l-j7d0g-k6xjjk9g0l5pmqh#Description

conn = sqlite3.connect('Database/untap.db')
c = conn.cursor()
c.execute('SELECT * FROM demographics')
rows = c.fetchall()

# Converting to a dataframe
demographics = pd.DataFrame(rows,columns=zip(*c.description)[0])

conn.close()

In [4]:
demographics


Out[4]:
id human_id date_of_birth gender weight height blood_type race
0 1 hu43860C 1954-08-28 (61 years old) Male 246lbs (112kg) 6ft 5in (195cm) O+ White
1 2 huC30901 1962-05-23 (53 years old) Male 175lbs (79kg) 6ft 2in (187cm) O- White
2 3 hu9385BA 1955-07-05 (60 years old) Male 155lbs (70kg) A+
3 4 hu04FD18 1954-09-18 (61 years old) Male O+ White
4 5 huAE6220 1949-04-24 (66 years old) Male B+ White
5 6 hu604D39 1958-01-19 (58 years old) Male 245lbs (111kg) 5ft 8in (172cm) AB+ Black or African American
6 7 hu6E4515 1921-07-09 (94 years old) Male 180lbs (82kg) 6ft (182cm) O+ White
7 8 huA90CE6 1958-09-25 (57 years old) Male White
8 9 hu34D5B9 1973-01-20 (43 years old) 170lbs (77kg) 5ft 11in (180cm)
9 10 hu2FEC01 1985-10-06 (30 years old) Male 143lbs (65kg) 5ft 9in (175cm) A+ White
10 11 huAA16BD 1927-11-17 (88 years old)
11 12 hu2DBF2D 1982-12-07 (33 years old) Male 190lbs (86kg) 5ft 10in (177cm) O- White
12 13 hu342A08 1943-09-18 (72 years old) Female 160lbs (73kg) 5ft 9in (175cm) White
13 14 hu4339C0
14 15 hu72A81D 1968-11-01 (47 years old) Female 145lbs (66kg) 5ft 10in (177cm) A+ White
15 16 hu0E64A1 1985-02-20 (30 years old) Male 120lbs (54kg) White
16 17 hu3CAB43 1967-11-30 (48 years old) Male 115lbs (52kg) 5ft 6in (167cm) White
17 18 hu2D6140 1942-01-05 (74 years old) Female 142lbs (64kg) 5ft 6in (167cm) O+ White
18 19 huA0E089 1942-11-22 (73 years old) Female 139lbs (63kg) 5ft 4in (162cm) A+ White
19 20 hu8229AE 1969-09-11 (46 years old) Female 153lbs (69kg) 5ft 2in (157cm) O+ White
20 21 huCA017E 1949-03-19 (66 years old) Male 142lbs (64kg) 5ft 6in (167cm) A+ Asian
21 22 hu92C40A 1944-01-13 (72 years old) Female 132lbs (60kg) 5ft 7in (170cm) American Indian or Alaska Native
22 23 hu38168C 1949-07-08 (66 years old) Female 105lbs (48kg) 5ft 3in (160cm) O+ Asian
23 24 huB1FD55 1958-09-19 (57 years old) Male White
24 25 huD81F3D 1969-04-24 (46 years old) Male 210lbs (95kg) 6ft 3in (190cm) A+ White
25 26 huD37D14 1970-02-20 (45 years old) Female 147lbs (67kg) 5ft 6in (167cm) O+ White
26 27 huBAAC98 1980-07-30 (35 years old) Male 240lbs (109kg) 6ft 2in (187cm) O+ White
27 28 huB4940E
28 29 huEC6EEC 1964-07-10 (51 years old) Female 180lbs (82kg) 5ft 4in (162cm) O+ Hispanic or Latino
29 30 huF5E0B6 1971-12-12 (44 years old) Male 185lbs (84kg) 6ft (182cm) A+ White
... ... ... ... ... ... ... ... ...
1303 1304 hu4A6650 1943-10-26 (72 years old) Male 227lbs (103kg) 5ft 11in (180cm) A+ White
1304 1305 hu594129 1962-02-20 (53 years old) Male 163lbs (74kg) 5ft 11in (180cm) B+ White
1305 1306 huB256A2 1982-04-14 (33 years old) 115lbs (52kg)
1306 1307 hu7D6AB1 1948-07-30 (67 years old) Female 93lbs (42kg) 5ft (152cm) O+ Asian
1307 1308 huFEC65A 1969-11-25 (46 years old) Male White
1308 1309 huF9A043 1957-01-08 (59 years old) 162lbs (73kg)
1309 1310 huF9C3EE 1985-11-20 (30 years old) Male 170lbs (77kg) 5ft 10in (177cm) A+ Asian
1310 1311 huD0127A 1964-05-22 (51 years old) 212lbs (96kg)
1311 1312 hu5C4EBA 1966-10-27 (49 years old) 270lbs (122kg)
1312 1313 huD66092 1948-01-02 (68 years old) 116lbs (53kg) 5ft 4in (162cm)
1313 1314 hu039674 1981-04-18 (34 years old) Male 165lbs (75kg) 5ft 6in (167cm) White
1314 1315 hu941A10 1946-03-01 (69 years old) Female 140lbs (64kg) 5ft 3in (160cm) White
1315 1316 hu1D278B 1976-11-26 (39 years old) Male 190lbs (86kg) 5ft 11in (180cm) O+ Hispanic/Latino
1316 1317 hu045168 1980-06-04 (35 years old) 143lbs (65kg) 5ft 9in (175cm)
1317 1318 hu2561FE 1955-01-08 (61 years old) 5ft 9in (175cm)
1318 1319 huD8A5B6 1956-01-28 (60 years old) 184lbs (83kg)
1319 1320 hu2CA680 1964-12-24 (51 years old) 140lbs (64kg)
1320 1321 hu631D32 1937-10-19 (78 years old) Male 156lbs (71kg) 5ft 7in (170cm) AB+ White
1321 1322 hu934217 1962-10-01 (53 years old) Male 191lbs (87kg) 5ft 6in (167cm) O+ White
1322 1323 hu524B5B 1961-05-26 (54 years old) Female 147lbs (67kg) 5ft 3in (160cm) O+ White
1323 1324 hu78E6A9 1963-05-20 (52 years old) Female 142lbs (64kg) 5ft 5in (165cm) O+ White
1324 1325 huDF8970 1975-07-16 (40 years old)
1325 1326 hu313A20 1982-05-29 (33 years old) Male 165lbs (75kg) 6ft (182cm) O+ White
1326 1327 huFBD57F 1962-03-27 (53 years old) Male 201lbs (91kg) 5ft 9in (175cm) A+ White
1327 1328 huC5203C 1978-12-28 (37 years old) Female 98lbs (44kg) 5ft (152cm) White
1328 1329 huABB4CD 1975-01-08 (41 years old) 174lbs (79kg)
1329 1330 huDD1522 1986-02-06 (29 years old) 153lbs (69kg) 5ft 4in (162cm)
1330 1331 hu25C433 1951-08-17 (64 years old)
1331 1332 hu419539 1974-07-22 (41 years old) 161lbs (73kg) 6ft (182cm)
1332 1333 hu7FC773 108lbs (49kg)

1333 rows × 8 columns


In [5]:
# Doing some basic data munging to clean up birth date data

# Convert date to all lower case and strip out blank spaces for height and weight
demographics.date_of_birth = demographics.date_of_birth.str.strip()

# Remove characters with parantheses, replace empties with NaNs and convert to datetimes
g = lambda x:re.sub('\s\([^)]*\)','',x)   
birthdate = demographics.date_of_birth.apply(g)

demographics['only_birth'] = demographics.date_of_birth.apply(g)

demographics.only_birth = demographics['only_birth'].replace(r'\s+|^$', np.nan, regex=True)
demographics.only_birth = pd.to_datetime(demographics.only_birth,format='%Y-%m-%d')

# Calculate current age based on birth date
ageSec = date.today()-demographics.only_birth
demographics['only_age'] = ageSec/np.timedelta64(365,'D')

In [6]:
# Plotting a histogram of ages 

# Find null values
idx = demographics.only_age.notnull()

# Create the histogram (excluding null values)
plt.hist(demographics.only_age[idx],25,normed=True)

plt.title("Histogram of PGP Ages")
plt.xlabel("age in yrs")
plt.ylabel("frequency")
plt.savefig('Images/histogramAges.png',format='png',dpi=300)



In [7]:
# Stack histograms for male and female participants

idxM = (demographics.gender == 'Male') & idx 
idxF = (demographics.gender == 'Female') & idx

plt.hist([demographics.only_age[idxM], demographics.only_age[idxF]],25, stacked=True)

plt.title("Histogram of PGP Ages")
plt.xlabel("age in yrs")
plt.ylabel("number of participants")
plt.legend(['Male','Female'])

plt.savefig('Images/histogramAges.png',format='png',dpi=300)



In [8]:
# Data munging for weight data

# Strip off any spaces/banks at beginning or end
demographics.weight = demographics.weight.str.strip()

# Extract weight (in kg) 
g2 = lambda x:x[x.find("(")+1:x.find("k")]
only_weight = demographics.weight.apply(g2)

demographics['only_weight'] = only_weight

# Replace empties (spaces) with NaNs
demographics.only_weight = demographics['only_weight'].replace(r'\s+|^$', np.nan, regex=True)
demographics.only_weight

# Convert from strings to floats
demographics['only_weight'] = demographics.only_weight.astype(float)

In [9]:
# Create the histogram of weights

idx = demographics.only_weight.notnull()
idxM = (demographics.gender == 'Male') & idx 
idxF = (demographics.gender == 'Female') & idx

plt.hist([demographics.only_weight[idxM], demographics.only_weight[idxF]], 25)

plt.title("Histogram of PGP Weights")
plt.xlabel("weight (kg)")
plt.ylabel("number of participants")
plt.legend(['Male','Female'])

plt.savefig('Images/histogramWeights.png',format='png',dpi=300)



In [10]:
# Data munging for height data

# strip off any spaces/banks at beginning or end
demographics.height = demographics.height.str.strip()

# extract height (in kg) 
g2 = lambda x:x[x.find("(")+1:x.find("k")]
only_height = demographics.weight.apply(g2)

demographics['only_height'] = only_height

# replace empties (spaces) with NaNs
demographics.only_height = demographics['only_height'].replace(r'\s+|^$', np.nan, regex=True)

# convert from strings to floats
demographics['only_height'] = demographics.only_height.astype(float)

In [11]:
# Create the histogram of heights

idx = demographics.only_height.notnull()
idxM = (demographics.gender == 'Male') & idx 
idxF = (demographics.gender == 'Female') & idx

plt.hist([demographics.only_height[idxM], demographics.only_height[idxF]], 25)

plt.title("Histogram of PGP heights")
plt.xlabel("height (cm)")
plt.ylabel("number of participants")
plt.legend(['Male','Female'])

plt.savefig('Images/histogramHeights.png',format='png',dpi=300)