In [1]:
%matplotlib inline
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
births_url = "https://goo.gl/pFAL23"

births = pd.read_table(births_url)

In [4]:
births.head(n=10)


Out[4]:
fAge mAge weeks premature visits gained weight sexBaby smoke
0 31 30 39 full term 13 1 6.88 male smoker
1 34 36 39 full term 5 35 7.69 male nonsmoker
2 36 35 40 full term 12 29 8.88 male nonsmoker
3 41 40 40 full term 13 30 9.00 female nonsmoker
4 42 37 40 full term NaN 10 7.94 male nonsmoker
5 37 28 40 full term 12 35 8.25 male smoker
6 35 35 28 premie 6 29 1.63 female nonsmoker
7 28 21 35 premie 9 15 5.50 female smoker
8 22 20 32 premie 5 40 2.69 male smoker
9 36 25 40 full term 13 34 8.75 female nonsmoker

In [6]:
births.visits.isnull().sum()


Out[6]:
1

In [7]:
births.isnull().sum()


Out[7]:
fAge         31
mAge          0
weeks         0
premature     0
visits        1
gained        2
weight        0
sexBaby       0
smoke         0
dtype: int64

In [10]:
births.smoke.unique()


Out[10]:
array(['smoker', 'nonsmoker'], dtype=object)

In [11]:
nfullterm = births.premature == "full term"
nfullterm.sum()


Out[11]:
129

In [12]:
npremie = births.premature == "premie"
npremie.sum()


Out[12]:
21

In [13]:
births[npremie]


Out[13]:
fAge mAge weeks premature visits gained weight sexBaby smoke
6 35 35 28 premie 6 29 1.63 female nonsmoker
7 28 21 35 premie 9 15 5.50 female smoker
8 22 20 32 premie 5 40 2.69 male smoker
10 27 19 32 premie 5 32 6.50 female nonsmoker
16 33 40 36 premie 13 23 7.81 female nonsmoker
18 28 27 33 premie 6 18 4.75 male smoker
19 25 22 34 premie 10 20 3.75 male nonsmoker
21 NaN 38 32 premie 10 16 2.19 female smoker
31 NaN 21 36 premie 15 10 6.81 male nonsmoker
43 27 22 35 premie 12 30 4.69 male nonsmoker
44 33 25 35 premie 15 18 6.75 male nonsmoker
46 30 25 35 premie 15 40 4.50 male smoker
66 29 31 36 premie 8 42 5.94 male smoker
67 19 20 34 premie 13 6 4.50 male nonsmoker
82 NaN 32 33 premie 10 60 5.06 male nonsmoker
91 NaN 41 33 premie 13 0 5.69 female nonsmoker
125 NaN 18 33 premie 7 40 1.69 male smoker
127 37 33 36 premie 11 15 6.31 male smoker
130 17 17 29 premie 4 10 2.63 female nonsmoker
140 NaN 23 36 premie 2 27 5.88 female nonsmoker
149 38 37 26 premie 5 25 3.63 male nonsmoker

In [20]:
#babyGirls = births.query('sexBaby == "female"')
#babyGirls = births[births.sexBaby == "female"]
isgirl = births.sexBaby == "female"
births[isgirl].shape


Out[20]:
(68, 9)

In [16]:
babyGirls.shape


Out[16]:
(68, 9)

In [21]:
isboy = births.sexBaby == "male"
births[isboy].shape


Out[21]:
(82, 9)

In [22]:
ispremie = births.premature == "premie"

In [23]:
premieGirls = births[isgirl & ispremie]

In [24]:
premieGirls


Out[24]:
fAge mAge weeks premature visits gained weight sexBaby smoke
6 35 35 28 premie 6 29 1.63 female nonsmoker
7 28 21 35 premie 9 15 5.50 female smoker
10 27 19 32 premie 5 32 6.50 female nonsmoker
16 33 40 36 premie 13 23 7.81 female nonsmoker
21 NaN 38 32 premie 10 16 2.19 female smoker
91 NaN 41 33 premie 13 0 5.69 female nonsmoker
130 17 17 29 premie 4 10 2.63 female nonsmoker
140 NaN 23 36 premie 2 27 5.88 female nonsmoker

In [26]:
isbig = births.weight > 9
isbig.sum()


Out[26]:
7

In [32]:
boys = births.sexBaby != "female"

In [30]:
births[nofAge]


  File "<ipython-input-30-3871e93f3198>", line 1
    births[!nofAge]
           ^
SyntaxError: invalid syntax

In [39]:
births.groupby(["smoke","sexBaby"])


Out[39]:
<pandas.core.groupby.DataFrameGroupBy object at 0x7f056cdadeb8>

In [42]:
g = births.groupby('smoke').describe()
g


Out[42]:
fAge gained mAge visits weeks weight
smoke
nonsmoker count 85.000000 99.000000 100.000000 99.000000 100.000000 100.000000
mean 29.811765 32.545455 26.900000 11.858586 38.550000 7.179500
std 6.182722 15.231059 6.342895 3.490541 2.875796 1.434152
min 17.000000 0.000000 15.000000 2.000000 26.000000 1.630000
25% 25.000000 23.000000 22.000000 10.000000 38.000000 6.702500
50% 30.000000 31.000000 25.000000 12.000000 39.000000 7.440000
75% 35.000000 40.000000 32.000000 14.500000 40.000000 8.060000
max 47.000000 85.000000 41.000000 19.000000 44.000000 10.130000
smoker count 34.000000 49.000000 50.000000 50.000000 50.000000 50.000000
mean 29.705882 32.265306 26.000000 10.800000 38.540000 6.779000
std 6.147330 16.646791 5.993193 3.843893 2.500694 1.597415
min 20.000000 0.000000 16.000000 3.000000 32.000000 1.690000
25% 25.000000 22.000000 21.250000 8.000000 38.000000 6.220000
50% 29.000000 30.000000 25.500000 11.500000 39.000000 6.970000
75% 33.000000 40.000000 30.000000 14.000000 40.000000 7.810000
max 46.000000 75.000000 39.000000 17.000000 44.000000 9.130000

In [43]:
## Matplotlib

In [46]:
plt.scatter(births.mAge, births.fAge)
plt.xlabel("Mother's Age")
plt.ylabel("Father's Age")
plt.title("My useful title")


Out[46]:
<matplotlib.text.Text at 0x7f056a451438>

In [47]:
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])



In [49]:
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])

axes.scatter(births.mAge, births.fAge)
axes.set_xlabel("Age of Mother")
axes.set_ylabel("Age of Father")

pass



In [57]:
fig = plt.figure(figsize=(6,6))
axes = fig.add_axes([0.1, 0.1, 0.5, 0.5])

axes.scatter(births.mAge, births.fAge)
axes.set_xlabel("Age of Mother")
axes.set_ylabel("Age of Father")

rightax = fig.add_axes([0.7, 0.1, 0.25, 0.5])
rightax.hist(births.mAge, normed=True,orientation="horizontal" )


above = fig.add_axes([0.1, 0.7, 0.5, 0.25])
above.hist(births.mAge, normed=True)
above.set_xlim(10,50)

pass



In [ ]: