Exercise from Think Stats, 2nd Edition (thinkstats2.com)
Allen Downey
In [7]:
#!/usr/bin/python
#-*- encoding: utf-8 -*-
import sys
sys.path.append('./code')
import pandas as pd
import nsfg
import chap01ex as ex1
df = nsfg.ReadFemPreg()
#print(df.describe())
# Show All Rows
def printf(rows):
pd.set_option('display.max_rows', len(rows))
print(rows)
pd.reset_option('display.max_rows')
# Usage of printf
# printf(df.birthord) # print all rows
# printf(pregord[0:400]) # you can use slices
df
Out[7]:
Print value counts for birthord and compare to results published in the codebook
In [2]:
df.birthord.value_counts().sort_index()
Out[2]:
Print value counts for prglngth and compare to results published in the codebook
In [8]:
# Ex1 "Print value counts for prglngth and compare to results published in the codebook"
# Data http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG§ion=A&subSec=8016&srtLabel=611931
def ex1pr1():
prglngth = df.prglngth
printf(prglngth.value_counts().sort_index())
print(type(prglngth), "\n")
print("13 WEEKS OR LESS: {0}" .format(prglngth[prglngth <= 13].count()))
print("14-26 WEEKS: {0}" .format(prglngth[(13 < prglngth) & (prglngth <= 26)].count()))
print("27 WEEKS OR LONGER: {0}" .format(prglngth[prglngth > 26].count()))
print("\nTOTAL: {0}" .format(len(prglngth)))
ex1pr1()
Print value counts for agepreg and compare to results published in the codebook.
Looking at this data, please remember my comments in the book about the obligation to approach data with consideration for the context and respect for the respondents.
In [11]:
# Ex1 "Print value counts for agepreg and compare to results published in the codebook."
# Data http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG§ion=A&subSec=8016&srtLabel=611935
def ex1pr2():
agepreg = df.agepreg
printf(agepreg.value_counts().sort_index())
print(type(agepreg), "\n")
print("INAPPLICABLE: {0}" .format(agepreg.isnull().sum()))
print("UNDER 20 YEARS: {0}".format(df[df.agepreg < 20].agepreg.count()))
print("20-24 YEARS: {0}" .format(len(agepreg[(20 <= agepreg) & (agepreg < 25)]))) #条件が複数ある時は()でくくる。演算子はビット演算子を使う。
print("25-29 YEARS: {0}" .format(len(df.query("agepreg >= 25 & agepreg < 30")))) #queryも利用できる
print("30-44 YEARS: {0}" .format(len([x for x in agepreg if x >= 30]))) #普通のリストと同様にリスト内包表記も使える
print("\nTOTAL: {0}" .format(len(agepreg))) #len()は全要素数を、count()はNaNやNullを除いた要素数を返す
print("TOTAL - INAPPLICABLE: {0}" .format(agepreg.count()))
ex1pr2()
Compute the mean birthweight.
In [6]:
df.totalwgt_lb.mean()
Out[6]:
Create a new column named totalwgt_kg that contains birth weight in kilograms. Compute its mean. Remember that when you create a new column, you have to use dictionary syntax, not dot notation.
In [17]:
# Ex1 "Create a new column named totalwgt_kg that contains birth weight in kilograms. Compute its mean."
def ex1pr3():
df['totalwgt_kg'] = df.totalwgt_lb / 2.2
print(df[['totalwgt_kg', 'totalwgt_lb']])
print("Mean(kg): {0}\nMean(pounds): {1}".format(df.totalwgt_kg.mean(), df.totalwgt_lb.mean()))
ex1pr3()
Look through the codebook and find a variable, other than the ones mentioned in the book, that you find interesting. Compute values counts, means, or other statistics.
In [18]:
# Ex1 "Look through the codebook and find a variable, other than the ones mentioned in the book, that you find interesting.
# Compute values counts, means, or other statistics."
# Explanation of variables: http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=pregResp
# data(BABYSEX): http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG§ion=&subSec=8014&srtLabel=611801
def ex1pr4(df):
#show_info_df(df)
babysex = df['babysex']
print(babysex.describe(), "\n")
print(babysex.replace({1: "Male", 2:"Female"}))
print("\nTotal: {0}\nMale: {1}\nFemale: {2}\nNaN: {3}\n".format(len(babysex), len(df[df.babysex == 1]), len(df[df.babysex == 2]), df.babysex.isnull().sum()))
print("Percentage of Male: {0:.3f}%\nPercentage of Feale: {1:.3f}%".format((len(df[df.babysex == 1]) * 100 / df.babysex.count()), (len(df[df.babysex == 2]) * 100 / df.babysex.count())))
ex1pr4(df)
Create a boolean Series.
In [4]:
df.outcome == 1
Out[4]:
Use a boolean Series to select the records for the pregnancies that ended in live birth.
In [5]:
live = df[df.outcome == 1]
len(live)
Out[5]:
Count the number of live births with birthwgt_lb between 0 and 5 pounds (including both). The result should be 1125.
In [6]:
len(live[(live.birthwgt_lb >= 0) & (live.birthwgt_lb <= 5)])
Out[6]:
Count the number of live births with birthwgt_lb between 9 and 95 pounds (including both). The result should be 798
In [19]:
# Ex1 "Count the number of live births with birthwgt_lb between 9 and 95 pounds (including both). The result should be 798"
def ex1pr5(df):
live = df[df.outcome == 1]
birthwgt_lb = df.birthwgt_lb
print("From 9 to 95 pounds: {0}".format(len(live.query("birthwgt_lb >= 9 & birthwgt_lb <= 95"))))
ex1pr5(df)
Use birthord to select the records for first babies and others. How many are there of each?
In [7]:
firsts = df[df.birthord==1]
others = df[df.birthord>1]
len(firsts), len(others)
Out[7]:
Compute the mean weight for first babies and others.
In [8]:
firsts.totalwgt_lb.mean()
Out[8]:
In [9]:
others.totalwgt_lb.mean()
Out[9]:
Compute the mean prglngth for first babies and others. Compute the difference in means, expressed in hours.
In [20]:
# Ex1 "Compute the mean prglngth for first babies and others. Compute the difference in means, expressed in hours."
def ex1pr6(df):
firsts = df[df.birthord == 1]
others = df[df.birthord > 1]
weeks_f = firsts['prglngth'].mean()
weeks_o = others['prglngth'].mean()
difference = (weeks_f - weeks_o) * 7 * 24
print("1st babies prglngth(mean): {0:.3} WEEKS".format(weeks_f))
print("Other babies prglngth(mean): {0:.3} WEEKS".format(weeks_o))
print("Difference(firsts - others): {0:.3} hours".format(difference))
ex1pr6(df)
In [ ]: