ThinkStat2 Chapter1 Exerciseのサンプルコード 実行例

モジュール類のインポート


In [17]:
#!/usr/bin/python
#-*- encoding: utf-8 -*-
"""
Sample Codes for ThinkStats2 - Chapter1

Copyright 2015 @myuuuuun
URL: https://github.com/myuuuuun/ThinkStats2-Notebook
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

import sys
sys.path.append('./code')
import pandas as pd
import nsfg
import custom_functions as cf

Exercise1 Problem1

Print value counts for prglngth and compare to results published in the codebook.
DataBook: http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931


In [18]:
def ex1pr1():
    # 2002年のPrefnancyに関するNSFGのデータフレームを読み込む
    df = nsfg.ReadFemPreg()

    # データフレームから妊娠期間の列を代入
    prglngth = df.prglngth

    # 妊娠週ごとに集計して表示
    print(prglngth.value_counts().sort_index())

    # 上記DataBookの表記に合うように集計
    print("\n+-----------------------------+")
    print("13 WEEKS OR LESS: {0}"  .format(prglngth[prglngth <= 13].count()))
    print("14-26 WEEKS: {0}"         .format(prglngth[(13 < prglngth) & (prglngth <= 26)].count()))
    print("27 WEEKS OR LONGER: {0}"  .format(prglngth[prglngth > 26].count()))
    print("\nTOTAL: {0}"             .format(len(prglngth)))
    print("+-----------------------------+")


ex1pr1()


0       15
1        9
2       78
3      151
4      412
5      181
6      543
7      175
8      409
9      594
10     137
11     202
12     170
13     446
14      29
15      39
16      44
17     253
18      17
19      34
20      18
21      37
22     147
23      12
24      31
25      15
26     117
27       8
28      38
29      23
30     198
31      29
32     122
33      50
34      60
35     357
36     329
37     457
38     609
39    4744
40    1120
41     591
42     328
43     148
44      46
45      10
46       1
47       1
48       7
50       2
dtype: int64

+-----------------------------+
13 WEEKS OR LESS: 3522
14-26 WEEKS: 793
27 WEEKS OR LONGER: 9278

TOTAL: 13593
+-----------------------------+

Exercise1 Problem2

Print value counts for agepreg and compare to results published in the codebook.
DataBook: http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611935


In [19]:
def ex1pr2():
    # 2002年のPrefnancyに関するNSFGのデータフレームを読み込む
    df = nsfg.ReadFemPreg()

    # 妊娠した年齢の列を代入
    agepreg = df.agepreg

    # 妊娠した年齢を集計して表示
    print(agepreg.value_counts().sort_index())

    # 上記DataBookの表記に合うように集計
    print("\n+-----------------------------+")
    print("INAPPLICABLE: {0}"  .format(agepreg.isnull().sum()))
    print("UNDER 20 YEARS: {0}".format(df[df.agepreg < 20].agepreg.count()))
    print("20-24 YEARS: {0}"   .format(len(agepreg[(20 <= agepreg) & (agepreg < 25)])))  #条件が複数ある時は()でくくる。演算子はビット演算子を使う。
    print("25-29 YEARS: {0}"   .format(len(df.query("agepreg >= 25 & agepreg < 30"))))   #queryも利用できる
    print("30-44 YEARS: {0}"   .format(len([x for x in agepreg if x >= 30])))            #普通のリストと同様にリスト内包表記も使える

    print("\nTOTAL: {0}"  .format(len(agepreg)))  #len()は全要素数を、count()はNaNやNullを除いた要素数を返す
    print("TOTAL - INAPPLICABLE: {0}"  .format(agepreg.count()))
    print("+-----------------------------+")


ex1pr2()


10.33    1
10.50    1
10.83    1
10.91    1
11.08    1
11.41    1
11.50    1
11.75    2
12.08    1
12.25    1
12.41    2
12.50    1
12.66    2
12.75    1
12.83    2
...
41.91    4
42.08    2
42.16    1
42.25    1
42.33    1
42.41    1
42.50    2
42.58    2
42.75    2
43.00    3
43.16    1
43.25    4
43.58    3
43.91    1
44.08    1
Length: 373, dtype: int64

+-----------------------------+
INAPPLICABLE: 352
UNDER 20 YEARS: 3182
20-24 YEARS: 4246
25-29 YEARS: 3178
30-44 YEARS: 2635

TOTAL: 13593
TOTAL - INAPPLICABLE: 13241
+-----------------------------+

Exercise1 Problem3

Create a new column named totalwgt_kg that contains birth weight in kilograms. Compute its mean.


In [20]:
def ex1pr3():
    df = nsfg.ReadFemPreg()

    # 妊婦 +(お腹の)赤ちゃんの重さはポンドで記録されているので、データフレームに新しい列を追加し、そこにkgの重さを代入
    # pandas.DataFrameに新しい列をつくる時は dataframe['hoge'] のようにする。(dataframe.hoge = ... はダメ)
    df['totalwgt_kg'] = df.totalwgt_lb / 2.2  #1ポンド = 0.45kg

    # ポンドの列と、新しく作ったkgの列を比較(ちゃんと2.2倍くらいになっているか?)
    print(df[['totalwgt_kg', 'totalwgt_lb']])

    # ポンドの列の平均と、kgの列の平均を比較
    print("Mean(kg): {0}\nMean(pounds): {1}".format(df.totalwgt_kg.mean(), df.totalwgt_lb.mean()))


ex1pr3()


       totalwgt_kg  totalwgt_lb
0         4.005682       8.8125
1         3.579545       7.8750
2         4.147727       9.1250
3         3.181818       7.0000
4         2.812500       6.1875
5         3.892045       8.5625
6         4.346591       9.5625
7         3.806818       8.3750
8         3.437500       7.5625
9         3.011364       6.6250
10        3.551136       7.8125
11        3.181818       7.0000
12        1.818182       4.0000
13             NaN          NaN
14             NaN          NaN
15        3.494318       7.6875
16        3.409091       7.5000
17        2.869318       6.3125
18             NaN          NaN
19        3.977273       8.7500
20        3.721591       8.1875
21        2.528409       5.5625
22             NaN          NaN
23        3.068182       6.7500
24        3.352273       7.3750
25        3.096591       6.8125
26        3.693182       8.1250
27        3.238636       7.1250
28        2.755682       6.0625
29        3.380682       7.4375
...            ...          ...
13563     3.494318       7.6875
13564     3.465909       7.6250
13565     3.693182       8.1250
13566     3.409091       7.5000
13567          NaN          NaN
13568          NaN          NaN
13569     2.642045       5.8125
13570     3.039773       6.6875
13571     2.727273       6.0000
13572     2.642045       5.8125
13573     2.982955       6.5625
13574     2.784091       6.1250
13575          NaN          NaN
13576     2.926136       6.4375
13577          NaN          NaN
13578     2.727273       6.0000
13579     3.181818       7.0000
13580          NaN          NaN
13581     2.897727       6.3750
13582          NaN          NaN
13583          NaN          NaN
13584     2.897727       6.3750
13585          NaN          NaN
13586          NaN          NaN
13587          NaN          NaN
13588     2.812500       6.1875
13589          NaN          NaN
13590          NaN          NaN
13591     3.409091       7.5000
13592     3.409091       7.5000

[13593 rows x 2 columns]
Mean(kg): 3.30255838983
Mean(pounds): 7.26562845762

Exercise1 Problem4

Look through the codebook and find a variable, other than the ones mentioned in the book, that you find interesting. Compute values counts, means, or other statistics.
Explanation of variables: http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=pregResp
DataBook (BABYSEX): http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=&subSec=8014&srtLabel=611801


In [21]:
def ex1pr4():
    df = nsfg.ReadFemPreg()

    # データフレームにある全ての列と、その基本統計量を表示する
    #cf.show_info_df(df)

    # ここでは、赤ちゃんの性別がどの位の割合で分布しているか確かめてみる
    # 一般には、男105 : 女100 くらいだと言われている
    babysex = df['babysex']

    # babysexの列の基本統計量を表示
    print(babysex.describe(), "\n")

    # 「1」は男、「2」は女の意味(DataBookを参照)なので、置き換えて何行か表示してみる
    print(babysex.replace({1: "Male", 2:"Female"}))

    # 男女比がどのくらいか調べてみる
    print("\nTotal: {0}\nMale: {1}\nFemale: {2}\nNaN: {3}\n" \
        .format(len(babysex), len(df[df.babysex == 1]), len(df[df.babysex == 2]), df.babysex.isnull().sum()))
    print("Percentage of Male: {0:.3f}%\nPercentage of Feale: {1:.3f}%" \
        .format((len(df[df.babysex == 1]) * 100.0 / df.babysex.count()), (len(df[df.babysex == 2]) * 100.0 / df.babysex.count())))


ex1pr4()


(count    9141.000000
mean        1.492287
std         0.499968
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
dtype: float64, '\n')
0       Male
1     Female
2       Male
3     Female
4     Female
5       Male
6     Female
7     Female
8       Male
9     Female
10      Male
11    Female
12      Male
13       NaN
14       NaN
...
13578    Female
13579      Male
13580       NaN
13581      Male
13582       NaN
13583       NaN
13584    Female
13585       NaN
13586       NaN
13587       NaN
13588      Male
13589       NaN
13590       NaN
13591      Male
13592      Male
Name: babysex, Length: 13593, dtype: object

Total: 13593
Male: 4641
Female: 4500
NaN: 4452

Percentage of Male: 50.771%
Percentage of Feale: 49.229%

Exercise1 Problem5

Count the number of live births with birthwgt_lb between 9 and 95 pounds (including both). The result should be 798


In [22]:
def ex1pr5():
    df = nsfg.ReadFemPreg()

    # outcome列を使い、生きて産まれた赤ちゃんだけを取り出す。
    live = df[df.outcome == 1]

    # 取り出した赤ちゃんの重さを表示する
    birthwgt_lb = df.birthwgt_lb
    print("From 9 to 95 pounds: {0}".format(len(live.query("birthwgt_lb >= 9 & birthwgt_lb <= 95"))))


ex1pr5()


From 9 to 95 pounds: 798

Exercise1 Problem6

Compute the mean prglngth for first babies and others. Compute the difference in means, expressed in hours.


In [23]:
def ex1pr6():
    df = nsfg.ReadFemPreg()

    # 1人目の赤ちゃんと、2人目以降の赤ちゃんをとり出す
    firsts = df[df.birthord == 1]
    others = df[df.birthord > 1]

    # それぞれ妊娠期間の平均を計算する
    weeks_f = firsts['prglngth'].mean()
    weeks_o = others['prglngth'].mean()

    # その差をhour単位に換算
    difference = (weeks_f - weeks_o) * 7 * 24

    print("1st babies prglngth(mean): {0:.3} WEEKS".format(weeks_f))
    print("Other babies prglngth(mean): {0:.3} WEEKS".format(weeks_o))
    print("Difference(firsts - others): {0:.3} hours".format(difference))


ex1pr6()


1st babies prglngth(mean): 38.6 WEEKS
Other babies prglngth(mean): 38.5 WEEKS
Difference(firsts - others): 13.1 hours

Exercise2-1

Print the value counts for this variable(pregnum: how many times each respondent has been pregnant) and compare them to the published results.
DataBook: http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=FEM&section=R&subSec=7869&srtLabel=606835


In [24]:
def ex2pr1():
    df = cf.ReadFemResp()

    # データが何回目の妊娠かを出力
    pregnum = df['pregnum']
    #printf(pregnum)
    print(pregnum.value_counts().sort_index(), "\n") #19回は信頼性のある数字か?
    print("NONE: {0}".format(len(df.query("pregnum == 0"))))
    print("1 PREGNANCY: {0}".format(len(df.query("pregnum == 1"))))
    print("2 PREGNANCIES: {0}".format(len(df.query("pregnum == 2"))))
    print("3 PREGNANCIES: {0}".format(len(df.query("pregnum == 3"))))
    print("4 PREGNANCIES: {0}".format(len(df.query("pregnum == 4"))))
    print("5 PREGNANCIES: {0}".format(len(df.query("pregnum == 5"))))
    print("6 PREGNANCIES: {0}".format(len(df.query("pregnum == 6"))))
    print("7 OR MORE PREGNANCIES: {0}".format(len(df.query("pregnum > 6"))))
    print("\nTOTAL: {0}".format(pregnum.count()))
    

ex2pr1()


(0     2610
1     1267
2     1432
3     1110
4      611
5      305
6      150
7       80
8       40
9       21
10       9
11       3
12       2
14       2
19       1
dtype: int64, '\n')
NONE: 2610
1 PREGNANCY: 1267
2 PREGNANCIES: 1432
3 PREGNANCIES: 1110
4 PREGNANCIES: 611
5 PREGNANCIES: 305
6 PREGNANCIES: 150
7 OR MORE PREGNANCIES: 158

TOTAL: 7643

Exercise2-2


In [25]:
# Ex2 Preg辞書とResp辞書のそれぞれで、同じcaseidを持つ回答者が同じpregnumの値を答えているか調べる
def ex2pr2():
    df = nsfg.ReadFemPreg()
    df2 = cf.ReadFemResp()
    dict_preg = nsfg.MakePregMap(df)

    for caseid in sorted(dict_preg):
        pregnum_of_df = len(dict_preg[caseid])
        pregnum_of_df2 = df2[df2.caseid == caseid].pregnum.values[0]

        # 一致していたらtrue
        is_valid = pregnum_of_df == pregnum_of_df2

        # 全てのcaseidについて、2つの辞書で妊娠回数が一致しているかprint
        """
        print("\ncaseid: {0}さんは、".format(caseid))
        message = "正しいデータです!" if is_valid else "不正なデータです!"
        print(message)
        print("Preg: {0}人、 Resp: {1}人\n".format(pregnum_of_df, pregnum_of_df2))
        print("+" + "-" * 20 + "+")
        """

        if is_valid == False:
            break

    # 全て一致していたらTrue, 1つでも違ったらFalse
    print(is_valid)


ex2pr2()


True

In [25]: