ThinkStat2 Chapter2 Exerciseのサンプルコード 実行例

モジュール類のインポート


In [1]:
#!/usr/bin/python
#-*- encoding: utf-8 -*-
"""
Sample Codes for ThinkStats2 - Chapter2

Copyright 2015 @myuuuuun
URL: https://github.com/myuuuuun/ThinkStats2-Notebook
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

%matplotlib inline
import sys
sys.path.append('./code')
import pandas as pd
import nsfg
import custom_functions as cf
import sys
import math
import thinkstats2
import thinkplot


# Cohenのdを求める
def CohenEffectSize(g1, g2):
    diff = g1.mean() - g2.mean()
    var1 = g1.var()
    var2 = g2.var()
    n1 = len(g1)
    n2 = len(g2)

    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    d = diff / math.sqrt(pooled_var)
    return d

Exercise2 Problem1

回答者の年齢をヒストグラムで表示する


In [2]:
def ex2pr1():
    df = cf.ReadFemResp()
    #print(df.age_r.value_counts().sort_index())  #年齢の頻度を表示(年齢順)
    #print(df.age_r.value_counts())  #年齢の頻度を表示(頻度順)
    hist = thinkstats2.Hist(df.age_r, label='age_r')
    thinkplot.Hist(hist)
    thinkplot.Show()


ex2pr1()


<matplotlib.figure.Figure at 0x111bd8cd0>

Exercise2 Problem2

回答者の家族人数をヒストグラムで表示する


In [3]:
def ex2pr2():
    df = cf.ReadFemResp()
    #print(df.numfmhh.value_counts().sort_index())  #家族人数の頻度を表示(家族の人数順)
    #print(df.numfmhh.value_counts())  #家族人数の頻度を表示(頻度順)
    hist = thinkstats2.Hist(df.numfmhh, label='anumfmhh')
    thinkplot.Hist(hist)
    thinkplot.Show()


ex2pr2()


<matplotlib.figure.Figure at 0x11188dd50>

Exercise2 Problem3

出産した子供の人数をヒストグラムで表示する


In [4]:
def ex2pr3():
    df = cf.ReadFemResp()
    #print(df.parity.value_counts().sort_index())  #出産人数の頻度を表示(家族人数順)。22や16はエラーとみなせるかもしれない。
    #print(df.parity.value_counts())  #出産人数の頻度を表示(頻度順)
    hist = thinkstats2.Hist(df.parity, label='parity')
    thinkplot.Hist(hist)
    thinkplot.Show()


ex2pr3()


<matplotlib.figure.Figure at 0x111b2ec10>

Exercise2 Problem4

出産人数上位n件を表示する


In [5]:
def ex2pr4():
    df = cf.ReadFemResp()
    #print(df.parity.value_counts().sort_index())  #出産人数の頻度を表示(家族人数順)。22や16はエラーとみなせるかもしれない。
    #print(df.parity.value_counts())  #出産人数の頻度を表示(頻度順)
    hist = thinkstats2.Hist(df.parity)
    print(hist.Largest(5))


ex2pr4()


[(22, 1), (16, 1), (10, 3), (9, 2), (8, 8)]

Exercise2 Problem5

富裕層の出産人数をヒストグラムで表示する

totincrの詳細: http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=MALE&section=R&subSec=7958&srtLabel=609776
収入75000$以上の人(totincr = 14の人)をrich peopleとかんがえる。


In [6]:
def ex2pr5():
    df = cf.ReadFemResp()
    rich = df[df.totincr == 14]
    #print(rich.parity.value_counts().sort_index())  #出産人数の頻度を表示(家族人数順)
    #print(rich.parity.value_counts())  #出産人数の頻度を表示(頻度順)
    hist = thinkstats2.Hist(rich.parity, label='r_parity')
    thinkplot.Hist(hist)
    thinkplot.Show()


ex2pr5()


<matplotlib.figure.Figure at 0x11616d110>

Exercise2 Problem6

富裕層の出産人数上位n件を表示する


In [7]:
def ex2pr6():
    df = cf.ReadFemResp()
    rich = df[df.totincr == 14]
    #print(rich.parity.value_counts().sort_index())  #出産人数の頻度を表示(家族人数順)
    #print(rich.parity.value_counts())  #出産人数の頻度を表示(頻度順)
    hist = thinkstats2.Hist(rich.parity)
    print(hist.Largest(5))


ex2pr6()


[(8, 1), (7, 1), (5, 5), (4, 19), (3, 123)]

Exercise2 Problem7

富裕層とそれ以外の出産人数の平均を比較する


In [10]:
def ex2pr7():
    df = cf.ReadFemResp()
    rich = df[df.totincr == 14]
    norich = df[df.totincr != 14]
    print("Rich: "+ str(rich.parity.mean()))
    print("Others: " + str(norich.parity.mean()))


ex2pr7()


Rich: 1.07586206897
Others: 1.24957581367

Exercise3

"The mode of a distribution is the most frequent value; see http://wikipedia.org/wiki/Mode_(statistics).
Write a function called Mode that takes a Hist and returns the most frequent value.
As a more challenging exercise, write a function called AllModes that returns a list of value-frequency pairs in descending order of frequency."

parity(出産人数)で適当に実験してみる


In [12]:
# Histオブジェクトを引数に取り、最頻値を返す関数
def Mode(hist):
    for key, freq in sorted(hist.Items(), key=lambda x:x[1], reverse=True): # 頻度の多い順に並び替えてループ
        return key # 1週目で関数を抜ける


# Histオブジェクトを引数に取り、頻度の降順に並び替えた[key, frequency]のリスト(二重配列)を返す関数
def AllModes(hist):
    m_list = []
    for key, freq in sorted(hist.Items(), key=lambda x:x[1], reverse=True): # 頻度の多い順に並び替えてループ
        m_list += [[key, freq]]
    return m_list


def ex3():
    df = cf.ReadFemResp()
    hist = thinkstats2.Hist(df.parity)
    print("Mode()")
    print(Mode(hist))
    
    print("\nAllModes")
    print(AllModes(hist))


ex3()


Mode()
0

AllModes
[[0, 3230], [2, 1603], [1, 1519], [3, 828], [4, 309], [5, 95], [6, 29], [7, 15], [8, 8], [10, 3], [9, 2], [16, 1], [22, 1]]

Exercise4

一番目の赤ちゃんは他の赤ちゃんよりも軽いか重いかを調べる


In [15]:
def ex4():
    df = nsfg.ReadFemPreg()
    first = df[df.birthord == 1]
    others = df[df.birthord > 1]
    hist1 = thinkstats2.Hist(first.totalwgt_lb, label='first')
    hist2 = thinkstats2.Hist(others.totalwgt_lb, label='others')
    
    print("First babies average weights: " + str(first.totalwgt_lb.mean()) + " pounds")
    print("Other babies average weights: " + str(others.totalwgt_lb.mean()) + " pounds")
    print("Cohen's d: " + str(CohenEffectSize(others.totalwgt_lb, first.totalwgt_lb)))


ex4()


First babies average weights: 7.20109443044 pounds
Other babies average weights: 7.32585561497 pounds
Cohen's d: 0.0886729270726

In [ ]: