In [1]:
import savReaderWriter as s
import pandas as pd
import numpy as np

%matplotlib inline

In [2]:
dem = pd.read_pickle('dem.pkl')
dem.shape


Out[2]:
(1008802, 14)

In [3]:
len(set(dem.index))


Out[3]:
549496

In [3]:


In [4]:
hs = pd.read_pickle('hs.pkl')
hs.shape


Out[4]:
(1008802, 41)

In [5]:
len(set(dem.index))


Out[5]:
549496

In [5]:


In [6]:
dems = dem.groupby('YEAR')
hss = hs.groupby('YEAR')

for name, group in dems:
    print(name)


1996.0
1997.0
1998.0
1999.0

In [7]:
for name, group in dems:
    n = len(group)
    m = len(set(group.index))
    print(name, n, m, n-m, min(group.index), max(group.index))


(1996.0, 240565, 240565, 0, 100001.0, 679935.0)
(1997.0, 243040, 243040, 0, 1.0, 544792.0)
(1998.0, 270234, 270234, 0, 4301.0, 770452.0)
(1999.0, 254963, 254963, 0, 18.0, 588301.0)

In [8]:
for name, group in hss:
    n = len(group)
    m = len(set(group.index))
    print(name, n, m, n-m, min(group.index), max(group.index) )


(1996.0, 240565, 240565, 0, 100001.0, 679935.0)
(1997.0, 243040, 243040, 0, 1.0, 544792.0)
(1998.0, 270234, 270234, 0, 4301.0, 770452.0)
(1999.0, 254963, 254963, 0, 18.0, 588301.0)

In [9]:
joined = {}
for name, group1 in dems:
    group2 = hss.get_group(name)
    n = len(group1)
    m = len(group2)
    print(name, n, m, n-m)
    try:
        joined[name] = pd.concat([group1, group2], axis=1)
    except pd.core.index.InvalidIndexError:
        # TODO: deal with the years that have duplicate SUBJID
        pass


(1996.0, 240565, 240565, 0)
(1997.0, 243040, 243040, 0)
(1998.0, 270234, 270234, 0)
(1999.0, 254963, 254963, 0)

In [10]:
df = joined[1999.0]

In [10]:


In [11]:
df.describe()


Out[11]:
YEAR SUBJID SEX AGE1 AGE2 RRACE RACEGROUP INCOME FATHEDUC MOTHEDUC ... ACT29 ACT30 ACT31 ACT32 ACT33 ACT17_T ACT21_T ACT23_T ACT24_T ACT26_T
count 254963 254963.000000 254963.000000 253031.000000 253031.000000 254963.000000 250951.000000 228109.000000 254963.000000 254963.000000 ... 252007.000000 251421.000000 250717.000000 251380.000000 251961.000000 254963.000000 254963.000000 254963.000000 254963.000000 254963.000000
mean 1999 262637.832568 1.561823 3.284198 3.278575 1.984264 4.763484 15.278652 5.423563 5.235995 ... 2.542675 1.706791 2.043148 1.361266 2.373486 2.193146 1.725341 1.408138 1.959688 1.767551
std 0 169974.832774 0.496164 0.569794 0.532832 0.124451 1.019940 5.722904 2.170062 1.989437 ... 0.601739 0.612357 0.671691 0.573339 0.552231 0.770246 0.782734 0.619980 0.729157 0.712127
min 1999 18.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1999 102510.500000 1.000000 3.000000 3.000000 2.000000 5.000000 13.000000 3.000000 3.000000 ... 2.000000 1.000000 2.000000 1.000000 2.000000 2.000000 1.000000 1.000000 1.000000 1.000000
50% 1999 255331.000000 2.000000 3.000000 3.000000 2.000000 5.000000 17.000000 6.000000 6.000000 ... 3.000000 2.000000 2.000000 1.000000 2.000000 2.000000 2.000000 1.000000 2.000000 2.000000
75% 1999 408788.500000 2.000000 4.000000 4.000000 2.000000 5.000000 20.000000 8.000000 6.000000 ... 3.000000 2.000000 2.000000 2.000000 3.000000 3.000000 2.000000 2.000000 2.000000 2.000000
max 1999 588301.000000 2.000000 11.000000 6.000000 2.000000 7.000000 23.000000 8.000000 8.000000 ... 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000

8 rows × 55 columns


In [12]:
df.AGE1.value_counts().sort_index()


Out[12]:
1        138
2       4697
3     177078
4      67814
5       2265
7        804
8        126
9         72
10        34
11         3
dtype: int64

In [13]:
df.AGE2.value_counts().sort_index()


Out[13]:
1       138
2      4697
3    177078
4     67814
5      2265
6      1039
dtype: int64

In [13]:


In [13]:


In [14]:
# select people 24 and younger
df = df.loc[df.AGE1 <= 6]
len(df)


Out[14]:
251992

In [15]:
df.HSGPA.value_counts().sort_index()


Out[15]:
1      112
2     3170
3     7773
4    16078
5    46173
6    51719
7    61159
8    64066
dtype: int64

In [16]:
df.SATV.value_counts().sort_index()


Out[16]:
200     11
210      9
212      1
220      6
230     11
240     18
250     17
260     10
270     25
275      1
280     34
289      1
290     31
300    112
305      1
...
756       1
760    1340
764       1
768       3
770    1353
777       1
778       1
780     973
782       1
785       2
790     628
796       1
798       1
799       1
800    2544
Length: 297, dtype: int64

In [17]:
df.SATM.value_counts().sort_index()


Out[17]:
200    13
210    11
220     6
223     1
225     2
226     1
230     9
236     1
238     1
240    16
245     1
250    15
260     8
263     1
264     1
...
738       1
740    1494
742       1
745       1
746       1
750    1971
751       2
760    1790
766       1
770     682
780    1450
782       1
790    1027
791       1
800    2583
Length: 298, dtype: int64

In [18]:
df.ACTCOMP.value_counts().sort_index()


Out[18]:
1        3
2       26
3        7
6        1
7        2
8        1
9        1
10       4
11      25
12      60
13      93
14     233
15     434
16     832
17    2075
18    3579
19    5255
20    5139
21    8516
22    7271
23    8464
24    9345
25    7864
26    8688
27    8815
28    8035
29    7021
30    5249
31    4358
32    3098
33    1565
34     862
35     328
36      77
dtype: int64

In [64]:
# TODO: check that 0 really means NA for these vars
df.loc[:, 'ACT17_T'] = df.ACT17_T.replace(0, np.nan)
df.loc[:, 'ACT21_T'] = df.ACT21_T.replace(0, np.nan)
df.loc[:, 'ACT23_T'] = df.ACT23_T.replace(0, np.nan)
df.loc[:, 'ACT24_T'] = df.ACT24_T.replace(0, np.nan)
df.loc[:, 'ACT26_T'] = df.ACT26_T.replace(0, np.nan)

In [65]:
df.isnull().mean()


Out[65]:
YEAR          0.000000
SUBJID        0.000000
SEX           0.000000
AGE1          0.000000
AGE2          0.000000
RRACE         0.000000
RACEGROUP     0.014862
INCOME        0.104031
FATHEDUC      0.000000
MOTHEDUC      0.000000
FIRSTGEN      0.013782
FRELIGIONA    0.115309
MRELIGIONA    0.093729
SRELIGIONA    0.034009
YEAR          0.000000
SUBJID        0.000000
HSGPA         0.006913
SATV          0.414751
SATM          0.414569
SATW          1.000000
ACTCOMP       0.574090
ACT01         0.010881
ACT02         0.011659
ACT03         0.011917
ACT04         0.013695
ACT05         0.011770
ACT06         0.019524
ACT08         0.013231
ACT09         0.016092
ACT10         0.013056
ACT11         0.014127
ACT12         1.000000
ACT14         0.012199
ACT15         0.010893
ACT17         0.010965
ACT18         0.017171
ACT19         0.013715
ACT20         0.011600
ACT21         0.012822
ACT23         1.000000
ACT24         0.011691
ACT25         0.010996
ACT26         0.010699
ACT27         0.012056
ACT28         0.012528
ACT29         0.010639
ACT30         0.012929
ACT31         0.015679
ACT32         0.013024
ACT33         0.010711
ACT17_T       0.016350
ACT21_T       0.011992
ACT23_T       0.013437
ACT24_T       0.011782
ACT26_T       0.011953
HASRELIG      0.000000
PAREDUC       0.000000
MNONE         0.000000
FNONE         0.000000
MIXED         0.000000
Length: 60, dtype: float64

In [21]:
df.ACT08.value_counts().sort_index()


Out[21]:
1     64709
2    137078
3     46871
dtype: int64

In [22]:
df.ACT09.value_counts().sort_index()


Out[22]:
1     38878
2    131278
3     77781
dtype: int64

In [23]:
df.ACT21.value_counts().sort_index()


Out[23]:
1    143096
2     47024
3     58641
dtype: int64

In [24]:
df.ACT25.value_counts().sort_index()


Out[24]:
1      6859
2     73679
3    168683
dtype: int64

In [25]:
df.ACT28.value_counts().sort_index()


Out[25]:
1     17039
2     52052
3    179744
dtype: int64

In [26]:
df.ACT29.value_counts().sort_index()


Out[26]:
1     13917
2     85641
3    149753
dtype: int64

In [66]:
df.ACT17_T.value_counts().sort_index()


Out[66]:
1     41905
2    106353
3     99614
dtype: int64

In [67]:
df.ACT21_T.value_counts().sort_index()


Out[67]:
1    111557
2     88551
3     48862
dtype: int64

In [68]:
df.ACT24_T.value_counts().sort_index()


Out[68]:
1     62620
2    127436
3     58967
dtype: int64

In [30]:
df.INCOME.value_counts().sort_index()


Out[30]:
1      3389
2      3055
4      5454
5      5617
6      8103
7      8673
9     17880
13    21210
15    23689
17    31157
18    34013
20    32405
22    12976
23    18156
dtype: int64

In [31]:
df.FATHEDUC.value_counts().sort_index()


Out[31]:
0     6485
1     5817
2     8288
3    43890
4     9250
5    34708
6    68388
7     6741
8    68425
dtype: int64

In [32]:
df.MOTHEDUC.value_counts().sort_index()


Out[32]:
0     4240
1     5336
2     6261
3    50823
4    12192
5    40307
6    76152
7     9005
8    47676
dtype: int64

In [33]:
df.FIRSTGEN.value_counts().sort_index()


Out[33]:
1    203994
2     44525
dtype: int64

In [34]:
# I'm assuming that the code is PCJON
df.SRELIGIONA.value_counts().sort_index()


Out[34]:
1    107459
2     76732
3      8173
4     13114
5     37944
dtype: int64

In [35]:
df.FRELIGIONA.value_counts().sort_index()


Out[35]:
1    100512
2     73465
3      9319
4     12251
5     27388
dtype: int64

In [36]:
df.MRELIGIONA.value_counts().sort_index()


Out[36]:
1    109813
2     79434
3      8865
4     12479
5     17782
dtype: int64

In [39]:
df.loc[:, 'HASRELIG'] = (df.SRELIGIONA != 5).astype(int)
df.HASRELIG.mean()


Out[39]:
0.84942379123146772

In [39]:


In [40]:
df.groupby('INCOME').mean().HASRELIG.plot()


Out[40]:
<matplotlib.axes.AxesSubplot at 0x7fed29452350>

In [41]:
df.groupby('FATHEDUC').mean().HASRELIG.plot()


Out[41]:
<matplotlib.axes.AxesSubplot at 0x7fed2940ca10>

In [42]:
df.groupby('MOTHEDUC').mean().HASRELIG.plot()


Out[42]:
<matplotlib.axes.AxesSubplot at 0x7fed2922fe90>

In [43]:


In [68]:


In [45]:
df.groupby('RACEGROUP').mean().HASRELIG.plot()


Out[45]:
<matplotlib.axes.AxesSubplot at 0x7fed29061b10>

In [46]:
df.groupby('FIRSTGEN').mean().HASRELIG.plot()


Out[46]:
<matplotlib.axes.AxesSubplot at 0x7fed28fd0110>

In [47]:
df.groupby('HSGPA').mean().HASRELIG.plot()


Out[47]:
<matplotlib.axes.AxesSubplot at 0x7fed28e7f850>

In [48]:
df.groupby('MRELIGIONA').mean().HASRELIG.plot()


Out[48]:
<matplotlib.axes.AxesSubplot at 0x7fed28e5c7d0>

In [49]:
df.groupby('FRELIGIONA').mean().HASRELIG.plot()


Out[49]:
<matplotlib.axes.AxesSubplot at 0x7fed28d2ad50>

In [51]:
df.loc[:, 'MNONE'] = (df.MRELIGIONA == 5).astype(int)
df.loc[:, 'FNONE'] = (df.FRELIGIONA == 5).astype(int)
df.loc[:, 'MIXED'] = (df.FRELIGIONA != df.MRELIGIONA).astype(int)

In [52]:
df.groupby('FNONE').mean().HASRELIG.plot()


Out[52]:
<matplotlib.axes.AxesSubplot at 0x7fed28c5e2d0>

In [53]:
df.groupby('MNONE').mean().HASRELIG.plot()


Out[53]:
<matplotlib.axes.AxesSubplot at 0x7fed28b39990>

In [54]:
# parents have different religions
df.groupby('MIXED').mean().HASRELIG.plot()


Out[54]:
<matplotlib.axes.AxesSubplot at 0x7fed28a79d10>

In [55]:
# discussed politics 
df.groupby('ACT08').mean().HASRELIG.plot()


Out[55]:
<matplotlib.axes.AxesSubplot at 0x7fed289c60d0>

In [56]:
# discussed religion
#df.groupby('ACT09').mean().HASRELIG.plot()

In [57]:
# played a musical instrument
df.groupby('ACT21').mean().HASRELIG.plot()


Out[57]:
<matplotlib.axes.AxesSubplot at 0x7fed28904110>

In [58]:
# used a personal computer
df.groupby('ACT28').mean().HASRELIG.plot()


Out[58]:
<matplotlib.axes.AxesSubplot at 0x7fed28841150>

In [59]:
# used the Internet for research or homework
df.groupby('ACT29').mean().HASRELIG.plot()


Out[59]:
<matplotlib.axes.AxesSubplot at 0x7fed28778b10>

In [69]:
# other Internet use
df.groupby('ACT17_T').mean().HASRELIG.plot()


Out[69]:
<matplotlib.axes.AxesSubplot at 0x7fed27d5e550>

In [70]:
# Participated in Internet chat rooms
df.groupby('ACT21_T').mean().HASRELIG.plot()


Out[70]:
<matplotlib.axes.AxesSubplot at 0x7fed27ca2790>

In [76]:
# played chess
df.groupby('ACT23_T').mean().HASRELIG.plot()


Out[76]:
<matplotlib.axes.AxesSubplot at 0x7fed27a0b110>

In [73]:
# played computer games
df.groupby('ACT24_T').mean().HASRELIG.plot()


Out[73]:
<matplotlib.axes.AxesSubplot at 0x7fed27ab9dd0>

In [77]:
# read the editorial page
df.groupby('ACT26_T').mean().HASRELIG.plot()


Out[77]:
<matplotlib.axes.AxesSubplot at 0x7fed27925410>

In [75]:
df.loc[:, 'ASIAN'] = (df.RACEGROUP == 2).astype(int)
df.loc[:, 'BLACK'] = (df.RACEGROUP == 3).astype(int)
df.loc[:, 'HISP'] = (df.RACEGROUP == 4).astype(int)
df.loc[:, 'INC'] = df.INCOME - df.INCOME.mean()
df.loc[:, 'INC2'] = df.INC**2

In [63]:


In [78]:
import statsmodels.formula.api as smf

formula = ('HASRELIG ~ MNONE + FNONE + MIXED + '
                  'ASIAN + BLACK + HISP + INC + INC2 + '
                  'ACT08 + ACT09 + ACT21 + ACT28 + ACT29 + '
                  'ACT17_T + ACT21_T + ACT24_T')
            
formula = ('HASRELIG ~ MNONE + FNONE + MIXED + '
                  'ASIAN + BLACK + HISP + INC + INC2 + '
                  'ACT17_T')
            
model = smf.logit(formula, data=df)
results = model.fit()
results.summary()


Optimization terminated successfully.
         Current function value: 0.321774
         Iterations 7
Out[78]:
Logit Regression Results
Dep. Variable: HASRELIG No. Observations: 223570
Model: Logit Df Residuals: 223560
Method: MLE Df Model: 9
Date: Thu, 23 Jul 2015 Pseudo R-squ.: 0.2527
Time: 11:49:01 Log-Likelihood: -71939.
converged: True LL-Null: -96271.
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
Intercept 2.9338 0.026 113.334 0.000 2.883 2.984
MNONE -2.8543 0.025 -112.414 0.000 -2.904 -2.804
FNONE -1.3263 0.020 -66.305 0.000 -1.365 -1.287
MIXED -0.6334 0.015 -41.886 0.000 -0.663 -0.604
ASIAN -0.4040 0.026 -15.566 0.000 -0.455 -0.353
BLACK 0.6087 0.037 16.372 0.000 0.536 0.682
HISP 0.1321 0.040 3.290 0.001 0.053 0.211
INC -0.0189 0.001 -12.623 0.000 -0.022 -0.016
INC2 -0.0017 0.000 -8.492 0.000 -0.002 -0.001
ACT17_T -0.1526 0.010 -15.379 0.000 -0.172 -0.133

In [64]:


In [64]: