notebook.community

Edit and run



In [1]:

    
import savReaderWriter as s
import pandas as pd
import numpy as np

%matplotlib inline



In [2]:

    
dem = pd.read_pickle('dem.pkl')
dem.shape









    Out[2]:





(1008802, 14)



In [3]:

    
len(set(dem.index))









    Out[3]:





549496



In [3]:



In [4]:

    
hs = pd.read_pickle('hs.pkl')
hs.shape









    Out[4]:





(1008802, 41)



In [5]:

    
len(set(dem.index))









    Out[5]:





549496



In [5]:



In [6]:

    
dems = dem.groupby('YEAR')
hss = hs.groupby('YEAR')

for name, group in dems:
    print(name)



In [7]:

    
for name, group in dems:
    n = len(group)
    m = len(set(group.index))
    print(name, n, m, n-m, min(group.index), max(group.index))









    



(1996.0, 240565, 240565, 0, 100001.0, 679935.0)
(1997.0, 243040, 243040, 0, 1.0, 544792.0)
(1998.0, 270234, 270234, 0, 4301.0, 770452.0)
(1999.0, 254963, 254963, 0, 18.0, 588301.0)



In [8]:

    
for name, group in hss:
    n = len(group)
    m = len(set(group.index))
    print(name, n, m, n-m, min(group.index), max(group.index) )









    



(1996.0, 240565, 240565, 0, 100001.0, 679935.0)
(1997.0, 243040, 243040, 0, 1.0, 544792.0)
(1998.0, 270234, 270234, 0, 4301.0, 770452.0)
(1999.0, 254963, 254963, 0, 18.0, 588301.0)



In [9]:

    
joined = {}
for name, group1 in dems:
    group2 = hss.get_group(name)
    n = len(group1)
    m = len(group2)
    print(name, n, m, n-m)
    try:
        joined[name] = pd.concat([group1, group2], axis=1)
    except pd.core.index.InvalidIndexError:
        # TODO: deal with the years that have duplicate SUBJID
        pass









    



(1996.0, 240565, 240565, 0)
(1997.0, 243040, 243040, 0)
(1998.0, 270234, 270234, 0)
(1999.0, 254963, 254963, 0)



In [10]:

    
df = joined[1999.0]



In [10]:



In [11]:

    
df.describe()









    Out[11]:






  
    
      
      YEAR
      SUBJID
      SEX
      AGE1
      AGE2
      RRACE
      RACEGROUP
      INCOME
      FATHEDUC
      MOTHEDUC
      ...
      ACT29
      ACT30
      ACT31
      ACT32
      ACT33
      ACT17_T
      ACT21_T
      ACT23_T
      ACT24_T
      ACT26_T
    
  
  
    
      count
       254963
       254963.000000
       254963.000000
       253031.000000
       253031.000000
       254963.000000
       250951.000000
       228109.000000
       254963.000000
       254963.000000
      ...
       252007.000000
       251421.000000
       250717.000000
       251380.000000
       251961.000000
       254963.000000
       254963.000000
       254963.000000
       254963.000000
       254963.000000
    
    
      mean
         1999
       262637.832568
            1.561823
            3.284198
            3.278575
            1.984264
            4.763484
           15.278652
            5.423563
            5.235995
      ...
            2.542675
            1.706791
            2.043148
            1.361266
            2.373486
            2.193146
            1.725341
            1.408138
            1.959688
            1.767551
    
    
      std
            0
       169974.832774
            0.496164
            0.569794
            0.532832
            0.124451
            1.019940
            5.722904
            2.170062
            1.989437
      ...
            0.601739
            0.612357
            0.671691
            0.573339
            0.552231
            0.770246
            0.782734
            0.619980
            0.729157
            0.712127
    
    
      min
         1999
           18.000000
            1.000000
            1.000000
            1.000000
            1.000000
            1.000000
            1.000000
            0.000000
            0.000000
      ...
            1.000000
            1.000000
            1.000000
            1.000000
            1.000000
            0.000000
            0.000000
            0.000000
            0.000000
            0.000000
    
    
      25%
         1999
       102510.500000
            1.000000
            3.000000
            3.000000
            2.000000
            5.000000
           13.000000
            3.000000
            3.000000
      ...
            2.000000
            1.000000
            2.000000
            1.000000
            2.000000
            2.000000
            1.000000
            1.000000
            1.000000
            1.000000
    
    
      50%
         1999
       255331.000000
            2.000000
            3.000000
            3.000000
            2.000000
            5.000000
           17.000000
            6.000000
            6.000000
      ...
            3.000000
            2.000000
            2.000000
            1.000000
            2.000000
            2.000000
            2.000000
            1.000000
            2.000000
            2.000000
    
    
      75%
         1999
       408788.500000
            2.000000
            4.000000
            4.000000
            2.000000
            5.000000
           20.000000
            8.000000
            6.000000
      ...
            3.000000
            2.000000
            2.000000
            2.000000
            3.000000
            3.000000
            2.000000
            2.000000
            2.000000
            2.000000
    
    
      max
         1999
       588301.000000
            2.000000
           11.000000
            6.000000
            2.000000
            7.000000
           23.000000
            8.000000
            8.000000
      ...
            3.000000
            3.000000
            3.000000
            3.000000
            3.000000
            3.000000
            3.000000
            3.000000
            3.000000
            3.000000
    
  

8 rows × 55 columns



In [12]:

    
df.AGE1.value_counts().sort_index()









    Out[12]:





1        138
2       4697
3     177078
4      67814
5       2265
7        804
8        126
9         72
10        34
11         3
dtype: int64



In [13]:

    
df.AGE2.value_counts().sort_index()









    Out[13]:





1       138
2      4697
3    177078
4     67814
5      2265
6      1039
dtype: int64



In [13]:



In [13]:



In [14]:

    
# select people 24 and younger
df = df.loc[df.AGE1 <= 6]
len(df)









    Out[14]:





251992



In [15]:

    
df.HSGPA.value_counts().sort_index()









    Out[15]:





1      112
2     3170
3     7773
4    16078
5    46173
6    51719
7    61159
8    64066
dtype: int64



In [16]:

    
df.SATV.value_counts().sort_index()









    Out[16]:





200     11
210      9
212      1
220      6
230     11
240     18
250     17
260     10
270     25
275      1
280     34
289      1
290     31
300    112
305      1
...
756       1
760    1340
764       1
768       3
770    1353
777       1
778       1
780     973
782       1
785       2
790     628
796       1
798       1
799       1
800    2544
Length: 297, dtype: int64



In [17]:

    
df.SATM.value_counts().sort_index()









    Out[17]:





200    13
210    11
220     6
223     1
225     2
226     1
230     9
236     1
238     1
240    16
245     1
250    15
260     8
263     1
264     1
...
738       1
740    1494
742       1
745       1
746       1
750    1971
751       2
760    1790
766       1
770     682
780    1450
782       1
790    1027
791       1
800    2583
Length: 298, dtype: int64



In [18]:

    
df.ACTCOMP.value_counts().sort_index()









    Out[18]:





1        3
2       26
3        7
6        1
7        2
8        1
9        1
10       4
11      25
12      60
13      93
14     233
15     434
16     832
17    2075
18    3579
19    5255
20    5139
21    8516
22    7271
23    8464
24    9345
25    7864
26    8688
27    8815
28    8035
29    7021
30    5249
31    4358
32    3098
33    1565
34     862
35     328
36      77
dtype: int64



In [64]:

    
# TODO: check that 0 really means NA for these vars
df.loc[:, 'ACT17_T'] = df.ACT17_T.replace(0, np.nan)
df.loc[:, 'ACT21_T'] = df.ACT21_T.replace(0, np.nan)
df.loc[:, 'ACT23_T'] = df.ACT23_T.replace(0, np.nan)
df.loc[:, 'ACT24_T'] = df.ACT24_T.replace(0, np.nan)
df.loc[:, 'ACT26_T'] = df.ACT26_T.replace(0, np.nan)



In [65]:

    
df.isnull().mean()









    Out[65]:





YEAR          0.000000
SUBJID        0.000000
SEX           0.000000
AGE1          0.000000
AGE2          0.000000
RRACE         0.000000
RACEGROUP     0.014862
INCOME        0.104031
FATHEDUC      0.000000
MOTHEDUC      0.000000
FIRSTGEN      0.013782
FRELIGIONA    0.115309
MRELIGIONA    0.093729
SRELIGIONA    0.034009
YEAR          0.000000
SUBJID        0.000000
HSGPA         0.006913
SATV          0.414751
SATM          0.414569
SATW          1.000000
ACTCOMP       0.574090
ACT01         0.010881
ACT02         0.011659
ACT03         0.011917
ACT04         0.013695
ACT05         0.011770
ACT06         0.019524
ACT08         0.013231
ACT09         0.016092
ACT10         0.013056
ACT11         0.014127
ACT12         1.000000
ACT14         0.012199
ACT15         0.010893
ACT17         0.010965
ACT18         0.017171
ACT19         0.013715
ACT20         0.011600
ACT21         0.012822
ACT23         1.000000
ACT24         0.011691
ACT25         0.010996
ACT26         0.010699
ACT27         0.012056
ACT28         0.012528
ACT29         0.010639
ACT30         0.012929
ACT31         0.015679
ACT32         0.013024
ACT33         0.010711
ACT17_T       0.016350
ACT21_T       0.011992
ACT23_T       0.013437
ACT24_T       0.011782
ACT26_T       0.011953
HASRELIG      0.000000
PAREDUC       0.000000
MNONE         0.000000
FNONE         0.000000
MIXED         0.000000
Length: 60, dtype: float64



In [21]:

    
df.ACT08.value_counts().sort_index()









    Out[21]:





1     64709
2    137078
3     46871
dtype: int64



In [22]:

    
df.ACT09.value_counts().sort_index()









    Out[22]:





1     38878
2    131278
3     77781
dtype: int64



In [23]:

    
df.ACT21.value_counts().sort_index()









    Out[23]:





1    143096
2     47024
3     58641
dtype: int64



In [24]:

    
df.ACT25.value_counts().sort_index()









    Out[24]:





1      6859
2     73679
3    168683
dtype: int64



In [25]:

    
df.ACT28.value_counts().sort_index()









    Out[25]:





1     17039
2     52052
3    179744
dtype: int64



In [26]:

    
df.ACT29.value_counts().sort_index()









    Out[26]:





1     13917
2     85641
3    149753
dtype: int64



In [66]:

    
df.ACT17_T.value_counts().sort_index()









    Out[66]:





1     41905
2    106353
3     99614
dtype: int64



In [67]:

    
df.ACT21_T.value_counts().sort_index()









    Out[67]:





1    111557
2     88551
3     48862
dtype: int64



In [68]:

    
df.ACT24_T.value_counts().sort_index()









    Out[68]:





1     62620
2    127436
3     58967
dtype: int64



In [30]:

    
df.INCOME.value_counts().sort_index()









    Out[30]:





1      3389
2      3055
4      5454
5      5617
6      8103
7      8673
9     17880
13    21210
15    23689
17    31157
18    34013
20    32405
22    12976
23    18156
dtype: int64



In [31]:

    
df.FATHEDUC.value_counts().sort_index()









    Out[31]:





0     6485
1     5817
2     8288
3    43890
4     9250
5    34708
6    68388
7     6741
8    68425
dtype: int64



In [32]:

    
df.MOTHEDUC.value_counts().sort_index()









    Out[32]:





0     4240
1     5336
2     6261
3    50823
4    12192
5    40307
6    76152
7     9005
8    47676
dtype: int64



In [33]:

    
df.FIRSTGEN.value_counts().sort_index()









    Out[33]:





1    203994
2     44525
dtype: int64



In [34]:

    
# I'm assuming that the code is PCJON
df.SRELIGIONA.value_counts().sort_index()









    Out[34]:





1    107459
2     76732
3      8173
4     13114
5     37944
dtype: int64



In [35]:

    
df.FRELIGIONA.value_counts().sort_index()









    Out[35]:





1    100512
2     73465
3      9319
4     12251
5     27388
dtype: int64



In [36]:

    
df.MRELIGIONA.value_counts().sort_index()









    Out[36]:





1    109813
2     79434
3      8865
4     12479
5     17782
dtype: int64



In [39]:

    
df.loc[:, 'HASRELIG'] = (df.SRELIGIONA != 5).astype(int)
df.HASRELIG.mean()









    Out[39]:





0.84942379123146772



In [39]:



In [40]:

    
df.groupby('INCOME').mean().HASRELIG.plot()









    Out[40]:





<matplotlib.axes.AxesSubplot at 0x7fed29452350>



In [41]:

    
df.groupby('FATHEDUC').mean().HASRELIG.plot()









    Out[41]:





<matplotlib.axes.AxesSubplot at 0x7fed2940ca10>



In [42]:

    
df.groupby('MOTHEDUC').mean().HASRELIG.plot()









    Out[42]:





<matplotlib.axes.AxesSubplot at 0x7fed2922fe90>



In [43]:



In [68]:



In [45]:

    
df.groupby('RACEGROUP').mean().HASRELIG.plot()









    Out[45]:





<matplotlib.axes.AxesSubplot at 0x7fed29061b10>



In [46]:

    
df.groupby('FIRSTGEN').mean().HASRELIG.plot()









    Out[46]:





<matplotlib.axes.AxesSubplot at 0x7fed28fd0110>



In [47]:

    
df.groupby('HSGPA').mean().HASRELIG.plot()









    Out[47]:





<matplotlib.axes.AxesSubplot at 0x7fed28e7f850>



In [48]:

    
df.groupby('MRELIGIONA').mean().HASRELIG.plot()









    Out[48]:





<matplotlib.axes.AxesSubplot at 0x7fed28e5c7d0>



In [49]:

    
df.groupby('FRELIGIONA').mean().HASRELIG.plot()









    Out[49]:





<matplotlib.axes.AxesSubplot at 0x7fed28d2ad50>



In [51]:

    
df.loc[:, 'MNONE'] = (df.MRELIGIONA == 5).astype(int)
df.loc[:, 'FNONE'] = (df.FRELIGIONA == 5).astype(int)
df.loc[:, 'MIXED'] = (df.FRELIGIONA != df.MRELIGIONA).astype(int)



In [52]:

    
df.groupby('FNONE').mean().HASRELIG.plot()









    Out[52]:





<matplotlib.axes.AxesSubplot at 0x7fed28c5e2d0>



In [53]:

    
df.groupby('MNONE').mean().HASRELIG.plot()









    Out[53]:





<matplotlib.axes.AxesSubplot at 0x7fed28b39990>



In [54]:

    
# parents have different religions
df.groupby('MIXED').mean().HASRELIG.plot()









    Out[54]:





<matplotlib.axes.AxesSubplot at 0x7fed28a79d10>



In [55]:

    
# discussed politics 
df.groupby('ACT08').mean().HASRELIG.plot()









    Out[55]:





<matplotlib.axes.AxesSubplot at 0x7fed289c60d0>



In [56]:

    
# discussed religion
#df.groupby('ACT09').mean().HASRELIG.plot()



In [57]:

    
# played a musical instrument
df.groupby('ACT21').mean().HASRELIG.plot()









    Out[57]:





<matplotlib.axes.AxesSubplot at 0x7fed28904110>



In [58]:

    
# used a personal computer
df.groupby('ACT28').mean().HASRELIG.plot()









    Out[58]:





<matplotlib.axes.AxesSubplot at 0x7fed28841150>



In [59]:

    
# used the Internet for research or homework
df.groupby('ACT29').mean().HASRELIG.plot()









    Out[59]:





<matplotlib.axes.AxesSubplot at 0x7fed28778b10>



In [69]:

    
# other Internet use
df.groupby('ACT17_T').mean().HASRELIG.plot()









    Out[69]:





<matplotlib.axes.AxesSubplot at 0x7fed27d5e550>



In [70]:

    
# Participated in Internet chat rooms
df.groupby('ACT21_T').mean().HASRELIG.plot()









    Out[70]:





<matplotlib.axes.AxesSubplot at 0x7fed27ca2790>



In [76]:

    
# played chess
df.groupby('ACT23_T').mean().HASRELIG.plot()









    Out[76]:





<matplotlib.axes.AxesSubplot at 0x7fed27a0b110>



In [73]:

    
# played computer games
df.groupby('ACT24_T').mean().HASRELIG.plot()









    Out[73]:





<matplotlib.axes.AxesSubplot at 0x7fed27ab9dd0>



In [77]:

    
# read the editorial page
df.groupby('ACT26_T').mean().HASRELIG.plot()









    Out[77]:





<matplotlib.axes.AxesSubplot at 0x7fed27925410>



In [75]:

    
df.loc[:, 'ASIAN'] = (df.RACEGROUP == 2).astype(int)
df.loc[:, 'BLACK'] = (df.RACEGROUP == 3).astype(int)
df.loc[:, 'HISP'] = (df.RACEGROUP == 4).astype(int)
df.loc[:, 'INC'] = df.INCOME - df.INCOME.mean()
df.loc[:, 'INC2'] = df.INC**2



In [63]:



In [78]:

    
import statsmodels.formula.api as smf

formula = ('HASRELIG ~ MNONE + FNONE + MIXED + '
                  'ASIAN + BLACK + HISP + INC + INC2 + '
                  'ACT08 + ACT09 + ACT21 + ACT28 + ACT29 + '
                  'ACT17_T + ACT21_T + ACT24_T')
            
formula = ('HASRELIG ~ MNONE + FNONE + MIXED + '
                  'ASIAN + BLACK + HISP + INC + INC2 + '
                  'ACT17_T')
            
model = smf.logit(formula, data=df)
results = model.fit()
results.summary()









    



Optimization terminated successfully.
         Current function value: 0.321774
         Iterations 7






    Out[78]:





Logit Regression Results

  Dep. Variable:      HASRELIG        No. Observations:    223570 


  Model:                Logit         Df Residuals:        223560 


  Method:                MLE          Df Model:                 9 


  Date:           Thu, 23 Jul 2015    Pseudo R-squ.:       0.2527 


  Time:               11:49:01        Log-Likelihood:      -71939.


  converged:            True          LL-Null:             -96271.


                                    LLR p-value:          0.000 




               coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  Intercept      2.9338      0.026    113.334   0.000      2.883     2.984


  MNONE         -2.8543      0.025   -112.414   0.000     -2.904    -2.804


  FNONE         -1.3263      0.020    -66.305   0.000     -1.365    -1.287


  MIXED         -0.6334      0.015    -41.886   0.000     -0.663    -0.604


  ASIAN         -0.4040      0.026    -15.566   0.000     -0.455    -0.353


  BLACK          0.6087      0.037     16.372   0.000      0.536     0.682


  HISP           0.1321      0.040      3.290   0.001      0.053     0.211


  INC           -0.0189      0.001    -12.623   0.000     -0.022    -0.016


  INC2          -0.0017      0.000     -8.492   0.000     -0.002    -0.001


  ACT17_T       -0.1526      0.010    -15.379   0.000     -0.172    -0.133



In [64]:



In [64]:

	YEAR	SUBJID	SEX	AGE1	AGE2	RRACE	RACEGROUP	INCOME	FATHEDUC	MOTHEDUC	...	ACT29	ACT30	ACT31	ACT32	ACT33	ACT17_T	ACT21_T	ACT23_T	ACT24_T	ACT26_T
count	254963	254963.000000	254963.000000	253031.000000	253031.000000	254963.000000	250951.000000	228109.000000	254963.000000	254963.000000	...	252007.000000	251421.000000	250717.000000	251380.000000	251961.000000	254963.000000	254963.000000	254963.000000	254963.000000	254963.000000
mean	1999	262637.832568	1.561823	3.284198	3.278575	1.984264	4.763484	15.278652	5.423563	5.235995	...	2.542675	1.706791	2.043148	1.361266	2.373486	2.193146	1.725341	1.408138	1.959688	1.767551
std	0	169974.832774	0.496164	0.569794	0.532832	0.124451	1.019940	5.722904	2.170062	1.989437	...	0.601739	0.612357	0.671691	0.573339	0.552231	0.770246	0.782734	0.619980	0.729157	0.712127
min	1999	18.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1999	102510.500000	1.000000	3.000000	3.000000	2.000000	5.000000	13.000000	3.000000	3.000000	...	2.000000	1.000000	2.000000	1.000000	2.000000	2.000000	1.000000	1.000000	1.000000	1.000000
50%	1999	255331.000000	2.000000	3.000000	3.000000	2.000000	5.000000	17.000000	6.000000	6.000000	...	3.000000	2.000000	2.000000	1.000000	2.000000	2.000000	2.000000	1.000000	2.000000	2.000000
75%	1999	408788.500000	2.000000	4.000000	4.000000	2.000000	5.000000	20.000000	8.000000	6.000000	...	3.000000	2.000000	2.000000	2.000000	3.000000	3.000000	2.000000	2.000000	2.000000	2.000000
max	1999	588301.000000	2.000000	11.000000	6.000000	2.000000	7.000000	23.000000	8.000000	8.000000	...	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000

Dep. Variable:	HASRELIG	No. Observations:	223570
Model:	Logit	Df Residuals:	223560
Method:	MLE	Df Model:	9
Date:	Thu, 23 Jul 2015	Pseudo R-squ.:	0.2527
Time:	11:49:01	Log-Likelihood:	-71939.
converged:	True	LL-Null:	-96271.
		LLR p-value:	0.000

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
Intercept	2.9338	0.026	113.334	0.000	2.883 2.984
MNONE	-2.8543	0.025	-112.414	0.000	-2.904 -2.804
FNONE	-1.3263	0.020	-66.305	0.000	-1.365 -1.287
MIXED	-0.6334	0.015	-41.886	0.000	-0.663 -0.604
ASIAN	-0.4040	0.026	-15.566	0.000	-0.455 -0.353
BLACK	0.6087	0.037	16.372	0.000	0.536 0.682
HISP	0.1321	0.040	3.290	0.001	0.053 0.211
INC	-0.0189	0.001	-12.623	0.000	-0.022 -0.016
INC2	-0.0017	0.000	-8.492	0.000	-0.002 -0.001
ACT17_T	-0.1526	0.010	-15.379	0.000	-0.172 -0.133