In [1]:
import savReaderWriter as s
import pandas as pd
import numpy as np
%matplotlib inline
In [2]:
dem = pd.read_pickle('dem.pkl')
dem.shape
Out[2]:
In [3]:
len(set(dem.index))
Out[3]:
In [3]:
In [4]:
hs = pd.read_pickle('hs.pkl')
hs.shape
Out[4]:
In [5]:
len(set(dem.index))
Out[5]:
In [5]:
In [6]:
dems = dem.groupby('YEAR')
hss = hs.groupby('YEAR')
for name, group in dems:
print(name)
In [7]:
for name, group in dems:
n = len(group)
m = len(set(group.index))
print(name, n, m, n-m, min(group.index), max(group.index))
In [8]:
for name, group in hss:
n = len(group)
m = len(set(group.index))
print(name, n, m, n-m, min(group.index), max(group.index) )
In [9]:
joined = {}
for name, group1 in dems:
group2 = hss.get_group(name)
n = len(group1)
m = len(group2)
print(name, n, m, n-m)
try:
joined[name] = pd.concat([group1, group2], axis=1)
except pd.core.index.InvalidIndexError:
# TODO: deal with the years that have duplicate SUBJID
pass
In [10]:
df = joined[1999.0]
In [10]:
In [11]:
df.describe()
Out[11]:
In [12]:
df.AGE1.value_counts().sort_index()
Out[12]:
In [13]:
df.AGE2.value_counts().sort_index()
Out[13]:
In [13]:
In [13]:
In [14]:
# select people 24 and younger
df = df.loc[df.AGE1 <= 6]
len(df)
Out[14]:
In [15]:
df.HSGPA.value_counts().sort_index()
Out[15]:
In [16]:
df.SATV.value_counts().sort_index()
Out[16]:
In [17]:
df.SATM.value_counts().sort_index()
Out[17]:
In [18]:
df.ACTCOMP.value_counts().sort_index()
Out[18]:
In [64]:
# TODO: check that 0 really means NA for these vars
df.loc[:, 'ACT17_T'] = df.ACT17_T.replace(0, np.nan)
df.loc[:, 'ACT21_T'] = df.ACT21_T.replace(0, np.nan)
df.loc[:, 'ACT23_T'] = df.ACT23_T.replace(0, np.nan)
df.loc[:, 'ACT24_T'] = df.ACT24_T.replace(0, np.nan)
df.loc[:, 'ACT26_T'] = df.ACT26_T.replace(0, np.nan)
In [65]:
df.isnull().mean()
Out[65]:
In [21]:
df.ACT08.value_counts().sort_index()
Out[21]:
In [22]:
df.ACT09.value_counts().sort_index()
Out[22]:
In [23]:
df.ACT21.value_counts().sort_index()
Out[23]:
In [24]:
df.ACT25.value_counts().sort_index()
Out[24]:
In [25]:
df.ACT28.value_counts().sort_index()
Out[25]:
In [26]:
df.ACT29.value_counts().sort_index()
Out[26]:
In [66]:
df.ACT17_T.value_counts().sort_index()
Out[66]:
In [67]:
df.ACT21_T.value_counts().sort_index()
Out[67]:
In [68]:
df.ACT24_T.value_counts().sort_index()
Out[68]:
In [30]:
df.INCOME.value_counts().sort_index()
Out[30]:
In [31]:
df.FATHEDUC.value_counts().sort_index()
Out[31]:
In [32]:
df.MOTHEDUC.value_counts().sort_index()
Out[32]:
In [33]:
df.FIRSTGEN.value_counts().sort_index()
Out[33]:
In [34]:
# I'm assuming that the code is PCJON
df.SRELIGIONA.value_counts().sort_index()
Out[34]:
In [35]:
df.FRELIGIONA.value_counts().sort_index()
Out[35]:
In [36]:
df.MRELIGIONA.value_counts().sort_index()
Out[36]:
In [39]:
df.loc[:, 'HASRELIG'] = (df.SRELIGIONA != 5).astype(int)
df.HASRELIG.mean()
Out[39]:
In [39]:
In [40]:
df.groupby('INCOME').mean().HASRELIG.plot()
Out[40]:
In [41]:
df.groupby('FATHEDUC').mean().HASRELIG.plot()
Out[41]:
In [42]:
df.groupby('MOTHEDUC').mean().HASRELIG.plot()
Out[42]:
In [43]:
In [68]:
In [45]:
df.groupby('RACEGROUP').mean().HASRELIG.plot()
Out[45]:
In [46]:
df.groupby('FIRSTGEN').mean().HASRELIG.plot()
Out[46]:
In [47]:
df.groupby('HSGPA').mean().HASRELIG.plot()
Out[47]:
In [48]:
df.groupby('MRELIGIONA').mean().HASRELIG.plot()
Out[48]:
In [49]:
df.groupby('FRELIGIONA').mean().HASRELIG.plot()
Out[49]:
In [51]:
df.loc[:, 'MNONE'] = (df.MRELIGIONA == 5).astype(int)
df.loc[:, 'FNONE'] = (df.FRELIGIONA == 5).astype(int)
df.loc[:, 'MIXED'] = (df.FRELIGIONA != df.MRELIGIONA).astype(int)
In [52]:
df.groupby('FNONE').mean().HASRELIG.plot()
Out[52]:
In [53]:
df.groupby('MNONE').mean().HASRELIG.plot()
Out[53]:
In [54]:
# parents have different religions
df.groupby('MIXED').mean().HASRELIG.plot()
Out[54]:
In [55]:
# discussed politics
df.groupby('ACT08').mean().HASRELIG.plot()
Out[55]:
In [56]:
# discussed religion
#df.groupby('ACT09').mean().HASRELIG.plot()
In [57]:
# played a musical instrument
df.groupby('ACT21').mean().HASRELIG.plot()
Out[57]:
In [58]:
# used a personal computer
df.groupby('ACT28').mean().HASRELIG.plot()
Out[58]:
In [59]:
# used the Internet for research or homework
df.groupby('ACT29').mean().HASRELIG.plot()
Out[59]:
In [69]:
# other Internet use
df.groupby('ACT17_T').mean().HASRELIG.plot()
Out[69]:
In [70]:
# Participated in Internet chat rooms
df.groupby('ACT21_T').mean().HASRELIG.plot()
Out[70]:
In [76]:
# played chess
df.groupby('ACT23_T').mean().HASRELIG.plot()
Out[76]:
In [73]:
# played computer games
df.groupby('ACT24_T').mean().HASRELIG.plot()
Out[73]:
In [77]:
# read the editorial page
df.groupby('ACT26_T').mean().HASRELIG.plot()
Out[77]:
In [75]:
df.loc[:, 'ASIAN'] = (df.RACEGROUP == 2).astype(int)
df.loc[:, 'BLACK'] = (df.RACEGROUP == 3).astype(int)
df.loc[:, 'HISP'] = (df.RACEGROUP == 4).astype(int)
df.loc[:, 'INC'] = df.INCOME - df.INCOME.mean()
df.loc[:, 'INC2'] = df.INC**2
In [63]:
In [78]:
import statsmodels.formula.api as smf
formula = ('HASRELIG ~ MNONE + FNONE + MIXED + '
'ASIAN + BLACK + HISP + INC + INC2 + '
'ACT08 + ACT09 + ACT21 + ACT28 + ACT29 + '
'ACT17_T + ACT21_T + ACT24_T')
formula = ('HASRELIG ~ MNONE + FNONE + MIXED + '
'ASIAN + BLACK + HISP + INC + INC2 + '
'ACT17_T')
model = smf.logit(formula, data=df)
results = model.fit()
results.summary()
Out[78]:
In [64]:
In [64]: