Wave I, the main survey, was fielded between February 21 and April 2, 2009. Wave 2 was fielded March 12, 2010 to June 8, 2010. Wave 3 was fielded March 22, 2011 to August 29, 2011. Wave 4 was fielded between March and November of 2013. Wave 5 was fielded between November, 2014 and March, 2015.
In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns=1000
In [28]:
df = pd.read_stata('/gh/data/hcmst/1.dta')
# df2 = pd.read_stata('/gh/data/hcmst/2.dta')
# df3 = pd.read_stata('/gh/data/hcmst/3.dta')
# df = df1.merge(df2, on='caseid_new')
# df = df.merge(df3, on='caseid_new')
df.head(2)
Out[28]:
In [29]:
rename_cols_dict = {'ppage': 'age', 'ppeducat': 'education',
'ppethm': 'race', 'ppgender': 'sex',
'pphouseholdsize': 'household_size', 'pphouse': 'house_type',
'hhinc': 'income', 'ppmarit': 'marital_status',
'ppmsacat': 'in_metro', 'ppreg4': 'usa_region',
'pprent': 'house_payment', 'children_in_hh': 'N_child',
'ppwork': 'work', 'ppnet': 'has_internet',
'papglb_friend': 'has_gay_friendsfam', 'pppartyid3': 'politics',
'papreligion': 'religion', 'qflag': 'in_relationship',
'q9': 'partner_age', 'duration': 'N_minutes_survey',
'glbstatus': 'is_lgb', 's1': 'is_married',
'partner_race': 'partner_race', 'q7b': 'partner_religion',
'q10': 'partner_education', 'US_raised': 'USA_raised',
'q17a': 'N_marriages', 'q17b': 'N_marriages2', 'coresident': 'cohabit',
'q21a': 'age_first_met', 'q21b': 'age_relationship_begin',
'q21d': 'age_married', 'q23': 'relative_income',
'q25': 'same_high_school', 'q26': 'same_college',
'q27': 'same_hometown', 'age_difference': 'age_difference',
'q34':'relationship_quality',
'q24_met_online': 'met_online', 'met_through_friends': 'met_friends',
'met_through_family': 'met_family', 'met_through_as_coworkers': 'met_work'}
df = df[list(rename_cols_dict.keys())]
df.rename(columns=rename_cols_dict, inplace=True)
# Process number of marriages
df['N_marriages'] = df['N_marriages'].astype(str).replace({'nan':''}) + df['N_marriages2'].astype(str).replace({'nan':''})
df.drop('N_marriages2', axis=1, inplace=True)
df['N_marriages'] = df['N_marriages'].replace({'':np.nan, 'once (this is my first marriage)': 'once', 'refused':np.nan})
df['N_marriages'] = df['N_marriages'].astype('category')
In [30]:
# Clean entries to make simpler
df['in_metro'] = df['in_metro']=='metro'
df['relationship_excellent'] = df['relationship_quality'] == 'excellent'
df['house_payment'].replace({'owned or being bought by you or someone in your household': 'owned',
'rented for cash': 'rent',
'occupied without payment of cash rent': 'free'}, inplace=True)
df['race'].replace({'white, non-hispanic': 'white',
'2+ races, non-hispanic': 'other, non-hispanic',
'black, non-hispanic': 'black'}, inplace=True)
df['house_type'].replace({'a one-family house detached from any other house': 'house',
'a building with 2 or more apartments': 'apartment',
'a one-family house attached to one or more houses': 'house',
'a mobile home': 'mobile',
'boat, rv, van, etc.': 'mobile'}, inplace=True)
df['is_not_working'] = df['work'].str.contains('not working')
df['has_internet'] = df['has_internet'] == 'yes'
df['has_gay_friends'] = np.logical_or(df['has_gay_friendsfam']=='yes, friends', df['has_gay_friendsfam']=='yes, both')
df['has_gay_family'] = np.logical_or(df['has_gay_friendsfam']=='yes, relatives', df['has_gay_friendsfam']=='yes, both')
df['religion_is_christian'] = df['religion'].isin(['protestant (e.g., methodist, lutheran, presbyterian, episcopal)',
'catholic', 'baptist-any denomination', 'other christian', 'pentecostal', 'mormon', 'eastern orthodox'])
df['religion_is_none'] = df['religion'].isin(['none'])
df['in_relationship'] = df['in_relationship']=='partnered'
df['is_lgb'] = df['is_lgb']=='glb'
df['is_married'] = df['is_married']=='yes, i am married'
df['partner_race'].replace({'NH white': 'white', ' NH black': 'black',
' NH Asian Pac Islander':'other', ' NH Other': 'other', ' NH Amer Indian': 'other'}, inplace=True)
df['partner_religion_is_christian'] = df['partner_religion'].isin(['protestant (e.g., methodist, lutheran, presbyterian, episcopal)',
'catholic', 'baptist-any denomination', 'other christian', 'pentecostal', 'mormon', 'eastern orthodox'])
df['partner_religion_is_none'] = df['partner_religion'].isin(['none'])
df['partner_education'] = df['partner_education'].map({'hs graduate or ged': 'high school',
'some college, no degree': 'some college',
"associate degree": "some college",
"bachelor's degree": "bachelor's degree or higher",
"master's degree": "bachelor's degree or higher",
"professional or doctorate degree": "bachelor's degree or higher"})
df['partner_education'].fillna('less than high school', inplace=True)
df['USA_raised'] = df['USA_raised']=='raised in US'
df['N_marriages'] = df['N_marriages'].map({'never married': '0', 'once': '1', 'twice': '2', 'three times': '3+', 'four or more times':'3+'})
df['relative_income'].replace({'i earned more': 'more', 'partner earned more': 'less',
'we earned about the same amount': 'same', 'refused': np.nan}, inplace=True)
df['same_high_school'] = df['same_high_school']=='same high school'
df['same_college'] = df['same_college']=='attended same college or university'
df['same_hometown'] = df['same_hometown']=='yes'
df['cohabit'] = df['cohabit']=='yes'
df['met_online'] = df['met_online']=='met online'
df['met_friends'] = df['met_friends']=='meet through friends'
df['met_family'] = df['met_family']=='met through family'
df['met_work'] = df['met_family']==1
df['age'] = df['age'].astype(int)
for c in df.columns:
if str(type(df[c])) == 'object':
df[c] = df[c].astype('category')
In [53]:
df.head()
Out[53]:
In [58]:
df.to_csv('/gh/data/hcmst/1_cleaned.csv')
In [54]:
for c in df.columns:
print(df[c].value_counts())
In [55]:
# Countplot if categorical; distplot if numeric
from pandas.api.types import is_numeric_dtype
plt.figure(figsize=(40,40))
for i, c in enumerate(df.columns):
plt.subplot(7,7,i+1)
if is_numeric_dtype(df[c]):
sns.distplot(df[c].dropna(), kde=False)
else:
sns.countplot(y=c, data=df)
plt.savefig('temp.png')
In [57]:
sns.barplot(x='income', y='race', data=df)
Out[57]: