In [198]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline
In [199]:
def nummap(x):
ttnkeys = list(txttonum.keys())
if x in ttnkeys:
return txttonum[x]
else:
return x
def agemap(x):
atnkeys = list(agetonum.keys())
if x in atnkeys:
return agetonum[x]
else:
return x
def sexmap(x):
sexkeys = list(sextonum.keys())
if x in sexkeys:
return sextonum[x]
else:
return x
In [253]:
bo = pd.read_excel('Birth_order_survey_Responses.xlsx')
In [237]:
# translate column names
coldict = {}
coldict['Comments?'] = 'comments'
coldict['How many years older than you is your next oldest sibling (if applicable)?'] = 'sib_yrsolder'
coldict['How many years younger than you is your next youngest sibling (if applicable)?'] = 'sib_yrsynger'
coldict['Where do you fall in your family\'s birth order?'] = 'your_bo'
coldict['How old are you?'] = 'age'
coldict['Timestamp'] = 'timestamp'
coldict['What is your gender?'] = 'gender'
coldict['What position in the birth order are your closest friends? [Friend 1]'] = 'fr1'
coldict['What position in the birth order are your closest friends? [Friend 2]'] = 'fr2'
coldict['What position in the birth order are your closest friends? [Friend 3]'] = 'fr3'
coldict['What position in the birth order are your closest friends? [Friend 4]'] = 'fr4'
coldict['What position in the birth order are your closest friends? [Friend 5]'] = 'fr5'
coldict['What position in the birth order are your closest friends? [Friend 6]'] = 'fr6'
coldict['What position in the birth order are your closest friends? [Friend 7]'] = 'fr7'
coldict['What position in the birth order are your significant others? [Romance 1]'] = 'rom1'
coldict['What position in the birth order are your significant others? [Romance 2]'] = 'rom2'
coldict['What position in the birth order are your significant others? [Romance 3]'] = 'rom3'
coldict['What position in the birth order are your significant others? [Romance 4]'] = 'rom4'
coldict['What position in the birth order are your significant others? [Romance 5]'] = 'rom5'
coldict['What position in the birth order are your significant others? [Romance 6]'] = 'rom6'
coldict['What position in the birth order are your significant others? [Romance 7]'] = 'rom7'
In [238]:
# translate numeric entries
txttonum = {}
txttonum['Only child'] = 0
txttonum['First'] = 1
txttonum['Second or higher'] = 2
txttonum['Second'] = 2
txttonum['Third or higher'] = 3
txttonum['Other'] = 4
txttonum['Don\'t know'] = 5
In [239]:
agetonum = {}
agetonum['16-24'] = 0
agetonum['25-34'] = 1
agetonum['35-44'] = 2
agetonum['45-54'] = 3
agetonum['55-64'] = 4
agetonum['65-74'] = 5
agetonum['75+'] = 6
In [240]:
sextonum = {}
sextonum['Male'] = 0
sextonum['Female'] = 1
sextonum['genderqueer'] = 2
sextonum['Genderqueer'] = 2
sextonum['gender non-conforming'] = 2
In [254]:
# apply column and numeric mappings
bo = bo.rename(columns=coldict)
bo = bo.applymap(nummap)
bo = bo.applymap(agemap)
bo = bo.applymap(sexmap)
In [257]:
# drop duplicate rows
print(len(bo))
subset = ['sib_yrsynger','sib_yrsolder','your_bo','gender','age','comments'] + frcols + romcols
# bo.drop_duplicates(subset = subset, inplace=True)
dups = bo.duplicated(subset=subset, take_last=True)
bo[dups]
Out[257]:
In [246]:
# designate friend and romance columns
frcols = ['fr1','fr2','fr3','fr4','fr5','fr6','fr7']
romcols = ['rom1','rom2','rom3','rom4','rom5','rom6','rom7']
In [208]:
# check that there are no strings in numeric columns
def nostring(collist):
for col in collist:
nonstr = [num for a in bo[col].tolist() if type(a) == str]
assert len(nonstr) < 1, 'String present in %s column' %col
collist = ['age','gender','your_bo'] + frcols + romcols
nostring(collist)
In [209]:
bo_calc = pd.DataFrame(index=bo.index)
In [210]:
copycols = ['age','your_bo','sib_yrsolder','sib_yrsynger']
bo_calc[copycols] = bo[copycols]
In [211]:
bo_calc['fr_count'] = bo[frcols].count(axis=1)
bo_calc['rom_count'] = bo[romcols].count(axis=1)
In [213]:
# create error columns
In [214]:
# raise error if someone is first born but has an older sibling
bo_calc['err_yrsolder'] = float('NaN')
In [215]:
# raise error if someone's birth order is 'other' or 'don't know'
bo_calc['err_uncertainbo'] = bo_calc['your_bo'].apply(lambda x: 1 if x == 4 or x == 5 else float('NaN'))
In [280]:
# impute the minimum number of siblings
bo_calc['numsibs'] = float('NaN')
if bo_calc.loc((bo_calc.your_bo.isin([1, 2, 3])) & (type(bo_calc.loc(bo_calc.sib_yrsynger)) == float)):
bo_calc['numsibs'] = bo_calc.your_bo + 1
In [ ]:
# make column designating twins
twins = [1 for val in bo_calc.sib_yrsolder.tolist() if re.search('twin',str(val))]
# bo_calc['twin'] = twins
In [ ]:
twtest = 'twin 1 for me'
twtest2 = 'tweeen 1'
result = re.search('twin',twtest).group()
result
In [281]:
bo_calc.head()
Out[281]:
In [ ]:
if bo_calc.your_bo == 1 and bo_calc.sib_yrsolder.notnull() == True:
bo['err_yrsolder'] = 1
In [ ]:
bo_calc.sib_yrsynger.unique()
In [ ]:
In [ ]: