In [198]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline

In [199]:
def nummap(x):
    ttnkeys = list(txttonum.keys())
    if x in ttnkeys:
        return txttonum[x]
    else:
        return x
    
def agemap(x):
    atnkeys = list(agetonum.keys())
    if x in atnkeys:
        return agetonum[x]
    else:
        return x
    
def sexmap(x):
    sexkeys = list(sextonum.keys())
    if x in sexkeys:
        return sextonum[x]
    else:
        return x

In [253]:
bo = pd.read_excel('Birth_order_survey_Responses.xlsx')

In [237]:
# translate column names
coldict = {}
coldict['Comments?'] = 'comments'
coldict['How many years older than you is your next oldest sibling (if applicable)?'] = 'sib_yrsolder'
coldict['How many years younger than you is your next youngest sibling (if applicable)?'] = 'sib_yrsynger'
coldict['Where do you fall in your family\'s birth order?'] = 'your_bo'
coldict['How old are you?'] = 'age'
coldict['Timestamp'] = 'timestamp'
coldict['What is your gender?'] = 'gender'
coldict['What position in the birth order are your closest friends? [Friend 1]'] = 'fr1'
coldict['What position in the birth order are your closest friends? [Friend 2]'] = 'fr2'
coldict['What position in the birth order are your closest friends? [Friend 3]'] = 'fr3'
coldict['What position in the birth order are your closest friends? [Friend 4]'] = 'fr4'
coldict['What position in the birth order are your closest friends? [Friend 5]'] = 'fr5'
coldict['What position in the birth order are your closest friends? [Friend 6]'] = 'fr6'
coldict['What position in the birth order are your closest friends? [Friend 7]'] = 'fr7'
coldict['What position in the birth order are your significant others? [Romance 1]'] = 'rom1'
coldict['What position in the birth order are your significant others? [Romance 2]'] = 'rom2'
coldict['What position in the birth order are your significant others? [Romance 3]'] = 'rom3'
coldict['What position in the birth order are your significant others? [Romance 4]'] = 'rom4'
coldict['What position in the birth order are your significant others? [Romance 5]'] = 'rom5'
coldict['What position in the birth order are your significant others? [Romance 6]'] = 'rom6'
coldict['What position in the birth order are your significant others? [Romance 7]'] = 'rom7'

In [238]:
# translate numeric entries 
txttonum = {}
txttonum['Only child'] = 0
txttonum['First'] = 1
txttonum['Second or higher'] = 2
txttonum['Second'] = 2
txttonum['Third or higher'] = 3
txttonum['Other'] = 4
txttonum['Don\'t know'] = 5

In [239]:
agetonum = {}
agetonum['16-24'] = 0
agetonum['25-34'] = 1
agetonum['35-44'] = 2
agetonum['45-54'] = 3
agetonum['55-64'] = 4
agetonum['65-74'] = 5
agetonum['75+'] = 6

In [240]:
sextonum = {}
sextonum['Male'] = 0
sextonum['Female'] = 1
sextonum['genderqueer'] = 2
sextonum['Genderqueer'] = 2
sextonum['gender non-conforming'] = 2

In [254]:
# apply column and numeric mappings
bo = bo.rename(columns=coldict)
bo = bo.applymap(nummap)
bo = bo.applymap(agemap)
bo = bo.applymap(sexmap)

In [257]:
# drop duplicate rows
print(len(bo))
subset = ['sib_yrsynger','sib_yrsolder','your_bo','gender','age','comments'] + frcols + romcols
# bo.drop_duplicates(subset = subset, inplace=True)
dups = bo.duplicated(subset=subset, take_last=True)
bo[dups]


476
Out[257]:
timestamp age gender your_bo sib_yrsolder sib_yrsynger fr1 fr2 fr3 fr4 ... fr6 fr7 rom1 rom2 rom3 rom4 rom5 rom6 rom7 comments
216 2015-06-16 00:55:52.800 4 1 1 NaN 3 2 2 2 NaN ... NaN NaN 2 0 1 NaN NaN NaN NaN NaN
251 2015-06-16 09:51:36.793 1 1 1 NaN 2 years 2 2 2 2 ... 0 2 2 1 1 NaN NaN NaN NaN NaN
306 2015-06-17 11:33:19.620 2 1 1 NaN 18 monthd 2 2 2 1 ... 2 2 2 NaN NaN NaN NaN NaN NaN NaN

3 rows × 21 columns


In [246]:
# designate friend and romance columns
frcols = ['fr1','fr2','fr3','fr4','fr5','fr6','fr7']
romcols = ['rom1','rom2','rom3','rom4','rom5','rom6','rom7']

In [208]:
# check that there are no strings in numeric columns
def nostring(collist):
    for col in collist:
        nonstr = [num for a in bo[col].tolist() if type(a) == str]
        assert len(nonstr) < 1, 'String present in %s column' %col

collist = ['age','gender','your_bo'] + frcols + romcols
nostring(collist)

In [209]:
bo_calc = pd.DataFrame(index=bo.index)

In [210]:
copycols = ['age','your_bo','sib_yrsolder','sib_yrsynger']
bo_calc[copycols] = bo[copycols]

In [211]:
bo_calc['fr_count'] = bo[frcols].count(axis=1)
bo_calc['rom_count'] = bo[romcols].count(axis=1)

In [213]:
# create error columns

In [214]:
# raise error if someone is first born but has an older sibling
bo_calc['err_yrsolder'] = float('NaN')

In [215]:
# raise error if someone's birth order is 'other' or 'don't know'
bo_calc['err_uncertainbo'] = bo_calc['your_bo'].apply(lambda x: 1 if x == 4 or x == 5 else float('NaN'))

In [280]:
# impute the minimum number of siblings
bo_calc['numsibs'] = float('NaN')
if bo_calc.loc((bo_calc.your_bo.isin([1, 2, 3])) & (type(bo_calc.loc(bo_calc.sib_yrsynger)) == float)):
    bo_calc['numsibs'] = bo_calc.your_bo + 1

In [ ]:
# make column designating twins
twins = [1 for val in bo_calc.sib_yrsolder.tolist() if re.search('twin',str(val))]
# bo_calc['twin'] = twins

In [ ]:
twtest = 'twin 1 for me'
twtest2 = 'tweeen 1'
result = re.search('twin',twtest).group()
result

In [281]:
bo_calc.head()


Out[281]:
age your_bo sib_yrsolder sib_yrsynger fr_count rom_count err_yrsolder err_uncertainbo numsibs
0 1 1 NaN 2 7 5 NaN NaN 2
1 2 2 3 NaN 2 1 NaN NaN 3
2 1 2 5 na 7 4 NaN NaN 3
3 1 3 3 NaN 7 4 NaN NaN 4
4 1 2 6 NaN 7 7 NaN NaN 3

In [ ]:
if bo_calc.your_bo == 1 and bo_calc.sib_yrsolder.notnull() == True:
    bo['err_yrsolder'] = 1

In [ ]:
bo_calc.sib_yrsynger.unique()

In [ ]:


In [ ]: