notebook.community

Edit and run



In [198]:

    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline



In [199]:

    
def nummap(x):
    ttnkeys = list(txttonum.keys())
    if x in ttnkeys:
        return txttonum[x]
    else:
        return x
    
def agemap(x):
    atnkeys = list(agetonum.keys())
    if x in atnkeys:
        return agetonum[x]
    else:
        return x
    
def sexmap(x):
    sexkeys = list(sextonum.keys())
    if x in sexkeys:
        return sextonum[x]
    else:
        return x



In [253]:

    
bo = pd.read_excel('Birth_order_survey_Responses.xlsx')



In [237]:

    
# translate column names
coldict = {}
coldict['Comments?'] = 'comments'
coldict['How many years older than you is your next oldest sibling (if applicable)?'] = 'sib_yrsolder'
coldict['How many years younger than you is your next youngest sibling (if applicable)?'] = 'sib_yrsynger'
coldict['Where do you fall in your family\'s birth order?'] = 'your_bo'
coldict['How old are you?'] = 'age'
coldict['Timestamp'] = 'timestamp'
coldict['What is your gender?'] = 'gender'
coldict['What position in the birth order are your closest friends? [Friend 1]'] = 'fr1'
coldict['What position in the birth order are your closest friends? [Friend 2]'] = 'fr2'
coldict['What position in the birth order are your closest friends? [Friend 3]'] = 'fr3'
coldict['What position in the birth order are your closest friends? [Friend 4]'] = 'fr4'
coldict['What position in the birth order are your closest friends? [Friend 5]'] = 'fr5'
coldict['What position in the birth order are your closest friends? [Friend 6]'] = 'fr6'
coldict['What position in the birth order are your closest friends? [Friend 7]'] = 'fr7'
coldict['What position in the birth order are your significant others? [Romance 1]'] = 'rom1'
coldict['What position in the birth order are your significant others? [Romance 2]'] = 'rom2'
coldict['What position in the birth order are your significant others? [Romance 3]'] = 'rom3'
coldict['What position in the birth order are your significant others? [Romance 4]'] = 'rom4'
coldict['What position in the birth order are your significant others? [Romance 5]'] = 'rom5'
coldict['What position in the birth order are your significant others? [Romance 6]'] = 'rom6'
coldict['What position in the birth order are your significant others? [Romance 7]'] = 'rom7'



In [238]:

    
# translate numeric entries 
txttonum = {}
txttonum['Only child'] = 0
txttonum['First'] = 1
txttonum['Second or higher'] = 2
txttonum['Second'] = 2
txttonum['Third or higher'] = 3
txttonum['Other'] = 4
txttonum['Don\'t know'] = 5



In [239]:

    
agetonum = {}
agetonum['16-24'] = 0
agetonum['25-34'] = 1
agetonum['35-44'] = 2
agetonum['45-54'] = 3
agetonum['55-64'] = 4
agetonum['65-74'] = 5
agetonum['75+'] = 6



In [240]:

    
sextonum = {}
sextonum['Male'] = 0
sextonum['Female'] = 1
sextonum['genderqueer'] = 2
sextonum['Genderqueer'] = 2
sextonum['gender non-conforming'] = 2



In [254]:

    
# apply column and numeric mappings
bo = bo.rename(columns=coldict)
bo = bo.applymap(nummap)
bo = bo.applymap(agemap)
bo = bo.applymap(sexmap)



In [257]:

    
# drop duplicate rows
print(len(bo))
subset = ['sib_yrsynger','sib_yrsolder','your_bo','gender','age','comments'] + frcols + romcols
# bo.drop_duplicates(subset = subset, inplace=True)
dups = bo.duplicated(subset=subset, take_last=True)
bo[dups]









    



476






    Out[257]:






  
    
      
      timestamp
      age
      gender
      your_bo
      sib_yrsolder
      sib_yrsynger
      fr1
      fr2
      fr3
      fr4
      ...
      fr6
      fr7
      rom1
      rom2
      rom3
      rom4
      rom5
      rom6
      rom7
      comments
    
  
  
    
      216
      2015-06-16 00:55:52.800
      4
      1
      1
      NaN
      3
      2
      2
      2
      NaN
      ...
      NaN
      NaN
      2
      0
      1
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      251
      2015-06-16 09:51:36.793
      1
      1
      1
      NaN
      2 years
      2
      2
      2
      2
      ...
      0
      2
      2
      1
      1
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      306
      2015-06-17 11:33:19.620
      2
      1
      1
      NaN
      18 monthd
      2
      2
      2
      1
      ...
      2
      2
      2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

3 rows × 21 columns



In [246]:

    
# designate friend and romance columns
frcols = ['fr1','fr2','fr3','fr4','fr5','fr6','fr7']
romcols = ['rom1','rom2','rom3','rom4','rom5','rom6','rom7']



In [208]:

    
# check that there are no strings in numeric columns
def nostring(collist):
    for col in collist:
        nonstr = [num for a in bo[col].tolist() if type(a) == str]
        assert len(nonstr) < 1, 'String present in %s column' %col

collist = ['age','gender','your_bo'] + frcols + romcols
nostring(collist)



In [209]:

    
bo_calc = pd.DataFrame(index=bo.index)



In [210]:

    
copycols = ['age','your_bo','sib_yrsolder','sib_yrsynger']
bo_calc[copycols] = bo[copycols]



In [211]:

    
bo_calc['fr_count'] = bo[frcols].count(axis=1)
bo_calc['rom_count'] = bo[romcols].count(axis=1)



In [213]:

    
# create error columns



In [214]:

    
# raise error if someone is first born but has an older sibling
bo_calc['err_yrsolder'] = float('NaN')



In [215]:

    
# raise error if someone's birth order is 'other' or 'don't know'
bo_calc['err_uncertainbo'] = bo_calc['your_bo'].apply(lambda x: 1 if x == 4 or x == 5 else float('NaN'))



In [280]:

    
# impute the minimum number of siblings
bo_calc['numsibs'] = float('NaN')
if bo_calc.loc((bo_calc.your_bo.isin([1, 2, 3])) & (type(bo_calc.loc(bo_calc.sib_yrsynger)) == float)):
    bo_calc['numsibs'] = bo_calc.your_bo + 1



In [ ]:

    
# make column designating twins
twins = [1 for val in bo_calc.sib_yrsolder.tolist() if re.search('twin',str(val))]
# bo_calc['twin'] = twins



In [ ]:

    
twtest = 'twin 1 for me'
twtest2 = 'tweeen 1'
result = re.search('twin',twtest).group()
result



In [281]:

    
bo_calc.head()









    Out[281]:






  
    
      
      age
      your_bo
      sib_yrsolder
      sib_yrsynger
      fr_count
      rom_count
      err_yrsolder
      err_uncertainbo
      numsibs
    
  
  
    
      0
      1
      1
      NaN
      2
      7
      5
      NaN
      NaN
      2
    
    
      1
      2
      2
      3
      NaN
      2
      1
      NaN
      NaN
      3
    
    
      2
      1
      2
      5
      na
      7
      4
      NaN
      NaN
      3
    
    
      3
      1
      3
      3
      NaN
      7
      4
      NaN
      NaN
      4
    
    
      4
      1
      2
      6
      NaN
      7
      7
      NaN
      NaN
      3



In [ ]:

    
if bo_calc.your_bo == 1 and bo_calc.sib_yrsolder.notnull() == True:
    bo['err_yrsolder'] = 1



In [ ]:

    
bo_calc.sib_yrsynger.unique()



In [ ]:



In [ ]:

	timestamp	age	gender	your_bo	sib_yrsolder	sib_yrsynger	fr1	fr2	fr3	fr4	...	fr6	fr7	rom1	rom2	rom3	rom4	rom5	rom6	rom7	comments
216	2015-06-16 00:55:52.800	4	1	1	NaN	3	2	2	2	NaN	...	NaN	NaN	2	0	1	NaN	NaN	NaN	NaN	NaN
251	2015-06-16 09:51:36.793	1	1	1	NaN	2 years	2	2	2	2	...	0	2	2	1	1	NaN	NaN	NaN	NaN	NaN
306	2015-06-17 11:33:19.620	2	1	1	NaN	18 monthd	2	2	2	1	...	2	2	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	age	your_bo	sib_yrsolder	sib_yrsynger	fr_count	rom_count	err_yrsolder	err_uncertainbo	numsibs
0	1	1	NaN	2	7	5	NaN	NaN	2
1	2	2	3	NaN	2	1	NaN	NaN	3
2	1	2	5	na	7	4	NaN	NaN	3
3	1	3	3	NaN	7	4	NaN	NaN	4
4	1	2	6	NaN	7	7	NaN	NaN	3

	age	your_bo	sib_yrsolder	sib_yrsynger	fr_count	rom_count	err_yrsolder	err_uncertainbo	numsibs
0	1	1	NaN	2	7	5	NaN	NaN	2
1	2	2	3	NaN	2	1	NaN	NaN	3
2	1	2	5	na	7	4	NaN	NaN	3
3	1	3	3	NaN	7	4	NaN	NaN	4
4	1	2	6	NaN	7	7	NaN	NaN	3

	age	your_bo	sib_yrsolder	sib_yrsynger	fr_count	rom_count	err_yrsolder	err_uncertainbo	numsibs
0	1	1	NaN	2	7	5	NaN	NaN	2
1	2	2	3	NaN	2	1	NaN	NaN	3
2	1	2	5	na	7	4	NaN	NaN	3
3	1	3	3	NaN	7	4	NaN	NaN	4
4	1	2	6	NaN	7	7	NaN	NaN	3