Written by CC on 20161228
In [1]:
import pandas
import numpy
import os
import ijson
In [2]:
path = os.chdir('/Users/superuser/Documents/projects/SDRegionalDataLib/age friendly community/acs2015_1yr_B01001/')
In [3]:
ageData = pandas.read_csv('acs2015_1yr_B01001.csv');
ageData.head()
Out[3]:
In [4]:
colNames = list(ageData.columns.values)
#show the first ten coded column names
colNames[0:9]
Out[4]:
In [5]:
#open the json file
jsonFile = 'metadata.json';
with open(jsonFile, 'r') as f:
objects = ijson.items(f, 'tables.B01001.columns')
columnAttr = list(objects)
#show the values of columnAttr
columnAttr[:]
Out[5]:
In [6]:
def getRecodingKeys(element):
if ('Error' not in element) and ('name' != element) and ('geoid' != element):
return element
return False
#filter out the original column names that don't require recoding
codingDF = pandas.DataFrame({'origColNames': list(filter(getRecodingKeys, colNames))})
#add a new column to codingDF that contains the recoded ageDF column names
codingDF['recodeColName'] = ''
codingDF is intended to be used as a lookup table for recoding.
codingDF contains 2 columns:
origColNames: contain the original coded column name in the ageData dataframe
recodeColName: contains the actual column names that are a lot more descriptive than the coded column names in the ageData dataframe
In [7]:
#append the recodedColNames to codingDF.recodeColName
for idx, origColName in enumerate(codingDF.origColNames):
codingDF.recodeColName[idx] = columnAttr[0][origColName]['name']
codingDF.head()
Out[7]:
The 2 cells below are my attempts to recode the ageData column names.
Some things to deal with:
1) Each coded column name in the ageData dataframe 'B01001xxx' for instance has a corresponding column cotaining the error of each measurement with a column name 'B01001xxx, Error'.
2) Finding the index values of python/pandas dataframe where the rows are equal to a certain value. In CRAN-R you could execute the following code: 'index = which(dataFrame$column=='B01001xxx')'. A python/pandas equivalent would be great if there is one.
In [ ]:
#recode ageData with the actual column names
for idx, col in enumerate(ageData.columns):
if codingDF.origColNames.str.contains(col): #codingDF.origColNames.str.contains(ageData.columns[idx]):
colMatchIDX = codingDF.origColNames.get_loc(col)
if ageData.columns[idx].str.contains('Error'):
tempColName = codingDF.recodeColName[colMatchIDX] + '_Error'
ageData.columns[idx] = tempColName
else:
tempColName = codingDF.recodeColName[colMatchIDX]
In [ ]:
#recode ageData with the actual column names
for idx, col in enumerate(ageData.columns):
if codingDF.origColNames.str.contains(col): #codingDF.origColNames.str.contains(ageData.columns[idx]):
colMatchIDX = codingDF.origColNames.get_loc(col)
if ageData.columns[idx].str.contains('Error'):
tempColName = codingDF.recodeColName[colMatchIDX] + '_Error'
ageData.rename(columns={col:tempColName}, inplace=True)
else:
tempColName = codingDF.recodeColName[colMatchIDX]
print(tempColName)
ageData.rename(columns={col:tempColName}, inplace=True)