In [1]:
import os
from dotenv import load_dotenv, find_dotenv
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
# load up the entries as environment variables
load_dotenv(dotenv_path)
Out[1]:
In [2]:
PROJECT_DIR = os.path.dirname(dotenv_path)
RAW_DATA_DIR = PROJECT_DIR + os.environ.get("RAW_DATA_DIR")
INTERIM_DATA_DIR = PROJECT_DIR + os.environ.get("INTERIM_DATA_DIR")
files=os.environ.get("FILES").split()
print("Project directory is : {0}".format(PROJECT_DIR))
print("Raw data directory is : {0}".format(RAW_DATA_DIR))
print("Interim directory is : {0}".format(INTERIM_DATA_DIR))
In [3]:
# The following jupyter notebook magic makes the plots appear in the notebook.
# If you run in batch mode, you have to save your plots as images.
%matplotlib inline
# matplotlib.pyplot is traditionally imported as plt
import matplotlib.pyplot as plt
# numpy is imported as np
import numpy as np
# Pandas is traditionaly imported as pd.
import pandas as pd
from pylab import rcParams
# some display options to size the figures. feel free to experiment
pd.set_option('display.max_columns', 25)
rcParams['figure.figsize'] = (17, 7)
Reading a CSV file is really easy in Pandas. There are several formats that Pandas can deal with.
Format Type | Data Description | Reader | Writer |
---|---|---|---|
text | CSV | read_csv | to_csv |
text | JSON | read_json | to_json |
text | HTML | read_html | to_html |
text | Local clipboard | read_clipboard | to_clipboard |
binary | MS Excel | read_excel | to_excel |
binary | HDF5 Format | read_hdf | to_hdf |
binary | Feather Format | read_feather | to_feather |
binary | Msgpack | read_msgpack | to_msgpack |
binary | Stata | read_stata | to_stata |
binary | SAS | read_sas | |
binary | Python Pickle Format | read_pickle | to_pickle |
SQL | SQL | read_sql | to_sql |
SQL | Google Big Query | read_gbq | to_gbq |
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5436776/pdf/oarrr-9-101.pdf
This article suggests a relationship between arthritis and serious psychological distress (SPD).
First we will look at the article to recreate the data set from the NIHS data we got in session 2.
We will use pd.read_csv()
.
As you will see, the Jupyter notebook prints out a very nice rendition of the DataFrame object that is the result
In [4]:
family=pd.read_csv(RAW_DATA_DIR+'/familyxx.csv')
samadult=pd.read_csv(RAW_DATA_DIR+'/samadult.csv')
In [5]:
# Start with a data frame to collect all the data in
df = pd.DataFrame()
Individuals were determined to have SPD using the validated Kessler 6 (K6) scale.31,32 K6 scores are derived from responses to six questions asking how often in the past 30 days the individual felt “nervous”, “restless”, “hopeless”, “worthless”, “everything feels like an effort”, and “so sad that nothing cheers them up”, with responses ranging from 0 (none of the time) to 4 (all of the time). The responses for these six variables are summed to obtain the K6 score (maximum possible score of 24), and individuals with a score of ≥13 are considered to have SPD.
Corresponding columns:
ASINERV, ASIRSTLS, ASIHOPLS, ASIWTHLS, ASIEFFRT, ASISAD
In [6]:
# Calculate Kessler 6
# How often did you feel:
# nervous, restless, hopeless, worthless, everything is an effort, so sad nothing mattered.
# ASINERV, ASIRSTLS, ASIHOPLS, ASIWTHLS, ASIEFFRT, ASISAD
kessler_6_questions=['ASINERV', 'ASIRSTLS', 'ASIHOPLS', 'ASIWTHLS', 'ASIEFFRT', 'ASISAD']
# 1 ALL of the time
# 2 MOST of the time
# 3 SOME of the time
# 4 A LITTLE of the time
# 5 NONE of the time
# 7 Refused
# 8 Not ascertained
# 9 Don't know
# These have to be encoded as:
# 7, 8, 9 -> NaN
# 5 -> 0
# 4 -> 1
# 3 -> 2
# 2 -> 3
# 1 -> 4
In [7]:
kessler_6_map = { 1:4, 2:3, 3:2, 4:1, 5:0}
kessler_6=pd.DataFrame()
for col in kessler_6_questions:
kessler_6[col]=[ kessler_6_map.get(x, None) for x in samadult[col]]
df['SPD']= kessler_6.sum(axis=1)>=13
df['SPD'] = np.where(df['SPD'], 'Yes', 'No')
del kessler_6
df.head(5)
Out[7]:
In [8]:
# Arthritis Status
arth_map= {1:'Yes', 2:'No'}
df['ARTH1']=[ arth_map.get(x, None) for x in samadult['ARTH1']]
From the article:
We created a chronic condition count based on the following eight nonarthritis chronic conditions: cancer (except nonmelanoma skin); heart condition (including coronary heart disease, angina, myocardial infarction, or any other heart condition); diabetes; hepatitis or liver condition; hypertension (on at least two different visits); respiratory conditions (current asthma, emphysema, or chronic bronchitis); stroke; and weak or failing kidneys, defined similar to the recommendations of Goodman et al.
From the NIHS file:
In [9]:
# the following variables are used for the chronic condition count
straight_chronic_condition_questions = ['CHDEV','DIBEV','HYPDIFV', 'KIDWKYR']
cancer_nonmelanoma_skin= ['CANEV','CNKIND22'] # CANEV minus CNKIND22
hep_liver=['AHEP','LIVEV']
respiratory=['AASMEV','EPHEV', 'CBRCHYR']
stroke=['STREV','ALCHRC8']
# Create a temporary dataframe and collect the straight forward conditions
chronic_ind=pd.DataFrame()
# this could be a bit too liberal with the Unknown and Refused to answer values
for col in straight_chronic_condition_questions:
chronic_ind[col]=samadult[col]==1
In [10]:
# Assume CANCER is false. Set to True for those diagnosed, and reset a few that were CNKIND22
chronic_ind['CANCER']=False
chronic_ind.loc[samadult['CANEV']==1,'CANCER'] = True
# override a few that have nonmelanoma skin
chronic_ind.loc[samadult['CNKIND22']==1, 'CANCER'] = False
In [11]:
# Assume Hepatitis or Liver condition is false and then set to True if either is reported
chronic_ind['HEPLIVER']=False
chronic_ind.loc[(samadult['AHEP']==1) | (samadult['LIVEV']==1), 'HEPLIVER'] = True
# Assume Respiratory condition is False and set to True if either of the three is reported
chronic_ind['RESPIRATORY']=False
chronic_ind.loc[(samadult['AASMEV']==1) | (samadult['EPHEV']==1) | (samadult['CBRCHYR']==1), 'RESPIRATORY'] = True
# Assume Stroke condition is false and then set to True if either flag is reported
chronic_ind['STROKE']=False
chronic_ind.loc[(samadult['STREV']==1) | (samadult['ALCHRC8']==1), 'STROKE'] = True
chronic_ind.head()
Out[11]:
Now count the TRUE values over this dataframe.
Keep the values for 0, 1 and 2 and call everything else >=3
In [12]:
# Now count the chronic conditions and assign to df
chronic_ind['CHRONIC_CT']=np.array(np.sum(chronic_ind, axis=1))
chron_map = {0:'0',1:'1', 2:'2'}
df['CHRONIC_CT']=[chron_map.get(x, '>=3') for x in chronic_ind['CHRONIC_CT']]
del chronic_ind
df.head(10)
Out[12]:
In [13]:
# General Health Status, does not exist as in study Very Good/Excellent, Good, Poor/Fair.
# Only and indicator if it was worse, same, better
# we will use it as a proxy.
status_map={1:"Very Good", 2:"Poor", 3: "Good"}
df['GENERAL_HEALTH_STATUS']=[status_map.get(x, None) for x in samadult['AHSTATYR']]
In [14]:
# BMI
bmi=pd.DataFrame()
bmi['BMI']=samadult['BMI']
bmi.loc[bmi['BMI'] < 2500, 'BMI_C'] = '<25'
bmi.loc[(bmi['BMI'] >= 2500)&(bmi['BMI'] < 3000), 'BMI_C'] = '25<30'
bmi.loc[(bmi['BMI'] >= 3000)&(bmi['BMI'] < 9999), 'BMI_C'] = '>30'
df['BMI_C']=bmi['BMI_C']
del bmi
At least 150 Moderate or 75 Vigorous minutes per week. Questions are answered with per day, per week, per month, per year and recoded to units per week. But the file also has it recoded to units per week. Those units are either minutes or hours.
So we have to do some math to figure out if we get more than 150 moderate equivalent minutes.
Another interesting way to manipulate data, this time using 'apply' and a user defined function.
In [15]:
def determine_activity(x):
minutes = 0
if x['VIGLNGTP']==1:
minutes = minutes + x['VIGLNGNO']*2
elif x['VIGLNGTP']==2:
minutes = minutes + x['VIGLNGNO']*120
if x['MODLNGTP']==1:
minutes = minutes + x['MODLNGNO']
elif x['MODLNGTP']==2:
minutes = minutes + x['MODLNGNO']*60
return 'Meets' if minutes >= 150 else 'Does not meet'
physical_activity=pd.DataFrame()
physical_activity=samadult[['VIGLNGNO','VIGLNGTP', 'MODLNGNO', 'MODLNGTP']].copy()
physical_activity['ACTIVITY']=physical_activity.apply(determine_activity, axis=1)
df['ACTIVITY']=physical_activity['ACTIVITY']
del physical_activity
In [16]:
df.head(20)
Out[16]:
In [17]:
# Age
age=pd.DataFrame()
age['AGE_P']=samadult['AGE_P']
age.loc[age['AGE_P'] < 45, 'AGE_C'] = '18-44'
age.loc[(age['AGE_P'] >= 45)&(age['AGE_P'] < 65), 'AGE_C'] = '45-64'
age.loc[age['AGE_P'] >= 65, 'AGE_C'] = '65-'
df['AGE_C']=age['AGE_C']
del age
In [18]:
# Sex
df['SEX']=[ 'Male' if x == 1 else 'Female' for x in samadult['SEX']]
In [19]:
# Race. Not exactly a match with the study. Not sure why.
# RACERPI2
race_map= {1: 'White', 2: 'Black/African American', 3:'AIAN', 4: 'Asian',5: 'not releasable',6: 'Multiple'}
df['RACE']=[ race_map.get(x, None) for x in samadult['RACERPI2']]
Education Level and Employment status are not encoded as expected. Education level can't be found at all and employment status is a mix between workstatus and why did you not work last week. Which is an odd way to determine if someone is retired, or a student.
In [20]:
# Educational level:
# Less than high school
# High school diploma
# Some college or Associates degree
# College or greater
# Can't find it in data?
In [21]:
# Employment status: complex between workstatus and why not worked last week, logic is not described
# Maybe at least get "Out of Work", "Retired", "Other"?
In [22]:
# marital status
# R_MARITL
# 0 Under 14 years -> will combine that with Never Married
# 1 Married - spouse in household \
# 2 Married - spouse not in household > -- will combine these
# 3 Married - spouse in household unknown /
# 4 Widowed
# 5 Divorced \ will combine these
# 6 Separated /
# 7 Never married
# 8 Living with partner
# 9 Unknown marital status -> will combine with 7
marital_map = { 0: "Never Married"
, 1: "Married"
, 2: "Married"
, 3: "Married"
, 4: "Widowed"
, 5: "Divorced/Separated"
, 6: "Divorced/Separated"
, 7: "Never Married"
, 8: "Living with Partner"
, 9: "Never Married"}
df['MARITAL_STATUS']=[ marital_map.get(x, "Never Married") for x in samadult['R_MARITL']]
In [23]:
# Functional limitation score
fl_columns=['FLWALK','FLCLIMB','FLSTAND','FLSIT','FLSTOOP','FLREACH','FLGRASP','FLCARRY','FLPUSH']
fl_cols=samadult[fl_columns].copy()
for col in fl_columns:
fl_cols.loc[fl_cols[col]>=6] = 0
fl_cols['FL_AVG']=fl_cols.mean(axis=1)
fl_cols.loc[fl_cols['FL_AVG'] == 0,'FUNC_LIMIT'] = 'None'
fl_cols.loc[(fl_cols['FL_AVG'] > 0)&(fl_cols['FL_AVG'] <=1),'FUNC_LIMIT'] = 'Low'
fl_cols.loc[(fl_cols['FL_AVG'] > 1)&(fl_cols['FL_AVG'] <=2),'FUNC_LIMIT'] = 'Medium'
fl_cols.loc[fl_cols['FL_AVG'] > 2,'FUNC_LIMIT'] = 'High'
df['FUNC_LIMIT']=fl_cols['FUNC_LIMIT']
del fl_cols
In [24]:
# Social participation restriction
# We defined social participation restriction as
# difficulty or inability to shop, go to events, or participate in
# social activities without special equipment, per previously
# published analyses.
# FLSHOP and FLSOCL
restr_map={1:"Yes", 2:"Yes", 3: "Yes", 4: "Yes"}
social_cols=pd.DataFrame()
social_cols['FLSHOP']=[restr_map.get(x, 'No') for x in samadult['FLSHOP']]
social_cols['FLSOCL']=[restr_map.get(x, 'No') for x in samadult['FLSOCL']]
social_cols.loc[(social_cols['FLSHOP']=='Yes')|(social_cols['FLSOCL']=='Yes'), 'SOC_RESTR']='Yes'
social_cols.loc[(social_cols['FLSHOP']=='No')&(social_cols['FLSOCL']=='No'), 'SOC_RESTR']='No'
df['SOC_RESTR']=social_cols['SOC_RESTR']
In [25]:
#Could not afford mental health care, past 12 months
# AHCAFYR2
# No = 2
# Yes = 1
df['NOT_AFFORD']=[ 'Yes' if x == 1 else 'No' for x in samadult['AHCAFYR2']]
#Seen a mental health professional, past 12 months
# AHCSYR1
#No = 2
#Yes = 1
df['SEEN_MENTAL_DR']=[ 'Yes' if x == 1 else 'No' for x in samadult['AHCSYR1']]
In [26]:
df.head(34)
Out[26]:
In [27]:
#From Familyxx get poverty ratio
fam_df=pd.DataFrame()
ratio_map={
1: '<1' # Under 0.50
,2: '<1' # 0.50 - 0.74
,3: '<1' # 0.75 - 0.99
,4: '1 to <2' # 1.00 - 1.24
,5: '1 to <2' # 1.25 - 1.49
,6: '1 to <2' # 1.50 - 1.74
,7: '1 to <2' # 1.75 - 1.99
,8: '>=2' # 2.00 - 2.49
,9: '>=2' # 2.50 - 2.99
,10: '>=2' # 3.00 - 3.49
,11: '>=2' # 3.50 - 3.99
,12: '>=2' # 4.00 - 4.49
,13: '>=2' # 4.50 - 4.99
,14: '>=2' # 5.00 and over
,15: '<1' # Less than 1.00 (no further detail)
,16: '1 to <2' # 1.00 - 1.99 (no further detail)
,17: '>=2' # 2.00 and over (no further detail)
,96: '1 to <2' # Undefinable
,99: '1 to <2' # Unknown
}
fam_df['POV_RATIO']=[ratio_map.get(x, None) for x in family['RAT_CAT4']]
In [28]:
# Just going to go for Yes and No and any unknown/refused as No
# Health insurance
#Any private
#Public only
# Not covered
# FHICOVYN
fam_df['INSURANCE']=['Yes' if x == 1 else 'No' for x in family['FHICOVYN']]
In [29]:
df['HHX']=samadult['HHX']
df['FMX']=samadult['FMX']
fam_df['HHX']=family['HHX']
fam_df['FMX']=family['FMX']
And you then do a merge, with the columns indicated in the on= parameter and, very important, specify it is a left join, so that you don't loose any people, if you can't find the family.
In [30]:
joined_df=pd.merge(df, fam_df, on=['HHX','FMX'],how='left', sort=False)
joined_df.drop(['HHX','FMX'], axis=1,inplace=True )
joined_df.head()
Out[30]:
In [31]:
df=joined_df
df.to_csv(INTERIM_DATA_DIR+'/arthritis_study.csv')