In [14]:
import numpy
import pandas
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn
import statistics
# bug fix for display formats to avoid run time errors
pandas.set_option('display.float_format', lambda x:'%.2f'%x)
#load the data
data = pandas.read_csv('~/dev/coursera/separatedData.csv')
In [15]:
# convert to numeric format
data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce')
data["meanSugarPerson"] = pandas.to_numeric(data["meanSugarPerson"], errors='coerce')
data["meanFoodPerson"] = pandas.to_numeric(data["meanFoodPerson"], errors='coerce')
data["meanCholesterol"] = pandas.to_numeric(data["meanCholesterol"], errors='coerce')
In [16]:
# listwise deletion of missing values
sub1 = data[['breastCancer100th', 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']].dropna()
# Create the conditions to a new variable named sugar_consumption that will categorize the meanSugarPerson answers
meanIncidence = statistics.mean(sub1['breastCancer100th'])
def incidence_cancer (row):
if row['breastCancer100th'] <= meanIncidence : return 0 # Incidence of breast cancer is below the average of the incidence of all countries.
if row['breastCancer100th'] > meanIncidence : return 1 # incidence of breast cancer is above the average of the incidence of all countries.
# Add the new variable sugar_consumption to subData
sub1['incidence_cancer'] = sub1.apply (lambda row: incidence_cancer (row),axis=1)
# Create the conditions to a new variable named sugar_consumption that will categorize the meanSugarPerson answers
def sugar_consumption (row):
if 0 < row['meanSugarPerson'] <= 30 : return 0 # Desirable between 0 and 30 g.
if 30 < row['meanSugarPerson'] <= 60 : return 1 # Raised between 30 and 60 g.
if 60 < row['meanSugarPerson'] <= 90 : return 2 # Borderline high between 60 and 90 g.
if 90 < row['meanSugarPerson'] <= 120 : return 3 # High between 90 and 120 g.
if row['meanSugarPerson'] > 120 : return 4 # Very high under 120g.
# Add the new variable sugar_consumption to subData
sub1['sugar_consumption'] = sub1.apply (lambda row: sugar_consumption (row),axis=1)
In [17]:
# Create the conditions to a new variable named food_consumption that will categorize the meanFoodPerson answers
meanFood = statistics.mean(sub1['meanFoodPerson'])
def food_consumption (row):
if row['meanFoodPerson'] <= meanFood : return 0 # food consumption below the average of the food consumption of all countries.
if row['meanFoodPerson'] > meanFood : return 1 # food consumption above the average of the food consumption of all countries.
In [18]:
# Add the new variable food_consumption to subData
sub1['food_consumption'] = sub1.apply (lambda row: food_consumption (row),axis=1)
# Create the conditions to a new variable named cholesterol_blood that will categorize the meanCholesterol answers
def cholesterol_blood (row):
if row['meanCholesterol'] <= 5.2 : return 0 # (0) Desirable below 5.2 mmol/L
if 5.2 < row['meanCholesterol'] <= 6.2 : return 1 # (1) Borderline high between 5.2 and 6.2 mmol/L
if row['meanCholesterol'] > 6.2 : return 2 # (2) High above 6.2 mmol/L
# Add the new variable sugar_consumption to subData
sub1['cholesterol_blood'] = sub1.apply (lambda row: cholesterol_blood (row),axis=1)
In [19]:
# Logistic Regression analysis
lreg1 = smf.logit(formula = 'incidence_cancer ~ sugar_consumption + food_consumption + cholesterol_blood', data = sub1).fit()
print (lreg1.summary())
In [20]:
# odd ratios with 95% confidence intervals
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (numpy.exp(conf))