In [1]:
""" This file contains code used to analyze the
UK Food Nutrient Databank, the information for which can be found
at the above links.
The following packages are required to run this code.
"""
from __future__ import print_function, division
import pandas as pd
import sys
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
%matplotlib inline
import seaborn as sns
from collections import defaultdict, Counter
import statsmodels.formula.api as smf
In [2]:
def ReadProximates():
""" Reads the correct sheet from the Excel Spreadsheet downloaded from the databank.
Cleans the macronutrient data and replaces non-numerical entries with 0.
Returns: cleaned DataFrame
"""
df = pd.read_excel('dietary.xls', sheetname='Proximates')
column_list = ['Water (g)', 'Protein (g)', 'Fat (g)', 'Carbohydrate (g)', 'Total sugars (g)']
df['Water'] = pd.to_numeric(df['Water (g)'], errors='coerce')
df['Protein'] = pd.to_numeric(df['Protein (g)'], errors='coerce')
df['Fat'] = pd.to_numeric(df['Fat (g)'], errors='coerce')
df['Carbohydrate'] = pd.to_numeric(df['Carbohydrate (g)'], errors='coerce')
df['Sugars'] = pd.to_numeric(df['Total sugars (g)'], errors='coerce')
df['Water'].replace([np.nan], 0, inplace=True)
df['Protein'].replace([np.nan], 0, inplace=True)
df['Fat'].replace([np.nan], 0, inplace=True)
df['Carbohydrate'].replace([np.nan], 0, inplace=True)
df['Sugars'].replace([np.nan], 0, inplace=True)
return df
In [3]:
tester = ReadProximates()
In [4]:
# Plot clusters on a 2D log scale for first-glance. They do not appear to be particularly differentiated.
# Log coordinates are better for visualization in this instance.
x_vals = 'Protein'
y_vals = 'Carbohydrate'
z_vals = 'Fat'
food_group_dict = {'A':['Cereals','peru'], 'B':['Dairy','beige'], 'C':['Egg','paleturquoise'],
'D':['Vegetable','darkolivegreen'], 'F':['Fruit','firebrick'], 'G':['Nuts','saddlebrown'],
'J':['Fish','slategray'],'M':['Meat','indianred'], 'O':['Fat','khaki']}
In [5]:
ax = plt.subplot(111)
for key,val in food_group_dict.items():
df = tester[tester.Group.str.startswith(key, na=False)]
ax.scatter(df[x_vals],df[y_vals],df[z_vals],color=val[1],label = val[0])
plt.xscale('log')
plt.yscale('log')
ax.set_xlabel(x_vals+' (g)')
ax.set_ylabel(y_vals+' (g)')
ax.legend()
Out[5]:
In [6]:
from mpl_toolkits import mplot3d
ax = plt.subplot(projection='3d')
#ax.scatter3D(tester[x_vals], tester[y_vals], tester[z_vals])
for key,val in food_group_dict.items():
df = tester[tester.Group.str.startswith(key, na=False)]
ax.scatter3D(np.log(df[x_vals]),np.log(df[y_vals]),np.log(df[z_vals]),color=val[1],label = val[0])
#plt.xscale('log')
#plt.yscale('log')
ax.set_xlabel(x_vals+' (log g)')
ax.set_ylabel(y_vals+' (log g)')
ax.set_zlabel(z_vals+' (log g)')
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[6]:
In [7]:
def ThreeDPlot(pred_cat, actual_cat, ax, actual_label, colors = ['firebrick', 'peru']):
""" Creates a 3D log log plot on the requested subplot.
Arguments:
pred_cat = predicted dataframe for a category
actual_cat = dataframe of the real category
ax = plt axis instance
actual_label = string with label for the actual category
colors = list with two entries of strings for color names
"""
ax.scatter3D(np.log(pred_cat.Protein),np.log(pred_cat.Carbs), np.log(pred_cat.Fat), c = colors[0], label = 'Predicted Group')
ax.scatter3D(np.log(actual_cat.Protein),np.log(actual_cat.Carbohydrate), np.log(actual_cat.Fat), c = colors[1], label = actual_label, alpha= .5)
ax.view_init(elev=10, azim=45)
ax.set_xlabel('Protein (log g)')
ax.set_ylabel('Carbohydrate (log g)')
ax.set_zlabel('Fat (log g)')
plt.legend()
In [8]:
# Create the categories for the different food pyramid groups.
# Labels were assigned by attempting to find the best match
# with the generated groups.
cereals = tester[tester.Group.str.startswith('A', na=False)]
cereals['Label'] = cereals.Water*0+3
fruits = tester[tester.Group.str.startswith('F', na=False)]
fruits['Label'] = fruits.Water*0+4
veggies = tester[tester.Group.str.startswith('D', na=False)]
veggies['Label'] = veggies.Water*0+0
dairy = tester[tester.Group.str.startswith('B', na=False)]
dairy['Label'] = dairy.Water*0+2
oils = tester[tester.Group.str.startswith('O', na=False)]
oils['Label'] = oils.Water*0+5
m1 = tester[tester.Group.str.startswith('J', na=False)]
m2 = tester[tester.Group.str.startswith('M', na=False)]
meats = pd.concat([m1,m2])
meats['Label'] = meats.Water*0+1
# Create a dataframe using only these categories (drops nuts, eggs, soups/misc, etc)
all_these = pd.concat([cereals, fruits, veggies, dairy, oils, meats])
In [9]:
from sklearn.cluster import KMeans
# Selects the appropriate macronutrient columns to feed to the kmeans algorithm
water = pd.Series(all_these.Water, name='Water')
protein = pd.Series(all_these.Protein, name='Protein')
fat = pd.Series(all_these.Fat, name='Fat')
carbs = pd.Series(all_these.Carbohydrate, name='Carbs')
sugars = pd.Series(all_these['Sugars'], name='Sugars')
# Create a new DataFrame using only the macronutrient columns
X = pd.concat([water, protein,fat,carbs,sugars], axis=1)
X.fillna(0)
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans.fit(X.dropna())
y_kmeans = kmeans.predict(X)
In [10]:
# Created clusters from the kmeans algorithm
ax = plt.subplot(projection='3d')
ax.scatter3D(np.log(X.Protein),np.log(X.Carbs), np.log(X.Fat), c = y_kmeans)
ax.view_init(elev=10, azim=45)
ax.set_xlabel('Protein (log g)')
ax.set_ylabel('Carbohydrate (log g)')
ax.set_zlabel('Fat (log g)')
Out[10]:
In [11]:
# Create a way to select the categories
predicted_labels = pd.DataFrame(y_kmeans, index=X.index).astype(float)
X['predictions'] = predicted_labels
In [12]:
# Separate out the categories for individual analysis
labeled0 = X[X.predictions == 0]
labeled1 = X[X.predictions == 1]
labeled2 = X[X.predictions == 2]
labeled3 = X[X.predictions == 3]
labeled4 = X[X.predictions == 4]
labeled5 = X[X.predictions == 5]
Following are two examples, plotted on the 3d log scale. For the rest of the comparisons, see the bottom of the journal.
In [13]:
ax = plt.subplot(projection='3d')
ThreeDPlot(labeled1, meats, ax, 'Meats', ['firebrick','slategray'])
In [14]:
ax = plt.subplot(projection='3d')
ThreeDPlot(labeled4, fruits, ax, 'Fruits', ['firebrick','purple'])
In [15]:
# Use this to generate an accuracy score. Ours was 53%
from sklearn.metrics import accuracy_score
accuracy_score(all_these.Label,predicted_labels)
Out[15]:
In [16]:
# Look at confusion matrix for some idea of accuracy. Meats has the highest rate of matching.
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(all_these.Label, predicted_labels)
sns.heatmap(mat.T, square=True)
plt.xlabel('true label')
plt.ylabel('predicted label')
# Veggies are 0
# Meats are 1
# Dairy is 2
# Cereals are 3
# Fruits are 4
# Oils are 5
Out[16]:
In [17]:
all_these['guess'] = predicted_labels[0]
all_these['correct_guess'] = np.where((all_these.Label == all_these.guess), True, False)
In [45]:
def HowMatched3D(df, label_int, actual_label):
""" Creates a 3D log log plot on the requested subplot.
Arguments:
pred_cat = predicted dataframe for a category
actual_cat = dataframe of the real category
ax = plt axis instance
actual_label = string with label for the actual category
colors = list with two entries of strings for color names
"""
ax = plt.subplot(projection='3d')
TP = df[(df.Label == label_int)&(df.correct_guess==True)]
FP = df[(df.guess == label_int)&(df.correct_guess==False)]
FN = df[(df.Label == label_int)&(df.correct_guess==False)]
print('Matches:',len(TP), 'In Group, is not '+actual_label+':',len(FP), 'Not in Group, is '+actual_label+':',len(FN))
ax.scatter3D(np.log(TP.Protein),np.log(TP.Carbohydrate), np.log(TP.Fat), c = 'purple', label = 'In Group, is '+actual_label)
ax.scatter3D(np.log(FP.Protein),np.log(FP.Carbohydrate), np.log(FP.Fat), c = 'crimson', label = 'In Group, is not '+actual_label)
ax.scatter3D(np.log(FN.Protein),np.log(FN.Carbohydrate), np.log(FN.Fat), c = 'darkgreen', label = 'Not in Group, is '+actual_label)
ax.view_init(elev=10, azim=45)
ax.set_xlabel('Protein (log g)')
ax.set_ylabel('Carbohydrate (log g)')
ax.set_zlabel('Fat (log g)')
plt.legend()
In [46]:
HowMatched3D(all_these, 0, 'Vegetable')
In [47]:
HowMatched3D(all_these, 1, 'Meat')
In [48]:
HowMatched3D(all_these, 2, 'Dairy')
In [49]:
HowMatched3D(all_these, 3, 'Cereal')
In [50]:
HowMatched3D(all_these, 4, 'Fruit')
In [51]:
HowMatched3D(all_these, 5, 'Oil')
In [53]:
all_these.groupby('guess').mean()
Out[53]:
In [25]:
# 1518 correct guesses, some examples of which follow
guessed_correctly = all_these[all_these.correct_guess == True]
print(guessed_correctly['Food Name'])
In [26]:
# 1334 incorrect guesses. Some examples follow.
guessed_wrong = all_these[all_these.correct_guess==False]
print(guessed_wrong['Food Name'])
Beyond this point are pairs of 3D cluster plots, to compare each generated group with the food pyramid groups.
In [27]:
ax0 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled0, meats, ax0, 'Meats', ['firebrick','slategray'])
ax1 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled0, cereals, ax1, 'Cereals', ['firebrick','peru'])
plt.legend()
Out[27]:
In [28]:
ax2 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled0, fruits, ax2, 'Fruits', ['firebrick','purple'])
ax3 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled0, veggies, ax3, 'Vegetables', ['firebrick','darkolivegreen'])
plt.legend()
Out[28]:
In [29]:
ax4 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled0, dairy, ax4, 'Dairy', ['firebrick','mintcream'])
ax5 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled0, oils, ax5, 'Oils', ['firebrick','paleturquoise'])
plt.legend()
Out[29]:
In [30]:
ax0 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled1, meats, ax0, 'Meats', ['firebrick','slategray'])
ax1 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled1, cereals, ax1, 'Cereals', ['firebrick','peru'])
plt.legend()
Out[30]:
In [31]:
ax2 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled1, fruits, ax2, 'Fruits', ['firebrick','purple'])
ax3 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled1, veggies, ax3, 'Vegetables', ['firebrick','darkolivegreen'])
plt.legend()
Out[31]:
In [32]:
ax4 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled1, dairy, ax4, 'Dairy', ['firebrick','mintcream'])
ax5 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled1, oils, ax5, 'Oils', ['firebrick','paleturquoise'])
plt.legend()
Out[32]:
In [33]:
ax0 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled2, meats, ax0, 'Meats', ['firebrick','slategray'])
ax1 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled2, cereals, ax1, 'Cereals', ['firebrick','peru'])
plt.legend()
Out[33]:
In [34]:
ax2 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled2, fruits, ax2, 'Fruits', ['firebrick','purple'])
ax3 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled2, veggies, ax3, 'Vegetables', ['firebrick','darkolivegreen'])
plt.legend()
Out[34]:
In [35]:
ax4 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled2, dairy, ax4, 'Dairy', ['firebrick','mintcream'])
ax5 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled2, oils, ax5, 'Oils', ['firebrick','paleturquoise'])
plt.legend()
Out[35]:
In [36]:
ax0 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled3, meats, ax0, 'Meats', ['firebrick','slategray'])
ax1 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled3, cereals, ax1, 'Cereals', ['firebrick','peru'])
plt.legend()
Out[36]:
In [37]:
ax2 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled3, fruits, ax2, 'Fruits', ['firebrick','purple'])
ax3 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled3, veggies, ax3, 'Vegetables', ['firebrick','darkolivegreen'])
plt.legend()
Out[37]:
In [38]:
ax4 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled3, dairy, ax4, 'Dairy', ['firebrick','mintcream'])
ax5 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled3, oils, ax5, 'Oils', ['firebrick','paleturquoise'])
plt.legend()
Out[38]:
In [39]:
ax0 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled4, meats, ax0, 'Meats', ['firebrick','slategray'])
ax1 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled4, cereals, ax1, 'Cereals', ['firebrick','peru'])
plt.legend()
Out[39]:
In [40]:
ax2 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled4, fruits, ax2, 'Fruits', ['firebrick','purple'])
ax3 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled4, veggies, ax3, 'Vegetables', ['firebrick','darkolivegreen'])
plt.legend()
Out[40]:
In [41]:
ax4 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled4, dairy, ax4, 'Dairy', ['firebrick','mintcream'])
ax5 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled4, oils, ax5, 'Oils', ['firebrick','paleturquoise'])
plt.legend()
Out[41]:
In [42]:
ax0 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled5, meats, ax0, 'Meats', ['firebrick','slategray'])
ax1 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled5, cereals, ax1, 'Cereals', ['firebrick','peru'])
plt.legend()
Out[42]:
In [43]:
ax2 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled5, fruits, ax2, 'Fruits', ['firebrick','purple'])
ax3 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled5, veggies, ax3, 'Vegetables', ['firebrick','darkolivegreen'])
plt.legend()
Out[43]:
In [44]:
ax4 = plt.subplot(121, projection='3d')
ThreeDPlot(labeled5, dairy, ax4, 'Dairy', ['firebrick','mintcream'])
ax5 = plt.subplot(122, projection='3d')
ThreeDPlot(labeled5, oils, ax5, 'Oils', ['firebrick','paleturquoise'])
plt.legend()
Out[44]:
In [ ]: