In [1]:
from jyquickhelper import add_notebook_menu
add_notebook_menu()
Out[1]:
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
In [3]:
def remove_na_rows(df, cols=None):
"""
remove row with NaN in any column
"""
if cols is None:
cols = df.columns
return df[np.logical_not(np.any(df[cols].isnull().values, axis=1))]
def trans_country_name(x):
"""
translate country name to code (2-char)
"""
try:
country_name = x.split(',')[0]
if country_name in dictCountryName2Code:
return dictCountryName2Code[country_name]
except:
return None
def parse_additives(x):
"""
parse additives column values into a list
"""
try:
dict = {}
for item in x.split(']'):
token = item.split('->')[0].replace("[", "").strip()
if token: dict[token] = 1
return [len(dict.keys()), sorted(dict.keys())]
except:
return None
def trans_serving_size(x):
"""
pick up gram value from serving_size column
"""
try:
serving_g = float((x.split('(')[0]).replace("g", "").strip())
return serving_g
except:
return 0.0
def distplot2x2(cols):
"""
make dist. plot on 2x2 grid for up to 4 features
"""
sb.set(style="white", palette="muted")
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=False)
b, g, r, p = sb.color_palette("muted", 4)
colors = [b, g, r, p]
axis = [axes[0,0],axes[0,1],axes[1,0],axes[1,1]]
for n,col in enumerate(cols):
sb.distplot(food[col].dropna(), hist=True, rug=False, color=colors[n], ax=axis[n])
https://www.kaggle.com/openfoodfacts/world-food-facts
This dataset contains Food Nutrition Fact for 100 000+ food products from 150 countries.
In [4]:
food = pd.read_excel("data/openfoodfacts_5k.xlsx")
In [5]:
food.shape
Out[5]:
In [6]:
food.columns
Out[6]:
In [7]:
food.head()
Out[7]:
In [8]:
# columns_to_keep = ['code','product_name','created_datetime','brands','categories','origins','manufacturing_places','energy_100g','fat_100g','saturated-fat_100g','trans-fat_100g','cholesterol_100g','carbohydrates_100g','sugars_100g','omega-3-fat_100g','omega-6-fat_100g','fiber_100g','proteins_100g','salt_100g','sodium_100g','alcohol_100g','vitamin-a_100g','vitamin-c_100g','potassium_100g','chloride_100g','calcium_100g','phosphorus_100g','iron_100g','magnesium_100g','zinc_100g','copper_100g','manganese_100g','fluoride_100g','ingredients_text','countries','countries_en','serving_size','additives','nutrition_grade_fr','nutrition_grade_uk','nutrition-score-fr_100g','nutrition-score-uk_100g','url','image_url','image_small_url']
columns_to_keep = ['code','product_name','created_datetime','brands','energy_100g','fat_100g','saturated-fat_100g','trans-fat_100g','cholesterol_100g','carbohydrates_100g','sugars_100g','fiber_100g','proteins_100g','salt_100g','sodium_100g','vitamin-a_100g','vitamin-c_100g','calcium_100g','iron_100g','ingredients_text','countries','countries_en','serving_size','additives','nutrition_grade_fr','nutrition-score-fr_100g','url']
In [9]:
food = food[columns_to_keep]
In [10]:
columns_numeric_all = ['energy_100g','fat_100g','saturated-fat_100g','trans-fat_100g','cholesterol_100g','carbohydrates_100g','sugars_100g','omega-3-fat_100g','omega-6-fat_100g','fiber_100g','proteins_100g','salt_100g','sodium_100g','alcohol_100g','vitamin-a_100g','vitamin-c_100g','potassium_100g','chloride_100g','calcium_100g','phosphorus_100g','iron_100g','magnesium_100g','zinc_100g','copper_100g','manganese_100g','fluoride_100g','nutrition-score-fr_100g','nutrition-score-uk_100g']
columns_numeric = set(columns_numeric_all) & set(columns_to_keep)
columns_categoric = set(columns_to_keep) - set(columns_numeric)
# turn off
if False:
for col in columns_numeric:
if not col in ['nutrition-score-fr_100g', 'nutrition-score-uk_100g']:
food[col] = food[col].fillna(0)
for col in columns_categoric:
if col in ['nutrition_grade_fr', 'nutrition_grade_uk']:
food[col] = food[col].fillna('-')
else:
food[col] = food[col].fillna('')
In [11]:
# list column names: categoric vs numeric
columns_categoric, columns_numeric
Out[11]:
In [12]:
food.head(3)
Out[12]:
In [13]:
# standardize country
country_lov = pd.read_excel("../../0.0-Datasets/country_cd.xlsx")
# country_lov.shape
# country_lov.head()
# country_lov[country_lov['GEOGRAPHY_NAME'].str.startswith('United')].head()
# country_lov['GEOGRAPHY_CODE'].tolist()
# country_lov.ix[0,'GEOGRAPHY_CODE'], country_lov.ix[0,'GEOGRAPHY_NAME']
# create 2 dictionaries
dictCountryCode2Name = {}
dictCountryName2Code = {}
for i in country_lov.index:
dictCountryCode2Name[country_lov.ix[i,'GEOGRAPHY_CODE']] = country_lov.ix[i,'GEOGRAPHY_NAME']
dictCountryName2Code[country_lov.ix[i,'GEOGRAPHY_NAME']] = country_lov.ix[i,'GEOGRAPHY_CODE']
In [14]:
# add Country_Code column - pick 1st country from list
food['countries_en'] = food['countries_en'].fillna('')
food['country_code'] = food['countries_en'].apply(str).apply(lambda x: trans_country_name(x))
# add country_code to columns_categoric set
columns_categoric.add('country_code')
In [15]:
# verify bad country
food[food['country_code'] != food['countries']][['country_code', 'countries']].head(20)
Out[15]:
In [16]:
food['ingredients_text'].head() # leave as is
Out[16]:
In [17]:
# add serving_size in gram column
food['serving_size'].head(10)
Out[17]:
In [18]:
food['serving_size'] = food['serving_size'].fillna('')
food['serving_size_gram'] = food['serving_size'].apply(lambda x: trans_serving_size(x))
# add serving_size_gram
columns_numeric.add('serving_size_gram')
In [19]:
food[['serving_size_gram', 'serving_size']].head()
Out[19]:
In [20]:
food['additives'].head(10)
Out[20]:
In [21]:
food['additives'] = food['additives'].fillna('')
food['additive_list'] = food['additives'].apply(lambda x: parse_additives(x))
# add additive_list
columns_categoric.add('additive_list')
In [22]:
food[['additive_list', 'additives']].head()
Out[22]:
In [23]:
food["creation_date"] = food["created_datetime"].apply(str).apply(lambda x: x[:x.find("T")])
food["year_added"] = food["created_datetime"].dropna().apply(str).apply(lambda x: int(x[:x.find("-")]))
# add creation_date
columns_categoric.add('creation_date')
columns_numeric.add('year_added')
In [24]:
food[['created_datetime', 'creation_date', 'year_added']].head()
Out[24]:
In [25]:
# food['product_name']
food.head(3)
Out[25]:
In [26]:
columns_numeric
Out[26]:
In [27]:
year_added = food['year_added'].value_counts().sort_index()
#year_added
year_i = [int(x) for x in year_added.index]
x_pos = np.arange(len(year_i))
year_added.plot.bar()
plt.xticks(x_pos, year_i)
plt.title("Food labels added per year")
Out[27]:
In [28]:
TOP_N = 10
dist_country = food['country_code'].value_counts()
top_country = dist_country[:TOP_N][::-1]
country_s = [dictCountryCode2Name[x] for x in top_country.index]
y_pos = np.arange(len(country_s))
top_country.plot.barh()
plt.yticks(y_pos, country_s)
plt.title("Top {} Country Distribution".format(TOP_N))
Out[28]:
In [29]:
# dist_nutri_grade = food['nutrition_grade_uk'].value_counts()
# no value
dist_nutri_grade = food['nutrition_grade_fr'].value_counts()
dist_nutri_grade.sort_index(ascending=False).plot.barh()
plt.title("Nutrition Grade Dist")
Out[29]:
In [30]:
food['nutrition-score-fr_100g'].dropna().plot.hist()
plt.title("{} Dist.".format("Nutri-Score"))
Out[30]:
In [31]:
food['serving_size_gram'].dropna().plot.hist()
plt.title("{} Dist.".format("Serving Size (g)"))
Out[31]:
In [32]:
distplot2x2([ 'energy_100g','fat_100g','saturated-fat_100g','trans-fat_100g'])
In [33]:
distplot2x2(['carbohydrates_100g', 'cholesterol_100g', 'proteins_100g', 'fiber_100g'])
In [34]:
distplot2x2([ 'sugars_100g', 'salt_100g', 'vitamin-a_100g', 'vitamin-c_100g'])
In [35]:
distplot2x2(['calcium_100g', 'iron_100g', 'sodium_100g'])
In [36]:
df = food[food["country_code"].isin(['US','FR'])][['energy_100g', 'carbohydrates_100g', 'sugars_100g','country_code']]
df = remove_na_rows(df)
df.head()
Out[36]:
In [37]:
sb.pairplot(df, hue="country_code", size=2.5)
Out[37]:
In [42]:
# prepare a small dataframe for ['US', 'FR']
df2 = food[food["country_code"].isin(['US','FR'])][['energy_100g', 'sugars_100g','country_code','nutrition_grade_fr']]
df2 = df2[df2["nutrition_grade_fr"].isin(['a','b','c','d','e'])]
df2 = df2.sort_values(by="nutrition_grade_fr")
# df2.head()
# create a grid of scatter plot
g = sb.FacetGrid(df2, row="nutrition_grade_fr", col="country_code", margin_titles=True)
g.map(plt.scatter, "sugars_100g", "energy_100g", color="steelblue")
g.set(xlim=(0, 100), ylim=(0, 3000))
Out[42]:
In [ ]: