In [1]:
import numpy as np
import pandas as pd
import utilities as utils
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("data/food.csv")


/Users/Jared/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (0,3,5,27,36) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [3]:
print data.shape
data.head()


(65503, 159)
Out[3]:
code url creator created_t created_datetime last_modified_t last_modified_datetime product_name generic_name quantity ... caffeine_100g taurine_100g ph_100g fruits_vegetables_nuts_100g collagen_meat_protein_ratio_100g cocoa_100g chlorophyl_100g carbon_footprint_100g nutrition_score_fr_100g nutrition_score_uk_100g
0 000000000000012866 http://world-en.openfoodfacts.org/product/0000... date-limite-app 1447004364 2015-11-08T17:39:24Z 1447004364 2015-11-08T17:39:24Z Poêlée à la sarladaise NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0000000024600 http://world-en.openfoodfacts.org/product/0000... date-limite-app 1434530704 2015-06-17T08:45:04Z 1434535914 2015-06-17T10:11:54Z Filet de bœuf NaN 2.46 kg ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 0000000036252 http://world-en.openfoodfacts.org/product/0000... tacinte 1422221701 2015-01-25T21:35:01Z 1422221855 2015-01-25T21:37:35Z Lion Peanut x2 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 0000000039259 http://world-en.openfoodfacts.org/product/0000... tacinte 1422221773 2015-01-25T21:36:13Z 1422221926 2015-01-25T21:38:46Z Twix x2 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0000000039529 http://world-en.openfoodfacts.org/product/0000... teolemon 1420147051 2015-01-01T21:17:31Z 1439141740 2015-08-09T17:35:40Z Pack de 2 Twix NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 159 columns


In [4]:
usa_data = utils.getByCountry(data, 'United States')
print usa_data.shape


(2508, 159)

In [5]:
usa_data.describe()


Out[5]:
no_nutriments additives_n ingredients_from_palm_oil_n ingredients_from_palm_oil ingredients_that_may_be_from_palm_oil_n ingredients_that_may_be_from_palm_oil nutrition_grade_uk energy_100g energy_from_fat_100g fat_100g ... caffeine_100g taurine_100g ph_100g fruits_vegetables_nuts_100g collagen_meat_protein_ratio_100g cocoa_100g chlorophyl_100g carbon_footprint_100g nutrition_score_fr_100g nutrition_score_uk_100g
count 0.0 1052.000000 1052.000000 0.0 1052.000000 0.0 0.0 1189.000000 727.000000 1202.000000 ... 4.000000 0.0 0.0 3.000000 0.0 1.0 0.0 2.000000 1026.000000 1026.000000
mean NaN 2.180608 0.000951 NaN 0.021863 NaN NaN 1176.289537 579.689216 13.638052 ... 0.013477 NaN NaN 33.333333 NaN 60.0 NaN 68.500000 9.060429 8.935673
std NaN 2.779168 0.030831 NaN 0.146306 NaN NaN 834.188753 696.134198 18.230727 ... 0.005561 NaN NaN 31.754265 NaN NaN NaN 79.903066 9.135097 9.348419
min NaN 0.000000 0.000000 NaN 0.000000 NaN NaN 0.000000 0.000000 0.000000 ... 0.007830 NaN NaN 15.000000 NaN 60.0 NaN 12.000000 -10.000000 -10.000000
25% NaN 0.000000 0.000000 NaN 0.000000 NaN NaN 328.000000 43.800000 0.000000 ... 0.009143 NaN NaN 15.000000 NaN 60.0 NaN 40.250000 0.000000 0.000000
50% NaN 1.000000 0.000000 NaN 0.000000 NaN NaN 1290.000000 300.000000 6.250000 ... 0.013790 NaN NaN 15.000000 NaN 60.0 NaN 68.500000 10.000000 9.000000
75% NaN 3.000000 0.000000 NaN 0.000000 NaN NaN 1790.000000 917.000000 21.400000 ... 0.018125 NaN NaN 42.500000 NaN 60.0 NaN 96.750000 16.000000 17.000000
max NaN 17.000000 1.000000 NaN 1.000000 NaN NaN 3766.000000 3740.000000 100.000000 ... 0.018500 NaN NaN 70.000000 NaN 60.0 NaN 125.000000 29.000000 29.000000

8 rows × 103 columns

What is the most sugary food in the USA?


In [6]:
sugar = usa_data.sugars_100g[usa_data.sugars_100g.notnull()]
most_sugar = sugar.sort_values(ascending=False)
usa_data[usa_data.sugars_100g >= most_sugar.iloc[0]]


Out[6]:
code url creator created_t created_datetime last_modified_t last_modified_datetime product_name generic_name quantity ... caffeine_100g taurine_100g ph_100g fruits_vegetables_nuts_100g collagen_meat_protein_ratio_100g cocoa_100g chlorophyl_100g carbon_footprint_100g nutrition_score_fr_100g nutrition_score_uk_100g
47379 4902124680235 http://world-en.openfoodfacts.org/product/4902... stephane 1438876475 2015-08-06T15:54:35Z 1439069028 2015-08-08T21:23:48Z Super Lemon Lemon candy 2.92 oz (83 g) ... NaN NaN NaN NaN NaN NaN NaN NaN 14.0 14.0

1 rows × 159 columns

Compare countries by average of different nutriments


In [7]:
unique_countries = utils.getUniqueCountries(data)

In [8]:
sugar_avg = utils.compareCountriesByNutrimentAverage(data, unique_countries, 'sugars_100g')
p = sugar_avg.plot(kind='bar', legend=False, title='Average Sugars_100g By Country (w/ Sample Size > 30)')
p.set_xlabel("Country")
p.set_ylabel("Grams")
p


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x102081610>

In [14]:
energy_avg = utils.compareCountriesByNutrimentAverage(data, unique_countries, 'energy_100g')
p = energy_avg.plot(kind='bar', legend=False, title='Average Energy_100g By Country (w/ Sample Size > 30)')
p.set_xlabel("Country")
p.set_ylabel("Grams")
p


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x129e0a0d0>

In [15]:
sodium_avg = utils.compareCountriesByNutrimentAverage(data, unique_countries, 'sodium_100g')
p = sodium_avg.plot(kind='bar', legend=False, title='Average Sodium_100g By Country (w/ Sample Size > 30)')
p.set_xlabel("Country")
p.set_ylabel("Grams")
p


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x12a825c90>

In [16]:
fat_100g = utils.compareCountriesByNutrimentAverage(data, unique_countries, 'fat_100g')
p = fat_100g.plot(kind='bar', legend=False, title='Average Fat_100g By Country (w/ Sample Size > 30)')
p.set_xlabel("Country")
p.set_ylabel("Grams")
p


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x12964f650>

In [17]:
proteins_100g = utils.compareCountriesByNutrimentAverage(data, unique_countries, 'proteins_100g')
p = proteins_100g.plot(kind='bar', legend=False, title='Average Proteins_100g By Country (w/ Sample Size > 30)')
p.set_xlabel("Country")
p.set_ylabel("Grams")
p


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x12ad55450>

In [18]:
carbohydrates_100g = utils.compareCountriesByNutrimentAverage(data, unique_countries, 'carbohydrates_100g')
p = carbohydrates_100g.plot(kind='bar', legend=False, title='Average Carbohydrates_100g By Country (w/ Sample Size > 30)')
p.set_xlabel("Country")
p.set_ylabel("Grams")
p


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x12b29c390>

In [ ]: