Cleaning


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Manufacturing places for products sold in France


In [2]:
# read data
data = pd.read_table('../en.openfoodfacts.org.products.csv', sep='\t')
print('Initial number of products: {}'.format(len(data)))
# removing rows with unknown countries and unknown manufacturing places
data.dropna(subset=['countries_en', 'manufacturing_places'], inplace=True)
print('Number of products sold to a known country and known manufacturing places: {}'.format(len(data)))
# keep only products sold in France
# (including products sold in France and other countries)
data = data[data.countries_en.dropna().str.match(r'.*france.*', case=False)]
print('Number of products sold in France (and potentially other countries too): {}'.format(len(data)))


/home/guillaume/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (0,3,5,27,36) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Initial number of products: 80501
Number of products sold to a known country and known manufacturing places: 25215
Number of products sold in France (and potentially other countries too): 17683

In [3]:
# creates manufacturing_country from manufacturing_place
len(data.manufacturing_places.unique())


Out[3]:
3119

In [4]:
# creates manufacturing_country from manufacturing_place
len(data.manufacturing_places_tags.unique())


Out[4]:
2985

In [5]:
data['manufacturing_country'] = data.manufacturing_places.str.lower()
special_char = {
    'é': 'e',
    'è': 'e',
    'ë': 'e',
    'ñ': 'n',
    'ï': 'i',
    'ô': 'o',
}
for sp_char, char in special_char.items():
    data['manufacturing_country'] = data.manufacturing_country.str.replace(sp_char, char)
    
to_replace = [# regex -> country, we keep a list and not a dic because order is important
    (r'france|francia|frankreich', 'france'),
    (r'allemagne|deutschland|germany', 'allemagne'),
    (r'portugal', 'portugal'),
    (r'belgique', 'belgique'),
    (r'suisse', 'suisse'),
    (r'ital[iy]', 'italie'),
    (r'ecosse|angleterre|england|royaume[ -]uni|united[ -]kingdom|grande[ -]bretagne', 'royaume-uni'),
    (r'irland', 'irlande'),
    (r'chin[ea]', 'chine'),
    (r'jap[ao]n', 'japon'),
    (r'tcheque', 'tcheque'),
    (r'espagne|espana', 'espagne'),
    (r'switzerland', 'suisse'),
    (r'coree', 'coree'),
    (r'grece', 'grece'),
    (r'sloveni', 'slovenie'),
    (r'pays[ -]bas', 'pays-bas'),
    (r'thailand', 'thailande'),
    (r'viet ?nam', 'viet nam'),
    (r'mexic', 'mexique'),    
    (r'bretagne', 'france'), # needs to be AFTER "grande[ -]bretagne" !!
    (r'corse|vendee|alsace|vezelay|provence|ardeche|savoie|auvergne|lozere|strasbourg', 'france'),
    (r'pyrenee|centre|grenade|vosges|normandie|villeneuve|clohars|marseille', 'france'),
    (r'.*u[\.]?s[\.]?a.*', 'etats-unis'),
    (r'.*la reunion.*', 'reunion'),
    (r'.*maurice.*', 'maurice'),
    (r'.*europe.*', 'union europeenne'), # keep at the end
    (r'.*u\.e[\.]?.*', 'union europeenne'), # keep at the end
]

for regex, country in to_replace:
    data.ix[data.manufacturing_country.str.contains(regex, case=False), 'manufacturing_country'] = country
    
# removing "union europeenne" since we do not know the precise country
data = data.ix[data.manufacturing_country != "union europeenne"]

# we cut lines that have less than 3 products
mc_counts = pd.DataFrame(data.manufacturing_country.value_counts() >= 3)
data_merge = data.merge(mc_counts, how='left', left_on='manufacturing_country', right_index=True, suffixes=('', '_to_keep'))
data = data[data_merge.manufacturing_country_to_keep]

In [6]:
data.manufacturing_country.value_counts()


Out[6]:
france            12957
italie             1009
allemagne           596
belgique            583
espagne             385
pays-bas            262
royaume-uni         198
suisse              150
thailande           135
chine               104
portugal             65
maroc                56
pologne              49
grece                48
suede                29
japon                29
danemark             27
viet nam             26
turquie              23
etats-unis           21
tunisie              19
autriche             17
taiwan               15
luxembourg           15
malaisie             14
coree                13
irlande              13
reunion              12
israel               12
tcheque              12
mexique              11
norvege              10
canada                9
cote d'ivoire         9
islande               9
maurice               9
hongrie               8
afrique du sud        8
lituanie              7
inde                  6
sri lanka             5
algerie               5
ghana                 5
perou                 4
philippines           4
equateur              4
bulgarie              3
roumanie              3
bresil                3
slovenie              3
liban                 3
liechtenstein         3
slovaquie             3
kenya                 3
madagascar            3
Name: manufacturing_country, dtype: int64

In [7]:
country_code = pd.read_table('code_original.tsv', sep='\t', usecols=['alpha-3 ', 'Nom français '])
country_code = country_code.rename(columns={'alpha-3 ': 'code', 'Nom français ': 'country_fr'})
country_code.dropna(inplace=True)
country_code.country_fr = country_code.country_fr.str.strip()
country_code.code = country_code.code.str.strip()
country_code.country_fr = country_code.country_fr.str.lower()
country_code.country_fr = country_code.country_fr.str.split(',').map(lambda x: x[0]) # remove "republic of" and others
country_code = country_code.ix[country_code.code != "PRK"] # drop North Korea
special_char = {
    'é': 'e',
    'è': 'e',
    'ë': 'e',
    'ñ': 'n',
    'ï': 'i',
    'ô': 'o',
    'å': 'a',
    '’': '\''
}
for sp_char, char in special_char.items():
    country_code.country_fr = country_code.country_fr.str.replace(sp_char, char)

country_code.head()


Out[7]:
code country_fr
0 AFG afghanistan
1 ZAF afrique du sud
2 ALA aland
3 ALB albanie
4 DZA algerie

In [8]:
set_country_fr = set(country_code.country_fr)
set_manufacturing_country = set(data.manufacturing_country)
print('My countries that are not standard : ', set_manufacturing_country.difference(set_country_fr))


My countries that are not standard :  set()

In [9]:
# join
data = data.merge(country_code, how='left', left_on='manufacturing_country', right_on='country_fr', suffixes=('', '_country'))

Categories


In [11]:
# uniformization of several names
data.ix[data.pnns_groups_1 == 'fruits-and-vegetables', 'pnns_groups_1'] = 'Fruits and vegetables'
data.ix[data.pnns_groups_2 == 'fruits', 'pnns_groups_2'] = 'Fruits'
data.ix[data.pnns_groups_2 == 'vegetables', 'pnns_groups_2'] = 'Vegetables'
data.ix[data.pnns_groups_1 == 'sugary-snacks', 'pnns_groups_1'] = 'Sugary snacks'
data.ix[data.pnns_groups_2 == 'pastries', 'pnns_groups_2'] = 'Pastries'

In [12]:
data[['pnns_groups_1', 'pnns_groups_2', 'code']].groupby(['pnns_groups_1', 'pnns_groups_2']).count()


Out[12]:
code
pnns_groups_1 pnns_groups_2
Beverages Artificially sweetened beverages 26
Fruit juices 283
Fruit nectars 61
Non-sugared beverages 440
Sweetened beverages 268
Cereals and potatoes Bread 296
Breakfast cereals 178
Cereals 944
Legumes 99
Potatoes 30
Composite foods One-dish meals 1844
Pizza pies and quiche 90
Sandwich 241
Fat and sauces Dressings and sauces 612
Fats 307
Fish Meat Eggs Eggs 113
Fish and seafood 818
Meat 410
Processed meat 1290
Fruits and vegetables Dried fruits 43
Fruits 278
Soups 94
Vegetables 609
Milk and dairy products Cheese 1437
Dairy desserts 330
Ice cream 110
Milk and yogurt 886
Salty snacks Appetizers 388
Nuts 44
Salty and fatty products 1
Sugary snacks Biscuits and cakes 909
Chocolate products 540
Pastries 104
Sweets 623
unknown unknown 1713

Nutriment


In [13]:
subset= ['nutrition-score-uk_100g', 
         'sodium_100g',
         'salt_100g',
         'proteins_100g',
         'energy_100g',
         'saturated-fat_100g',
         'sugars_100g',
         'fat_100g',
         'carbohydrates_100g']
data2 = data.dropna(subset=subset)

In [14]:
n = len(data2)
nutritionals_null = pd.Series()
nutritionals_null_pct = pd.Series()
for col in data2:
    if "_100g" in col or 'additiv' in col:
        p = np.sum(data2[col].isnull())
        #print('{:40} : {:5} ({}%)'.format(col, p, p/n*100))
        nutritionals_null[col] = p
        nutritionals_null_pct[col] = p/n*100
nutritionals = pd.DataFrame({'Null': nutritionals_null, 'NullPct': nutritionals_null_pct})

In [15]:
n


Out[15]:
12185

In [16]:
nutritionals.sort_values('Null').head(15)


Out[16]:
Null NullPct
nutrition-score-uk_100g 0 0.000000
sodium_100g 0 0.000000
salt_100g 0 0.000000
proteins_100g 0 0.000000
energy_100g 0 0.000000
nutrition-score-fr_100g 0 0.000000
fat_100g 0 0.000000
saturated-fat_100g 0 0.000000
sugars_100g 0 0.000000
carbohydrates_100g 0 0.000000
additives_n 268 2.199426
additives 284 2.330735
additives_en 4807 39.450144
additives_tags 4807 39.450144
fiber_100g 4849 39.794830

Reduced dataset


In [17]:
subset = [
    'product_name',
    'code_country',
    'pnns_groups_1',
    'pnns_groups_2',
    'nutrition-score-uk_100g',
    'nutrition-score-fr_100g',
    'sodium_100g',
    'salt_100g',
    'proteins_100g',
    'energy_100g',
    'saturated-fat_100g',
    'sugars_100g',
    'fat_100g',
    'carbohydrates_100g']

In [18]:
data.dropna(subset=subset, inplace=True)
data = data[subset]

In [19]:
data.head()


Out[19]:
product_name code_country pnns_groups_1 pnns_groups_2 nutrition-score-uk_100g nutrition-score-fr_100g sodium_100g salt_100g proteins_100g energy_100g saturated-fat_100g sugars_100g fat_100g carbohydrates_100g
2 30 Panach' Fruits FRA Sugary snacks Biscuits and cakes 13.0 13.0 0.248031 0.63 5.0 1724.0 2.0 31.0 18.0 57.0
3 Marks & Spencer Curiously Strong Mints GBR Beverages Sweetened beverages 14.0 20.0 0.000000 0.00 0.9 1665.0 0.5 96.0 0.5 96.0
4 All Buter Belgian White Chocolate Chunk Cookies GBR Sugary snacks Biscuits and cakes 26.0 26.0 0.393701 1.00 5.5 2172.0 17.0 33.5 29.1 58.4
5 Honey roast ham GBR Fish Meat Eggs Processed meat 4.0 4.0 0.787402 2.00 24.5 564.0 0.7 2.9 2.0 4.3
6 Pur Soup' Velouté de légumes FRA Fruits and vegetables Soups 2.0 2.0 0.295276 0.75 0.8 188.0 0.7 2.4 2.1 5.3

In [22]:
data.to_csv('final/static/data/data.csv', index=False)

In [25]:
len(data.pnns_groups_2.unique())


Out[25]:
35

In [21]:
for c in data:
    print('{:25} : {}'.format(c, data.ix[2, c]))


product_name              : 30 Panach' Fruits
code_country              : FRA
pnns_groups_1             : Sugary snacks
pnns_groups_2             : Biscuits and cakes
nutrition-score-uk_100g   : 13.0
nutrition-score-fr_100g   : 13.0
sodium_100g               : 0.248031496062992
salt_100g                 : 0.63
proteins_100g             : 5.0
energy_100g               : 1724.0
saturated-fat_100g        : 2.0
sugars_100g               : 31.0
fat_100g                  : 18.0
carbohydrates_100g        : 57.0

In [238]:
data.shape


Out[238]:
(12124, 14)