Cleaning



In [1]:

    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Manufacturing places for products sold in France



In [2]:

    
# read data
data = pd.read_table('../en.openfoodfacts.org.products.csv', sep='\t')
print('Initial number of products: {}'.format(len(data)))
# removing rows with unknown countries and unknown manufacturing places
data.dropna(subset=['countries_en', 'manufacturing_places'], inplace=True)
print('Number of products sold to a known country and known manufacturing places: {}'.format(len(data)))
# keep only products sold in France
# (including products sold in France and other countries)
data = data[data.countries_en.dropna().str.match(r'.*france.*', case=False)]
print('Number of products sold in France (and potentially other countries too): {}'.format(len(data)))









    



/home/guillaume/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (0,3,5,27,36) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    



Initial number of products: 80501
Number of products sold to a known country and known manufacturing places: 25215
Number of products sold in France (and potentially other countries too): 17683



In [3]:

    
# creates manufacturing_country from manufacturing_place
len(data.manufacturing_places.unique())









    Out[3]:





3119



In [4]:

    
# creates manufacturing_country from manufacturing_place
len(data.manufacturing_places_tags.unique())









    Out[4]:





2985



In [5]:

    
data['manufacturing_country'] = data.manufacturing_places.str.lower()
special_char = {
    'é': 'e',
    'è': 'e',
    'ë': 'e',
    'ñ': 'n',
    'ï': 'i',
    'ô': 'o',
}
for sp_char, char in special_char.items():
    data['manufacturing_country'] = data.manufacturing_country.str.replace(sp_char, char)
    
to_replace = [# regex -> country, we keep a list and not a dic because order is important
    (r'france|francia|frankreich', 'france'),
    (r'allemagne|deutschland|germany', 'allemagne'),
    (r'portugal', 'portugal'),
    (r'belgique', 'belgique'),
    (r'suisse', 'suisse'),
    (r'ital[iy]', 'italie'),
    (r'ecosse|angleterre|england|royaume[ -]uni|united[ -]kingdom|grande[ -]bretagne', 'royaume-uni'),
    (r'irland', 'irlande'),
    (r'chin[ea]', 'chine'),
    (r'jap[ao]n', 'japon'),
    (r'tcheque', 'tcheque'),
    (r'espagne|espana', 'espagne'),
    (r'switzerland', 'suisse'),
    (r'coree', 'coree'),
    (r'grece', 'grece'),
    (r'sloveni', 'slovenie'),
    (r'pays[ -]bas', 'pays-bas'),
    (r'thailand', 'thailande'),
    (r'viet ?nam', 'viet nam'),
    (r'mexic', 'mexique'),    
    (r'bretagne', 'france'), # needs to be AFTER "grande[ -]bretagne" !!
    (r'corse|vendee|alsace|vezelay|provence|ardeche|savoie|auvergne|lozere|strasbourg', 'france'),
    (r'pyrenee|centre|grenade|vosges|normandie|villeneuve|clohars|marseille', 'france'),
    (r'.*u[\.]?s[\.]?a.*', 'etats-unis'),
    (r'.*la reunion.*', 'reunion'),
    (r'.*maurice.*', 'maurice'),
    (r'.*europe.*', 'union europeenne'), # keep at the end
    (r'.*u\.e[\.]?.*', 'union europeenne'), # keep at the end
]

for regex, country in to_replace:
    data.ix[data.manufacturing_country.str.contains(regex, case=False), 'manufacturing_country'] = country
    
# removing "union europeenne" since we do not know the precise country
data = data.ix[data.manufacturing_country != "union europeenne"]

# we cut lines that have less than 3 products
mc_counts = pd.DataFrame(data.manufacturing_country.value_counts() >= 3)
data_merge = data.merge(mc_counts, how='left', left_on='manufacturing_country', right_index=True, suffixes=('', '_to_keep'))
data = data[data_merge.manufacturing_country_to_keep]



In [6]:

    
data.manufacturing_country.value_counts()









    Out[6]:





france            12957
italie             1009
allemagne           596
belgique            583
espagne             385
pays-bas            262
royaume-uni         198
suisse              150
thailande           135
chine               104
portugal             65
maroc                56
pologne              49
grece                48
suede                29
japon                29
danemark             27
viet nam             26
turquie              23
etats-unis           21
tunisie              19
autriche             17
taiwan               15
luxembourg           15
malaisie             14
coree                13
irlande              13
reunion              12
israel               12
tcheque              12
mexique              11
norvege              10
canada                9
cote d'ivoire         9
islande               9
maurice               9
hongrie               8
afrique du sud        8
lituanie              7
inde                  6
sri lanka             5
algerie               5
ghana                 5
perou                 4
philippines           4
equateur              4
bulgarie              3
roumanie              3
bresil                3
slovenie              3
liban                 3
liechtenstein         3
slovaquie             3
kenya                 3
madagascar            3
Name: manufacturing_country, dtype: int64



In [7]:

    
country_code = pd.read_table('code_original.tsv', sep='\t', usecols=['alpha-3 ', 'Nom français '])
country_code = country_code.rename(columns={'alpha-3 ': 'code', 'Nom français ': 'country_fr'})
country_code.dropna(inplace=True)
country_code.country_fr = country_code.country_fr.str.strip()
country_code.code = country_code.code.str.strip()
country_code.country_fr = country_code.country_fr.str.lower()
country_code.country_fr = country_code.country_fr.str.split(',').map(lambda x: x[0]) # remove "republic of" and others
country_code = country_code.ix[country_code.code != "PRK"] # drop North Korea
special_char = {
    'é': 'e',
    'è': 'e',
    'ë': 'e',
    'ñ': 'n',
    'ï': 'i',
    'ô': 'o',
    'å': 'a',
    '’': '\''
}
for sp_char, char in special_char.items():
    country_code.country_fr = country_code.country_fr.str.replace(sp_char, char)

country_code.head()









    Out[7]:






  
    
      
      code
      country_fr
    
  
  
    
      0
      AFG
      afghanistan
    
    
      1
      ZAF
      afrique du sud
    
    
      2
      ALA
      aland
    
    
      3
      ALB
      albanie
    
    
      4
      DZA
      algerie



In [8]:

    
set_country_fr = set(country_code.country_fr)
set_manufacturing_country = set(data.manufacturing_country)
print('My countries that are not standard : ', set_manufacturing_country.difference(set_country_fr))









    



My countries that are not standard :  set()



In [9]:

    
# join
data = data.merge(country_code, how='left', left_on='manufacturing_country', right_on='country_fr', suffixes=('', '_country'))

Nutriment



In [13]:

    
subset= ['nutrition-score-uk_100g', 
         'sodium_100g',
         'salt_100g',
         'proteins_100g',
         'energy_100g',
         'saturated-fat_100g',
         'sugars_100g',
         'fat_100g',
         'carbohydrates_100g']
data2 = data.dropna(subset=subset)



In [14]:

    
n = len(data2)
nutritionals_null = pd.Series()
nutritionals_null_pct = pd.Series()
for col in data2:
    if "_100g" in col or 'additiv' in col:
        p = np.sum(data2[col].isnull())
        #print('{:40} : {:5} ({}%)'.format(col, p, p/n*100))
        nutritionals_null[col] = p
        nutritionals_null_pct[col] = p/n*100
nutritionals = pd.DataFrame({'Null': nutritionals_null, 'NullPct': nutritionals_null_pct})



In [15]:

    
n









    Out[15]:





12185



In [16]:

    
nutritionals.sort_values('Null').head(15)









    Out[16]:






  
    
      
      Null
      NullPct
    
  
  
    
      nutrition-score-uk_100g
      0
      0.000000
    
    
      sodium_100g
      0
      0.000000
    
    
      salt_100g
      0
      0.000000
    
    
      proteins_100g
      0
      0.000000
    
    
      energy_100g
      0
      0.000000
    
    
      nutrition-score-fr_100g
      0
      0.000000
    
    
      fat_100g
      0
      0.000000
    
    
      saturated-fat_100g
      0
      0.000000
    
    
      sugars_100g
      0
      0.000000
    
    
      carbohydrates_100g
      0
      0.000000
    
    
      additives_n
      268
      2.199426
    
    
      additives
      284
      2.330735
    
    
      additives_en
      4807
      39.450144
    
    
      additives_tags
      4807
      39.450144
    
    
      fiber_100g
      4849
      39.794830

Reduced dataset



In [17]:

    
subset = [
    'product_name',
    'code_country',
    'pnns_groups_1',
    'pnns_groups_2',
    'nutrition-score-uk_100g',
    'nutrition-score-fr_100g',
    'sodium_100g',
    'salt_100g',
    'proteins_100g',
    'energy_100g',
    'saturated-fat_100g',
    'sugars_100g',
    'fat_100g',
    'carbohydrates_100g']



In [18]:

    
data.dropna(subset=subset, inplace=True)
data = data[subset]



In [19]:

    
data.head()









    Out[19]:






  
    
      
      product_name
      code_country
      pnns_groups_1
      pnns_groups_2
      nutrition-score-uk_100g
      nutrition-score-fr_100g
      sodium_100g
      salt_100g
      proteins_100g
      energy_100g
      saturated-fat_100g
      sugars_100g
      fat_100g
      carbohydrates_100g
    
  
  
    
      2
      30 Panach' Fruits
      FRA
      Sugary snacks
      Biscuits and cakes
      13.0
      13.0
      0.248031
      0.63
      5.0
      1724.0
      2.0
      31.0
      18.0
      57.0
    
    
      3
      Marks & Spencer Curiously Strong Mints
      GBR
      Beverages
      Sweetened beverages
      14.0
      20.0
      0.000000
      0.00
      0.9
      1665.0
      0.5
      96.0
      0.5
      96.0
    
    
      4
      All Buter Belgian White Chocolate Chunk Cookies
      GBR
      Sugary snacks
      Biscuits and cakes
      26.0
      26.0
      0.393701
      1.00
      5.5
      2172.0
      17.0
      33.5
      29.1
      58.4
    
    
      5
      Honey roast ham
      GBR
      Fish Meat Eggs
      Processed meat
      4.0
      4.0
      0.787402
      2.00
      24.5
      564.0
      0.7
      2.9
      2.0
      4.3
    
    
      6
      Pur Soup' Velouté de légumes
      FRA
      Fruits and vegetables
      Soups
      2.0
      2.0
      0.295276
      0.75
      0.8
      188.0
      0.7
      2.4
      2.1
      5.3



In [22]:

    
data.to_csv('final/static/data/data.csv', index=False)



In [25]:

    
len(data.pnns_groups_2.unique())









    Out[25]:





35



In [21]:

    
for c in data:
    print('{:25} : {}'.format(c, data.ix[2, c]))









    



product_name              : 30 Panach' Fruits
code_country              : FRA
pnns_groups_1             : Sugary snacks
pnns_groups_2             : Biscuits and cakes
nutrition-score-uk_100g   : 13.0
nutrition-score-fr_100g   : 13.0
sodium_100g               : 0.248031496062992
salt_100g                 : 0.63
proteins_100g             : 5.0
energy_100g               : 1724.0
saturated-fat_100g        : 2.0
sugars_100g               : 31.0
fat_100g                  : 18.0
carbohydrates_100g        : 57.0



In [238]:

    
data.shape









    Out[238]:





(12124, 14)

		code
pnns_groups_1	pnns_groups_2
Beverages	Artificially sweetened beverages	26
	Fruit juices	283
	Fruit nectars	61
	Non-sugared beverages	440
	Sweetened beverages	268
Cereals and potatoes	Bread	296
	Breakfast cereals	178
	Cereals	944
	Legumes	99
	Potatoes	30
Composite foods	One-dish meals	1844
	Pizza pies and quiche	90
	Sandwich	241
Fat and sauces	Dressings and sauces	612
Fat and sauces	Fats	307
Fish Meat Eggs	Eggs	113
	Fish and seafood	818
	Meat	410
	Processed meat	1290
Fruits and vegetables	Dried fruits	43
	Fruits	278
	Soups	94
	Vegetables	609
Milk and dairy products	Cheese	1437
	Dairy desserts	330
	Ice cream	110
	Milk and yogurt	886
Salty snacks	Appetizers	388
	Nuts	44
	Salty and fatty products	1
Sugary snacks	Biscuits and cakes	909
	Chocolate products	540
	Pastries	104
	Sweets	623
unknown	unknown	1713

	code	country_fr
0	AFG	afghanistan
1	ZAF	afrique du sud
2	ALA	aland
3	ALB	albanie
4	DZA	algerie

	Null	NullPct
nutrition-score-uk_100g	0	0.000000
sodium_100g	0	0.000000
salt_100g	0	0.000000
proteins_100g	0	0.000000
energy_100g	0	0.000000
nutrition-score-fr_100g	0	0.000000
fat_100g	0	0.000000
saturated-fat_100g	0	0.000000
sugars_100g	0	0.000000
carbohydrates_100g	0	0.000000
additives_n	268	2.199426
additives	284	2.330735
additives_en	4807	39.450144
additives_tags	4807	39.450144
fiber_100g	4849	39.794830

	product_name	code_country	pnns_groups_1	pnns_groups_2	nutrition-score-uk_100g	nutrition-score-fr_100g	sodium_100g	salt_100g	proteins_100g	energy_100g	saturated-fat_100g	sugars_100g	fat_100g	carbohydrates_100g
2	30 Panach' Fruits	FRA	Sugary snacks	Biscuits and cakes	13.0	13.0	0.248031	0.63	5.0	1724.0	2.0	31.0	18.0	57.0
3	Marks & Spencer Curiously Strong Mints	GBR	Beverages	Sweetened beverages	14.0	20.0	0.000000	0.00	0.9	1665.0	0.5	96.0	0.5	96.0
4	All Buter Belgian White Chocolate Chunk Cookies	GBR	Sugary snacks	Biscuits and cakes	26.0	26.0	0.393701	1.00	5.5	2172.0	17.0	33.5	29.1	58.4
5	Honey roast ham	GBR	Fish Meat Eggs	Processed meat	4.0	4.0	0.787402	2.00	24.5	564.0	0.7	2.9	2.0	4.3
6	Pur Soup' Velouté de légumes	FRA	Fruits and vegetables	Soups	2.0	2.0	0.295276	0.75	0.8	188.0	0.7	2.4	2.1	5.3

Cleaning

Manufacturing places for products sold in France

Categories

Nutriment

Reduced dataset