In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# read data
data = pd.read_table('../en.openfoodfacts.org.products.csv', sep='\t')
print('Initial number of products: {}'.format(len(data)))
# removing rows with unknown countries and unknown manufacturing places
data.dropna(subset=['countries_en', 'manufacturing_places'], inplace=True)
print('Number of products sold to a known country and known manufacturing places: {}'.format(len(data)))
# keep only products sold in France
# (including products sold in France and other countries)
data = data[data.countries_en.dropna().str.match(r'.*france.*', case=False)]
print('Number of products sold in France (and potentially other countries too): {}'.format(len(data)))
In [3]:
# creates manufacturing_country from manufacturing_place
len(data.manufacturing_places.unique())
Out[3]:
In [4]:
# creates manufacturing_country from manufacturing_place
len(data.manufacturing_places_tags.unique())
Out[4]:
In [5]:
data['manufacturing_country'] = data.manufacturing_places.str.lower()
special_char = {
'é': 'e',
'è': 'e',
'ë': 'e',
'ñ': 'n',
'ï': 'i',
'ô': 'o',
}
for sp_char, char in special_char.items():
data['manufacturing_country'] = data.manufacturing_country.str.replace(sp_char, char)
to_replace = [# regex -> country, we keep a list and not a dic because order is important
(r'france|francia|frankreich', 'france'),
(r'allemagne|deutschland|germany', 'allemagne'),
(r'portugal', 'portugal'),
(r'belgique', 'belgique'),
(r'suisse', 'suisse'),
(r'ital[iy]', 'italie'),
(r'ecosse|angleterre|england|royaume[ -]uni|united[ -]kingdom|grande[ -]bretagne', 'royaume-uni'),
(r'irland', 'irlande'),
(r'chin[ea]', 'chine'),
(r'jap[ao]n', 'japon'),
(r'tcheque', 'tcheque'),
(r'espagne|espana', 'espagne'),
(r'switzerland', 'suisse'),
(r'coree', 'coree'),
(r'grece', 'grece'),
(r'sloveni', 'slovenie'),
(r'pays[ -]bas', 'pays-bas'),
(r'thailand', 'thailande'),
(r'viet ?nam', 'viet nam'),
(r'mexic', 'mexique'),
(r'bretagne', 'france'), # needs to be AFTER "grande[ -]bretagne" !!
(r'corse|vendee|alsace|vezelay|provence|ardeche|savoie|auvergne|lozere|strasbourg', 'france'),
(r'pyrenee|centre|grenade|vosges|normandie|villeneuve|clohars|marseille', 'france'),
(r'.*u[\.]?s[\.]?a.*', 'etats-unis'),
(r'.*la reunion.*', 'reunion'),
(r'.*maurice.*', 'maurice'),
(r'.*europe.*', 'union europeenne'), # keep at the end
(r'.*u\.e[\.]?.*', 'union europeenne'), # keep at the end
]
for regex, country in to_replace:
data.ix[data.manufacturing_country.str.contains(regex, case=False), 'manufacturing_country'] = country
# removing "union europeenne" since we do not know the precise country
data = data.ix[data.manufacturing_country != "union europeenne"]
# we cut lines that have less than 3 products
mc_counts = pd.DataFrame(data.manufacturing_country.value_counts() >= 3)
data_merge = data.merge(mc_counts, how='left', left_on='manufacturing_country', right_index=True, suffixes=('', '_to_keep'))
data = data[data_merge.manufacturing_country_to_keep]
In [6]:
data.manufacturing_country.value_counts()
Out[6]:
In [7]:
country_code = pd.read_table('code_original.tsv', sep='\t', usecols=['alpha-3 ', 'Nom français '])
country_code = country_code.rename(columns={'alpha-3 ': 'code', 'Nom français ': 'country_fr'})
country_code.dropna(inplace=True)
country_code.country_fr = country_code.country_fr.str.strip()
country_code.code = country_code.code.str.strip()
country_code.country_fr = country_code.country_fr.str.lower()
country_code.country_fr = country_code.country_fr.str.split(',').map(lambda x: x[0]) # remove "republic of" and others
country_code = country_code.ix[country_code.code != "PRK"] # drop North Korea
special_char = {
'é': 'e',
'è': 'e',
'ë': 'e',
'ñ': 'n',
'ï': 'i',
'ô': 'o',
'å': 'a',
'’': '\''
}
for sp_char, char in special_char.items():
country_code.country_fr = country_code.country_fr.str.replace(sp_char, char)
country_code.head()
Out[7]:
In [8]:
set_country_fr = set(country_code.country_fr)
set_manufacturing_country = set(data.manufacturing_country)
print('My countries that are not standard : ', set_manufacturing_country.difference(set_country_fr))
In [9]:
# join
data = data.merge(country_code, how='left', left_on='manufacturing_country', right_on='country_fr', suffixes=('', '_country'))
In [11]:
# uniformization of several names
data.ix[data.pnns_groups_1 == 'fruits-and-vegetables', 'pnns_groups_1'] = 'Fruits and vegetables'
data.ix[data.pnns_groups_2 == 'fruits', 'pnns_groups_2'] = 'Fruits'
data.ix[data.pnns_groups_2 == 'vegetables', 'pnns_groups_2'] = 'Vegetables'
data.ix[data.pnns_groups_1 == 'sugary-snacks', 'pnns_groups_1'] = 'Sugary snacks'
data.ix[data.pnns_groups_2 == 'pastries', 'pnns_groups_2'] = 'Pastries'
In [12]:
data[['pnns_groups_1', 'pnns_groups_2', 'code']].groupby(['pnns_groups_1', 'pnns_groups_2']).count()
Out[12]:
In [13]:
subset= ['nutrition-score-uk_100g',
'sodium_100g',
'salt_100g',
'proteins_100g',
'energy_100g',
'saturated-fat_100g',
'sugars_100g',
'fat_100g',
'carbohydrates_100g']
data2 = data.dropna(subset=subset)
In [14]:
n = len(data2)
nutritionals_null = pd.Series()
nutritionals_null_pct = pd.Series()
for col in data2:
if "_100g" in col or 'additiv' in col:
p = np.sum(data2[col].isnull())
#print('{:40} : {:5} ({}%)'.format(col, p, p/n*100))
nutritionals_null[col] = p
nutritionals_null_pct[col] = p/n*100
nutritionals = pd.DataFrame({'Null': nutritionals_null, 'NullPct': nutritionals_null_pct})
In [15]:
n
Out[15]:
In [16]:
nutritionals.sort_values('Null').head(15)
Out[16]:
In [17]:
subset = [
'product_name',
'code_country',
'pnns_groups_1',
'pnns_groups_2',
'nutrition-score-uk_100g',
'nutrition-score-fr_100g',
'sodium_100g',
'salt_100g',
'proteins_100g',
'energy_100g',
'saturated-fat_100g',
'sugars_100g',
'fat_100g',
'carbohydrates_100g']
In [18]:
data.dropna(subset=subset, inplace=True)
data = data[subset]
In [19]:
data.head()
Out[19]:
In [22]:
data.to_csv('final/static/data/data.csv', index=False)
In [25]:
len(data.pnns_groups_2.unique())
Out[25]:
In [21]:
for c in data:
print('{:25} : {}'.format(c, data.ix[2, c]))
In [238]:
data.shape
Out[238]: