In [1]:
import numpy as np
import pandas as pd
import utilities as utils
import matplotlib.pyplot as plt
%matplotlib inline
import pylab as pl
from sklearn.cluster import KMeans
from sklearn.metrics import completeness_score, homogeneity_score
In [2]:
data = pd.read_csv("world-food-facts/FoodFacts.csv")
In [3]:
data = data[data.main_category_en.notnull()]
In [4]:
more_than = data.main_category_en.value_counts() > 1000
In [5]:
more_than = more_than.index[more_than == True]
more_than
Out[5]:
In [6]:
data = data[data.main_category_en.isin(more_than)]
print data.shape
print data.main_category_en.value_counts()
In [7]:
target = data.main_category_en
target.shape
Out[7]:
In [8]:
nutriment_cols = [col for col in data.columns if '_100g' in col and not 'score' in col]
data = data[nutriment_cols]
data = data.fillna(0) #fill nulls with 0 for now
data.shape
Out[8]:
In [9]:
km = KMeans(n_clusters=12, max_iter=500, n_init=20, init='k-means++') # initialization
In [10]:
km.fit(data)
Out[10]:
In [11]:
print km.cluster_centers_
In [12]:
c = km.predict(data)
In [13]:
np.bincount(c)
Out[13]:
In [14]:
target.unique()
Out[14]:
In [15]:
target.loc[target == 'Plant-based foods and beverages'] = 0
target.loc[target == 'Beverages'] = 1
target.loc[target == 'Sugary snacks'] = 2
target.loc[target == 'Groceries'] = 3
target.loc[target == 'Meats'] = 4
target.loc[target == 'Meals'] = 5
target.loc[target == 'Dairies'] = 6
target.loc[target == 'Salty snacks'] = 7
target.loc[target == 'Frozen foods'] = 8
target.loc[target == 'Fruit juices'] = 9
target.loc[target == 'Canned foods'] = 10
target.loc[target == 'Fresh foods'] = 11
t = target.as_matrix()
t = t.astype(int)
In [23]:
print completeness_score(t,c)
In [24]:
print homogeneity_score(t,c)
In [18]:
print t
print np.bincount(t)
In [19]:
print c
print np.bincount(c)
In [ ]: