In [1]:
import numpy as np
import pandas as pd
import utilities as utils
import matplotlib.pyplot as plt
%matplotlib inline

import pylab as pl
from sklearn.cluster import KMeans 
from sklearn.metrics import completeness_score, homogeneity_score

In [2]:
data = pd.read_csv("world-food-facts/FoodFacts.csv")


/Applications/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (0,3,5,27,36) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [3]:
data = data[data.main_category_en.notnull()]

In [4]:
more_than = data.main_category_en.value_counts() > 1000

In [5]:
more_than = more_than.index[more_than == True]
more_than


Out[5]:
Index([u'Plant-based foods and beverages', u'Sugary snacks', u'Beverages',
       u'Dairies', u'Groceries', u'Fresh foods', u'Meats', u'Meals',
       u'Canned foods', u'Salty snacks', u'Frozen foods', u'Fruit juices'],
      dtype='object')

In [6]:
data = data[data.main_category_en.isin(more_than)]
print data.shape
print data.main_category_en.value_counts()


(40222, 159)
Plant-based foods and beverages    11005
Sugary snacks                       5926
Beverages                           5503
Dairies                             3394
Groceries                           3185
Fresh foods                         2904
Meats                               2196
Meals                               1353
Canned foods                        1319
Salty snacks                        1202
Frozen foods                        1138
Fruit juices                        1097
Name: main_category_en, dtype: int64

In [7]:
target = data.main_category_en
target.shape


Out[7]:
(40222,)

In [8]:
nutriment_cols = [col for col in data.columns if '_100g' in col and not 'score' in col]
data = data[nutriment_cols]
data = data.fillna(0) #fill nulls with 0 for now
data.shape


Out[8]:
(40222, 94)

In [9]:
km = KMeans(n_clusters=12, max_iter=500, n_init=20, init='k-means++') # initialization

In [10]:
km.fit(data)


Out[10]:
KMeans(copy_x=True, init='k-means++', max_iter=500, n_clusters=12, n_init=20,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [11]:
print km.cluster_centers_


[[  8.26667349e+00   1.02590634e-02   7.65517391e-01 ...,   1.05694796e-01
    0.00000000e+00   6.49551621e-01]
 [  1.89356612e+03   1.92626914e+00   2.16426712e+01 ...,   5.66478646e-01
    0.00000000e+00   1.61039484e+00]
 [  7.73744382e+02   3.16816327e+00   9.02500301e+00 ...,  -1.93178806e-14
    0.00000000e+00   2.33001855e+00]
 ..., 
 [  1.11561241e+03   3.05276276e+00   1.35675673e+01 ...,  -2.03170814e-14
    0.00000000e+00   2.11951952e+00]
 [  3.58976635e+03   7.91666667e-01   9.42126189e+01 ...,   1.77635684e-15
    0.00000000e+00   1.36538462e+00]
 [  2.76527778e+03   2.40444444e+03   6.42611111e+01 ...,   3.33066907e-16
    0.00000000e+00   4.44089210e-16]]

In [12]:
c = km.predict(data)

In [13]:
np.bincount(c)


Out[13]:
array([13047,  2479,  2695,   664,  2826,  5454,  4783,   170,  4426,
        3330,   312,    36])

In [14]:
target.unique()


Out[14]:
array(['Plant-based foods and beverages', 'Beverages', 'Sugary snacks',
       'Groceries', 'Meats', 'Meals', 'Dairies', 'Salty snacks',
       'Frozen foods', 'Fruit juices', 'Canned foods', 'Fresh foods'], dtype=object)

In [15]:
target.loc[target == 'Plant-based foods and beverages'] = 0
target.loc[target == 'Beverages'] = 1
target.loc[target == 'Sugary snacks'] = 2
target.loc[target == 'Groceries'] = 3
target.loc[target == 'Meats'] = 4
target.loc[target == 'Meals'] = 5
target.loc[target == 'Dairies'] = 6
target.loc[target == 'Salty snacks'] = 7
target.loc[target == 'Frozen foods'] = 8
target.loc[target == 'Fruit juices'] = 9
target.loc[target == 'Canned foods'] = 10
target.loc[target == 'Fresh foods'] = 11
t = target.as_matrix()
t = t.astype(int)


/Applications/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:128: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [23]:
print completeness_score(t,c)


0.192688190744

In [24]:
print homogeneity_score(t,c)


0.17652198907

In [18]:
print t
print np.bincount(t)


[0 0 1 ..., 1 4 7]
[11005  5503  5926  3185  2196  1353  3394  1202  1138  1097  1319  2904]

In [19]:
print c
print np.bincount(c)


[0 0 0 ..., 0 0 0]
[13047  2479  2695   664  2826  5454  4783   170  4426  3330   312    36]

In [ ]: