In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [3]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
prod = pd.read_csv(path+'producto_tabla.csv')
cluster = pd.read_csv(path+'prod_cluster.csv')

In [2]:
'''MAC'''
path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
prod = pd.read_csv(path+'producto_tabla.csv')
cluster = pd.read_csv(path+'prod_cluster.csv')

In [4]:
cluster[['Producto_ID','p_name','p_clt']]=cluster[['Producto_ID','product_shortname','cluster']]
cluster=cluster[['Producto_ID','p_name','p_clt']]
cluster.head()


Out[4]:
Producto_ID p_name p_clt
0 9 Capuccino Moka 750g 23
1 41 Bimbollos Ext sAjonjoli 6p 480g 9
2 53 Burritos Sincro 170g 29
3 72 Div Tira Mini Doradita 4p 45g 23
4 73 Pan Multigrano Linaza 540g 1

In [5]:
'''p_cluster | p_name'''
prod = pd.merge(prod, cluster, how='left', on=['Producto_ID'])
prod.fillna(0, inplace=True)

In [6]:
'''prom'''
prod.drop(prod.index[[0]], inplace=True) #delete first row 'no indication'
prod['prom'] = prod['NombreProducto'].apply(lambda x: 1 if 'Prom' in x else 0)
len(prod[prod['prom'] == 1])


Out[6]:
204

In [7]:
# prod['short_name'] = prod.NombreProducto.str.extract('^(\D*)', expand=False)
prod.loc[prod['Producto_ID']==46711, 'NombreProducto'] = 'Coca Cola Zero Lata 355ml CC 46711' # 355 ml bug

prod['brand'] = prod.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False)
w = prod.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
prod['weight'] = w[0].astype('float')*w[1].map({'Kg':1000, 'g':1})
prod['pieces'] =  prod.NombreProducto.str.extract('(\d+)p ', expand=False).astype('float')
prod['pieces'].fillna(1, inplace=True)
prod['w_per_piece'] = prod['weight']/prod['pieces']

prod['healthy']= prod.NombreProducto.str.contains('100pct| 0pct|Multigrano|Integral|Fibra|Grasa|sin Azucar').astype(int)

In [8]:
'''drink'''
prod['drink'] = prod['NombreProducto'].apply(lambda x: 1 if 'ml' in x else 0)
len(prod[prod['drink'] == 1])
# prod['drink'] = 0
# for row in prod.iterrows():
#     if np.isnan(row[1]['weight']):
#             prod.set_value(row[0], 'drink', 1)


Out[8]:
49

In [9]:
prod.loc[prod['NombreProducto'].str.contains('ml'),'w_per_piece'] = \
                                            prod[prod['NombreProducto'].str.contains('ml')] \
                                            .NombreProducto.str.extract('(\d+)(ml)', expand=True)[0].astype('float') 

prod.loc[prod['NombreProducto'].str.contains('ml'),'weight'] = \
                                            prod.loc[prod['NombreProducto'].str.contains('ml'),'w_per_piece'] \
                                            *prod.loc[prod['NombreProducto'].str.contains('ml'),'pieces']

In [17]:
# prod[prod['NombreProducto'].str.contains('ml')].head()
prod.drop(['NombreProducto','p_name'],axis=1, inplace=True)


Out[17]:
Producto_ID p_clt prom brand weight pieces w_per_piece healthy drink
1 9 23.0 0 NES 750.0 1.0 750.00 0 0
2 41 9.0 0 BIM 480.0 6.0 80.00 0 0
3 53 29.0 0 LON 170.0 1.0 170.00 0 0
4 72 23.0 0 TR 45.0 4.0 11.25 0 0
5 73 1.0 0 BIM 540.0 1.0 540.00 1 0

In [20]:
prod['p_clt'] = prod['p_clt'].astype('int')
prod['pieces'] = prod['pieces'].astype('int')

In [23]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(prod['brand'])

print le.classes_

prod['brand'] = le.transform(prod['brand'])


[nan 'AM' 'AV' 'BAR' 'BIM' 'BRE' 'BRL' 'CAR' 'CC' 'CHK' 'COR' 'DH' 'DIF'
 'EMB' 'GBI' 'GV' 'JMX' 'KOD' 'LAR' 'LC' 'LON' 'MCM' 'MLA' 'MP' 'MR' 'MSK'
 'MTB' 'NAI' 'NEC' 'NES' 'ORO' 'PUL' 'RIC' 'SAN' 'SKD' 'SL' 'SUA' 'SUN'
 'THO' 'TR' 'TRI' 'VER' 'VR' 'WON']
/usr/local/lib/python2.7/dist-packages/numpy/lib/arraysetops.py:259: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  return aux[:-1][aux[1:] == aux[:-1]]

In [28]:
price = pd.read_csv(path+'price.csv')
prod = pd.merge(prod, price, how='left', on=['Producto_ID'])

In [30]:
price_na = {}
for row in prod.iterrows():
    if np.isnan(row[1]['price']):
        cluster = prod[prod['p_clt']==row[1]['p_clt']]
        subcluster = cluster[cluster['brand']==row[1]['brand']]
        subsubcluster = cluster[cluster['weight']==row[1]['weight']]
        subsubmed =  subsubcluster['price'].median()
        submed =  subcluster['price'].median()
        med =  cluster['price'].median()
        if np.isnan(subsubmed):
            if np.isnan(submed):
                if np.isnan(med):
                    prod.set_value(row[0], 'price', 0)
                else:
                    prod.set_value(row[0], 'price', med)
            else:
                prod.set_value(row[0], 'price', submed)
        else:
            prod.set_value(row[0], 'price', subsubmed)

In [31]:
prod.head()


Out[31]:
Producto_ID p_clt prom brand weight pieces w_per_piece healthy drink price
0 9 23 0 29 750.0 1 750.00 0 0 8.8000
1 41 9 0 4 480.0 6 80.00 0 0 17.1317
2 53 29 0 20 170.0 1 170.00 0 0 5.7600
3 72 23 0 39 45.0 4 11.25 0 0 8.0000
4 73 1 0 4 540.0 1 540.00 1 0 35.6700

In [ ]:


In [32]:
prod.fillna(0, inplace=True)
prod.to_csv(path+'products.csv',index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [13]:
prod[(prod['prom']==1)&(prod['healthy']==1)]


Out[13]:
Producto_ID NombreProducto p_name p_clt prom brand weight pieces w_per_piece healthy drink
436 30403 Pan Integral 370g Prom BIM 30403 Pan Integral 370g 1.0 1 BIM 370.0 1.0 370.0 1 0
519 30904 Sandwich Integral 112g Prom MTA LON 30904 Sandwich Integral 112g 1.0 1 LON 112.0 1.0 112.0 1 0
605 31320 Barra Multigrano Nuez 12p Prom CU BIM 31320 Barra Multigrano Nuez 12p 27.0 1 BIM 0.0 12.0 0.0 1 0
761 32221 Sandwich Integral 112g Prom LON 32221 Sandwich Integral 112g 1.0 1 LON 112.0 1.0 112.0 1 0
937 32999 Sandwich Integral Cong 112g Prom CU LON 32999 Sandwich Integral Cong 112g 1.0 1 LON 112.0 1.0 112.0 1 0

In [12]:
cc = prod[prod['NombreProducto'].str.contains('Fanta')]
cc


Out[12]:
Producto_ID NombreProducto p_name p_clt prom brand weight pieces w_per_piece drink
392 30190 Fanta Lata 24p 355ml CC 30190 Fanta Lata 24p 355ml 22.0 0 CC 8520.0 24.0 355.0 1
2566 49765 Fanta Lata 355ml CC 49765 Fanta Lata 355ml 22.0 0 CC 355.0 1.0 355.0 1

对比一下 prom 936 32997 Agua Ciel Toronja 600ml CC 32997 0 CC 0.0 1.0 0.000000 Agua Ciel Toronja 600ml 17m 949 33074 Agua Ciel Toronja 600ml Prom CC 33074 1 CC 0.0 1.0 0.000000 Agua Ciel Toronja 600ml 17


In [ ]:


In [ ]:


In [ ]:


暂时用R的skmeans


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 1000) 

product_bag_words = vectorizer.fit_transform(products.short_name).toarray()
product_bag_words.shape


Out[5]:
(2592, 808)

In [6]:
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering

cltr = KMeans(n_clusters=100)
inds = cltr.fit_predict(product_bag_words)

In [7]:
products['short_name'].values[np.where(inds==0)]


Out[7]:
array(['Suavicremas Fresa ', 'Suavicremas Fresa KC ', 'Suavicremas Fresa ',
       'Suavicremas Fresa ', 'Suavicremas Fresa ', 'Suavicremas Fresa ',
       'Suavicremas Fresa ', 'Suavicremas Fresa ', 'Suavicremas Fresa ',
       'Suavicremas Fresa ', 'Suavicremas Fresa ', 'Suavicremas Fresa ',
       'Suavicremas Fresa ', 'Suavicremas Fresa Nuevo KC ',
       'Suavicremas Fresa ', 'Suavicremas Fresa MG ',
       'Suavicremas Fresa MG ', 'Suavicremas Fresa ', 'Suavicremas Fresa ',
       'Suavicremas Fresa '], dtype=object)

In [8]:
products['cluster_nombre'] = inds
products['drink'] = 0

In [9]:
price_na = {}
for row in products.iterrows():
    if np.isnan(row[1]['weight']):
        products.set_value(row[0], 'drink', 1)
    if np.isnan(row[1]['price']):
        cluster = products[products['cluster_nombre']==row[1]['cluster_nombre']]
        subcluster = cluster[cluster['brand']==row[1]['brand']]
        subsubcluster = cluster[cluster['weight']==row[1]['weight']]
        subsubmed =  subsubcluster['price'].median()
        submed =  subcluster['price'].median()
        med =  cluster['price'].median()
        if np.isnan(subsubmed):
            if np.isnan(submed):
                if np.isnan(med):
                    products.set_value(row[0], 'price', 0)
                else:
                    products.set_value(row[0], 'price', med)
            else:
                products.set_value(row[0], 'price', submed)
        else:
            products.set_value(row[0], 'price', subsubmed)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-9-259d4a822de5> in <module>()
      3     if np.isnan(row[1]['weight']):
      4         products.set_value(row[0], 'drink', 1)
----> 5     if np.isnan(row[1]['price']):
      6         cluster = products[products['cluster_nombre']==row[1]['cluster_nombre']]
      7         subcluster = cluster[cluster['brand']==row[1]['brand']]

/usr/local/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    558     def __getitem__(self, key):
    559         try:
--> 560             result = self.index.get_value(self, key)
    561 
    562             if not lib.isscalar(result):

/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_value(self, series, key)
   1923                     raise InvalidIndexError(key)
   1924                 else:
-> 1925                     raise e1
   1926             except Exception:  # pragma: no cover
   1927                 raise e1

KeyError: 'price'