In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [3]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
prod = pd.read_csv(path+'producto_tabla.csv')
cluster = pd.read_csv(path+'prod_cluster.csv')
In [2]:
'''MAC'''
path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
prod = pd.read_csv(path+'producto_tabla.csv')
cluster = pd.read_csv(path+'prod_cluster.csv')
In [4]:
cluster[['Producto_ID','p_name','p_clt']]=cluster[['Producto_ID','product_shortname','cluster']]
cluster=cluster[['Producto_ID','p_name','p_clt']]
cluster.head()
Out[4]:
In [5]:
'''p_cluster | p_name'''
prod = pd.merge(prod, cluster, how='left', on=['Producto_ID'])
prod.fillna(0, inplace=True)
In [6]:
'''prom'''
prod.drop(prod.index[[0]], inplace=True) #delete first row 'no indication'
prod['prom'] = prod['NombreProducto'].apply(lambda x: 1 if 'Prom' in x else 0)
len(prod[prod['prom'] == 1])
Out[6]:
In [7]:
# prod['short_name'] = prod.NombreProducto.str.extract('^(\D*)', expand=False)
prod.loc[prod['Producto_ID']==46711, 'NombreProducto'] = 'Coca Cola Zero Lata 355ml CC 46711' # 355 ml bug
prod['brand'] = prod.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False)
w = prod.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
prod['weight'] = w[0].astype('float')*w[1].map({'Kg':1000, 'g':1})
prod['pieces'] = prod.NombreProducto.str.extract('(\d+)p ', expand=False).astype('float')
prod['pieces'].fillna(1, inplace=True)
prod['w_per_piece'] = prod['weight']/prod['pieces']
prod['healthy']= prod.NombreProducto.str.contains('100pct| 0pct|Multigrano|Integral|Fibra|Grasa|sin Azucar').astype(int)
In [8]:
'''drink'''
prod['drink'] = prod['NombreProducto'].apply(lambda x: 1 if 'ml' in x else 0)
len(prod[prod['drink'] == 1])
# prod['drink'] = 0
# for row in prod.iterrows():
# if np.isnan(row[1]['weight']):
# prod.set_value(row[0], 'drink', 1)
Out[8]:
In [9]:
prod.loc[prod['NombreProducto'].str.contains('ml'),'w_per_piece'] = \
prod[prod['NombreProducto'].str.contains('ml')] \
.NombreProducto.str.extract('(\d+)(ml)', expand=True)[0].astype('float')
prod.loc[prod['NombreProducto'].str.contains('ml'),'weight'] = \
prod.loc[prod['NombreProducto'].str.contains('ml'),'w_per_piece'] \
*prod.loc[prod['NombreProducto'].str.contains('ml'),'pieces']
In [17]:
# prod[prod['NombreProducto'].str.contains('ml')].head()
prod.drop(['NombreProducto','p_name'],axis=1, inplace=True)
Out[17]:
In [20]:
prod['p_clt'] = prod['p_clt'].astype('int')
prod['pieces'] = prod['pieces'].astype('int')
In [23]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(prod['brand'])
print le.classes_
prod['brand'] = le.transform(prod['brand'])
In [28]:
price = pd.read_csv(path+'price.csv')
prod = pd.merge(prod, price, how='left', on=['Producto_ID'])
In [30]:
price_na = {}
for row in prod.iterrows():
if np.isnan(row[1]['price']):
cluster = prod[prod['p_clt']==row[1]['p_clt']]
subcluster = cluster[cluster['brand']==row[1]['brand']]
subsubcluster = cluster[cluster['weight']==row[1]['weight']]
subsubmed = subsubcluster['price'].median()
submed = subcluster['price'].median()
med = cluster['price'].median()
if np.isnan(subsubmed):
if np.isnan(submed):
if np.isnan(med):
prod.set_value(row[0], 'price', 0)
else:
prod.set_value(row[0], 'price', med)
else:
prod.set_value(row[0], 'price', submed)
else:
prod.set_value(row[0], 'price', subsubmed)
In [31]:
prod.head()
Out[31]:
In [ ]:
In [32]:
prod.fillna(0, inplace=True)
prod.to_csv(path+'products.csv',index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [13]:
prod[(prod['prom']==1)&(prod['healthy']==1)]
Out[13]:
In [12]:
cc = prod[prod['NombreProducto'].str.contains('Fanta')]
cc
Out[12]:
对比一下 prom 936 32997 Agua Ciel Toronja 600ml CC 32997 0 CC 0.0 1.0 0.000000 Agua Ciel Toronja 600ml 17m 949 33074 Agua Ciel Toronja 600ml Prom CC 33074 1 CC 0.0 1.0 0.000000 Agua Ciel Toronja 600ml 17
In [ ]:
In [ ]:
In [ ]:
In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 1000)
product_bag_words = vectorizer.fit_transform(products.short_name).toarray()
product_bag_words.shape
Out[5]:
In [6]:
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
cltr = KMeans(n_clusters=100)
inds = cltr.fit_predict(product_bag_words)
In [7]:
products['short_name'].values[np.where(inds==0)]
Out[7]:
In [8]:
products['cluster_nombre'] = inds
products['drink'] = 0
In [9]:
price_na = {}
for row in products.iterrows():
if np.isnan(row[1]['weight']):
products.set_value(row[0], 'drink', 1)
if np.isnan(row[1]['price']):
cluster = products[products['cluster_nombre']==row[1]['cluster_nombre']]
subcluster = cluster[cluster['brand']==row[1]['brand']]
subsubcluster = cluster[cluster['weight']==row[1]['weight']]
subsubmed = subsubcluster['price'].median()
submed = subcluster['price'].median()
med = cluster['price'].median()
if np.isnan(subsubmed):
if np.isnan(submed):
if np.isnan(med):
products.set_value(row[0], 'price', 0)
else:
products.set_value(row[0], 'price', med)
else:
products.set_value(row[0], 'price', submed)
else:
products.set_value(row[0], 'price', subsubmed)