In [1]:
import os
import math
import pandas as pd
import numpy as np
from dask import dataframe as dd
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
In [4]:
path = '/home/zongyi/bimbo_data/'
sf = gl.SFrame.read_csv(path+'train.csv', verbose=False)
# prom = gl.SFrame.read_csv('/Users/zonemercy/jupyter_notebook/bimbo_data/prom_prod.csv', verbose=False)
# cluster = gl.SFrame.read_csv('/Users/zonemercy/jupyter_notebook/bimbo_data/prod_cluster.csv', verbose=False)
# prod = gl.SFrame.read_csv('/Users/zonemercy/jupyter_notebook/bimbo_data/preprocessed_products.csv', verbose=False)
# state = gl.SFrame.read_csv('/Users/zonemercy/jupyter_notebook/bimbo_data/town_state.csv', verbose=False)
In [18]:
sf['Demada_log'] = sf['Demanda_uni_equil'].apply(lambda x: math.log(x+1))
sf['price'] = sf['Venta_hoy'] / sf['Venta_uni_hoy'] #plus 1
In [22]:
sf = sf[['Producto_ID', 'price']]
sf = sf.groupby(key_columns=['Producto_ID'], operations={'price':agg.MEAN('price')})
sf.save(path+'price.csv',format='csv')
In [16]:
'''random choice prod'''
pid = sf[int(np.random.choice(len(sf), 1))]['Producto_ID']
rpid = sf[sf['Producto_ID'] == pid]
In [17]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'})
%matplotlib inline
figsize(16, 9)
fig, axs = plt.subplots(3, 3)
fig.subplots_adjust(hspace = .3) #, wspace=.1)
data = np.arange(3, 10)
for ax, d in zip(axs.ravel(), data):
ax.scatter(rpid[rpid['Semana']==d]['price'],
rpid[rpid['Semana']==d]['Demanda_uni_equil'],alpha=.1)
ax.set_title('Week '+str(d))
ax.set_ylim([0,6])
In [113]:
rpid[(rpid['price']<4) & (rpid['Demada_log']>1)]
Out[113]:
In [112]:
'''去除 Ruta_SAK 900'''
# x = rpid['price']
# y = rpid['Demanda_uni_equil']
x = rpid[rpid['Ruta_SAK'] != 900]['price']
y = rpid[rpid['Ruta_SAK'] != 900]['Demanda_uni_equil']
plt.scatter(x, y, alpha=0.2)
plt.show()