In [1]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg

In [2]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
# town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)


/usr/local/lib/python2.7/dist-packages/requests/packages/urllib3/util/ssl_.py:315: SNIMissingWarning: An HTTPS request has been made, but the SNI (Subject Name Indication) extension to TLS is not available on this platform. This may cause the server to present an incorrect TLS certificate, which can cause validation failures. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#snimissingwarning.
  SNIMissingWarning
/usr/local/lib/python2.7/dist-packages/requests/packages/urllib3/util/ssl_.py:120: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
[INFO] graphlab.cython.cy_server: GraphLab Create v2.0.1 started. Logging: /tmp/graphlab_server_1472162610.log
This non-commercial license of GraphLab Create for academic use is assigned to zong-yi.liu@irit.fr and will expire on July 13, 2017.

In [125]:
'''MAC'''
path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
# town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)

In [3]:
t1 = town.groupby(key_columns=['Town'], operations={'t_c': agg.COUNT('Agencia_ID')})
town = town.join(t1, on='Town', how='left')
town = town['Agencia_ID','Town','t_c']
del t1


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-fad060748b3c> in <module>()
----> 1 t1 = town.groupby(key_columns=['Town'], operations={'t_c': agg.COUNT('Agencia_ID')})
      2 town = town.join(t1, on='Town', how='left')
      3 town = town['Agencia_ID','Town','t_c']
      4 del t1

NameError: name 'town' is not defined

In [ ]:
sf = sf.join(town,on='Agencia_ID',how='left')

In [8]:
print town.head()


+------------+-------------------------+-----+
| Agencia_ID |           Town          | t_c |
+------------+-------------------------+-----+
|    1110    |    2008 AG. LAGO FILT   |  4  |
|    1111    |  2002 AG. AZCAPOTZALCO  |  4  |
|    1112    |   2004 AG. CUAUTITLAN   |  4  |
|    1113    |    2008 AG. LAGO FILT   |  4  |
|    1114    |   2029 AG.IZTAPALAPA 2  |  6  |
|    1116    |   2011 AG. SAN ANTONIO  |  7  |
|    1117    |    2001 AG. ATIZAPAN    |  5  |
|    1118    |    2007 AG. LA VILLA    |  3  |
|    1119    | 2013 AG. MEGA NAUCALPAN |  8  |
|    1120    |  2018 AG. TEPALCATES 2  |  4  |
+------------+-------------------------+-----+
[10 rows x 3 columns]


In [9]:
tcc = sf.groupby(key_columns=['Town'], operations={'tcc':agg.COUNT('Cliente_ID')})
tcc['tcc'] = tcc['tcc']/100
tcc['tcc']=tcc['tcc'].astype(int)
print len(tcc['tcc'].unique())
tcc.sort('tcc')#,ascending=False)


251
Out[9]:
Town tcc
2089 AG. AZCAPOTZALCO
INSTITUCIONALES ...
0
2145 Cruce De Anden Norte 13
2152 Cruce De Anden
Noroeste ...
17
2169 Cruce De Andén
Sureste ...
32
2116 Cruce De Anden
Región Mexico ...
41
2655 LOS MOCHIS 51
2081 NAUCALPAN II 129
2410 CANCUN II 146
2647 BLM_AG. CAMPESINOS
AUTOSERVICI ...
152
2472 OJINAGA 156
[257 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [10]:
# sf = sf.join(tcc,on='Town',how='left')
town = town.join(tcc,on='Town',how='left')

In [11]:
print town


+------------+-------------------------+-----+------+
| Agencia_ID |           Town          | t_c | tcc  |
+------------+-------------------------+-----+------+
|    1110    |    2008 AG. LAGO FILT   |  4  | 3275 |
|    1111    |  2002 AG. AZCAPOTZALCO  |  4  | 4750 |
|    1112    |   2004 AG. CUAUTITLAN   |  4  | 3834 |
|    1113    |    2008 AG. LAGO FILT   |  4  | 3275 |
|    1114    |   2029 AG.IZTAPALAPA 2  |  6  | 3679 |
|    1116    |   2011 AG. SAN ANTONIO  |  7  | 7781 |
|    1117    |    2001 AG. ATIZAPAN    |  5  | 7784 |
|    1118    |    2007 AG. LA VILLA    |  3  | 6445 |
|    1119    | 2013 AG. MEGA NAUCALPAN |  8  | 8144 |
|    1120    |  2018 AG. TEPALCATES 2  |  4  | 5094 |
+------------+-------------------------+-----+------+
[790 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

tp_sum


In [12]:
sf['Demada_log'] = sf['Demanda_uni_equil'].apply(lambda x: math.log(x+1))

tp_sum = sf.groupby(key_columns=['Town','Producto_ID'], operations={'tp_sum':agg.SUM('Demada_log')})

In [13]:
town = town.join(tp_sum,on=['Town'],how='left')

In [14]:
sf = sf.join(tp_sum,on=['Town','Producto_ID'],how='left')

In [19]:
print town


+------------+------+-----+------+-------------+---------------+
| Agencia_ID | Town | t_c | tcc  | Producto_ID |     tp_sum    |
+------------+------+-----+------+-------------+---------------+
|   22187    |  84  |  1  | 3041 |    34768    | 177.708346361 |
|    1164    |  26  |  5  | 7154 |    48893    | 1542.84943441 |
|    1177    |  26  |  5  | 7154 |    48893    | 1542.84943441 |
|    1220    |  26  |  5  | 7154 |    48893    | 1542.84943441 |
|    1262    |  26  |  5  | 7154 |    48893    | 1542.84943441 |
|   20489    |  26  |  5  | 7154 |    48893    | 1542.84943441 |
|    1253    |  76  |  5  | 1266 |    33267    | 54.2948354786 |
|    2011    |  76  |  5  | 1266 |    33267    | 54.2948354786 |
|    2061    |  76  |  5  | 1266 |    33267    | 54.2948354786 |
|    2095    |  76  |  5  | 1266 |    33267    | 54.2948354786 |
+------------+------+-----+------+-------------+---------------+
[234840 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [18]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(town['Town'])

print le.classes_

town['Town'] = le.transform(town['Town'])


['2001 AG. ATIZAPAN' '2002 AG. AZCAPOTZALCO' '2003 AG. COACALCO'
 '2004 AG. CUAUTITLAN' '2007 AG. LA VILLA' '2008 AG. LAGO FILT'
 '2010 AG. LOS REYES' '2011 AG. SAN ANTONIO' '2013 AG. MEGA NAUCALPAN'
 '2014 AG. NEZA' '2015 AG. ROJO GOMEZ' '2016 AG. SAN LORENZO'
 '2017 AG. SANTA CLARA' '2018 AG. TEPALCATES 2' '2019 AG. XALOSTOC'
 '2021 AG. XOCHIMILCO 2' '2027 AG. VALLEJO' '2029 AG.IZTAPALAPA 2'
 '2031 AG. PANTITLAN' '2032 AG. SANTA LUCIA' '2034 ACAPULCO COSTERA'
 '2036 APIZACO MARINELA' '2040 AG. CENTRO' '2041 AG. TULTITLAN'
 '2042 AG. TEPOZOTLAN' '2044 CUERNAVACA BUENAVISTA' '2048 AG. IXTAPALUCA 1'
 '2049 AG. IZTAPALAPA 1' '2050 AG. MIXCOAC' '2053 OAXACA MARINELA'
 '2055 Pinotepa' '2056 PUEBLA NORTE MARINELA' '2057 PUEBLA SUR MARINELA'
 '2059 QUERETARO BALVANERA' '2059 Queretaro Balvanera'
 '2063 TEHUACAN MARINELA' '2064 AG. TEPALCATES 1' '2065 TOLUCA CENTRO'
 '2066 TOLUCA PONIENTE' '2070 AG. XOCHIMILCO 1' '2078 AG. TEXCOCO'
 '2080 TEXCOCO WONDER' '2081 NAUCALPAN II' '2083 AZCAPOTZALCO II MARINELA'
 '2087 AG. TIZAYUCA' '2088 AG. CEYLAN'
 '2089 AG. AZCAPOTZALCO INSTITUCIONALES' '2090 AG. TEPEJI DEL RIO'
 '2092 AG. SANTA CLARA AUTOSERVICIOS' '2094 CHALCO_BM'
 '2116 Cruce De Anden Regi\xc3\xb3n Mexico' '2145 Cruce De Anden Norte'
 '2151 ACAPULCO ESCENICA' '2152 ALTAMIRANO_BM'
 '2152 Cruce De Anden Noroeste' '2153 APATZINGAN' '2154 ATLACOMULCO'
 '2155 CELAYA NORPONIENTE' '2156 CELAYA INDUSTRIAL WONDER' '2157 CUAUTLA'
 '2158 CUERNAVACA CIVAC' '2159 CHILPANCINGO' '2160 IGUALA'
 '2161 IRAPUATO GUADALUPE' '2162 JOJUTLA_BM' '2163 LA PIEDAD'
 '2164 LAZARO CARDENAS' '2165 LEON SAN JUAN BOSCO' '2167 MORELIA LA HUERTA'
 '2168 MORELIA ABASTOS' '2169 Cruce De And\xc3\xa9n Sureste'
 '2169 PINOTEPA' '2170 PUERTO ESCONDIDO' '2171 QUERETARO SAN PABLO'
 '2173 SAN JUAN DEL RIO' '2174 SAN MIGUEL DE ALLENDE'
 '2175 TOLUCA AEROPUERTO' '2176 TOLUCA SAN ANTONIO' '2177 AGENCIA SUANDY'
 '2178 URIANGATO' '2179 URUAPAN' '2181 ZIHUATANEJO' '2182 ZITACUARO'
 '2186 LAGOS DE MORENO' '2187 SALAMANCA' '2188 MARAVATIO'
 '2189 ACAPULCO SAN ISIDRO' '2190 BI_SAN FRANCISCO DEL RINCON'
 '2251 AGUASCALIENTES NORTE' '2252 AGUASCALIENTES SIGLO XXI'
 '2253 ATOTONILCO' '2254 AUTLAN_BM' '2255 CD GUZMAN' '2256 CD. MANTE'
 '2257 CD. VALLES' '2258 COLIMA BIMBO' '2259 FRESNILLO BIMBO'
 '2260 GONZALEZ GALLO' '2262 LA PE\xc3\x91ITA' '2263 LIBERTAD'
 '2264 MANZANILLO BIMBO' '2265 MATEHUALA BIMBO' '2267 OCOTLAN BIMBO'
 '2268 PALOMAR' '2269 PUERTO VALLARTA BIMBO' '2271 RIO VERDE BIMBO'
 '2272 SAN LUIS REY' '2273 SAN PEDRO' '2274 TEPATITLAN BIMBO'
 '2275 TEPIC BIMBO' '2276 WONDER LA RAZA' '2277 ZACATECAS BIMBO'
 '2278 ZAPOPAN BIMBO' '2279 ACAPONETA' '2285 CD. CONSTITUCION'
 '2288 CULIACAN BIMBO' '2289 CULIACAN MARINELA' '2290 DURANGO BIMBO'
 '2293 GRANJAS MARINELA' '2294 GUADALAJARA' '2296 IRAPUATO SAN MIGUELITO'
 '2297 LA PAZ' '2300 LEON MILENIUM' '2301 CUBILETE' '2304 MARIANO OTERO_MM'
 '2305 MAZATLAN BIMBO' '2306 MAZATLAN MARINELA' '2309 NORTE'
 '2310 NUEVO IDEAL' '2313 SAN JOSE DEL CABO' '2314 SAN LUCAS'
 '2315 SAN LUIS POTOSI MARINELA' '2316 SANTA ROSALIA'
 '2321 ZACATECAS MARINELA' '2322 ZAMORA MADERO' '2325 TECOMAN'
 '2328 SANTIAGO IXCUINTLA' '2329 AG.HUEJUTLA' '2330 TLAQUEPARQUE'
 '2352 CORDOBA' '2353 HUAJUAPAN' '2354 ISLA' '2355 JALAPA I'
 '2358 MARTINEZ DE LA TORRE' '2359 AG.MIXQUIAHUALA' '2360 OAXACA BIMBO'
 '2361 ORIZABA' '2362 AG.PACHUCA' '2363 POZA RICA'
 '2364 PUEBLA NORTE BIMBO' '2365 PUEBLA SUR BIMBO' '2366 SAN ANDRES'
 '2367 SAN MARTIN T.' '2368 TAMPICO BIMBO' '2370 TEZIUTLAN'
 '2371 TIERRA BLANCA' '2372 AG.TULANCINGO' '2373 TUXPAM' '2374 TUXTEPEC'
 '2375 VERACRUZ NORTE' '2376 CONVENIENCIA VERACRUZ SUR'
 '2377 VERACRUZ SUR MLA' '2378 ACAYUCAN' '2379 ARRIAGA_BM' '2380 CAMPECHE'
 '2381 CANCUN_BM' '2382 CARDENAS' '2384 CIUDAD DEL CARMEN'
 '2386 COATZACOALCOS MLA.' '2387 COZUMEL' '2388 CHETUMAL'
 '2389 EMILIANO ZAPATA' '2390 INDUSTRIAL' '2391 IXTEPEC'
 '2392 MERIDA NORTE' '2393 MERIDA SUR' '2394 MINATITLAN'
 '2395 PLAYA DEL CARMEN' '2396 RUIZ CORTINEZ' '2397 SALINA CRUZ'
 '2398 SAN CRISTOBAL' '2399 TAPACHULA' '2401 TEKAX' '2402 TUXTLA'
 '2404 VALLADOLID' '2407 TEPEACA' '2408 ESCARCEGA' '2410 CANCUN II'
 '2411 COMITAN' '2412 APAN' '2413 CIUDAD SERD\xc3\x81N'
 '2414 IZUCAR DE MATAMOROS' '2415 JUXTLAHUACA' '2416 PACHUCA II'
 '2417 ZACATLAN' '2418 COMALCALCO' '2451 ACU\xc3\x91A' '2453 CASAS GRANDES'
 '2454 CUAHUTEMOC' '2455 CHIHUAHUA BIMBO' '2457 DELICIAS BIMBO'
 '2458 GOMEZ PALACIO BIMBO' '2461 HIDALGO BIMBO' '2462 JUAREZ BIMBO'
 '2465 LAREDO' '2466 LINARES' '2467 LINCOLN' '2468 MATAMOROS BIMBO'
 '2469 MONCLOVA BIMBO' '2472 OJINAGA' '2473 PARRAL BIMBO'
 '2474 PIEDRAS NEGRAS' '2475 REYNOSA BIMBO' '2476 SABINAS'
 '2477 SALTILLO BIMBO' '2478 SAN NICOLAS' '2479 SANTA CATARINA BIMBO'
 '2480 VICTORIA BIMBO' '2481 WONDER GUERRERO' '2485 CHIHUAHUA MARINELA'
 '2487 AEROPUERTO' '2488 JUAREZ' '2493 MITRAS' '2496 LA FE'
 '2499 REYNOSA MARINELA' '2502 SALTILLO MARINELA' '2504 TAMPICO MARINELA'
 '2505 TORREON' '2515 CAVAZOS' '2516 ESCOBEDO' '2517 SENDERO AUTOSERVICIO'
 '2520 HERMOSILLO SUR' '2551 AGUA PRIETA' '2552 CABORCA' '2553 CANANEA'
 '2554 CD. OBREGON' '2555 ENSENADA II' '2556 GUAMUCHIL' '2557 BW_GUASAVE'
 '2558 GUAYMAS' '2560 LAS TORRES' '2561 LOS MOCHIS' '2562 MEXICALI PLAZA'
 '2563 MEXICALI PONIENTE' '2564 MOCTEZUMA' '2565 NAVOJOA' '2566 NOGALES_BW'
 '2567 PUERTO PE\xc3\x91ASCO' '2568 SAN LUIS R.C.'
 '2569 SAN QUINT\xc3\x8dN' '2570 SANTA ANA' '2571 TECATE'
 '2572 TIJUANA EL FLORIDO' '2573 TIJUANA GATO BRONCO'
 '2574 TIJUANA PACIFICO' '2575 TIJUANA ROSARITO' '2576 GUERRERO NEGRO'
 '2647 BLM_AG. CAMPESINOS AUTOSERVICI' '2655 LOS MOCHIS'
 '3216 NUEVO LAREDO']

In [ ]:


In [5]:
'''n_t'''
test = gl.SFrame.read_csv(path + 'test.csv', verbose=False)

sf = sf[sf['Semana']==8]
del sf['Venta_uni_hoy']
del sf['Venta_hoy']
del sf['Dev_uni_proxima']
del sf['Dev_proxima']
del sf['Demanda_uni_equil']
# del sf['Demada_log']
# del sf['id']
del test['id']
sf = sf.append(test)

In [6]:
print sf


+--------+------------+----------+----------+------------+-------------+
| Semana | Agencia_ID | Canal_ID | Ruta_SAK | Cliente_ID | Producto_ID |
+--------+------------+----------+----------+------------+-------------+
|   8    |    1110    |    7     |   3301   |   15766    |     1212    |
|   8    |    1110    |    7     |   3301   |   15766    |     1216    |
|   8    |    1110    |    7     |   3301   |   15766    |     1220    |
|   8    |    1110    |    7     |   3301   |   15766    |     1238    |
|   8    |    1110    |    7     |   3301   |   15766    |     1240    |
|   8    |    1110    |    7     |   3301   |   15766    |     1242    |
|   8    |    1110    |    7     |   3301   |   15766    |     1250    |
|   8    |    1110    |    7     |   3301   |   15766    |     1309    |
|   8    |    1110    |    7     |   3301   |   15766    |     3894    |
|   8    |    1110    |    7     |   3301   |   15766    |    30531    |
+--------+------------+----------+----------+------------+-------------+
[17406119 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [7]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(town['Town'])


town['Town'] = le.transform(town['Town']) 

town1 = town['Town','Agencia_ID']

In [8]:
print town1


+------+------------+
| Town | Agencia_ID |
+------+------------+
|  84  |   22187    |
|  26  |    1164    |
|  26  |    1177    |
|  26  |    1220    |
|  26  |    1262    |
|  26  |   20489    |
|  76  |    1253    |
|  76  |    2011    |
|  76  |    2061    |
|  76  |    2095    |
+------+------------+
[234840 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [9]:
sf = sf.join(town1,on='Agencia_ID',how='left')

In [10]:
n_t = sf.groupby(key_columns=['Semana','Town'], operations={'n_t':agg.COUNT('Town')})
n_t = n_t.groupby(key_columns=['Town'], operations={'n_t':agg.MEAN('n_t')})


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-10-8a091fe2cce4> in <module>()
----> 1 n_t = sf.groupby(key_columns=['Semana','Town'], operations={'n_t':agg.COUNT('Town')})
      2 n_t = n_t.groupby(key_columns=['Town'], operations={'n_t':agg.MEAN('n_t')})

/usr/local/lib/python2.7/dist-packages/graphlab/data_structures/sframe.pyc in groupby(self, key_columns, operations, *args)
   4650                                                                   group_columns,
   4651                                                                   group_output_columns,
-> 4652                                                                   group_ops))
   4653 
   4654     def join(self, right, on=None, how='inner'):

/usr/local/lib/python2.7/dist-packages/graphlab/cython/context.pyc in __exit__(self, exc_type, exc_value, traceback)
     47             if not self.show_cython_trace:
     48                 # To hide cython trace, we re-raise from here
---> 49                 raise exc_type(exc_value)
     50             else:
     51                 # To show the full trace, we do nothing and let exception propagate

RuntimeError: Runtime Exception. Canceled by user

In [ ]:
n_t

In [ ]:
n_t.save(path+'town8.csv',format='csv')

In [ ]:


In [ ]:


In [13]:
town.save(path+'towns.csv',format='csv')

In [ ]:


In [ ]:


In [38]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

# figsize(16, 9)
# fig, axs = plt.subplots(3, 3)
# fig.subplots_adjust(hspace = .3) #, wspace=.1)

# data = np.arange(3, 10)
# for ax, d in zip(axs.ravel(), data):
x = sf1['tp_sum']
y = sf1['Demanda_uni_equil']
plot = plt.scatter(x, y, alpha=.2)
del x
del y
del plot