In [4]:
import os
#Primero de todo es importante establecer nuestro workspace, deberemos apuntar a la carpeta donde se encuentran los archivos
# train y test.
os.path.abspath("data")


Out[4]:
'C:\\Users\\basau\\TFMPython\\data'

In [5]:
#verificamos que estamos en la carpeta correcta.
os.getcwd()


Out[5]:
'C:\\Users\\basau\\TFMPython'

In [ ]:


In [12]:
import numpy as np
import pandas
import matplotlib.pyplot as plt

filename_train = 'data/TrainingDataset.csv'
filename_test = 'data/TestDataset.csv'
#usando panda importamos los dos archivos csv
dataframe_train = pandas.read_csv(filename_train)
dataframe_test = pandas.read_csv(filename_test)
#los juntamos en un mismo dataframe
dataframe = pandas.concat([dataframe_train, dataframe_test])

quantitative_columns = filter(lambda s: s.startswith("Quan"), dataframe.columns)

plt.figure()

# Lista de variables para mostrar en escala logaritmica:

#to_log = ["Quan_4", "Quan_5", "Quan_6", "Quan_7", "Quan_8", "Quan_9", "Quan_10", "Quan_11", "Quan_12", "Quan_13", "Quan_14", "Quan_15", "Quan_16", "Quan_17", "Quan_18", "Quan_19", "Quan_21", "Quan_22", "Quan_27", "Quan_28", "Quan_29", "Quant_22", "Quant_24", "Quant_25"]
to_log = ["Quan_4", "Quan_15", "Quan_16", "Quan_17", "Quan_18", "Quan_19", "Quan_21", "Quan_22", "Quant_22", "Quant_24", "Quant_25"]
#recorremos todas las columnas para dibujar los histogramas de las variables cuantitativas
for i, col in enumerate(quantitative_columns):
    a = dataframe[col]
    print col, pandas.isnull(a).sum()
    plt.subplot(4,8,i)
    if col in to_log:
        a = np.log(a)
   
    plt.hist(a[pandas.notnull(a)].tolist(), bins=30, label=col)
    plt.legend()
print len(quantitative_columns)


plt.show() # Si no estas en modo interactivo necesitaras esto.


Quan_1 0
Quan_10 811
Quan_11 811
Quan_12 811
Quan_13 811
Quan_14 811
Quan_15 24
Quan_16 481
Quan_17 547
Quan_18 1147
Quan_19 940
Quan_2 22
Quan_20 1195
Quan_21 1094
Quan_22 1238
Quan_26 0
Quan_27 0
Quan_28 0
Quan_29 0
Quan_3 6
Quan_30 0
Quan_4 135
Quan_5 811
Quan_6 811
Quan_7 811
Quan_8 811
Quan_9 811
Quant_22 1159
Quant_23 1138
Quant_24 683
Quant_25 728
31

In [1]:
import numpy as np
import pandas
import pickle
import gzip
import datetime

#lista de columnas con variables cuantitativas que son representativas para la escala log
# (previo uso de explore.py)
to_log = ["Quan_4", "Quan_5", "Quan_6", "Quan_7", "Quan_8", "Quan_9", "Quan_10", "Quan_11", "Quan_12", "Quan_13", "Quan_14", "Quan_15", "Quan_16", "Quan_17", "Quan_18", "Quan_19", "Quan_21", "Quan_22", "Quan_27", "Quan_28", "Quan_29", "Quant_22", "Quant_24", "Quant_25"]

def create_dataset(dataframe_train, dataframe_test):
    #creamos una variable local
    global to_log
    #unimos los dos dataframe para crear el conjunto de datos completo
    dataframe = pandas.concat([dataframe_train, dataframe_test])
    #computamos diferencia entre fechas
    dataframe['Date_3'] = dataframe.Date_1 - dataframe.Date_2
    train_size = dataframe_train.shape[0]
    X_categorical = []
    X_quantitative = []
    X_date = []
    X_id = []
    #creamos vector de 0 para la futura predicción
    ys = np.zeros((train_size,12), dtype=np.int)
    columns = []
    for col in dataframe.columns:
        if col.startswith('Cat_'):
            columns.append(col)
            uni = np.unique(dataframe[col])
            uni = uni.tolist()
            if len(uni) > 1:
                #binarizamos las variables categoricas
                X_categorical.append(uni==dataframe[col].values[:,None])
        elif col.startswith('Quan_') or col.startswith('Quant_'):
            columns.append(col)
            #verificamos si la columna esta en la variable to_log
            if col in to_log:
                dataframe[col] = np.log(dataframe[col])
            # Si no encontramos la columna en to_log la llenamos de NaN
            if (pandas.isnull(dataframe[col])).sum() > 1:
                tmp = dataframe[col].copy()
                # calculo de la mediana:
                tmp = tmp.fillna(tmp.median())
                X_quantitative.append(tmp.values)
        elif col.startswith('Date_'):
            columns.append(col)
            # Si la columna no existe la llenamos de valores NaN:
            tmp = dataframe[col].copy()
            if (pandas.isnull(tmp)).sum() > 1:
                # calculo de mediana:
                tmp = tmp.fillna(tmp.median())
            X_date.append(tmp.values[:,None])
            #extraemos dia mes y año para otener efectos estacionarios de las ventas:            
            year = np.zeros((tmp.size,1))
            month = np.zeros((tmp.size,1))
            day = np.zeros((tmp.size,1))
            for i, date_number in enumerate(tmp):
                date = datetime.date.fromordinal(int(date_number))
                year[i,0] = date.year
                month[i,0] = date.month
                day[i,0] = date.day
            X_date.append(year)
            X_date.append(month)
            X_date.append(day)
            #considerando año, mes y dia como variables categoricas
            #creamos la representacion binaria:
            X_date.append((np.unique(year)==year).astype(np.int))
            X_date.append((np.unique(month)==month).astype(np.int))
            X_date.append((np.unique(day)==day).astype(np.int))
        elif col=='id':
            pass # X_id.append(dataframe[col].values)
        elif col.startswith('Outcome_'):
            outcome_col_number = int(col.split('M')[1]) - 1
            tmp = dataframe[col][:train_size].copy()
            # calculo de mediana:
            tmp = tmp.fillna(tmp.median())
            ys[:,outcome_col_number] = tmp.values
        else:
            raise NameError

    X_categorical = np.hstack(X_categorical).astype(np.float32)
    X_quantitative = np.vstack(X_quantitative).astype(np.float32).T
    X_date = np.hstack(X_date).astype(np.float32)

    X = np.hstack([X_categorical, X_quantitative, X_date])
    X_train = X[:train_size,:]
    X_test = X[train_size:,:]
    return X_train, X_test, ys, columns


def redundant_columns(X):
    """Identificar columnas redundantes.
    """
    idx = []
    for i in range(X.shape[1]-1):
        for j in range(i+1, X.shape[1]):
            if (X[:,i] == X[:,j]).all() :
                print i, '==', j
                idx.append(j)
    return np.unique(idx)


if __name__ == '__main__':

    np.random.seed(0)

    filename_train = 'data/TrainingDataset.csv'
    filename_test = 'data/TestDataset.csv'
    dataframe_train = pandas.read_csv(filename_train)
    dataframe_test = pandas.read_csv(filename_test)
    # Hay que tener en cuenta que el dataframe tiene las columnas en diferente
    # orden que dataframe_train y dataframe_test
    
    """print "dataframe_train:", dataframe_train
    print
    print "dataframe_test:", dataframe_test
    """
    ids = dataframe_test.values[:,0].astype(np.int)

    X_train, X_test, ys, columns = create_dataset(dataframe_train, dataframe_test)
    
    print "Este es el dataset de entrenamiento: ", X_train
    print
    print "este es el dataset de test: ", X_test
    
    print
    print "Calculando columnas redundantes"
    X = np.vstack([X_train, X_test])
    idx = redundant_columns(X)
    columns_to_keep = list(set(range(X.shape[1])).difference(set(idx.tolist())))
    X = X[:,columns_to_keep]
    X_train = X[:X_train.shape[0], :]
    X_test = X[X_train.shape[0]:, :]
    
    print "Saving dataset."
    all_data = {"X_train": X_train,
                "X_test": X_test,
                "columns": columns,
                "ys": ys,
                "ids": ids,
                "redundant": idx}
    pickle.dump(all_data, gzip.open('all_data.pickle.gz','w'), protocol=pickle.HIGHEST_PROTOCOL)
    print("Dataset saved. Everything OK")


Este es el dataset de entrenamiento:  [[ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 ..., 
 [ 1.  0.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]]

este es el dataset de test:  [[ 1.  0.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 1.  0.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 1.  0.  1. ...,  0.  0.  0.]]

Calculando columnas redundantes
26 == 805
30 == 232
30 == 523
30 == 1493
30 == 1863
31 == 403
32 == 404
37 == 1580
38 == 1581
39 == 1582
40 == 1583
41 == 1584
42 == 1585
43 == 1586
44 == 1587
48 == 1588
49 == 1589
50 == 1590
51 == 1591
52 == 1592
53 == 1593
54 == 1594
55 == 1595
56 == 1596
57 == 1597
58 == 1600
59 == 1601
60 == 1602
61 == 1603
62 == 1604
63 == 1605
66 == 1606
67 == 1607
68 == 1608
69 == 1609
70 == 1610
71 == 1611
72 == 1612
73 == 1613
74 == 1614
75 == 1615
76 == 1616
77 == 1617
78 == 1620
79 == 1621
80 == 1622
81 == 1623
98 == 415
98 == 425
99 == 289
99 == 416
99 == 426
99 == 1505
117 == 1327
129 == 202
130 == 203
143 == 301
144 == 302
145 == 303
146 == 304
154 == 1571
160 == 196
161 == 197
175 == 734
225 == 584
225 == 1153
232 == 523
232 == 1493
232 == 1863
249 == 1209
251 == 638
252 == 1404
253 == 640
254 == 538
254 == 1258
255 == 677
255 == 1296
261 == 966
262 == 1288
263 == 292
263 == 1299
266 == 801
266 == 1516
269 == 714
269 == 1356
270 == 716
270 == 1359
271 == 720
271 == 1363
271 == 1837
273 == 1530
274 == 742
274 == 1403
275 == 790
275 == 1470
276 == 746
276 == 1411
277 == 749
277 == 1414
278 == 750
278 == 1417
281 == 755
283 == 767
283 == 1440
285 == 806
285 == 1524
286 == 770
286 == 1447
287 == 773
287 == 1452
288 == 778
288 == 1455
289 == 416
289 == 426
289 == 1505
290 == 800
290 == 1515
292 == 1299
306 == 1281
384 == 672
384 == 1286
384 == 1845
384 == 1891
384 == 1924
415 == 425
416 == 426
416 == 1505
418 == 595
418 == 1168
422 == 1429
426 == 1505
444 == 674
444 == 1521
452 == 1062
474 == 1010
489 == 1031
493 == 1036
505 == 1055
508 == 1058
509 == 1059
516 == 1068
520 == 1075
523 == 1493
523 == 1863
524 == 1484
524 == 1928
527 == 1083
529 == 1085
531 == 1201
532 == 1100
533 == 1199
534 == 1368
536 == 1088
537 == 1089
538 == 1258
539 == 1463
539 == 1869
543 == 1095
546 == 1422
552 == 1107
560 == 1120
562 == 1124
565 == 1127
567 == 1132
572 == 1140
581 == 1150
582 == 1332
584 == 1153
585 == 1156
589 == 1159
591 == 1162
593 == 1164
594 == 1167
595 == 1168
598 == 1172
600 == 1177
601 == 1180
604 == 1185
609 == 1192
610 == 1193
612 == 1196
615 == 1200
618 == 1208
620 == 1211
621 == 1212
622 == 1213
623 == 1215
624 == 1216
625 == 1217
628 == 1223
629 == 1224
630 == 1226
632 == 1229
634 == 1231
635 == 1234
639 == 1239
642 == 1242
645 == 1250
646 == 1251
646 == 1750
647 == 1252
648 == 1190
649 == 1255
651 == 1256
651 == 1862
652 == 1257
654 == 1260
655 == 1261
659 == 1266
661 == 1269
665 == 1385
666 == 1275
668 == 1278
670 == 1282
672 == 1286
672 == 1845
672 == 1891
672 == 1924
674 == 1521
675 == 1289
676 == 1291
677 == 1296
679 == 1298
682 == 1304
686 == 1312
687 == 1315
688 == 1372
692 == 1319
693 == 1321
694 == 1322
697 == 1333
698 == 1335
699 == 1337
700 == 1338
702 == 1346
702 == 1772
704 == 1349
705 == 1350
707 == 1352
708 == 1453
710 == 1353
711 == 1354
714 == 1356
715 == 1357
715 == 1561
715 == 1563
715 == 1565
716 == 1359
717 == 1360
718 == 1361
719 == 1362
720 == 1363
720 == 1837
723 == 1370
725 == 1374
726 == 1375
727 == 1376
728 == 1377
728 == 1718
729 == 1379
731 == 1387
732 == 1388
733 == 1390
735 == 1394
735 == 1867
737 == 1395
740 == 1400
742 == 1403
743 == 1405
744 == 1407
746 == 1411
747 == 1412
748 == 1413
749 == 1414
750 == 1417
751 == 1418
752 == 1309
753 == 1419
753 == 1929
754 == 1423
757 == 1461
760 == 1430
761 == 1895
761 == 1940
762 == 1432
763 == 1433
765 == 1438
767 == 1440
768 == 1444
769 == 1445
770 == 1447
771 == 1448
773 == 1452
774 == 1536
775 == 1553
775 == 1557
775 == 1559
775 == 1566
775 == 1770
776 == 1178
776 == 1889
776 == 1920
778 == 1455
780 == 1456
781 == 1457
783 == 1460
784 == 1462
787 == 1466
788 == 1468
789 == 905
789 == 1469
790 == 1470
791 == 1472
792 == 1476
793 == 1480
794 == 1494
795 == 1495
797 == 1506
798 == 1509
799 == 1512
800 == 1515
801 == 1516
802 == 1537
803 == 1529
804 == 1520
806 == 1524
807 == 833
807 == 1525
807 == 1894
807 == 1930
808 == 1528
810 == 1531
811 == 1535
833 == 1525
833 == 1894
833 == 1930
843 == 1420
905 == 1469
971 == 998
971 == 1952
972 == 1381
977 == 1473
997 == 1951
998 == 1952
1003 == 1630
1004 == 1631
1005 == 1632
1006 == 1633
1007 == 1634
1008 == 1635
1082 == 1844
1096 == 1568
1103 == 1846
1116 == 1957
1178 == 1889
1178 == 1920
1251 == 1750
1256 == 1862
1286 == 1845
1286 == 1891
1286 == 1924
1287 == 1858
1301 == 1664
1306 == 1851
1330 == 1892
1330 == 1926
1343 == 1756
1346 == 1772
1357 == 1561
1357 == 1563
1357 == 1565
1363 == 1837
1369 == 1714
1377 == 1718
1394 == 1867
1398 == 1866
1419 == 1929
1441 == 1896
1441 == 1932
1449 == 1809
1463 == 1869
1474 == 1757
1484 == 1928
1490 == 1871
1491 == 1805
1492 == 1936
1493 == 1863
1502 == 1934
1510 == 1865
1523 == 1807
1525 == 1894
1525 == 1930
1549 == 1569
1552 == 1556
1552 == 1558
1552 == 1769
1553 == 1557
1553 == 1559
1553 == 1566
1553 == 1770
1556 == 1558
1556 == 1769
1557 == 1559
1557 == 1566
1557 == 1770
1558 == 1769
1559 == 1566
1559 == 1770
1560 == 1562
1561 == 1563
1561 == 1565
1563 == 1565
1566 == 1770
1739 == 1798
1740 == 1799
1789 == 1850
1845 == 1891
1845 == 1924
1885 == 1912
1888 == 1919
1889 == 1920
1890 == 1921
1891 == 1924
1892 == 1926
1894 == 1930
1895 == 1940
1896 == 1932
1897 == 1933
1898 == 1939
1967 == 1970
1967 == 1971
1970 == 1971
1983 == 1986
1983 == 1987
1986 == 1987
Saving dataset.
Dataset saved. Everything OK

In [ ]:
"""
Simple blender para los valores de regresion deseados durante meses

"""

import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import load_data
from sklearn.cross_validation import KFold
from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
import pickle
import gzip
import math

def rmsle_loop(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

if __name__ == '__main__':
    
    #iniciamos la seed para la aleatoriedad y creamos un 5 fold cross validation

    np.random.seed(0)
    n_folds = 3
    
    #cagamos el dataset

    X, X_submission, ys, ids, idx = load_data.load()    
    
    # evitamos el logscale en la evaluacion:
    ys = np.log(ys/500.0 + 1.0)      
    y_submission = np.zeros((X_submission.shape[0], 12))    

    # regs = [RandomForestRegressor(n_estimators=100, n_jobs=-1, max_features='auto'),
    #         ExtraTreesRegressor(n_estimators=100, n_jobs=-1, max_features='auto'),
    #         GradientBoostingRegressor(learning_rate=0.001, subsample=0.5, max_depth=6, n_estimators=20000)]

    #se prueba con n stimators 1000 para que se ejecute más rápido
    regs = [GradientBoostingRegressor(learning_rate=0.001, subsample=0.5, max_depth=6, n_estimators=1000)]

    dataset_blend_train = np.zeros((X.shape[0], 12*len(regs)), dtype=np.double)
    dataset_blend_submission = np.zeros((X_submission.shape[0], 12*len(regs), n_folds), dtype=np.double)
    
    
    for i in range(12):
        print "Month", i
        y = ys[:,i]
        kfcv = KFold(n=X.shape[0], n_folds=n_folds)
        for j, (train, test) in enumerate(kfcv):
            print "Fold", j
            for k, reg in enumerate(regs):
                print reg
                #Nos aseguramos de eliminar todos los valores infinitos o NaN
                y[train] = np.nan_to_num(y[train])
                X[train] = np.nan_to_num(X[train])
                X[test] = np.nan_to_num(X[test])
                X_submission = np.nan_to_num(X_submission)
                #check de valores NaN o infinitos
                print "y tiene valores infinitos: ", np.isinf(y[train]).any()
                print "y tiene valores nan: ", np.isnan(y[train]).any()
                print "X tiene valores nan: ", np.isnan(X[train]).any()
                print "X tiene valores infinitos: ", np.isnan(X[train]).any()                
                reg.fit(X[train], y[train])
                #ejecutamos el predictor
                dataset_blend_train[test,12*k+i] = reg.predict(X[test])
                dataset_blend_submission[:,12*k+i,j] = reg.predict(X_submission)

    
    dataset_blend_submission_final = dataset_blend_submission.mean(2)
    print "dataset_blend_submission_final:", dataset_blend_submission_final.shape

    print "Blending."
    for i in range(12):
        print "Month", i, '-',
        y = ys[:,i]
        reg = RidgeCV(alphas=np.logspace(-2,4,40))
        reg.fit(dataset_blend_train, y)
        print "best_alpha =", reg.alpha_
        y_submission[:,i] = reg.predict(dataset_blend_submission_final)
                
    # reconversion de los resultados a la dimension original:
    y_submission = (np.exp(y_submission) - 1.0) * 500.0
    
    print "Guardando resultados en test.csv..."
    np.savetxt("test.csv", np.hstack([ids[:,None], y_submission]), fmt="%d", delimiter=',')
    print("Resultados guardados en test.csv")
    ys = (np.exp(ys) - 1.0) * 500.0
    print rmsle_loop(ys, y_submission)


Month 0
Fold 0
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
             loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             random_state=None, subsample=0.5, verbose=0, warm_start=False)
y tiene valores infinitos:  False
y tiene valores nan:  False
X tiene valores nan:  False
X tiene valores infinitos:  False
Fold 1
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
             loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             random_state=None, subsample=0.5, verbose=0, warm_start=False)
y tiene valores infinitos:  False
y tiene valores nan:  False
X tiene valores nan:  False
X tiene valores infinitos:  False
Fold 2
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
             loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             random_state=None, subsample=0.5, verbose=0, warm_start=False)
y tiene valores infinitos:  False
y tiene valores nan:  False
X tiene valores nan:  False
X tiene valores infinitos:  False
Month 1
Fold 0
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
             loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             random_state=None, subsample=0.5, verbose=0, warm_start=False)
y tiene valores infinitos: 

In [3]:
"""
    Un cargados simple del conjunto de datos y que mezcla las lineas para crear 
un conjunto de datos aleatorio

"""

import pickle
import gzip
import numpy as np

def load(filename='all_data.pickle.gz', shuffle_train=False):
    """Load dataset. Shuffle train data if requested
    """
    f = gzip.open(filename)
    all_data = pickle.load(f)
    X_train = all_data['X_train']
    X_test = all_data['X_test']
    ys = all_data['ys']
    ids = all_data['ids']
    idx = np.arange(X_train.shape[0])
    if shuffle_train:
        idx = np.random.permutation(idx)
        X_train = X_train[idx, :]
        ys = ys[idx, :]
    return X_train, X_test, ys, ids, idx


Out[3]:
int

In [ ]:


In [ ]:


In [ ]:


In [ ]: