In [1]:
from IPython.display import Image
Image(filename='images/phd053104s.png')


Out[1]:

Paseando por pandas


In [2]:
from IPython.display import IFrame
IFrame('http://pandas.pydata.org/', width=900, height=350)


Out[2]:

Importamos las bibliotecas a usar ...


In [7]:
import pandas as pd
import numpy as np

In [8]:
more data/train.csv

In [9]:
df = pd.read_csv('data/train.csv')
# Reemplazamos nuestros valores de 'y' con valores enteros.
types = np.sort(np.unique(df['target']))
new_values = dict(zip(types, range(types.shape[0])))
df['target'] = df['target'].map(new_values).astype(np.int32)

**También mirar: read_excel, read_clipboard, read_fwf, read_html, read_json, read_sql**

Ahora demos un vistazo de cómo esta compuesto nuestro dataset ...


In [10]:
df.head(5)


Out[10]:
id feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 ... feat_85 feat_86 feat_87 feat_88 feat_89 feat_90 feat_91 feat_92 feat_93 target
0 1 1 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
1 2 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
2 3 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 1 0 0 1 6 1 5 0 0 ... 0 1 2 0 0 0 0 0 0 0
4 5 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 1 0 0 0 0

5 rows × 95 columns

Para tener una idea básica del comportamiento de nuestros datos podemos pedir información estadística básica a pandas con la sig. función


In [11]:
df.describe()


Out[11]:
id feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 ... feat_85 feat_86 feat_87 feat_88 feat_89 feat_90 feat_91 feat_92 feat_93 target
count 61878.000000 61878.00000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 ... 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000 61878.000000
mean 30939.500000 0.38668 0.263066 0.901467 0.779081 0.071043 0.025696 0.193704 0.662433 1.011296 ... 0.532306 1.128576 0.393549 0.874915 0.457772 0.812421 0.264941 0.380119 0.126135 3.842335
std 17862.784315 1.52533 1.252073 2.934818 2.788005 0.438902 0.215333 1.030102 2.255770 3.474822 ... 1.900438 2.681554 1.575455 2.115466 1.527385 4.597804 2.045646 0.982385 1.201720 2.510794
min 1.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 15470.250000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
50% 30939.500000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000
75% 46408.750000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 ... 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.000000
max 61878.000000 61.00000 51.000000 64.000000 70.000000 19.000000 10.000000 38.000000 76.000000 43.000000 ... 55.000000 65.000000 67.000000 30.000000 61.000000 130.000000 52.000000 19.000000 87.000000 8.000000

8 rows × 95 columns


In [12]:
df.corr()


Out[12]:
id feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 ... feat_85 feat_86 feat_87 feat_88 feat_89 feat_90 feat_91 feat_92 feat_93 target
id 1.000000 0.070691 0.185463 0.138980 1.460200e-01 0.030707 0.033410 0.136501 0.197418 -0.156082 ... -0.084507 -0.096484 0.097087 -0.215878 0.111519 0.188895 0.139078 0.131737 0.047944 0.977144
feat_1 0.070691 1.000000 0.031332 -0.027807 -2.752941e-02 0.042973 0.043603 0.298952 0.056321 -0.032285 ... -0.008739 0.107947 0.089374 0.020830 0.096851 0.010310 0.037264 0.054777 0.081783 0.072984
feat_2 0.185463 0.031332 1.000000 0.082573 1.349870e-01 0.020926 0.041343 0.222386 0.019815 -0.025630 ... -0.006764 -0.039090 0.047451 -0.047035 0.105527 0.515022 0.026383 -0.008219 0.054593 0.191739
feat_3 0.138980 -0.027807 0.082573 1.000000 5.835232e-01 0.010880 0.004288 0.001294 -0.053462 -0.063551 ... -0.048626 -0.096093 -0.009838 -0.082336 0.174781 -0.015068 -0.012417 0.066921 0.006814 0.148729
feat_4 0.146020 -0.027529 0.134987 0.583523 1.000000e+00 0.017290 0.014059 0.014490 -0.046184 -0.046250 ... -0.033153 -0.071029 0.005055 -0.067484 0.183715 0.009454 -0.010312 0.087631 0.015746 0.153458
feat_5 0.030707 0.042973 0.020926 0.010880 1.729026e-02 1.000000 0.145355 0.075047 0.035861 -0.024708 ... 0.034062 0.013879 0.013999 -0.019201 0.119951 0.004842 0.012012 0.065331 0.002038 0.036034
feat_6 0.033410 0.043603 0.041343 0.004288 1.405895e-02 0.145355 1.000000 0.088014 0.012867 -0.009373 ... 0.004290 0.010455 0.015256 -0.015437 0.035042 0.054034 0.012465 0.015479 0.008521 0.037116
feat_7 0.136501 0.298952 0.222386 0.001294 1.448981e-02 0.075047 0.088014 1.000000 0.038121 -0.027146 ... 0.037874 -0.009169 0.089574 -0.033646 0.063511 0.129578 0.068506 -0.032261 0.034912 0.142172
feat_8 0.197418 0.056321 0.019815 -0.053462 -4.618407e-02 0.035861 0.012867 0.038121 1.000000 -0.039281 ... -0.003416 -0.029395 0.059929 -0.050931 0.007974 0.026807 0.095990 0.013608 0.005131 0.209725
feat_9 -0.156082 -0.032285 -0.025630 -0.063551 -4.624977e-02 -0.024708 -0.009373 -0.027146 -0.039281 1.000000 ... -0.031462 -0.019144 -0.016925 0.001160 -0.019147 -0.020698 -0.014742 -0.069707 -0.006038 -0.175350
feat_10 0.096127 0.097776 0.051925 0.036944 5.951396e-02 0.091324 0.041940 0.194258 -0.000023 -0.024323 ... 0.086758 0.159447 0.077421 0.054635 0.061498 0.049908 0.024025 -0.006869 0.041316 0.095383
feat_11 0.179164 -0.042928 0.118534 0.596243 3.894092e-01 0.004882 0.014504 0.012418 -0.065923 -0.075820 ... -0.074293 -0.123339 -0.032969 -0.114491 0.137374 0.045074 -0.029511 0.013179 0.003326 0.189551
feat_12 0.079170 0.056934 0.090153 0.050037 5.743356e-02 0.036668 0.028588 0.056230 0.091424 -0.021885 ... 0.019283 -0.007214 0.016089 -0.024324 0.082220 0.062721 0.063965 0.063922 0.012722 0.082567
feat_13 0.184629 0.139254 0.157467 0.013870 2.897317e-02 0.059081 0.036293 0.199142 0.095365 -0.040164 ... 0.002594 0.004850 0.093870 -0.036259 0.062990 0.107722 0.044338 0.071953 0.038989 0.194200
feat_14 -0.346308 0.063517 -0.070057 -0.111105 -9.921490e-02 -0.037607 -0.027350 -0.044671 -0.061799 -0.110188 ... -0.021455 0.145787 -0.020229 0.323089 -0.038881 -0.060240 -0.038444 -0.040133 -0.018127 -0.365092
feat_15 -0.245562 -0.045738 -0.048798 -0.065285 -5.122155e-02 -0.007000 -0.018328 -0.035721 -0.056960 0.009858 ... 0.246847 -0.002529 -0.023191 0.010840 0.029547 -0.046616 -0.034402 -0.018206 -0.020369 -0.259047
feat_16 -0.004622 0.027086 0.108046 0.221426 2.110780e-01 0.062877 0.021934 0.043957 -0.004659 -0.082664 ... 0.110850 0.003610 0.077770 -0.007257 0.248364 0.016863 0.048494 0.210499 0.031467 -0.004812
feat_17 0.122884 0.053004 0.074902 -0.023093 -7.553867e-03 0.062197 0.015488 0.127245 0.173912 -0.028709 ... 0.015559 0.049102 0.214221 -0.034139 0.035390 0.045218 0.088508 -0.006538 0.056695 0.132741
feat_18 0.189280 0.084856 0.242716 0.115655 2.148952e-01 0.052186 0.048710 0.098972 0.087777 -0.043642 ... -0.001555 -0.029295 0.126886 -0.035981 0.247462 0.094336 0.037275 0.126640 0.058100 0.200805
feat_19 0.127893 0.002302 0.176655 -0.012228 -3.519107e-07 -0.008556 0.038493 0.058071 0.019387 -0.000167 ... -0.008292 -0.014560 0.000412 -0.018485 0.011116 0.450925 0.004085 -0.027662 0.014243 0.131549
feat_20 0.213630 0.070511 0.449160 -0.011069 4.465657e-02 0.046200 0.057813 0.364972 0.062595 -0.023397 ... 0.084570 0.016850 0.220475 0.004081 0.111231 0.370282 0.079181 -0.018715 0.110054 0.219909
feat_21 0.012886 -0.027026 0.014113 0.354925 2.329227e-01 0.003288 0.008046 -0.022908 -0.041095 -0.028409 ... -0.006180 -0.045562 -0.016862 -0.030401 0.105392 -0.033193 -0.019779 0.058008 -0.007677 0.013486
feat_22 0.190144 0.063283 0.215106 0.251082 2.477378e-01 0.075161 0.038939 0.162620 0.029032 -0.062348 ... 0.044396 -0.018347 0.219974 -0.045439 0.244779 0.098595 0.104921 0.200593 0.113276 0.200545
feat_23 0.102980 0.048686 0.162065 -0.002427 3.062225e-02 0.017281 0.043651 0.186462 0.012774 0.006940 ... 0.056994 0.121170 0.111837 -0.014039 0.059743 0.141869 0.010438 -0.031837 0.084945 0.104259
feat_24 0.215890 0.067255 0.253684 -0.031596 3.727726e-03 0.075222 0.082124 0.244813 0.161848 0.073618 ... -0.018990 0.015444 0.123298 -0.043479 0.023581 0.357270 0.090833 -0.024375 0.089200 0.226936
feat_25 -0.276763 0.187237 -0.096366 -0.157459 -1.342306e-01 -0.003610 -0.023319 -0.048820 -0.036939 -0.025279 ... 0.021119 0.263924 -0.011294 0.207974 -0.012866 -0.088187 -0.045759 0.030135 -0.015708 -0.294079
feat_26 0.133917 -0.022813 0.064856 0.268112 3.657567e-01 0.025116 0.004680 -0.008782 -0.041599 -0.066414 ... -0.048889 -0.072464 0.015937 -0.078470 0.094521 -0.021565 -0.018447 0.199974 0.016709 0.141617
feat_27 0.097531 -0.038826 0.037841 0.508370 3.086287e-01 0.002098 0.001943 -0.015429 -0.050272 -0.042531 ... -0.046053 -0.082510 -0.028097 -0.070194 0.099536 -0.025263 -0.018778 0.023790 0.000318 0.103288
feat_28 0.128876 -0.030257 0.072494 0.551398 4.864171e-01 0.047688 0.017132 0.000998 -0.036668 -0.055545 ... -0.039784 -0.080806 -0.002941 -0.074442 0.169794 -0.021330 -0.015242 0.122653 0.005275 0.136553
feat_29 0.117175 0.069266 0.025689 -0.004141 1.427066e-02 0.065957 0.002389 0.046231 0.104985 -0.021328 ... 0.013104 -0.011960 0.038800 -0.032585 0.055398 -0.000185 0.040526 0.084445 0.008301 0.122533
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
feat_65 0.064755 0.110041 0.078801 0.065492 6.228472e-02 0.228349 0.066867 0.202346 0.025544 -0.038163 ... 0.066922 0.030362 0.050138 -0.011600 0.125884 0.029076 0.001188 0.044286 0.015500 0.066557
feat_66 0.174734 0.053010 0.175620 0.088017 1.296545e-01 0.048364 0.033285 0.122660 0.115175 -0.001778 ... 0.031291 0.088686 0.206406 -0.000679 0.205289 0.094925 0.098063 0.123694 0.067957 0.183488
feat_67 0.186622 0.154301 0.068667 -0.110081 -8.045694e-02 0.061964 0.038289 0.148598 0.320949 0.176921 ... -0.020048 0.081445 0.295803 -0.058706 0.005220 0.089262 0.112052 -0.011247 0.129018 0.199869
feat_68 0.182707 0.014674 -0.012802 -0.030992 -2.009191e-02 0.107405 0.021619 0.040309 0.075384 -0.012192 ... -0.024620 -0.038904 0.001672 -0.059843 0.125150 -0.023839 0.022515 0.095970 -0.004602 0.190042
feat_69 0.210949 0.007544 0.307406 -0.032748 -1.446082e-02 -0.003294 0.074836 0.131430 0.046258 -0.029335 ... -0.006954 -0.025538 0.027690 -0.022918 0.011806 0.549489 0.041206 -0.037961 0.032052 0.217790
feat_70 0.043618 0.165442 0.112968 -0.018774 2.079779e-02 0.118510 0.052401 0.237907 0.023089 -0.056205 ... 0.361941 0.225792 0.212133 0.140850 0.163631 0.074178 0.030560 0.007310 0.093488 0.033185
feat_71 0.129365 0.013712 -0.002336 -0.053020 -4.241268e-02 0.056428 0.011901 0.115813 0.081664 0.043286 ... 0.013894 -0.015410 0.060004 -0.048676 0.076348 -0.019694 0.050622 0.000368 0.002001 0.135428
feat_72 -0.199876 -0.029983 -0.023267 -0.045339 -2.979578e-02 0.005177 -0.011090 -0.014921 -0.029868 -0.058147 ... 0.294384 0.008897 0.013536 0.004066 0.057040 -0.030673 -0.008936 0.005300 -0.008233 -0.208646
feat_73 0.075773 0.140815 0.039192 -0.013972 -1.128547e-02 0.001609 0.025023 0.022819 0.028999 0.022679 ... -0.010675 -0.000841 -0.004759 -0.026363 -0.006704 0.070001 0.007193 -0.024017 -0.000163 0.078975
feat_74 0.062870 0.051365 0.070724 0.041559 4.909735e-02 0.017265 0.043160 0.053059 -0.000431 0.007594 ... -0.000453 -0.015945 0.003992 -0.025207 0.042104 0.055372 0.016941 0.004497 0.021967 0.065925
feat_75 0.199319 0.011596 0.093689 -0.044724 -3.145389e-02 0.015279 0.006951 0.039865 0.031466 -0.027313 ... -0.026329 -0.031401 0.001201 -0.058630 -0.014925 0.160418 -0.002625 -0.037710 0.006208 0.206344
feat_76 0.189897 0.153808 0.259360 -0.028670 -1.379188e-02 0.035570 0.073867 0.375114 0.081682 -0.027424 ... 0.000682 0.010324 0.063411 -0.050417 0.023242 0.291884 0.175163 -0.050887 0.029426 0.199383
feat_77 0.053024 0.123752 0.014911 -0.001584 1.531773e-02 0.030462 0.006501 0.005769 0.027486 -0.020185 ... 0.005602 0.020294 0.019275 -0.007396 0.021591 -0.004988 0.026376 0.076551 0.001715 0.058228
feat_78 0.104017 0.279202 0.094256 -0.021979 -1.449856e-02 0.070709 0.061250 0.567084 0.079623 -0.015922 ... 0.004071 -0.018797 0.063539 -0.030010 0.014639 0.043339 0.068450 -0.028596 0.016047 0.109507
feat_79 0.162963 0.228912 0.033668 -0.020566 -1.083473e-02 0.055115 0.009942 0.066753 0.083714 -0.036116 ... 0.004663 0.095254 0.099579 -0.018615 0.073207 0.031099 0.021616 0.162033 0.029082 0.170379
feat_80 0.141716 -0.013303 0.155768 0.442036 4.057725e-01 0.026223 0.017648 0.028860 -0.038382 -0.046721 ... -0.035876 -0.081888 -0.004588 -0.076250 0.350787 0.012623 -0.017815 0.063401 0.012651 0.149499
feat_81 0.078358 0.032427 0.052101 0.013089 2.828377e-02 0.129333 0.044136 0.144308 0.035102 -0.005847 ... 0.054972 0.013808 0.084096 -0.017469 0.166234 0.009379 0.017243 0.018565 0.019378 0.082662
feat_82 0.113915 -0.026085 0.119109 0.438458 4.365413e-01 0.057400 0.014907 0.022059 -0.034409 -0.039806 ... -0.034368 -0.065189 -0.012153 -0.059553 0.266249 -0.001795 -0.014641 0.049661 0.005497 0.120733
feat_83 0.161417 0.059165 0.371691 -0.019914 -1.051874e-03 0.008006 0.035145 0.282069 0.033479 -0.032875 ... -0.009157 -0.029711 0.072006 -0.052930 0.035181 0.243942 0.095801 -0.018325 0.054188 0.165828
feat_84 -0.008192 0.049634 0.009845 0.011159 5.684499e-03 0.467329 0.177777 0.062634 0.005064 -0.013569 ... -0.010210 -0.003459 0.013631 -0.017903 0.103643 -0.006013 -0.003444 0.048431 0.003723 -0.003308
feat_85 -0.084507 -0.008739 -0.006764 -0.048626 -3.315343e-02 0.034062 0.004290 0.037874 -0.003416 -0.031462 ... 1.000000 0.109643 0.049250 0.027886 0.053582 -0.003931 -0.023091 -0.043484 0.023390 -0.102359
feat_86 -0.096484 0.107947 -0.039090 -0.096093 -7.102916e-02 0.013879 0.010455 -0.009169 -0.029395 -0.019144 ... 0.109643 1.000000 0.073685 0.426972 -0.011822 -0.019803 -0.024005 -0.049393 0.029035 -0.115179
feat_87 0.097087 0.089374 0.047451 -0.009838 5.054728e-03 0.013999 0.015256 0.089574 0.059929 -0.016925 ... 0.049250 0.073685 1.000000 0.023053 0.066008 0.014696 0.028850 0.001424 0.499990 0.101345
feat_88 -0.215878 0.020830 -0.047035 -0.082336 -6.748367e-02 -0.019201 -0.015437 -0.033646 -0.050931 0.001160 ... 0.027886 0.426972 0.023053 1.000000 -0.022552 -0.031679 -0.033653 -0.070120 -0.008631 -0.240481
feat_89 0.111519 0.096851 0.105527 0.174781 1.837145e-01 0.119951 0.035042 0.063511 0.007974 -0.019147 ... 0.053582 -0.011822 0.066008 -0.022552 1.000000 0.027764 0.015917 0.129622 0.030650 0.113492
feat_90 0.188895 0.010310 0.515022 -0.015068 9.454061e-03 0.004842 0.054034 0.129578 0.026807 -0.020698 ... -0.003931 -0.019803 0.014696 -0.031679 0.027764 1.000000 0.014812 -0.035311 0.039864 0.195439
feat_91 0.139078 0.037264 0.026383 -0.012417 -1.031241e-02 0.012012 0.012465 0.068506 0.095990 -0.014742 ... -0.023091 -0.024005 0.028850 -0.033653 0.015917 0.014812 1.000000 0.104226 -0.000045 0.146567
feat_92 0.131737 0.054777 -0.008219 0.066921 8.763105e-02 0.065331 0.015479 -0.032261 0.013608 -0.069707 ... -0.043484 -0.049393 0.001424 -0.070120 0.129622 -0.035311 0.104226 1.000000 -0.003653 0.145157
feat_93 0.047944 0.081783 0.054593 0.006814 1.574563e-02 0.002038 0.008521 0.034912 0.005131 -0.006038 ... 0.023390 0.029035 0.499990 -0.008631 0.030650 0.039864 -0.000045 -0.003653 1.000000 0.049821
target 0.977144 0.072984 0.191739 0.148729 1.534582e-01 0.036034 0.037116 0.142172 0.209725 -0.175350 ... -0.102359 -0.115179 0.101345 -0.240481 0.113492 0.195439 0.146567 0.145157 0.049821 1.000000

95 rows × 95 columns


In [13]:
df['feat_11'].corr(df['feat_90'])


Out[13]:
0.045073721280135606

Podemos seleccionar solo algunas de las columnas de nuestro DataFrame...


In [14]:
df[['feat_1', 'target']]


Out[14]:
feat_1 target
0 1 0
1 0 0
2 0 0
3 1 0
4 0 0
5 2 0
6 2 0
7 0 0
8 0 0
9 0 0
10 0 0
11 0 0
12 1 0
13 0 0
14 0 0
15 0 0
16 0 0
17 0 0
18 0 0
19 0 0
20 0 0
21 0 0
22 0 0
23 0 0
24 0 0
25 0 0
26 2 0
27 0 0
28 0 0
29 2 0
... ... ...
61848 0 8
61849 5 8
61850 0 8
61851 3 8
61852 0 8
61853 0 8
61854 0 8
61855 0 8
61856 0 8
61857 0 8
61858 4 8
61859 0 8
61860 0 8
61861 0 8
61862 2 8
61863 0 8
61864 0 8
61865 0 8
61866 0 8
61867 0 8
61868 0 8
61869 0 8
61870 1 8
61871 0 8
61872 0 8
61873 1 8
61874 4 8
61875 0 8
61876 1 8
61877 0 8

61878 rows × 2 columns


In [15]:
df.loc[2:5, ['id', 'target']]


Out[15]:
id target
2 3 0
3 4 0
4 5 0
5 6 0

Qué hacer cuando tenemos datos incompletos??


In [16]:
df.isnull().any() #df.isnull().isnull() full matrix


Out[16]:
id         False
feat_1     False
feat_2     False
feat_3     False
feat_4     False
feat_5     False
feat_6     False
feat_7     False
feat_8     False
feat_9     False
feat_10    False
feat_11    False
feat_12    False
feat_13    False
feat_14    False
feat_15    False
feat_16    False
feat_17    False
feat_18    False
feat_19    False
feat_20    False
feat_21    False
feat_22    False
feat_23    False
feat_24    False
feat_25    False
feat_26    False
feat_27    False
feat_28    False
feat_29    False
           ...  
feat_65    False
feat_66    False
feat_67    False
feat_68    False
feat_69    False
feat_70    False
feat_71    False
feat_72    False
feat_73    False
feat_74    False
feat_75    False
feat_76    False
feat_77    False
feat_78    False
feat_79    False
feat_80    False
feat_81    False
feat_82    False
feat_83    False
feat_84    False
feat_85    False
feat_86    False
feat_87    False
feat_88    False
feat_89    False
feat_90    False
feat_91    False
feat_92    False
feat_93    False
target     False
dtype: bool

Y duplicados ??


In [17]:
unique_df = df.drop_duplicates()
unique_df.shape, df.shape


Out[17]:
((61878, 95), (61878, 95))

Este dataset no cuenta con NaN values ... pero para futuros machetes ..


In [14]:
df_with_interpolated_values = df.interpolate(method='linear', axis=0)
# Se pueden utilizar diferentes métodos de interpolacion de datos y sobre cualquiera de los dos ejes
df_with_fill_values = df.fillna(df.mean())
**También mirar:
- fillna se tienen varias opciones como bfill y ffill. - dropna saca de la tabla todos los valores incompletos.
- interpolate cuenta con varios métodos: spline, pchip, polynomial, etc.**

In [15]:
df_with_fill_values.info(), df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 61878 entries, 0 to 61877
Data columns (total 95 columns):
id         61878 non-null int64
feat_1     61878 non-null int64
feat_2     61878 non-null int64
feat_3     61878 non-null int64
feat_4     61878 non-null int64
feat_5     61878 non-null int64
feat_6     61878 non-null int64
feat_7     61878 non-null int64
feat_8     61878 non-null int64
feat_9     61878 non-null int64
feat_10    61878 non-null int64
feat_11    61878 non-null int64
feat_12    61878 non-null int64
feat_13    61878 non-null int64
feat_14    61878 non-null int64
feat_15    61878 non-null int64
feat_16    61878 non-null int64
feat_17    61878 non-null int64
feat_18    61878 non-null int64
feat_19    61878 non-null int64
feat_20    61878 non-null int64
feat_21    61878 non-null int64
feat_22    61878 non-null int64
feat_23    61878 non-null int64
feat_24    61878 non-null int64
feat_25    61878 non-null int64
feat_26    61878 non-null int64
feat_27    61878 non-null int64
feat_28    61878 non-null int64
feat_29    61878 non-null int64
feat_30    61878 non-null int64
feat_31    61878 non-null int64
feat_32    61878 non-null int64
feat_33    61878 non-null int64
feat_34    61878 non-null int64
feat_35    61878 non-null int64
feat_36    61878 non-null int64
feat_37    61878 non-null int64
feat_38    61878 non-null int64
feat_39    61878 non-null int64
feat_40    61878 non-null int64
feat_41    61878 non-null int64
feat_42    61878 non-null int64
feat_43    61878 non-null int64
feat_44    61878 non-null int64
feat_45    61878 non-null int64
feat_46    61878 non-null int64
feat_47    61878 non-null int64
feat_48    61878 non-null int64
feat_49    61878 non-null int64
feat_50    61878 non-null int64
feat_51    61878 non-null int64
feat_52    61878 non-null int64
feat_53    61878 non-null int64
feat_54    61878 non-null int64
feat_55    61878 non-null int64
feat_56    61878 non-null int64
feat_57    61878 non-null int64
feat_58    61878 non-null int64
feat_59    61878 non-null int64
feat_60    61878 non-null int64
feat_61    61878 non-null int64
feat_62    61878 non-null int64
feat_63    61878 non-null int64
feat_64    61878 non-null int64
feat_65    61878 non-null int64
feat_66    61878 non-null int64
feat_67    61878 non-null int64
feat_68    61878 non-null int64
feat_69    61878 non-null int64
feat_70    61878 non-null int64
feat_71    61878 non-null int64
feat_72    61878 non-null int64
feat_73    61878 non-null int64
feat_74    61878 non-null int64
feat_75    61878 non-null int64
feat_76    61878 non-null int64
feat_77    61878 non-null int64
feat_78    61878 non-null int64
feat_79    61878 non-null int64
feat_80    61878 non-null int64
feat_81    61878 non-null int64
feat_82    61878 non-null int64
feat_83    61878 non-null int64
feat_84    61878 non-null int64
feat_85    61878 non-null int64
feat_86    61878 non-null int64
feat_87    61878 non-null int64
feat_88    61878 non-null int64
feat_89    61878 non-null int64
feat_90    61878 non-null int64
feat_91    61878 non-null int64
feat_92    61878 non-null int64
feat_93    61878 non-null int64
target     61878 non-null int32
dtypes: int32(1), int64(94)
memory usage: 45.1 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 61878 entries, 0 to 61877
Data columns (total 95 columns):
id         61878 non-null int64
feat_1     61878 non-null int64
feat_2     61878 non-null int64
feat_3     61878 non-null int64
feat_4     61878 non-null int64
feat_5     61878 non-null int64
feat_6     61878 non-null int64
feat_7     61878 non-null int64
feat_8     61878 non-null int64
feat_9     61878 non-null int64
feat_10    61878 non-null int64
feat_11    61878 non-null int64
feat_12    61878 non-null int64
feat_13    61878 non-null int64
feat_14    61878 non-null int64
feat_15    61878 non-null int64
feat_16    61878 non-null int64
feat_17    61878 non-null int64
feat_18    61878 non-null int64
feat_19    61878 non-null int64
feat_20    61878 non-null int64
feat_21    61878 non-null int64
feat_22    61878 non-null int64
feat_23    61878 non-null int64
feat_24    61878 non-null int64
feat_25    61878 non-null int64
feat_26    61878 non-null int64
feat_27    61878 non-null int64
feat_28    61878 non-null int64
feat_29    61878 non-null int64
feat_30    61878 non-null int64
feat_31    61878 non-null int64
feat_32    61878 non-null int64
feat_33    61878 non-null int64
feat_34    61878 non-null int64
feat_35    61878 non-null int64
feat_36    61878 non-null int64
feat_37    61878 non-null int64
feat_38    61878 non-null int64
feat_39    61878 non-null int64
feat_40    61878 non-null int64
feat_41    61878 non-null int64
feat_42    61878 non-null int64
feat_43    61878 non-null int64
feat_44    61878 non-null int64
feat_45    61878 non-null int64
feat_46    61878 non-null int64
feat_47    61878 non-null int64
feat_48    61878 non-null int64
feat_49    61878 non-null int64
feat_50    61878 non-null int64
feat_51    61878 non-null int64
feat_52    61878 non-null int64
feat_53    61878 non-null int64
feat_54    61878 non-null int64
feat_55    61878 non-null int64
feat_56    61878 non-null int64
feat_57    61878 non-null int64
feat_58    61878 non-null int64
feat_59    61878 non-null int64
feat_60    61878 non-null int64
feat_61    61878 non-null int64
feat_62    61878 non-null int64
feat_63    61878 non-null int64
feat_64    61878 non-null int64
feat_65    61878 non-null int64
feat_66    61878 non-null int64
feat_67    61878 non-null int64
feat_68    61878 non-null int64
feat_69    61878 non-null int64
feat_70    61878 non-null int64
feat_71    61878 non-null int64
feat_72    61878 non-null int64
feat_73    61878 non-null int64
feat_74    61878 non-null int64
feat_75    61878 non-null int64
feat_76    61878 non-null int64
feat_77    61878 non-null int64
feat_78    61878 non-null int64
feat_79    61878 non-null int64
feat_80    61878 non-null int64
feat_81    61878 non-null int64
feat_82    61878 non-null int64
feat_83    61878 non-null int64
feat_84    61878 non-null int64
feat_85    61878 non-null int64
feat_86    61878 non-null int64
feat_87    61878 non-null int64
feat_88    61878 non-null int64
feat_89    61878 non-null int64
feat_90    61878 non-null int64
feat_91    61878 non-null int64
feat_92    61878 non-null int64
feat_93    61878 non-null int64
target     61878 non-null int32
dtypes: int32(1), int64(94)
memory usage: 45.1 MB
Out[15]:
(None, None)
**Si los datos con los que se trabajan vienen de diferentes fuentes mirar las funciones:
- join, concat, merge, combine, etc. para unificar datos de distintos origenes con diferentes patrones ..**

Ahora queremos separar los datos que usaremos como entrada para crear nuestro clasificador...


In [20]:
y = df['target'].values.astype(np.int32)
xs = df[df.columns[1:-1]].values.astype(np.float32)

Y por las dudas guardemos nuestro archivo con la nueva codificación para la variable target ..


In [18]:
df.to_csv('/tmp/this_is_my_out.txt', sep='\t', header=None, index=False)

In [18]:
df.loc[:5, :].to_clipboard()
**También mirar:
- to_excel
- to_json
- to_html
- to_pickle ... Y sus parámetros**

Paseando por scikit-learn


In [3]:
from IPython.display import IFrame
IFrame('http://scikit-learn.org/stable/', width=900, height=350)


Out[3]:

In [20]:
from IPython.display import Image
Image(url='http://1.bp.blogspot.com/-ME24ePzpzIM/UQLWTwurfXI/AAAAAAAAANw/W3EETIroA80/s1600/drop_shadows_background.png',
      width=1000, height=1000)


Out[20]:

Nuestros datos cuentan con varias dimensiones por lo cual no es tan fácil visualizarlo ... por lo que si queremos ver cómo se comportan podemos usar PCA para reducir su dimensionalidad.


In [21]:
# Project the data to a 2D space for visualization
from sklearn.decomposition import RandomizedPCA # using randomized Singular Value Decomposition 
Xp = RandomizedPCA(n_components=2, random_state=1).fit_transform(xs)
Xp


Out[21]:
array([[ 0.68992602, -1.4586081 ],
       [-2.64613954, -1.89425161],
       [-1.8834618 , -3.11100498],
       ..., 
       [ 4.22165023, -7.45878803],
       [-0.3476059 , -2.22537112],
       [ 1.50033683, -0.69861494]])

In [22]:
% matplotlib inline
import matplotlib.pyplot as plt

# get the product class 
product_class = np.unique(y)

colors = plt.get_cmap("hsv")

plt.figure(figsize=(10, 4))
for i, p in enumerate(product_class):
    mask = (y == p)
    plt.scatter(Xp[mask, 0], Xp[mask, 1], 
                c=colors(1. * i / 11), label=p, alpha=0.2)
    
plt.legend(loc="best")


Out[22]:
<matplotlib.legend.Legend at 0x7f08de09c250>

Antes de usar cualquier algoritmo de predicción vamos a querer dividir nuestro dataset, entre los valores que usaremos para entrenar al clasificador y los que usaremos para evaluar que tan bien clasifica ...


In [23]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn import preprocessing

#X = preprocessing.scale(xs, axis=1)
X_train, X_test, y_train, y_test = train_test_split(xs, y,random_state=1)

print """X_train shape : {}, y_train shape : {}
X_test shape : {}, y_test shape : {}""".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


X_train shape : (46408, 93), y_train shape : (46408,)
X_test shape : (15470, 93), y_test shape : (15470,)

Vamos a implementar una pequeña función que nos deje ver de forma grafica que tan bien trabaja nuestro clasificador


In [24]:
def plot_matrix(clf, X_test, y_test):
    plt.clf()
    plt.imshow(confusion_matrix(clf.predict(X_test), y_test),
               interpolation='nearest', cmap=plt.cm.binary)
    plt.colorbar()
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    plt.show()

Ahora creamos nuestros clasificadores ...


In [17]:
from sklearn.svm import SVC

sv = SVC(kernel='rbf', cache_size=1000)
sv.fit(X_train, y_train)

print classification_report(sv.predict(X_test), y_test)
print sv.score(X_test, y_test)
plot_matrix(sv, X_test, y_test)


             precision    recall  f1-score   support

          0       0.44      0.74      0.56       315
          1       0.88      0.70      0.78      5136
          2       0.44      0.63      0.52      1411
          3       0.32      0.82      0.46       249
          4       0.96      0.98      0.97       710
          5       0.91      0.94      0.93      3327
          6       0.54      0.72      0.62       535
          7       0.94      0.74      0.83      2576
          8       0.82      0.85      0.83      1211

avg / total       0.82      0.78      0.79     15470

0.781124757595

In [25]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=200,
                           max_features=0.2, 
                           n_jobs=2,
                           max_depth=None,
                           min_samples_split=1,
                           random_state=1).fit(X_train, y_train)
print classification_report(clf.predict(X_test), y_test)
print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)


             precision    recall  f1-score   support

          0       0.42      0.84      0.56       262
          1       0.88      0.72      0.79      5037
          2       0.50      0.65      0.57      1568
          3       0.44      0.83      0.58       346
          4       0.96      0.98      0.97       712
          5       0.95      0.93      0.94      3515
          6       0.60      0.81      0.69       526
          7       0.94      0.87      0.91      2190
          8       0.88      0.84      0.86      1314

avg / total       0.84      0.81      0.82     15470

0.811700064641

Podemos observar cuales son las variables que mas peso tienen sobre la decision de ubicar un elemento en X clase?


In [26]:
importances = clf.feature_importances_

text = map(lambda i: df.columns[1:-1][i], range(93))
plt.figure(figsize=(20, 6))
print importances[::-1].shape
plt.bar(range(93),height=importances,  width=1.)
plt.xticks(np.arange(0.5, 93, 1.), text, rotation=90)
plt.xlim((0, 93))
plt.show()
# Top 10
indices = np.argsort(importances)[::-1]
for i in range(10):
    print importances[indices[i]], df.columns[1:-1][indices[i]]


(93,)
0.0490019250225 feat_11
0.0447674033442 feat_34
0.0423471301928 feat_60
0.0338275409266 feat_14
0.0264123595345 feat_40
0.0259991016933 feat_25
0.0246679553862 feat_15
0.0226700475102 feat_42
0.0225282480303 feat_26
0.020135275408 feat_75

Cómo buscar los mejores parámetros para nuestro modelo?


In [27]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier

parameter_grid = {
    'n_estimators': [100, 200],
    'max_features': [0.2, 0.5],
    #'max_depth': [5., None]
}

grid_search = GridSearchCV(ExtraTreesClassifier(n_jobs=4), parameter_grid,
                            cv=5, verbose=3)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.806785 -  18.4s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.813227 -  18.5s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.802715 -  18.1s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.810560 -  17.7s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.812567 -  18.2s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.807539 -  35.8s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.817428 -  36.3s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.805732 -  36.3s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.813147 -  36.0s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.811813 -  36.0s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.811524 -  39.1s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.818397 -  39.8s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.805301 -  39.5s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.813470 -  39.6s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.814615 -  39.6s
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.811632 - 1.3min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.818074 - 1.4min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.806379 - 1.3min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.816487 - 1.4min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.816986 - 1.3min
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   18.4s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 14.5min finished

Out[27]:
GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': [100, 200], 'max_features': [0.2, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=3)

In [77]:
grid_search.grid_scores_


Out[77]:
[mean: 0.80796, std: 0.00395, params: {'max_features': 0.2, 'n_estimators': 100},
 mean: 0.81074, std: 0.00412, params: {'max_features': 0.2, 'n_estimators': 200},
 mean: 0.81339, std: 0.00300, params: {'max_features': 0.5, 'n_estimators': 100},
 mean: 0.81413, std: 0.00419, params: {'max_features': 0.5, 'n_estimators': 200}]

Cómo saber si nuestro clasificador es realmente bueno ?? deberíamos tener alguna especie de baseline. Pensemos en cómo le iría al peor clasificador del mundo con nuestros datos ...


In [27]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent',random_state=0).fit(X_train, y_train)
print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)


0.2650937298

**También mirar: Pipelines en scikit-learn.**

Una miradita por lasagne ...


In [8]:
from sklearn import preprocessing

In [9]:
df = pd.read_csv('data/train.csv')
X = df.values

np.random.shuffle(X)

X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels).astype(np.int32)
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

In [10]:
# add to kfkd.py
from lasagne import layers 
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
import theano

net1 = NeuralNet(
    layers= [
            ('input', layers.InputLayer),
            ('hidden1', layers.DenseLayer), 
            ('dropout0', layers.DropoutLayer),
            ('hidden2', layers.DenseLayer),
            ('output', layers.DenseLayer),
            ],
    # layer parameters:
    input_shape = (None, 93),
    hidden1_num_units = 500,
    dropout0_p = 0.3,
    hidden2_num_units = 250,
    output_nonlinearity= softmax,  
    output_num_units= encoder.classes_.shape[0],

    # optimization method:
    update=nesterov_momentum,
    update_learning_rate=theano.shared(np.float32(0.03)),
    update_momentum=theano.shared(np.float32(0.9)),

    #on_epoch_finished=[
    #    AdjustVariable('update_learning_rate', start=0.03, stop=0.0001),
    #    AdjustVariable('update_momentum', start=0.9, stop=0.999),
        #EarlyStopping(patience=300)
    #    ],
    regression=False,  
    max_epochs=100,
    verbose=1,
    )

net1.fit(X, y)


Using gpu device 0: GeForce GT 635M
/home/celia/Downloads/src/lasagne/lasagne/init.py:86: UserWarning: The uniform initializer no longer uses Glorot et al.'s approach to determine the bounds, but defaults to the range (-0.01, 0.01) instead. Please use the new GlorotUniform initializer to get the old behavior. GlorotUniform is now the default for all layers.
  warnings.warn("The uniform initializer no longer uses Glorot et al.'s "
  input             	(None, 93)          	produces      93 outputs
  hidden1           	(None, 500)         	produces     500 outputs
  dropout0          	(None, 500)         	produces     500 outputs
  hidden2           	(None, 250)         	produces     250 outputs
  output            	(None, 9)           	produces       9 outputs
  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       0.72416       0.58657      1.23457      0.78109  1.37s
      2       0.58723       0.55946      1.04964      0.78482  1.27s
      3       0.55576       0.54064      1.02797      0.79241  1.27s
      4       0.53638       0.53297      1.00639      0.79409  1.27s
      5       0.51871       0.52702      0.98422      0.79441  1.27s
      6       0.50350       0.52198      0.96459      0.79828  1.27s
      7       0.49222       0.51321      0.95910      0.80213  1.28s
      8       0.48407       0.50851      0.95195      0.80418  1.27s
      9       0.47342       0.50727      0.93326      0.80180  1.27s
     10       0.46529       0.51027      0.91184      0.80318  1.45s
     11       0.45798       0.50284      0.91078      0.80465  1.60s
     12       0.44821       0.50661      0.88471      0.80671  1.61s
     13       0.44031       0.50404      0.87356      0.80471  1.62s
     14       0.43518       0.50391      0.86361      0.80887  1.60s
     15       0.42760       0.50075      0.85391      0.80780  1.60s
     16       0.42405       0.50587      0.83826      0.80774  1.60s
     17       0.41875       0.50503      0.82917      0.80670  1.67s
     18       0.41142       0.50574      0.81350      0.80998  1.62s
     19       0.40730       0.50283      0.81000      0.80806  1.61s
     20       0.39819       0.50154      0.79392      0.80820  1.62s
     21       0.39277       0.50259      0.78150      0.80906  1.61s
     22       0.39038       0.49828      0.78345      0.81167  1.62s
     23       0.38516       0.50340      0.76513      0.81145  1.66s
     24       0.38043       0.50618      0.75157      0.81154  1.69s
     25       0.37515       0.51446      0.72922      0.80763  1.61s
     26       0.37170       0.50897      0.73031      0.80611  1.65s
     27       0.37230       0.50921      0.73113      0.80513  1.62s
     28       0.36380       0.51565      0.70551      0.80933  1.63s
     29       0.35971       0.51057      0.70453      0.80823  1.62s
     30       0.35359       0.51519      0.68633      0.80885  1.63s
     31       0.35116       0.51461      0.68238      0.81038  1.64s
     32       0.34524       0.51990      0.66405      0.80478  1.69s
     33       0.34138       0.51768      0.65944      0.81000  1.65s
     34       0.34024       0.52704      0.64558      0.81041  1.62s
     35       0.33518       0.52718      0.63580      0.81124  1.65s
     36       0.33490       0.52074      0.64313      0.80719  1.61s
     37       0.33246       0.52708      0.63077      0.80817  1.65s
     38       0.32704       0.53166      0.61513      0.80703  1.61s
     39       0.32303       0.53418      0.60472      0.80632  1.60s
     40       0.32297       0.53073      0.60853      0.80773  1.65s
     41       0.31560       0.53619      0.58860      0.80933  1.62s
     42       0.31336       0.54226      0.57788      0.80781  1.62s
     43       0.31462       0.54176      0.58073      0.81026  1.61s
     44       0.31093       0.54612      0.56935      0.80459  1.62s
     45       0.30729       0.54595      0.56285      0.81155  1.62s
     46       0.30461       0.54560      0.55830      0.80934  1.62s
     47       0.29643       0.56000      0.52935      0.80812  1.62s
     48       0.29935       0.54724      0.54702      0.81031  1.62s
     49       0.29544       0.55324      0.53402      0.80976  1.62s
     50       0.29027       0.56169      0.51678      0.80701  1.61s
     51       0.28878       0.56202      0.51381      0.80612  1.62s
     52       0.28596       0.57100      0.50079      0.81045  1.62s
     53       0.28800       0.56849      0.50659      0.81069  1.62s
     54       0.28165       0.58102      0.48476      0.80430  1.62s
     55       0.28904       0.57535      0.50237      0.80350  1.62s
     56       0.28109       0.56791      0.49495      0.80806  1.62s
     57       0.27560       0.57797      0.47684      0.80757  1.62s
     58       0.27667       0.57735      0.47921      0.80892  1.61s
     59       0.27468       0.58504      0.46951      0.80918  1.61s
     60       0.27415       0.58381      0.46959      0.80798  1.62s
     61       0.26945       0.57916      0.46523      0.81207  1.62s
     62       0.26667       0.58842      0.45320      0.81020  1.64s
     63       0.26286       0.59291      0.44334      0.80967  1.66s
     64       0.26570       0.59542      0.44624      0.80886  1.64s
     65       0.26102       0.61152      0.42683      0.80596  1.62s
     66       0.26331       0.59913      0.43949      0.80883  1.63s
     67       0.26110       0.61007      0.42799      0.80914  1.61s
     68       0.25593       0.60861      0.42052      0.80246  1.67s
     69       0.25494       0.62655      0.40690      0.80991  1.62s
     70       0.25240       0.60906      0.41442      0.80512  1.62s
     71       0.24819       0.61691      0.40231      0.81028  1.64s
     72       0.24716       0.62290      0.39679      0.80806  1.62s
     73       0.24432       0.64377      0.37951      0.80510  1.62s
     74       0.24092       0.62105      0.38793      0.81135  1.62s
     75       0.24499       0.62283      0.39335      0.81271  1.62s
     76       0.24568       0.63266      0.38833      0.80285  1.61s
     77       0.24323       0.65336      0.37228      0.80975  1.61s
     78       0.23773       0.65139      0.36496      0.80677  1.62s
     79       0.23940       0.64610      0.37052      0.80908  1.61s
     80       0.23646       0.65991      0.35832      0.80636  1.61s
     81       0.23870       0.65253      0.36580      0.80838  1.62s
     82       0.23721       0.65243      0.36358      0.80857  1.67s
     83       0.23944       0.65476      0.36570      0.80669  1.61s
     84       0.23271       0.65678      0.35431      0.80878  1.65s
     85       0.22861       0.65566      0.34867      0.80749  1.90s
     86       0.22473       0.66446      0.33822      0.80920  1.75s
     87       0.22383       0.66661      0.33578      0.80793  1.74s
     88       0.22553       0.67273      0.33524      0.80340  1.61s
     89       0.22319       0.68162      0.32744      0.80724  1.62s
     90       0.22194       0.68806      0.32256      0.80456  1.62s
     91       0.22270       0.67390      0.33046      0.80725  1.61s
     92       0.22090       0.68044      0.32464      0.81015  1.61s
     93       0.21299       0.68284      0.31192      0.80318  1.62s
     94       0.21626       0.68805      0.31431      0.80957  1.61s
     95       0.21637       0.69689      0.31048      0.80723  1.60s
     96       0.21908       0.68275      0.32088      0.80604  1.61s
     97       0.21675       0.69357      0.31250      0.80770  1.63s
     98       0.21397       0.68933      0.31041      0.80780  1.61s
     99       0.21456       0.68251      0.31436      0.80937  1.61s
    100       0.20991       0.69972      0.29999      0.80891  1.61s
Out[10]:
NeuralNet(X_tensor_type=<function matrix at 0x7fd7f4ebd9b0>,
     batch_iterator_test=<nolearn.lasagne.base.BatchIterator object at 0x7fd7e3e1fd50>,
     batch_iterator_train=<nolearn.lasagne.base.BatchIterator object at 0x7fd7e81c2b10>,
     custom_score=None, dropout0_p=0.3, eval_size=0.2,
     hidden1_num_units=500, hidden2_num_units=250, input_shape=(None, 93),
     layers=[('input', <class 'lasagne.layers.input.InputLayer'>), ('hidden1', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout0', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden2', <class 'lasagne.layers.dense.DenseLayer'>), ('output', <class 'lasagne.layers.dense.DenseLayer'>)],
     loss=None, max_epochs=100, more_params={},
     objective=<class 'lasagne.objectives.Objective'>,
     objective_loss_function=<function categorical_crossentropy at 0x7fd7f4b22410>,
     on_epoch_finished=[<nolearn.lasagne.util.PrintLog instance at 0x7fd7e3e32440>],
     on_training_finished=[],
     output_nonlinearity=<function softmax at 0x7fd7e3e18b90>,
     output_num_units=9, regression=False,
     update=<function nesterov_momentum at 0x7fd7e3995320>,
     update_learning_rate=<TensorType(float32, scalar)>,
     update_momentum=<TensorType(float32, scalar)>,
     use_label_encoder=False, verbose=1,
     y_tensor_type=TensorType(int32, vector))

Último viaje del dia .. por matplotlib


In [11]:
from IPython.display import Image
Image(filename='images/convincing.png')


Out[11]:

In [29]:
from IPython.display import IFrame
IFrame('http://matplotlib.org/', width=900, height=350)


Out[29]:

In [30]:
% matplotlib inline
import matplotlib.pyplot as plt

In [31]:
print plt.style.available
plt.style.use(plt.style.available[1])


[u'dark_background', u'bmh', u'grayscale', u'ggplot', u'fivethirtyeight']

In [32]:
df = pd.read_csv('data/train.csv')

In [33]:
plt.plot(df['feat_34'].cumsum(), 'b-') # con la letra indicamos color y con el siguiente la forma del marcador.
plt.ylabel('Este es el label para y') # colocando labels sobre los ejes.
plt.xlabel('Este label es para x')
#plt.xscale('log')


Out[33]:
<matplotlib.text.Text at 0x7f08da946890>

In [34]:
plt.plot(df['feat_34'].diff(), 'y-') # con la letra indicamos color y con el siguiente la forma del marcador.
plt.ylabel('Este es el label para y') # colocando labels sobre los ejes.
plt.xlabel('Este label es para x')
#plt.axis([0, 65000, -60, 60])


Out[34]:
<matplotlib.text.Text at 0x7f08ddf40d90>

In [35]:
from mpl_toolkits.mplot3d import Axes3D
for i, group in df.groupby('target'):
    threedee = plt.figure().gca(projection='3d')
    threedee.set_title("scatter plot of class number {}".format(i))
    X = group['feat_11']
    Y = group['feat_14']
    Z = group['feat_34']
    threedee.scatter(X, Y, Z)
    threedee.set_xlabel('feat_11')
    threedee.set_ylabel('feat_14')
    threedee.set_zlabel('feat_34')
plt.show()



In [33]:
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata

for i, group in df.groupby('target'):
    threedee = plt.figure().gca(projection='3d')
    threedee.set_title("scatter plot of class number {}".format(i))
    X = group['feat_11']
    Y = group['feat_14']
    Z = group['feat_34']
    xi = np.linspace(X.min(),X.max(),100)
    yi = np.linspace(Y.min(),Y.max(),100)
    zi = griddata((X, Y), Z, (xi[None,:], yi[:,None]), method='cubic')
    
    xig, yig = np.meshgrid(xi, yi)
    surf = threedee.plot_surface(xig, yig, zi, linewidth=0)

    threedee.set_xlabel('feat_11')
    threedee.set_ylabel('feat_14')
    threedee.set_zlabel('feat_34')
plt.show()



In [34]:
for i, group in df.groupby('target'):
    threedee = plt.figure().gca(projection='3d')
    threedee.set_title("scatter plot of class number {}".format(i))
    X = group['feat_11']
    Y = group['feat_14']
    Z = group['feat_34']
    xi = np.linspace(X.min(),X.max(),100)
    yi = np.linspace(Y.min(),Y.max(),100)
    zi = griddata((X, Y), Z, (xi[None,:], yi[:,None]), method='cubic')
    
    xig, yig = np.meshgrid(xi, yi)
    contour = plt.contour(xi,yi,zi,15,linewidths=0.5,color='k')

    threedee.set_xlabel('feat_11')
    threedee.set_ylabel('feat_14')
    threedee.set_zlabel('feat_34')
plt.show()


También podemos graficar nuestros datos desde pandas con matplotlib ..


In [36]:
interest_feat = ['feat_11', 'feat_34', 'feat_14', 'feat_60', 'target']

In [22]:
from pandas.tools.plotting import andrews_curves

andrews_curves(df[interest_feat[:-1]], 'target')


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x4f88210>

In [26]:
from pandas.tools.plotting import parallel_coordinates

parallel_coordinates(df[interest_feat], 'target', alpha=0.5)


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e99cc90>

In [36]:
df_plot = df[interest_feat[:-1]].cumsum()
df_plot.plot(kind='area', alpha=0.5)


Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd7f182ca90>

In [37]:
df[interest_feat[:-1]].diff().hist(alpha=0.5, bins=10)


Out[37]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f2041750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f168fc10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f21c9950>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f1cc3fd0>]], dtype=object)

Podemos ver cuantos elementos de cada clase cuenta la muestra que tenemos


In [38]:
df.groupby(['target']).target.count().plot(kind='bar')


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd7efae9950>

In [39]:
df.groupby(['target']).target.count().plot(kind='pie')


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd7f15d4ed0>

In [40]:
df.boxplot(column='feat_34', by='target', grid=False)
for i in range(9):
    y = df.feat_11[df.target==i].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y, 'r.', alpha=0.02)


Si deseamos visualizar varias variables al mismo tiempo se puede utlizar la función scatter_matrix


In [37]:
pd.scatter_matrix(df.loc[:,interest_feat[:-1]], figsize=(12,8), diagonal='kde')


Out[37]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f08daaf5a90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da4b5cd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da3471d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da2aa6d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f08da22ca10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da2dfc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da113110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da088f10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f08dacc8b10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08dade2090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08daba8f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da975350>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f08da3f82d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08dab29550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08daaed550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08dac019d0>]], dtype=object)