In [57]:
#Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [46]:
# Read Dataset
dataset = pd.read_csv('C:\\Users\Ana\Documents\home\Master-TIC\Curso_2016-17\wine.csv',delimiter=',',header=0)
dataset


Out[46]:
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.640000 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.380000 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.680000 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.800000 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.320000 1.04 2.93 735
5 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.750000 1.05 2.85 1450
6 1 14.39 1.87 2.45 14.6 96 2.50 2.52 0.30 1.98 5.250000 1.02 3.58 1290
7 1 14.06 2.15 2.61 17.6 121 2.60 2.51 0.31 1.25 5.050000 1.06 3.58 1295
8 1 14.83 1.64 2.17 14.0 97 2.80 2.98 0.29 1.98 5.200000 1.08 2.85 1045
9 1 13.86 1.35 2.27 16.0 98 2.98 3.15 0.22 1.85 7.220000 1.01 3.55 1045
10 1 14.10 2.16 2.30 18.0 105 2.95 3.32 0.22 2.38 5.750000 1.25 3.17 1510
11 1 14.12 1.48 2.32 16.8 95 2.20 2.43 0.26 1.57 5.000000 1.17 2.82 1280
12 1 13.75 1.73 2.41 16.0 89 2.60 2.76 0.29 1.81 5.600000 1.15 2.90 1320
13 1 14.75 1.73 2.39 11.4 91 3.10 3.69 0.43 2.81 5.400000 1.25 2.73 1150
14 1 14.38 1.87 2.38 12.0 102 3.30 3.64 0.29 2.96 7.500000 1.20 3.00 1547
15 1 13.63 1.81 2.70 17.2 112 2.85 2.91 0.30 1.46 7.300000 1.28 2.88 1310
16 1 14.30 1.92 2.72 20.0 120 2.80 3.14 0.33 1.97 6.200000 1.07 2.65 1280
17 1 13.83 1.57 2.62 20.0 115 2.95 3.40 0.40 1.72 6.600000 1.13 2.57 1130
18 1 14.19 1.59 2.48 16.5 108 3.30 3.93 0.32 1.86 8.700000 1.23 2.82 1680
19 1 13.64 3.10 2.56 15.2 116 2.70 3.03 0.17 1.66 5.100000 0.96 3.36 845
20 1 14.06 1.63 2.28 16.0 126 3.00 3.17 0.24 2.10 5.650000 1.09 3.71 780
21 1 12.93 3.80 2.65 18.6 102 2.41 2.41 0.25 1.98 4.500000 1.03 3.52 770
22 1 13.71 1.86 2.36 16.6 101 2.61 2.88 0.27 1.69 3.800000 1.11 4.00 1035
23 1 12.85 1.60 2.52 17.8 95 2.48 2.37 0.26 1.46 3.930000 1.09 3.63 1015
24 1 13.50 1.81 2.61 20.0 96 2.53 2.61 0.28 1.66 3.520000 1.12 3.82 845
25 1 13.05 2.05 3.22 25.0 124 2.63 2.68 0.47 1.92 3.580000 1.13 3.20 830
26 1 13.39 1.77 2.62 16.1 93 2.85 2.94 0.34 1.45 4.800000 0.92 3.22 1195
27 1 13.30 1.72 2.14 17.0 94 2.40 2.19 0.27 1.35 3.950000 1.02 2.77 1285
28 1 13.87 1.90 2.80 19.4 107 2.95 2.97 0.37 1.76 4.500000 1.25 3.40 915
29 1 14.02 1.68 2.21 16.0 96 2.65 2.33 0.26 1.98 4.700000 1.04 3.59 1035
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
148 3 13.32 3.24 2.38 21.5 92 1.93 0.76 0.45 1.25 8.420000 0.55 1.62 650
149 3 13.08 3.90 2.36 21.5 113 1.41 1.39 0.34 1.14 9.400000 0.57 1.33 550
150 3 13.50 3.12 2.62 24.0 123 1.40 1.57 0.22 1.25 8.600000 0.59 1.30 500
151 3 12.79 2.67 2.48 22.0 112 1.48 1.36 0.24 1.26 10.800000 0.48 1.47 480
152 3 13.11 1.90 2.75 25.5 116 2.20 1.28 0.26 1.56 7.100000 0.61 1.33 425
153 3 13.23 3.30 2.28 18.5 98 1.80 0.83 0.61 1.87 10.520000 0.56 1.51 675
154 3 12.58 1.29 2.10 20.0 103 1.48 0.58 0.53 1.40 7.600000 0.58 1.55 640
155 3 13.17 5.19 2.32 22.0 93 1.74 0.63 0.61 1.55 7.900000 0.60 1.48 725
156 3 13.84 4.12 2.38 19.5 89 1.80 0.83 0.48 1.56 9.010000 0.57 1.64 480
157 3 12.45 3.03 2.64 27.0 97 1.90 0.58 0.63 1.14 7.500000 0.67 1.73 880
158 3 14.34 1.68 2.70 25.0 98 2.80 1.31 0.53 2.70 13.000000 0.57 1.96 660
159 3 13.48 1.67 2.64 22.5 89 2.60 1.10 0.52 2.29 11.750000 0.57 1.78 620
160 3 12.36 3.83 2.38 21.0 88 2.30 0.92 0.50 1.04 7.650000 0.56 1.58 520
161 3 13.69 3.26 2.54 20.0 107 1.83 0.56 0.50 0.80 5.880000 0.96 1.82 680
162 3 12.85 3.27 2.58 22.0 106 1.65 0.60 0.60 0.96 5.580000 0.87 2.11 570
163 3 12.96 3.45 2.35 18.5 106 1.39 0.70 0.40 0.94 5.280000 0.68 1.75 675
164 3 13.78 2.76 2.30 22.0 90 1.35 0.68 0.41 1.03 9.580000 0.70 1.68 615
165 3 13.73 4.36 2.26 22.5 88 1.28 0.47 0.52 1.15 6.620000 0.78 1.75 520
166 3 13.45 3.70 2.60 23.0 111 1.70 0.92 0.43 1.46 10.680000 0.85 1.56 695
167 3 12.82 3.37 2.30 19.5 88 1.48 0.66 0.40 0.97 10.260000 0.72 1.75 685
168 3 13.58 2.58 2.69 24.5 105 1.55 0.84 0.39 1.54 8.660000 0.74 1.80 750
169 3 13.40 4.60 2.86 25.0 112 1.98 0.96 0.27 1.11 8.500000 0.67 1.92 630
170 3 12.20 3.03 2.32 19.0 96 1.25 0.49 0.40 0.73 5.500000 0.66 1.83 510
171 3 12.77 2.39 2.28 19.5 86 1.39 0.51 0.48 0.64 9.899999 0.57 1.63 470
172 3 14.16 2.51 2.48 20.0 91 1.68 0.70 0.44 1.24 9.700000 0.62 1.71 660
173 3 13.71 5.65 2.45 20.5 95 1.68 0.61 0.52 1.06 7.700000 0.64 1.74 740
174 3 13.40 3.91 2.48 23.0 102 1.80 0.75 0.43 1.41 7.300000 0.70 1.56 750
175 3 13.27 4.28 2.26 20.0 120 1.59 0.69 0.43 1.35 10.200000 0.59 1.56 835
176 3 13.17 2.59 2.37 20.0 120 1.65 0.68 0.53 1.46 9.300000 0.60 1.62 840
177 3 14.13 4.10 2.74 24.5 96 2.05 0.76 0.56 1.35 9.200000 0.61 1.60 560

178 rows × 14 columns


In [47]:
# Header
header = []
for row in dataset:
    header.append(row)
header


Out[47]:
['Wine',
 'Alcohol',
 'Malic.acid',
 'Ash',
 'Acl',
 'Mg',
 'Phenols',
 'Flavanoids',
 'Nonflavanoid.phenols',
 'Proanth',
 'Color.int',
 'Hue',
 'OD',
 'Proline']

In [48]:
print(dataset.describe()) #Descripción de los datos


             Wine     Alcohol  Malic.acid         Ash         Acl          Mg  \
count  178.000000  178.000000  178.000000  178.000000  178.000000  178.000000   
mean     1.938202   13.000618    2.336348    2.366517   19.494944   99.741573   
std      0.775035    0.811827    1.117146    0.274344    3.339564   14.282484   
min      1.000000   11.030000    0.740000    1.360000   10.600000   70.000000   
25%      1.000000   12.362500    1.602500    2.210000   17.200000   88.000000   
50%      2.000000   13.050000    1.865000    2.360000   19.500000   98.000000   
75%      3.000000   13.677500    3.082500    2.557500   21.500000  107.000000   
max      3.000000   14.830000    5.800000    3.230000   30.000000  162.000000   

          Phenols  Flavanoids  Nonflavanoid.phenols     Proanth   Color.int  \
count  178.000000  178.000000            178.000000  178.000000  178.000000   
mean     2.295112    2.029270              0.361854    1.590899    5.058090   
std      0.625851    0.998859              0.124453    0.572359    2.318286   
min      0.980000    0.340000              0.130000    0.410000    1.280000   
25%      1.742500    1.205000              0.270000    1.250000    3.220000   
50%      2.355000    2.135000              0.340000    1.555000    4.690000   
75%      2.800000    2.875000              0.437500    1.950000    6.200000   
max      3.880000    5.080000              0.660000    3.580000   13.000000   

              Hue          OD      Proline  
count  178.000000  178.000000   178.000000  
mean     0.957449    2.611685   746.893258  
std      0.228572    0.709990   314.907474  
min      0.480000    1.270000   278.000000  
25%      0.782500    1.937500   500.500000  
50%      0.965000    2.780000   673.500000  
75%      1.120000    3.170000   985.000000  
max      1.710000    4.000000  1680.000000  

In [49]:
correlation=dataset.corr() #Correlation Matrix
correlation


Out[49]:
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
Wine 1.000000 -0.328222 0.437776 -0.049643 0.517859 -0.209179 -0.719163 -0.847498 0.489109 -0.499130 0.265668 -0.617369 -0.788230 -0.633717
Alcohol -0.328222 1.000000 0.094397 0.211545 -0.310235 0.270798 0.289101 0.236815 -0.155929 0.136698 0.546364 -0.071747 0.072343 0.643720
Malic.acid 0.437776 0.094397 1.000000 0.164045 0.288500 -0.054575 -0.335167 -0.411007 0.292977 -0.220746 0.248985 -0.561296 -0.368710 -0.192011
Ash -0.049643 0.211545 0.164045 1.000000 0.443367 0.286587 0.128980 0.115077 0.186230 0.009652 0.258887 -0.074667 0.003911 0.223626
Acl 0.517859 -0.310235 0.288500 0.443367 1.000000 -0.083333 -0.321113 -0.351370 0.361922 -0.197327 0.018732 -0.273955 -0.276769 -0.440597
Mg -0.209179 0.270798 -0.054575 0.286587 -0.083333 1.000000 0.214401 0.195784 -0.256294 0.236441 0.199950 0.055398 0.066004 0.393351
Phenols -0.719163 0.289101 -0.335167 0.128980 -0.321113 0.214401 1.000000 0.864564 -0.449935 0.612413 -0.055136 0.433681 0.699949 0.498115
Flavanoids -0.847498 0.236815 -0.411007 0.115077 -0.351370 0.195784 0.864564 1.000000 -0.537900 0.652692 -0.172379 0.543479 0.787194 0.494193
Nonflavanoid.phenols 0.489109 -0.155929 0.292977 0.186230 0.361922 -0.256294 -0.449935 -0.537900 1.000000 -0.365845 0.139057 -0.262640 -0.503270 -0.311385
Proanth -0.499130 0.136698 -0.220746 0.009652 -0.197327 0.236441 0.612413 0.652692 -0.365845 1.000000 -0.025250 0.295544 0.519067 0.330417
Color.int 0.265668 0.546364 0.248985 0.258887 0.018732 0.199950 -0.055136 -0.172379 0.139057 -0.025250 1.000000 -0.521813 -0.428815 0.316100
Hue -0.617369 -0.071747 -0.561296 -0.074667 -0.273955 0.055398 0.433681 0.543479 -0.262640 0.295544 -0.521813 1.000000 0.565468 0.236183
OD -0.788230 0.072343 -0.368710 0.003911 -0.276769 0.066004 0.699949 0.787194 -0.503270 0.519067 -0.428815 0.565468 1.000000 0.312761
Proline -0.633717 0.643720 -0.192011 0.223626 -0.440597 0.393351 0.498115 0.494193 -0.311385 0.330417 0.316100 0.236183 0.312761 1.000000

In [50]:
# Display the correlation matrix with a specified figure number and a bluescale
# colormap
plt.figure()
plt.matshow(correlation, fignum=1, cmap=plt.cm.Blues)
plt.ylabel("Attribute Index")
plt.show()



In [51]:
#### Scatter Matrix Plot

plt.figure()
from pandas.tools.plotting import scatter_matrix
scatter_matrix(dataset, alpha=0.3, figsize=(20, 20), diagonal='kde')
plt.show()


<matplotlib.figure.Figure at 0x1fda40c1668>

In [52]:
#### Histogram Matrix Plot

plt.figure()
dataset.hist(xlabelsize=0.5, ylabelsize=0.2,figsize=(10,10))
plt.xlabel("Data")
plt.show()


<matplotlib.figure.Figure at 0x1fda288e1d0>

In [ ]:


In [53]:
### Histogram of Alcohol variable

plt.figure(figsize=(8,4))
plt.hist(dataset[dataset.Wine==1].Alcohol, 30, facecolor='r')
plt.hist(dataset[dataset.Wine==2].Alcohol, 30, facecolor='g')
plt.hist(dataset[dataset.Wine==3].Alcohol, 30, facecolor='b')
plt.title('Histogram of Alcohol')
plt.legend(['Wine A','Wine B','Wine C'])
plt.xlabel("Alcohol")
plt.grid(True)
plt.show()



In [54]:
### Scatter Plot Alcohol vs Ash

plt.figure()
plt.scatter(dataset[dataset.Wine==1].Alcohol,dataset[dataset.Wine==1].Ash, color='red')
plt.scatter(dataset[dataset.Wine==2].Alcohol,dataset[dataset.Wine==2].Ash, color='blue')
plt.scatter(dataset[dataset.Wine==3].Alcohol,dataset[dataset.Wine==3].Ash, color='green')
plt.title('Scatter Plot: Alcohol vs Ash')
plt.xlabel('Alcohol')
plt.ylabel('Ash')
plt.legend(['A','B','C'])
plt.show()


División de la muestra en Train y Test de forma aleatoria


In [55]:
def dividir_ent_test(dataframe, porcentaje=0.7):
    """ 
    Función que divide un dataframe aleatoriamente en entrenamiento y en test.
    Recibe los siguientes argumentos:
    - dataframe: DataFrame que vamos a utilizar para extraer los datos
    - porcentaje: porcentaje de patrones en entrenamiento
    Devuelve:
    - train: DataFrame con los datos de entrenamiento
    - test: DataFrame con los datos de test
    """
    mascara = np.random.rand(len(dataframe)) < porcentaje
    train = dataframe[mascara]
    test = dataframe[~mascara]
    return train, test

wine_train, wine_test = dividir_ent_test(dataset)
print ("wine_train", wine_train.shape,"\n")
print ("wine_test",wine_test.shape,"\n")


wine_train (121, 14) 

wine_test (57, 14) 


In [56]:
### Scatter Plot Alcohol vs Ash

plt.figure(figsize=(8,6))
plt.subplot(121)
plt.scatter(wine_train[wine_train.Wine==1].Alcohol,wine_train[wine_train.Wine==1].Ash, color='red')
plt.scatter(wine_train[wine_train.Wine==2].Alcohol,wine_train[wine_train.Wine==2].Ash, color='blue')
plt.scatter(wine_train[wine_train.Wine==3].Alcohol,wine_train[wine_train.Wine==3].Ash, color='green')
plt.title('Alcohol vs Ash; Train dataset\n',fontsize=10)
plt.xlabel('Alcohol',fontsize=10)
plt.ylabel('Ash')
plt.legend(['A','B','C'])
plt.xlim(10,16)
plt.ylim (1,3.5)

plt.subplot(122)
plt.scatter(wine_test[wine_test.Wine==1].Alcohol,wine_test[wine_test.Wine==1].Ash, color='red')
plt.scatter(wine_test[wine_test.Wine==2].Alcohol,wine_test[wine_test.Wine==2].Ash, color='blue')
plt.scatter(wine_test[wine_test.Wine==3].Alcohol,wine_test[wine_test.Wine==3].Ash, color='green')
plt.title('Alcohol vs Ash; Test dataset \n',fontsize=10)
plt.xlabel('Alcohol', fontsize=10)
plt.ylabel('Ash')
plt.legend(['A','B','C'])
plt.xlim(10,16)
plt.ylim (1,3.5)
plt.show()



In [ ]: