Nesta trabalho, treinaremos um modelo de regressão linear usando descendência de gradiente estocástico no conjunto de dados da Qualidade do Vinho. O exemplo pressupõe que uma cópia CSV do conjunto de dados está no diretório de trabalho atual com o nome do arquivo winequality-white.csv.
O conjunto de dados de qualidade do vinho envolve a previsão da qualidade dos vinhos brancos em uma escala, com medidas químicas de cada vinho. É um problema de classificação multiclasse, mas também pode ser enquadrado como um problema de regressão. O número de observações para cada classe não é equilibrado. Existem 4.898 observações com 11 variáveis de entrada e 1 variável de saída. Os nomes das variáveis são os seguintes:
O desempenho de referencia de predição do valor médio é um RMSE de aproximadamente 0.148 pontos de qualidade.
Utilize o exemplo apresentado no tutorial e altere-o de forma a carregar os dados e analisar a acurácia de sua solução.
In [2]:
import pandas as pd
import numpy as np
import sklearn as skt
In [10]:
dataset = pd.read_csv('winequality-white.csv', delimiter=";")
In [11]:
dataset.head
Out[11]:
<bound method NDFrame.head of fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.0 0.270 0.36 20.70 0.045
1 6.3 0.300 0.34 1.60 0.049
2 8.1 0.280 0.40 6.90 0.050
3 7.2 0.230 0.32 8.50 0.058
4 7.2 0.230 0.32 8.50 0.058
5 8.1 0.280 0.40 6.90 0.050
6 6.2 0.320 0.16 7.00 0.045
7 7.0 0.270 0.36 20.70 0.045
8 6.3 0.300 0.34 1.60 0.049
9 8.1 0.220 0.43 1.50 0.044
10 8.1 0.270 0.41 1.45 0.033
11 8.6 0.230 0.40 4.20 0.035
12 7.9 0.180 0.37 1.20 0.040
13 6.6 0.160 0.40 1.50 0.044
14 8.3 0.420 0.62 19.25 0.040
15 6.6 0.170 0.38 1.50 0.032
16 6.3 0.480 0.04 1.10 0.046
17 6.2 0.660 0.48 1.20 0.029
18 7.4 0.340 0.42 1.10 0.033
19 6.5 0.310 0.14 7.50 0.044
20 6.2 0.660 0.48 1.20 0.029
21 6.4 0.310 0.38 2.90 0.038
22 6.8 0.260 0.42 1.70 0.049
23 7.6 0.670 0.14 1.50 0.074
24 6.6 0.270 0.41 1.30 0.052
25 7.0 0.250 0.32 9.00 0.046
26 6.9 0.240 0.35 1.00 0.052
27 7.0 0.280 0.39 8.70 0.051
28 7.4 0.270 0.48 1.10 0.047
29 7.2 0.320 0.36 2.00 0.033
... ... ... ... ... ...
4868 5.8 0.230 0.31 4.50 0.046
4869 6.6 0.240 0.33 10.10 0.032
4870 6.1 0.320 0.28 6.60 0.021
4871 5.0 0.200 0.40 1.90 0.015
4872 6.0 0.420 0.41 12.40 0.032
4873 5.7 0.210 0.32 1.60 0.030
4874 5.6 0.200 0.36 2.50 0.048
4875 7.4 0.220 0.26 1.20 0.035
4876 6.2 0.380 0.42 2.50 0.038
4877 5.9 0.540 0.00 0.80 0.032
4878 6.2 0.530 0.02 0.90 0.035
4879 6.6 0.340 0.40 8.10 0.046
4880 6.6 0.340 0.40 8.10 0.046
4881 5.0 0.235 0.27 11.75 0.030
4882 5.5 0.320 0.13 1.30 0.037
4883 4.9 0.470 0.17 1.90 0.035
4884 6.5 0.330 0.38 8.30 0.048
4885 6.6 0.340 0.40 8.10 0.046
4886 6.2 0.210 0.28 5.70 0.028
4887 6.2 0.410 0.22 1.90 0.023
4888 6.8 0.220 0.36 1.20 0.052
4889 4.9 0.235 0.27 11.75 0.030
4890 6.1 0.340 0.29 2.20 0.036
4891 5.7 0.210 0.32 0.90 0.038
4892 6.5 0.230 0.38 1.30 0.032
4893 6.2 0.210 0.29 1.60 0.039
4894 6.6 0.320 0.36 8.00 0.047
4895 6.5 0.240 0.19 1.20 0.041
4896 5.5 0.290 0.30 1.10 0.022
4897 6.0 0.210 0.38 0.80 0.020
free sulfur dioxide total sulfur dioxide density pH sulphates \
0 45.0 170.0 1.00100 3.00 0.45
1 14.0 132.0 0.99400 3.30 0.49
2 30.0 97.0 0.99510 3.26 0.44
3 47.0 186.0 0.99560 3.19 0.40
4 47.0 186.0 0.99560 3.19 0.40
5 30.0 97.0 0.99510 3.26 0.44
6 30.0 136.0 0.99490 3.18 0.47
7 45.0 170.0 1.00100 3.00 0.45
8 14.0 132.0 0.99400 3.30 0.49
9 28.0 129.0 0.99380 3.22 0.45
10 11.0 63.0 0.99080 2.99 0.56
11 17.0 109.0 0.99470 3.14 0.53
12 16.0 75.0 0.99200 3.18 0.63
13 48.0 143.0 0.99120 3.54 0.52
14 41.0 172.0 1.00020 2.98 0.67
15 28.0 112.0 0.99140 3.25 0.55
16 30.0 99.0 0.99280 3.24 0.36
17 29.0 75.0 0.98920 3.33 0.39
18 17.0 171.0 0.99170 3.12 0.53
19 34.0 133.0 0.99550 3.22 0.50
20 29.0 75.0 0.98920 3.33 0.39
21 19.0 102.0 0.99120 3.17 0.35
22 41.0 122.0 0.99300 3.47 0.48
23 25.0 168.0 0.99370 3.05 0.51
24 16.0 142.0 0.99510 3.42 0.47
25 56.0 245.0 0.99550 3.25 0.50
26 35.0 146.0 0.99300 3.45 0.44
27 32.0 141.0 0.99610 3.38 0.53
28 17.0 132.0 0.99140 3.19 0.49
29 37.0 114.0 0.99060 3.10 0.71
... ... ... ... ... ...
4868 42.0 124.0 0.99324 3.31 0.64
4869 8.0 81.0 0.99626 3.19 0.51
4870 29.0 132.0 0.99188 3.15 0.36
4871 20.0 98.0 0.98970 3.37 0.55
4872 50.0 179.0 0.99622 3.14 0.60
4873 33.0 122.0 0.99044 3.33 0.52
4874 16.0 125.0 0.99282 3.49 0.49
4875 18.0 97.0 0.99245 3.12 0.41
4876 34.0 117.0 0.99132 3.36 0.59
4877 12.0 82.0 0.99286 3.25 0.36
4878 6.0 81.0 0.99234 3.24 0.35
4879 68.0 170.0 0.99494 3.15 0.50
4880 68.0 170.0 0.99494 3.15 0.50
4881 34.0 118.0 0.99540 3.07 0.50
4882 45.0 156.0 0.99184 3.26 0.38
4883 60.0 148.0 0.98964 3.27 0.35
4884 68.0 174.0 0.99492 3.14 0.50
4885 68.0 170.0 0.99494 3.15 0.50
4886 45.0 121.0 0.99168 3.21 1.08
4887 5.0 56.0 0.98928 3.04 0.79
4888 38.0 127.0 0.99330 3.04 0.54
4889 34.0 118.0 0.99540 3.07 0.50
4890 25.0 100.0 0.98938 3.06 0.44
4891 38.0 121.0 0.99074 3.24 0.46
4892 29.0 112.0 0.99298 3.29 0.54
4893 24.0 92.0 0.99114 3.27 0.50
4894 57.0 168.0 0.99490 3.15 0.46
4895 30.0 111.0 0.99254 2.99 0.46
4896 20.0 110.0 0.98869 3.34 0.38
4897 22.0 98.0 0.98941 3.26 0.32
alcohol quality
0 8.800000 6
1 9.500000 6
2 10.100000 6
3 9.900000 6
4 9.900000 6
5 10.100000 6
6 9.600000 6
7 8.800000 6
8 9.500000 6
9 11.000000 6
10 12.000000 5
11 9.700000 5
12 10.800000 5
13 12.400000 7
14 9.700000 5
15 11.400000 7
16 9.600000 6
17 12.800000 8
18 11.300000 6
19 9.500000 5
20 12.800000 8
21 11.000000 7
22 10.500000 8
23 9.300000 5
24 10.000000 6
25 10.400000 6
26 10.000000 6
27 10.500000 6
28 11.600000 6
29 12.300000 7
... ... ...
4868 10.800000 6
4869 9.800000 6
4870 11.450000 7
4871 12.050000 6
4872 9.700000 5
4873 11.900000 6
4874 10.000000 6
4875 9.700000 6
4876 11.600000 7
4877 8.800000 5
4878 9.500000 4
4879 9.533333 6
4880 9.533333 6
4881 9.400000 6
4882 10.700000 5
4883 11.500000 6
4884 9.600000 5
4885 9.550000 6
4886 12.150000 7
4887 13.000000 7
4888 9.200000 5
4889 9.400000 6
4890 11.800000 6
4891 10.600000 6
4892 9.700000 5
4893 11.200000 6
4894 9.600000 5
4895 9.400000 6
4896 12.800000 7
4897 11.800000 6
[4898 rows x 12 columns]>
In [1]:
def predict(row, coefficients):
yhat = coefficients[0]
for i in range(len(row)-1):
yhat += coefficients[i + 1] * row[i]
return yhat
In [4]:
# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
coef = [0.0 for i in range(len(train[0]))]
print ('Coeficiente Inicial={0}' % (coef))
for epoch in range(n_epoch):
sum_error = 0
for row in train:
yhat = predict(row, coef)
error = yhat - row[-1]
sum_error += error**2
coef[0] = coef[0] - l_rate * error
for i in range(len(row)-1):
coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
print(('epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error)))
return coef
In [7]:
l_rate = 0.001
n_epoch = 10
In [12]:
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniconda3/envs/data-science/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-12-0dbea17cbc4e> in <module>()
----> 1 coef = coefficients_sgd(dataset, l_rate, n_epoch)
2 print(coef)
<ipython-input-4-0e02069505eb> in coefficients_sgd(train, l_rate, n_epoch)
1 # Estimate linear regression coefficients using stochastic gradient descent
2 def coefficients_sgd(train, l_rate, n_epoch):
----> 3 coef = [0.0 for i in range(len(train[0]))]
4 print ('Coeficiente Inicial={0}' % (coef))
5 for epoch in range(n_epoch):
~/miniconda3/envs/data-science/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~/miniconda3/envs/data-science/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~/miniconda3/envs/data-science/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~/miniconda3/envs/data-science/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~/miniconda3/envs/data-science/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
In [ ]:
Content source: abevieiramota/data-science-cookbook
Similar notebooks: