In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (13,8)
In [ ]:
df = pd.read_csv("./winequality-red.csv")
df.head()
In [ ]:
df.shape
In [ ]:
#df.loc[df.b > 0, 'd'] = 1
df.loc[df.quality > 5, 'category'] = 1
df.loc[df.quality <= 5, 'category'] = 0
This is the frequency count for each category
In [ ]:
df.category.value_counts()
In [ ]:
df.head()
In [ ]:
df.corr()
In [ ]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, figsize=(15,15), diagonal='kde')
In [ ]:
df.plot(x="alcohol", y="category", kind="scatter")
In [ ]:
In [ ]:
#df.plot(x="alcohol", y="volatile acidity", kind="scatter", c="category")
ax = df[df.category == 1].plot(x="alcohol", y="volatile acidity", kind="scatter", color="red", label="HIGH", s=100, alpha=0.5)
df[df.category == 0].plot(x="alcohol", y="volatile acidity", kind="scatter", color="green", label="LOW", s=100, alpha=0.5, ax=ax)
In [ ]:
pd.set_option("precision",3)
Building a predictive model involves training the model with historical data known as training data. Once we have the model trained, the model can predict labels (in this case, the category of wine) for the given features (test data) We have 1600 rows of the wine data, lets split this data into 80:20 ratio as training:testingg data
Why do we need to do this?
We can compare the predicted label with the actual label. By doing this, we can measure how accurate our model is.
In [ ]:
df.shape
In [ ]:
df_train = df.iloc[:1280,]
df_test = df.iloc[1280:,]
In [ ]:
X_train = df_train["volatile acidity"]
y_train = df_train["category"]
In [ ]:
X_test = df_test["volatile acidity"]
y_test = df_test["category"]
In [ ]:
X_train = X_train.reshape(X_train.shape[0],1)
X_test = X_test.reshape(X_test.shape[0],1)
In [ ]:
from sklearn.linear_model import LogisticRegression
In [ ]:
logistic_model = LogisticRegression()
In [ ]:
logistic_model.fit(X_train, y_train)
In [ ]:
sns.lmplot(data=df, x="alcohol", y="category", logistic=True)
It’s a bird… it’s a plane… it… depends on your classifier’s threshold -- Sancho McCann
In [ ]:
predicted = logistic_model.predict(X_test)
In [ ]:
df_compare = pd.DataFrame()
df_compare["actual"] = y_test
df_compare["predicted"] = predicted
df_compare["volatile acidity"] = df_test["volatile acidity"]
In [ ]:
ax=df_compare.plot(x="volatile acidity", y="actual", kind="scatter", color="blue", label="actual")
df_compare.plot(x="volatile acidity", y="predicted", kind="scatter", color="red", label="predicted", ax=ax)
Let's add more features - volatile acidity, sulphates, alcohol to predict the category
2 variable model
In [ ]:
df_train = df.iloc[:1280,]
df_test = df.iloc[1280:,]
In [ ]:
X_train = df_train[["sulphates", "alcohol"]]
y_train = df_train["category"]
In [ ]:
X_test = df_test[["sulphates", "alcohol"]]
y_test = df_test["category"]
In [ ]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
In [ ]:
predicted = logistic_model.predict(X_test)
In [ ]:
df_compare = pd.DataFrame()
df_compare["actual"] = y_test
df_compare["predicted"] = predicted
df_compare["sulphates"] = df_test["sulphates"]
df_compare["alcohol"] = df_test["alcohol"]
In [ ]:
df_compare.head()
In [ ]:
ax = df_compare[df_compare.actual == 1].plot(x="alcohol", y="sulphates", kind="scatter", color="red", label="HIGH", s=100, alpha=0.5)
df_compare[df_compare.actual == 0].plot(x="alcohol", y="sulphates", kind="scatter", color="green", label="LOW", s=100, alpha=0.5, ax=ax)
In [ ]:
ax = df_compare[df_compare.predicted == 1].plot(x="alcohol", y="sulphates", kind="scatter", color="red", label="HIGH", s=100, alpha=0.5)
df_compare[df_compare.predicted == 0].plot(x="alcohol", y="sulphates", kind="scatter", color="green", label="LOW", s=100, alpha=0.5, ax=ax)
In [ ]:
In [ ]:
In [ ]:
from sklearn import metrics
In [ ]:
#ols_auc = metrics.roc_auc_score(df_compare.actual, df_compare.predicted)
fpr, tpr, thresholds = metrics.roc_curve(df_compare.actual, df_compare.predicted)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1])
In [ ]: