In [ ]:
import pandas as pd
from pandas import DataFrame
url="https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
df = pd.read_csv(url,header=None)
df.describe()
In [ ]:
pd.options.display.max_columns=70
df.describe()
In [ ]:
import numpy as np
import pylab
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
stats.probplot(df[4], dist="norm", plot=pylab)
pylab.show()
In [ ]:
df[60].unique()
In [ ]:
df.corr()
In [ ]:
import matplotlib.pyplot as plot
plot.pcolor(df.corr())
plot.show()
In [ ]:
df.corr()[0].plot()
In [ ]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
import pandas as pd
from pandas import DataFrame
w_df = pd.read_csv(url,header=0,sep=';')
w_df.describe()
In [ ]:
w_df['volatile acidity']
In [ ]:
w_df.corr()
In [ ]:
import matplotlib.pyplot as plot
plot.pcolor(w_df.corr())
plot.show()
In [ ]:
w_df.corr()['fixed acidity'].plot()
In [ ]:
from pandas.tools.plotting import scatter_matrix
p=scatter_matrix(w_df, alpha=0.2, figsize=(12, 12), diagonal='kde')
In [ ]:
import numpy as np
import pylab
import scipy.stats as stats
%matplotlib inline
stats.probplot(w_df['alcohol'], dist="norm", plot=pylab)
pylab.show()
In [ ]:
import numpy
import random
from sklearn import datasets, linear_model
from sklearn.metrics import roc_curve, auc
import pylab as pl
In [ ]:
import pandas as pd
from pandas import DataFrame
url="https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
df = pd.read_csv(url,header=None)
df.describe()
In [ ]:
df[60]=np.where(df[60]=='R',0,1)
In [ ]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.3)
x_train = train.iloc[0:,0:60]
y_train = train[60]
x_test = test.iloc[0:,0:60]
y_test = test[60]
y_train
In [ ]:
model = linear_model.LinearRegression()
model.fit(x_train,y_train)
In [ ]:
In [ ]:
training_predictions = model.predict(x_train)
print(np.mean((training_predictions - y_train) ** 2))
In [ ]:
print('Train R-Square:',model.score(x_train,y_train))
print('Test R-Square:',model.score(x_test,y_test))
In [ ]:
print(max(training_predictions),min(training_predictions),np.mean(training_predictions))
In [ ]:
def confusion_matrix(predicted, actual, threshold):
if len(predicted) != len(actual): return -1
tp = 0.0
fp = 0.0
tn = 0.0
fn = 0.0
for i in range(len(actual)):
if actual[i] > 0.5: #labels that are 1.0 (positive examples)
if predicted[i] > threshold:
tp += 1.0 #correctly predicted positive
else:
fn += 1.0 #incorrectly predicted negative
else: #labels that are 0.0 (negative examples)
if predicted[i] < threshold:
tn += 1.0 #correctly predicted negative
else:
fp += 1.0 #incorrectly predicted positive
rtn = [tp, fn, fp, tn]
return rtn
In [ ]:
testing_predictions = model.predict(x_test)
In [ ]:
testing_predictions = model.predict(x_test)
confusion_matrix(testing_predictions,np.array(y_test),0.5)
In [ ]:
cm = confusion_matrix(testing_predictions,np.array(y_test),0.5)
misclassification_rate = (cm[1] + cm[2])/len(y_test)
misclassification_rate
In [ ]:
[tp, fn, fp, tn] = confusion_matrix(testing_predictions,np.array(y_test),0.5)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f_score = 2 * (precision * recall)/(precision + recall)
print(precision,recall,f_score)
In [ ]:
[tp, fn, fp, tn] = confusion_matrix(testing_predictions,np.array(y_test),0.9)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f_score = 2 * (precision * recall)/(precision + recall)
print(precision,recall,f_score)
In [ ]:
positives = list()
negatives = list()
actual = np.array(y_train)
for i in range(len(y_train)):
if actual[i]:
positives.append(training_predictions[i])
else:
negatives.append(training_predictions[i])
In [ ]:
df_p = pd.DataFrame(positives)
df_n = pd.DataFrame(negatives)
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(df_p)
b_heights, b_bins = np.histogram(df_n, bins=a_bins)
width = (a_bins[1] - a_bins[0])/3
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='seagreen')
In [ ]:
positives = list()
negatives = list()
actual = np.array(y_test)
for i in range(len(y_test)):
if actual[i]:
positives.append(testing_predictions[i])
else:
negatives.append(testing_predictions[i])
df_p = pd.DataFrame(positives)
df_n = pd.DataFrame(negatives)
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(df_p)
b_heights, b_bins = np.histogram(df_n, bins=a_bins)
width = (a_bins[1] - a_bins[0])/3
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='seagreen')
In [ ]:
from sklearn.metrics import roc_curve, auc
In [ ]:
(fpr, tpr, thresholds) = roc_curve(y_train,training_predictions)
area = auc(fpr,tpr)
pl.clf() #Clear the current figure
pl.plot(fpr,tpr,label="In-Sample ROC Curve with area = %1.2f"%area)
pl.plot([0, 1], [0, 1], 'k') #This plots the random (equal probability line)
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
In [ ]:
(fpr, tpr, thresholds) = roc_curve(y_test,testing_predictions)
area = auc(fpr,tpr)
pl.clf() #Clear the current figure
pl.plot(fpr,tpr,label="Out-Sample ROC Curve with area = %1.2f"%area)
pl.plot([0, 1], [0, 1], 'k')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Out sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
In [ ]:
(fpr, tpr, thresholds)
In [ ]:
cm = confusion_matrix(testing_predictions,np.array(y_test),.1)
cost1 = 1000*cm[0] + 300 * cm[2] + 200 * cm[1] + 200 * cm[3]
cm = confusion_matrix(testing_predictions,np.array(y_test),.9)
cost2 = 1000*cm[0] + 300 * cm[2] + 200 * cm[1] + 200 * cm[3]
print(cost1,cost2)
In [ ]:
cm = confusion_matrix(testing_predictions,np.array(y_test),.1)
cost1 = 0*cm[0] + 0 * cm[2] + 5000 * cm[1] + 0 * cm[3]
cm = confusion_matrix(testing_predictions,np.array(y_test),.9)
cost2 = 0*cm[0] + 0 * cm[2] + 5000 * cm[1] + 0 * cm[3]
print(cost1,cost2)
In [ ]: