In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
In [2]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [285]:
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression, LogisticRegression
In [3]:
PATH = Path('data/datasets/paresh2047/uci-semcom')
In [4]:
df = pd.read_csv(PATH/'uci-secom.csv')
In [5]:
df.head() # -1 pass; +1 fail
Out[5]:
In [6]:
df
Out[6]:
In [21]:
df.values.shape
Out[21]:
In [22]:
col = df.columns[-1]
col
Out[22]:
In [23]:
passes = df.loc[df[col]==-1]
fails = df.loc[df[col]== 1]
In [24]:
plt.style.use('seaborn')
In [25]:
def plot_row(df, rows=0, show_nans=False, figsize=None, alpha=1.):
if figsize is not None:
fig = plt.figure(figsize=(figsize))
if type(rows) == int:
rows = [rows]
for row in rows:
row = df.values[row][1:]
if show_nans:
nans = np.where(pd.isnull(row))
ymax,ymin = max(row)/5, -max(row)/5
plt.vlines(nans, ymin=ymin, ymax=ymax, linewidth=.5, color='firebrick')
plt.plot(range(len(row)), row, alpha=alpha);
plot_row(df, figsize=(12,8), show_nans=True)
In [173]:
plot_row(df, np.random.randint(len(df), size=50), figsize=(12,8), alpha=0.1)
In [26]:
plot_row(fails, rows=range(len(fails)), figsize=(12,8), alpha=0.1)
In [30]:
plot_row(passes, rows=np.random.randint(len(passes), size=100), figsize=(12,8), alpha=0.1)
Eyeing it isn't going to work.
train / val : 80 / 20
In [44]:
def train_val_idxs(data, p=0.2):
idxs = np.random.permutation(len(data))
n_val = int(len(data)*p)
return idxs[n_val:], idxs[:n_val]
In [160]:
train_idxs, val_idxs = train_val_idxs(df)
In [197]:
train.columns
Out[197]:
In [537]:
train = df.iloc[train_idxs]
valid = df.iloc[val_idxs]
# remove the first 'timestamp' column
train = train.drop(columns=['Time'])
valid = valid.drop(columns=['Time'])
In [538]:
len(train), len(valid)
Out[538]:
Since there are only 104 negative examples to 1463 positives, I want to ensure there's a similar ratio in the split datasets.
In [539]:
pos, neg = len(passes), len(fails)
pos, neg, neg/pos
Out[539]:
In [540]:
pos, neg = len(valid.loc[valid[col]==-1]), len(valid.loc[valid[col]== 1])
pos, neg, neg/pos
Out[540]:
In [541]:
pos, neg = len(train.loc[train[col]==-1]), len(train.loc[train[col]== 1])
pos, neg, neg/pos
Out[541]:
I could try resampling negative examples to artificially balance the dataset, although I won't attempt to generatively create new examples here.
In [542]:
# replacing NaNs with the mean of each row
for rdx in range(len(train)):
train.iloc[rdx] = train.iloc[rdx].fillna(train.iloc[rdx].mean())
for rdx in range(len(valid)):
valid.iloc[rdx] = valid.iloc[rdx].fillna(valid.iloc[rdx].mean())
Separate data into inputs and labels:
In [543]:
x_train = train.drop([col], 1).values
y_train = train[col].values
x_valid = valid.drop([col], 1).values
y_valid = valid[col].values
Preprocessing: Center to Mean and Scale to Unit Variance
In [544]:
x_train = preprocessing.scale(x_train)
x_valid = preprocessing.scale(x_valid)
Classifier:
In [277]:
clsfr = LinearRegression()
clsfr.fit(x_train, y_train)
# clsfr.fit(x_valid, y_valid)
Out[277]:
In [279]:
clsfr.score(x_train, y_train)
Out[279]:
In [278]:
clsfr.score(x_valid, y_valid)
Out[278]:
An R2 score (what the Linear Regressor is using as its scoring metric) gives a value of 1 for a perfect score, and 0 for taking the average; anything below a zero is worse than just taking the average of the dataset..
I wonder if I was just misusing this model. Though I guess fitting a simple line to this dataset and generalizing would be difficult.
In [280]:
clsfr = LogisticRegression()
clsfr.fit(x_train, y_train)
Out[280]:
In [281]:
clsfr.score(x_train, y_train)
Out[281]:
In [282]:
clsfr.score(x_valid, y_valid)
Out[282]:
This gives more-expected results.
In [295]:
clsfr = svm.LinearSVC()
clsfr.fit(x_train, y_train)
Out[295]:
In [296]:
clsfr.score(x_train, y_train)
Out[296]:
In [297]:
clsfr.score(x_valid, y_valid)
Out[297]:
In [545]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.learner import *
In [546]:
from fastai.dataloader import DataLoader
In [547]:
import torchvision
In [548]:
class SimpleNet(nn.Module):
def __init__(self, in_size):
super().__init__()
self.fc0 = nn.Linear(in_size, 80)
self.fc1 = nn.Linear(80, 2)
def forward(self, x):
x = F.relu(self.fc0(x))
x = F.log_softmax(self.fc1(x))
return x
In [549]:
class SignalDataset(Dataset):
def __init__(self, x, y, transform=None):
self.x = np.copy(x)
self.y = np.copy(y)
self.transform = transform
def __len__(self):
return len(self.x)
def __getitem__(self, i):
x = self.x[i]
y = self.y[i]
if self.transform is not None:
x = self.transform(x)
return (x, y)
One-Hot Encode -1/+1 pass/fail
In [550]:
y_train.shape
Out[550]:
In [551]:
def one_hot_y(y_data):
y = np.zeros((y_data.shape[0], 2))
for i,yi in enumerate(y_data):
y[i][int((yi + 1)/2)] = 1
return y
In [560]:
train_dataset = SignalDataset(x_train, one_hot_y(y_train))
valid_dataset = SignalDataset(x_valid, one_hot_y(y_valid))
In [561]:
train_dataset.x
Out[561]:
In [562]:
minval = abs(np.min(train_dataset.x))
train_dataset.x += minval
train_dataset.x /= np.max(train_dataset.x)
minval = abs(np.min(valid_dataset.x))
valid_dataset.x += minval
valid_dataset.x /= np.max(valid_dataset.x)
In [563]:
train_dataloader = DataLoader(train_dataset)
valid_dataloader = DataLoader(valid_dataset)
In [564]:
mdata = ModelData(PATH, train_dataloader, valid_dataloader)
In [565]:
network = SimpleNet(len(train_dataset.x[0]))
network
Out[565]:
In [566]:
learner = Learner.from_model_data(network, mdata)
In [567]:
learner.lr_find()
learner.sched.plot()
In [568]:
learner.fit(1e-4, n_cycle=5, wds=1e-6)
Out[568]:
In [569]:
log_preds = learner.predict()
In [570]:
np.exp(log_preds)[:40]
Out[570]:
In [571]:
train_dataset = SignalDataset(x_train, one_hot_y(y_train))
valid_dataset = SignalDataset(x_valid, one_hot_y(y_valid))
In [572]:
train_dataset.x
Out[572]:
In [573]:
train_dataset.x = np.clip(train_dataset.x, 0.0, 1.0)
valid_dataset.x = np.clip(valid_dataset.x, 0.0, 1.0)
In [574]:
train_dataloader = DataLoader(train_dataset)
valid_dataloader = DataLoader(valid_dataset)
In [575]:
mdata = ModelData(PATH, train_dataloader, valid_dataloader)
In [576]:
network = SimpleNet(len(train_dataset.x[0]))
network
Out[576]:
In [577]:
learner = Learner.from_model_data(network, mdata)
In [578]:
learner.lr_find()
learner.sched.plot()
In [579]:
learner.fit(1e-4, n_cycle=5, wds=1e-6)
Out[579]:
In [580]:
log_preds = learner.predict()
In [581]:
np.exp(log_preds)[:40]
Out[581]:
In [582]:
train_dataset = SignalDataset(x_train, one_hot_y(y_train))
valid_dataset = SignalDataset(x_valid, one_hot_y(y_valid))
In [583]:
train_dataset.x
Out[583]:
In [584]:
train_dataloader = DataLoader(train_dataset)
valid_dataloader = DataLoader(valid_dataset)
In [585]:
mdata = ModelData(PATH, train_dataloader, valid_dataloader)
In [586]:
network = SimpleNet(len(train_dataset.x[0]))
network
Out[586]:
In [587]:
learner = Learner.from_model_data(network, mdata)
In [588]:
learner.lr_find()
learner.sched.plot()
In [589]:
learner.fit(5e-4, n_cycle=5, wds=1e-6)
Out[589]:
In [590]:
log_preds = learner.predict()
In [591]:
np.exp(log_preds)[:40]
Out[591]:
More work is needed.