In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
sns.set()
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
train_data = pd.read_csv("hw1_train.csv")
train_data.head()
Out[2]:
In [3]:
test_data = pd.read_csv("hw1_test.csv")
test_data.info()
Plot input signals and its FFTs in the train dataset as overlays
In [4]:
FIG_NUM = 1320
fig, axes = plt.subplots(2, 2, figsize=(20,20))
t = np.arange(500)
freq = np.fft.fftfreq(t.shape[-1])
lbl1 = "Source signals. Target: 1"
lbl2 = "Source signals. Target: -1"
lbl3 = "FFT of signals. Target: 1"
lbl4 = "FFT of signals. Target: -1"
for i in range(FIG_NUM):
if (train_data.iloc[i][0] == 1.0):
axes[0, 0].set_title(lbl1)
axes[0, 0].set_ylim([-3,3])
axes[0, 0].plot(t, train_data.iloc[i][1:])
axes[0, 1].set_title(lbl3)
sp = np.fft.fft(train_data.iloc[i][1:])
axes[0, 1].set_ylim([-200,200])
axes[0, 1].plot(t[0:100], sp.real[0:100])
if (train_data.iloc[i][0] == -1.0):
axes[1, 0].set_title(lbl2)
axes[1, 0].set_ylim([-3,3])
axes[1, 0].plot(t, train_data.iloc[i][1:])
axes[1, 1].set_title(lbl4)
sp = np.fft.fft(train_data.iloc[i][1:])
axes[1, 1].set_ylim([-200,200])
axes[1, 1].plot(t[0:100], sp.real[0:100])
In [5]:
train_new_feat = pd.DataFrame(train_data['Target'], columns=['Target'])
test_new_feat = pd.DataFrame(test_data['Target'], columns=['Target'])
Feature 1. Based on section 2, let's consider the power of input signal in time domain, but to avoid noise the signal should be smoothed, i.e. apply rolling window and compute value for each frame. Then add computed values to the final dataframe.
In [6]:
import numpy as np
train_data_sm = train_data.drop(['Target'], axis=1).rolling(window=60, axis=1).mean().dropna(axis=1)
train_new_feat['sig_energy'] = train_data_sm.apply((lambda x: np.square(x).sum()), axis=1)
test_data_sm = test_data.drop(['Target'], axis=1).rolling(window=60, axis=1).mean().dropna(axis=1)
test_new_feat['sig_energy'] = test_data_sm.apply((lambda x: np.square(x).sum()), axis=1)
Feature 2. Moreover, it is clearly shown, that amplitude of overlayed signals in section 2 in freq. domain is much higher for the '-1' class than for the '1' class. But it works only for high frequency region (approximately from 45th to 70th sample). So in such a way let's compute the power of signals in high freq. region and present is as second feature.
In [7]:
fft_hf_power = [abs(x[45:70]).sum() for x in np.fft.fft(train_data.drop(['Target'], axis=1))]
train_new_feat['fft_hf_weight'] = pd.DataFrame(fft_hf_power, columns=['fft_hf_weight'])
fft_hf_power = [abs(x[45:70]).sum() for x in np.fft.fft(test_data.drop(['Target'], axis=1))]
test_new_feat['fft_hf_weight'] = pd.DataFrame(fft_hf_power, columns=['fft_hf_weight'])
Killer feature. Sometimes, some miracles happen, thus let's consider the feature like the number of unique values in time series (values which appeared just at once)
In [8]:
train_new_feat['val_counts'] = train_data.drop(['Target'], axis=1).apply((lambda x: len(x.value_counts())), axis=1)
test_new_feat['val_counts'] = test_data.drop(['Target'], axis=1).apply((lambda x: len(x.value_counts())), axis=1)
In [9]:
Y_train = train_new_feat['Target']
X_train = train_new_feat.drop(['Target'], axis=1)
Y_test = test_new_feat['Target']
X_test = test_new_feat.drop(['Target'], axis=1)
In [10]:
#Train dataset(fft_hf_weight and sig_energy)
plt.figure(figsize=(20, 6))
plt.title("Train dataset. Yellow - target '1'; Purple - target '-1'", size=16)
plt.xlabel(xlabel = "Signal energy in high freq. range", size=16)
plt.ylabel(ylabel = "Total signal energy", size=16)
plt.scatter(X_train['fft_hf_weight'], X_train['sig_energy'], c=Y_train, cmap='viridis')
plt.colorbar();
#Test dataset(fft_hf_weight and sig_energy)
plt.figure(figsize=(20, 6))
plt.title("Test dataset. Yellow - target '1'; Purple - target '-1'", size=16)
plt.xlabel(xlabel = "Signal energy in high freq. range", size=16)
plt.ylabel(ylabel = "Total signal energy", size=16)
plt.scatter(X_test['fft_hf_weight'], X_test['sig_energy'], c=Y_test, cmap='viridis')
plt.colorbar();
#Train dataset(fft_hf_weight and val_counts)
plt.figure(figsize=(20, 6))
plt.title("Train dataset. Yellow - target '1'; Purple - target '-1'", size=16)
plt.xlabel(xlabel = "Signal energy in high freq. range", size=16)
plt.ylabel(ylabel = "Number of unique values of time series", size=16)
plt.scatter(X_train['fft_hf_weight'], X_train['val_counts'], c=Y_train, cmap='viridis')
plt.colorbar();
#Test dataset(fft_hf_weight and val_counts)
plt.figure(figsize=(20, 6))
plt.title("Test dataset. Yellow - target '1'; Purple - target '-1'", size=16)
plt.xlabel(xlabel = "Signal energy in high freq. range", size=16)
plt.ylabel(ylabel = "Number of unique values of time series", size=16)
plt.scatter(X_test['fft_hf_weight'], X_test['val_counts'], c=Y_test, cmap='viridis')
plt.colorbar();
#Train dataset(sig_energy and val_counts)
plt.figure(figsize=(20, 6))
plt.title("Train dataset. Yellow - target '1'; Purple - target '-1'", size=16)
plt.xlabel(xlabel = "Total signal energy", size=16)
plt.ylabel(ylabel = "Number of unique values of time series", size=16)
plt.scatter(X_train['sig_energy'], X_train['val_counts'], c=Y_train, cmap='viridis')
plt.colorbar();
#Test dataset(sig_energy and val_counts)
plt.figure(figsize=(20, 6))
plt.title("Test dataset. Yellow - target '1'; Purple - target '-1'", size=16)
plt.xlabel(xlabel = "Total signal energy", size=16)
plt.ylabel(ylabel = "Number of unique values of time series", size=16)
plt.scatter(X_test['sig_energy'], X_test['val_counts'], c=Y_test, cmap='viridis')
plt.colorbar();
Three features have been already introduced in section 4
In [11]:
from sklearn.metrics import accuracy_score
# Simple decision rule taking into account energy characteristics
def predict_target(row):
if (row['fft_hf_weight'] <= 258.623 and row['sig_energy'] <= 4.71):
return 1
return -1
# The simplest decision rule taking into account the killer feature
def predict2_target(row):
if (row['val_counts'] > 480):
return 1
return -1
X_train['pred_target'] = X_train.apply(predict_target, axis=1)
X_test['pred_target'] = X_test.apply(predict_target, axis=1)
print("Test data acc(energy features):", accuracy_score(X_train['pred_target'], Y_train))
print("Train data acc(energy features):", accuracy_score(X_test['pred_target'], Y_test))
print()
X_train['pred_target'] = X_train.apply(predict2_target, axis=1)
X_test['pred_target'] = X_test.apply(predict2_target, axis=1)
print("Test data acc(killer feature):", accuracy_score(X_train['pred_target'], Y_train))
print("Train data acc(killer feature):", accuracy_score(X_test['pred_target'], Y_test))