In [1]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [2]:
# Numeric data
num_dat = func.load_data_file('train_numeric')
y = num_dat['data']['y']
data = num_dat['data']['features']
ids = num_dat['data']['ids']
n_f_names = num_dat['data']['feature_names'][1:]
del num_dat
In [3]:
# Load numeric data test
num_dat = func.load_data_file('test_numeric')
ids_test = num_dat['data']['ids']
data = vstack([data, num_dat['data']['features']], format='csr')
del num_dat
In [4]:
ids_all = pd.concat([ids, ids_test], axis=0)
In [5]:
n_1 = y[y.Response==1].index.values
n_0 = y[y.Response==0].index.values
In [6]:
ids_test.head(3)
Out[6]:
In [7]:
y = y.reindex(pd.concat([ids.Id, ids_test.Id]))
In [8]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)
Out[8]:
In [10]:
# Read start time per station (discard features related to t delta)
t_station = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_date_station.csv')).iloc[:,:129]
t_station.head(3)
Out[10]:
In [11]:
data.shape[0]
Out[11]:
In [18]:
def visualize_detrended_numeric_feature(ft_nr, showplot=True, cluster = 'cluster_n8'):
# For a given numeric features
# Get station of feature
# Get station time per sample
# Get samples with value at this feature
# Combine both to dataframe
# Group dataframe by sec and take mean
# Subtract this mean from each value
#n = 0.0
# Get station number
station = lut[lut.feature_nr==ft_nr].station_V2.values[0]
# Column nr in sparse array
n_col = lut[lut.feature_nr==ft_nr].col_num.values[0]
if not n_col>=0:
return None
# Samples with value for this feature
i_n = data[:, n_col].nonzero()[0]
# Timestamps for these values from station
i_t = (~t_station.iloc[i_n, t_station.columns.get_loc('t_' + str(station))].isnull()).index.values
# Samples with value and timestamp
i = list(set(i_n) & set(i_t))
# Get value, timestamp, Response and cluster for each sample and combine in DataFrame
v = data[i, n_col].todense().A1
t = 1.0/100*t_station.iloc[i, t_station.columns.get_loc('t_' + str(station))].values
r = y.iloc[i]['Response'].values
c = cluster_info.iloc[i][cluster].values
c50 = cluster_info.iloc[i]['cluster_n50'].values
c150 = cluster_info.iloc[i]['cluster_n150'].values
c500 = cluster_info.iloc[i]['cluster_n500'].values
df = pd.DataFrame({'i':i, 't':t,'v':v, 'r':r, 'c':c, 'c50':c50, 'c150':c150, 'c500':c500})
# Check nan in t (if large, adjust code for t)
print('F nr: {} | Feature values: {} | Time values: {} | Nans found: {}'.format(ft_nr,
len(i_n),
len(i_t),
(np.isnan(t)).sum()))
# Prepare grouping window
t_window = 0.02
df['t_w'] = (df['t']/t_window).round(0)*t_window
# Calculate average numeric value per time window and add to DataFrame
mean_t_w = df.groupby('t_w')[['v']].agg('median').rename(columns={'v':'mu_v'})
df = df.merge(mean_t_w, how='left', left_on='t_w', right_index=True)
# Calculate average numeric value per time window and cluster and add to DataFrame
mean_t_w_c = df.groupby(['t_w','c'])[['v']].agg('median').rename(columns={'v':'mu_v_c'})
df = df.set_index(['t_w','c']).merge(mean_t_w_c, how='left', left_index=True, right_index=True).reset_index()
# Above is faster than transform:
# df['mu_v'] = df.groupby('t_w')[['v']].transform('mean')
# Calculate some differences
df['v_diff_abs'] = df['v'] - df['mu_v_c']
df['v_diff_abs_abs'] = df['v_diff_abs'].abs()
df['v_diff_rel'] = (df['v'] / df['mu_v']).abs()
if showplot:
# Define figure and axes
plt.figure(figsize=(16,10))
gs = gridspec.GridSpec(4, 5)
ax0 = plt.subplot(gs[0,:]) # Raw data
ax1 = plt.subplot(gs[1,:]) # Detrended data
ax5 = plt.subplot(gs[2,:]) # Detrended data
ax2 = plt.subplot(gs[3,0]) # Histogram of raw values
ax3 = plt.subplot(gs[3,1]) # Histogram of detrended values
ax4 = plt.subplot(gs[3,2]) # Histogram of relative values
ax6 = plt.subplot(gs[3,3]) # Cumulative plot
ax7 = plt.subplot(gs[3,4]) # Relative values per bin
means = df.groupby('r').mean()
xlim = [0, 1750]
plt.suptitle('Feature: {} | Station: {} | <v-> = {:.4f} | <v+> = {:.4f}'.format(ft_nr,
station,
means.loc[0,'v_diff_abs_abs'],
means.loc[1,'v_diff_abs_abs']))
colors = ['r','g','b','k','c','w','m','y']
# Raw values
for cl in cluster_info[cluster].unique():
cl_x = df[df['c']==cl].t
cl_y = df[df['c']==cl].v
if cl_x.shape[0]>0:
ax0.scatter(cl_x, cl_y, marker='.', color=colors[cl % 8], label='{}: {:.2f}%'.format(cl,
float(cl_x.shape[0])/df.shape[0]*100))
ax0.scatter(df.t_w, df.mu_v,color='c', marker='x', s=5)
ax0.set_ylabel('Raw value')
ax0.set_xlim(xlim)
# Add legend
ax0.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
ncol=cluster_info[cluster].nunique() + 1, mode="expand", borderaxespad=0.)
# Detrended values
ax1.scatter(df.t, df.v_diff_abs, marker='.', color='g')
ax1.scatter(df.t[df.r==1], df.v_diff_abs[df.r==1], color='r', marker='.', s=100)
ax1.set_ylabel('Detrended')
ax1.set_xlim(xlim)
# Error rates
e_rates = df.groupby('t_w')['r'].agg(['sum','count'])
e_rates['r'] = e_rates['sum'] / e_rates['count']
e_rates['r'].plot(marker='.', linestyle='None', markersize='10', ax=ax5)
ax5.set_ylabel('Error rate')
ax5.set_xlim(xlim)
# Histogram of raw values
nbins = 100
rmax = 0.4
freq_1, bins = np.histogram(df.loc[df['r']==1,'v'], bins=nbins, range=[-rmax, rmax], density=True)
freq_0, bins = np.histogram(df.loc[df['r']==0,'v'], bins=nbins, range=[-rmax, rmax], density=True)
ax2.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
ax2.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
ax2.set_ylabel('Raw values')
# Histogram of values after detrending
nbins = 100
rmax = 0.4
freq_1, bins = np.histogram(df.loc[df['r']==1,'v_diff_abs'], bins=nbins, range=[-rmax, rmax], density=True)
freq_0, bins = np.histogram(df.loc[df['r']==0,'v_diff_abs'], bins=nbins, range=[-rmax, rmax], density=True)
ax3.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
ax3.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
ax3.set_ylabel('Detrended')
# Relative values per bin
ax4.bar(bins[1:], freq_1/freq_0, color='b', width=2*rmax/nbins, alpha=0.5)
ax4.set_ylabel('Difference')
# Cumulative sum of abs values
df['dummy'] = 1
tmp0 = df[df['r']==0].set_index('v_diff_abs_abs').sort_index()['dummy'].cumsum()
tmp1 = df[df['r']==1].set_index('v_diff_abs_abs').sort_index()['dummy'].cumsum()
ax6.plot(tmp0.index, tmp0.values.astype(float) / tmp0.max(), color='g')
ax6.plot(tmp1.index, tmp1.values.astype(float) / tmp1.max(), color='r')
ax6.set_ylabel('Cumsum')
# Histogram of relative values by detrending
nbins = 40
rmax = 40
freq_0, bins = np.histogram(df.loc[df['r']==0,'v_diff_rel'], bins=nbins, range=[0, rmax], density=True)
freq_1, bins = np.histogram(df.loc[df['r']==1,'v_diff_rel'], bins=nbins, range=[0, rmax], density=True)
ax7.bar(bins[1:], freq_0, color='g', width=rmax/nbins, alpha=0.5)
ax7.bar(bins[1:], freq_1, color='r', width=rmax/nbins, alpha=0.5)
ax7.set_ylabel('Relative detrending')
plt.show()
return df
In [92]:
a = visualize_detrended_numeric_feature(3354, cluster='cluster_n25', showplot=False)
In [ ]:
data_detrended = data.copy()
for f_nr in lut['feature_nr'].unique():
a = visualize_detrended_numeric_feature(f_nr, showplot=False, cluster='cluster_n25')
if isinstance(a, pd.DataFrame):
# Update values in sparse array
n_col = lut[lut.feature_nr==f_nr].col_num.values[0]
i_n = data[:, n_col].nonzero()[0]
# To not mess up sparse structure
a.loc[a['v_diff_abs']==0,'v_diff_abs'] = 1e-5
data_detrended[a['i'].values, n_col] = a['v_diff_abs'].values.reshape(-1,1)
else:
print('No numeric data for feature {}'.format(f_nr))
In [36]:
with open(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.pkl'), 'wb') as f:
pickle.dump(data_detrended, f, pickle.HIGHEST_PROTOCOL)
#pd.concat([ids,pd.DataFrame(data_detrended.todense())], axis=1).to_csv(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.csv'), index=False)
In [21]:
def visualize_values_per_cluster(a):
''' Visualizes the distribution of numeric values per clusters
a is the output of visualize_detrended_numeric_feature()
'''
cl_col = 'c50'
n_col = 50
for cl in range(n_col):
print a[a[cl_col]==cl].shape[0], a[a[cl_col]==cl].v.nunique()
if a[a[cl_col]==cl].shape[0]>100:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharex=True, figsize=(16,6))
#a[a.c500==cl].v.hist(bins=100)
# Histogram of raw values
nbins = 50
rmax = 0.3
freq_2, bins = np.histogram(a.loc[(a[cl_col]==cl),'v'], bins=nbins, range=[-rmax, rmax])
freq_1, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax], density=True)
freq_0, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'], bins=nbins, range=[-rmax, rmax], density=True)
freq_1r, bins = np.histogram(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax])
ax1.set_title('<v> = {:.4f}'.format(a.loc[(a[cl_col]==cl),'v'].mean()))
ax2.set_title('<v-> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'].mean()))
ax3.set_title('<v+> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'].mean()))
ax1.bar(bins[1:], freq_2, color='b', width=2*rmax/nbins, alpha=1)
ax2.bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
ax2.bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
ax3.bar(bins[1:], freq_1r, color='r', width=2*rmax/nbins, alpha=0.5)
plt.show()
def visualize_per_time(a):
''' Visualizes the distribution of numeric values per timeunit
a is the output of visualize_detrended_numeric_feature()
'''
# square root of number of time splits
n = 5
f, ax = plt.subplots(n,n,sharex=True, figsize=(16,16))
t_split = np.linspace(0, a['t'].max(), n ** 2 +1)
for i in range(n ** 2):
tmin = t_split[i]
tmax = t_split[i+1]
#print tmin, tmax
# Histogram of raw values
nbins = 50
rmax = 0.3
freq_2, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax),'v'], bins=nbins, range=[-rmax, rmax])
freq_1, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax], density=True)
freq_0, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==0),'v'], bins=nbins, range=[-rmax, rmax], density=True)
freq_1r, bins = np.histogram(a.loc[(a['t']>=tmin) & (a['t']<=tmax) & (a['r']==1),'v'], bins=nbins, range=[-rmax, rmax])
#ax1.set_title('<v> = {:.4f}'.format(a.loc[(a[cl_col]==cl),'v'].mean()))
#ax2.set_title('<v-> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==0),'v'].mean()))
#ax3.set_title('<v+> = {:.4f}'.format(a.loc[(a[cl_col]==cl) & (a['r']==1),'v'].mean()))
ax[i / n, i % n].bar(bins[1:], freq_2, color='b', width=2*rmax/nbins, alpha=1)
#ax[i / 9, 9 % i].bar(bins[1:], freq_0, color='g', width=2*rmax/nbins, alpha=0.5)
#ax[i / 9, 9 % i].bar(bins[1:], freq_1, color='r', width=2*rmax/nbins, alpha=0.5)
#ax[i / 9, 9 % i].bar(bins[1:], freq_1r, color='r', width=2*rmax/nbins, alpha=0.5)
plt.show()
In [20]:
def check_timestamp_coverage_per_feature():
for n in lut.feature_nr.unique():
station = lut[lut.feature_nr==n].station_V2.values[0]
t_col = lut[lut.feature_nr==n].col_dat.values[0]
# TO-DO: GET TIMING FROM FEATURE WITH TIMES
n_col = lut[lut.feature_nr==n].col_num.values[0]
if np.isnan(n_col):
continue
i_n = data[:, n_col].nonzero()[0]
if np.isnan(t_col):
i_t = 0.0
else:
i_t = set(i_n) & set(t[:, t_col].nonzero()[0])
i_t = 1.0 * len(i_t)
i_ts = (~t_station.iloc[i_n, t_station.columns.get_loc('t_' + str(station))].isnull()).sum()
i_n = 1.0 * len(i_n)
print('F: {} | S: {} | n: {} | t: {} | r: {}% t(S): {}'.format(n,
station,
i_n,
i_t,
i_t/i_n*100,
i_ts))
In [31]:
def visualize_feature(f_nr):
t_col = lut[lut.feature_nr==f_nr].col_dat.values[0]
n_col = lut[lut.feature_nr==f_nr].col_num.values[0]
if np.isnan(n_col):
print('No numeric data for {}'.format(f_nr))
return
if np.isnan(t_col):
print('No timestamp data for {}'.format(f_nr))
return
#try:
if 1:
# Get rows that have values for t AND numeric data
nrow = list((set(t[:, t_col].nonzero()[0]) & set(data[:, n_col].nonzero()[0])))
nrow_1 = list(set(nrow) & set(n_1))
nrow_0 = list(set(nrow) & set(n_0))
t0 = t[nrow_0[:250000], t_col].todense()
d0 = data[nrow_0[:250000], n_col].todense()
t1 = t[nrow_1, t_col].todense()
d1 = data[nrow_1, n_col].todense()
# Sort arrays
s_t0 = np.argsort(t0, axis=0)
s_t1 = np.argsort(t1, axis=0)
t0 = t0[s_t0].ravel().T
d0 = d0[s_t0].ravel().T
t1 = t1[s_t1].ravel().T
d1 = d1[s_t1].ravel().T
df_0 = pd.DataFrame(np.concatenate([t0,d0], axis=1), columns=['t0','d0'])
df_1 = pd.DataFrame(np.concatenate([t1,d1], axis=1), columns=['t1','d1'])
df_0.loc[:, 't0'] = df_0['t0'].round(0)
df_1.loc[:, 't1'] = df_1['t1'].round(0)
df_0_agg = df_0.groupby('t0').agg(['mean','count'])
df_1_agg = df_1.groupby('t1').agg(['mean','count'])
df_all = df_0_agg.merge(df_1_agg, left_index=True, right_index=True, how='outer').fillna(0)
f, (ax1, ax2, ax3, ax4) = plt.subplots(4,1,figsize=(16,6), sharex=True)
ax1.plot(t0, d0, linestyle='None', marker='.', color='g')
ax1.plot(t1, d1, linestyle='None', marker='.', color='r')
ax1.set_ylabel('Value')
ax1.set_title('Line {} | Station {} | Feature {} '.format(lut[lut.feature_nr==f_nr].line.values[0],
lut[lut.feature_nr==f_nr].station_V2.values[0],
f_nr))
ax2.plot(df_0_agg.index, df_0_agg['d0']['mean'], color='g')
ax2.plot(df_1_agg.index, df_1_agg['d1']['mean'], color='r')
ax2.set_ylabel('Moving average')
ax3.plot(df_all.index, df_all['d1']['count'].astype(float).div(df_all['d0']['count']), color='b')
ax3.set_ylabel('Error rate')
ax3.set_xlabel('Timestamp')
ax3.set_ylim([0, 0.1])
plt.show()
return df_all
In [ ]: