In [1]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [2]:
const.TRAIN_FILES
Out[2]:
In [3]:
const.TEST_FILES
Out[3]:
In [4]:
num_data = func.load_data_file(const.TRAIN_FILES[0], ftype='bin')
cat_data = func.load_data_file(const.TRAIN_FILES[1], ftype='bin')
num_data_te = func.load_data_file(const.TEST_FILES[0], ftype='bin')
cat_data_te = func.load_data_file(const.TEST_FILES[1], ftype='bin')
y = num_data['data']['y']
ids = np.concatenate([num_data['data']['ids'].Id.values, num_data_te['data']['ids'].Id.values])
num_f_n = num_data['data']['feature_names'][1:]
cat_f_n = cat_data['data']['feature_names'][1:]
num_data = vstack([num_data['data']['features'], num_data_te['data']['features']], format='csr')
cat_data = vstack([cat_data['data']['features'], cat_data_te['data']['features']], format='csr')
In [5]:
num_data.shape
Out[5]:
In [6]:
del num_data_te, cat_data_te
In [7]:
lut = pd.read_csv(os.path.join(const.DATA_PATH, 'date_feat_lut_V2.csv'))
lut.head()
Out[7]:
In [8]:
def insert_col_num(x):
''' Adds columns number to lookup table'''
try:
num_col_n = num_f_n.index(x['name_num'])
x['col_num'] = num_col_n
except:
x['col_num'] = np.nan
try:
cat_col_n = cat_f_n.index(x['name_cat'])
x['col_cat'] = cat_col_n
except:
x['col_cat'] = np.nan
return x
In [9]:
lut = lut.apply(lambda x: insert_col_num(x), axis=1)
In [10]:
# Create lookup table for the numeric matrix so we can easily go from feature nr to column no
lut_num_indices = lut.groupby(['line','station_V2']).col_num.agg(['min','max'])
lut_num_indices.head()
Out[10]:
In [11]:
# Create lookup table for the numeric matrix so we can easily go from feature nr to column no
lut_cat_indices = lut.groupby(['line','station_V2']).col_cat.agg(['min','max'])
lut_cat_indices.head()
Out[11]:
In [12]:
print lut_num_indices.shape
print lut_cat_indices.shape
In [13]:
n_stations = lut.station_V2.nunique()
n_features = lut.feature_nr.nunique()
n_samples = num_data.shape[0]
print('Number of stations: {}'.format(n_stations))
print('Number of features: {}'.format(n_features))
print('Number of samples: {}'.format(n_samples))
We assume that when a sample has a measurement value for one of the features of the station, it has passed this station. We also assume that samples always pass the stations in order.
To calculate this matrix we need to sum over all the features per station. Most efficient way to do this is by matrix multiplication: A B C [0 1 1] [1 0] = [1 1] -> '>0' -> [1 1] [0 0 1] x [1 0] [0 1] [0 1] [1 1 0] [0 1] [2 1] [1 1]
We have to do this for the numeric and categorical features and concate them later.
In [14]:
# Create matrix B for numeric features
station_filter_num = np.zeros((lut_num_indices.shape[0], num_data.shape[1]))
for i in range(lut_num_indices.shape[0]):
if lut_num_indices.iloc[i]['min']>=0:
i_s = int(lut_num_indices.iloc[i]['min'])
i_e = int(lut_num_indices.iloc[i]['max']+1)
#print i_s, i_e
station_filter_num[i, i_s:i_e] = 1
station_filter_num = csr_matrix(station_filter_num)
In [15]:
# Create matrix C for numeric features
num_per_station = (num_data * station_filter_num.T)>0
In [16]:
# Create matrix B for categorical features
station_filter_cat = np.zeros((lut_cat_indices.shape[0], cat_data.shape[1]))
for i in range(lut_cat_indices.shape[0]):
if lut_cat_indices.iloc[i]['min']>=0:
i_s = int(lut_cat_indices.iloc[i]['min'])
i_e = int(lut_cat_indices.iloc[i]['max']+1)
#print i_s, i_e
station_filter_cat[i, i_s:i_e] = 1
station_filter_cat = csr_matrix(station_filter_cat)
In [17]:
# Create matrix C for categorical features
cat_per_station = (cat_data * station_filter_cat.T)>0
In [18]:
# Double check dimensions
print num_per_station.shape
print cat_per_station.shape
In [19]:
# Now get a array of station nr visited for each
a=time.time()
stations_passed = []
for i in range(num_data.shape[0]):
if not i % 100000:
print('Calculated stations for {} of {}'.format(i+1, num_data.shape[0]))
stations_passed.append(np.unique(np.concatenate([num_per_station[i,:].nonzero()[1], cat_per_station[i,:].nonzero()[1]], axis=0)))
# Below is ~10% slower
#stations_passed.append(np.asarray(set(num_data[i,:].nonzero()[1]) & set(cat_data[i,:].nonzero()[1])))
print time.time()-a
In [20]:
# Method used to convert list of stations visited to either a string '01111000' or array [0,1,0,1,0,0] or both
def to_path_info_per_item(items_visited, n_items, o_type='string'):
tmp = np.zeros((n_items)).astype(int)
for n in items_visited: tmp[n]=1
if o_type=='string':
return ''.join(map(str,tmp))
elif o_type=='array':
return tmp
elif o_type=='both':
return ''.join(map(str,tmp)), tmp
In [21]:
# Convert list of stations to binary vectors
ps_strings = []
ps_arr = []
for i in range(n_samples):
if not i % 100000:
print('Converted {} of {}'.format(i+1, n_samples))
s_s, s_arr = to_path_info_per_item(stations_passed[i], n_stations, o_type='both')
ps_strings.append(s_s)
ps_arr.append(s_arr)
In [22]:
# Now finally get the unique values based on the strings
u_str, u_ix, i_ix = np.unique(ps_strings, return_index=True, return_inverse=True)
u_arr = np.array([ps_arr[n] for n in u_ix])
In [23]:
print('Number of unique paths: {}'.format(u_arr.shape[0]))
In [24]:
# Store the sample_id with unique path id
pd.DataFrame({'id': ids, 'u_arr_ix': i_ix}).to_csv(os.path.join(const.DATA_PATH, 'eda_product_flow_sample_paths_station.csv'),
index=False)
# Store the unique strings with id
pd.DataFrame({'id': range(u_arr.shape[0]), 'u_str': u_str}).to_csv(os.path.join(const.DATA_PATH, 'eda_product_flow_unique_paths_station.csv'),
index=False)
In [529]:
def distance_matrix(bin_arr_list, verbose=False):
tmp = np.empty((bin_arr_list.shape[0], bin_arr_list.shape[0]))
tmp[:] = np.nan
for i in range(bin_arr_list.shape[0]):
if (not i % 1000) & verbose:
print('{}/{}'.format(i+1,bin_arr_list.shape[0]))
for j in range(bin_arr_list.shape[0]):
#if j>i:
##a = np.array([int(n) for n in list(unique_string[i])])
#b = np.array([int(n) for n in list(unique_string[j])])
tmp[i,j] = np.count_nonzero( bin_arr_list[i]!=bin_arr_list[j] )
return tmp
In [530]:
tmp = distance_matrix(u_arr, True)
In [545]:
from sklearn.cluster import KMeans
n_cluster=7
km = KMeans(n_clusters=n_cluster)
tmp
%time km.fit(u_arr)
clusters = km.labels_.tolist()
In [546]:
len(clusters)
Out[546]:
In [547]:
from sklearn import decomposition
In [548]:
ns=7600
ne=ns+20
print clusters[ns:ne]
[np.count_nonzero( u_arr[i]!=u_arr[i+1]) for i in range(ns, ne)]
Out[548]:
In [491]:
u_arr
Out[491]:
In [498]:
u_arr[1]
Out[498]:
In [549]:
for n_clus in range(n_cluster):
u_clus = np.array([u_arr[i] for i,n in enumerate(clusters) if n==n_clus])
tmp2 = distance_matrix(u_clus)
print('Mean of matrix: {}'.format(np.nanmean(tmp2)))
print('Max of matrix: {}'.format(np.nanmax(tmp2)))
In [523]:
n_clus=3
In [543]:
nn_clus = np.array([n for i,n in enumerate(clusters) if n==n_clus])
In [552]:
titles = [str(i) for i in range(tmp.shape[0])]
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',4: '#8d03d1', 5:'#2b09db', 6:'#466eb8'}
#set up cluster names using a dict
cluster_names = {0: '1',
1: '2',
2: '3',
3: '4',
4: '5',
5: '6',
6: '7'}
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles))
#group by cluster
groups = df.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_names[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='off')
ax.legend(numpoints=1) #show legend with only 1 point
#add label in x,y position with the label as the film title
for i in range(len(df)):
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)
plt.show() #show the plot
In [253]:
pd.DataFrame({'string':station_strings}).to_csv(os.path.join(const.DATA_PATH,'path_string_per_station.csv'), index_label='Id')
In [224]:
stations = []
for i in range(num_per_station.shape[0]):
if not i % 100000:
print('Hashed {} of {}'.format(i+1, num_per_station.shape[0]))
stations.append((np.unique(np.concatenate([num_per_station[i,:].nonzero()[1], cat_per_station[i,:].nonzero()[1]], axis=0))))
In [555]:
pd.DataFrame({'string':ustring}).to_csv(os.path.join(const.DATA_PATH,'path_unique_string_per_station.csv'), index_label='Id')
In [223]:
hashes = []
for i in range(num_per_station.shape[0]):
if not i % 10000:
print('Hashed {} of {}'.format(i+1, num_per_station.shape[0]))
hashes.append(hash(str(np.concatenate([num_per_station[i,:].nonzero()[1], cat_per_station[i,:].nonzero()[1]], axis=0))))
In [562]:
lut.feature_nr.max()
Out[562]:
In [209]:
pd.Series(hashes).nunique()
Out[209]:
In [216]:
pd.Series(hashes).value_counts().head()
Out[216]:
In [210]:
hash_vals = pd.Series(hashes).value_counts()
In [213]:
hash_vals.hist(range=[0,100])
Out[213]:
In [215]:
(hash_vals<10).sum()
Out[215]:
This results in 24775 different hashes. Most common hashes are:
4906655664106956524: 6679
of those, 12157 have a single sample
of those, 20205 have less than 10 samples
In [34]:
def to_path_info_per_feature(features_visited, n_features, o_type='string'):
tmp = np.zeros((n_features)).astype(int)
for n in features_visited: tmp[n]=1
if o_type=='string':
return ''.join(map(str,tmp))
elif o_type=='array':
return tmp
elif o_type=='both':
return ''.join(map(str,tmp)), tmp
In [ ]:
# Generate a list of all the features products have values for
features_visited = []
for i in range(n_samples):
if not i % 100000:
print('Calculated {} of {}'.format(i+1, n_samples))
#stations_visited.append(hash(str(np.concatenate([num_data[i,:].nonzero()[1], cat_data[i,:].nonzero()[1]], axis=0).data)))
features_visited.append(np.unique(np.concatenate([num_data[i,:].nonzero()[1], cat_data[i,:].nonzero()[1]], axis=0)))
In [ ]:
# Covert to list of stations to binary vectors
station_strings = []
station_arr = []
for i in range(n_samples):
if not i % 100000:
print('Hashed {} of {}'.format(i+1, n_samples))
s_s, s_arr = to_path_info_per_feature(features_visited[i], n_features, o_type='both')
station_strings.append(s_s)
station_arr.append(s_arr)
In [ ]:
# Now finally get the unique values based on the strings
u_str, u_ix, i_ix = np.unique(station_strings, return_index=True, return_inverse=True)
u_arr = np.array([station_arr[n] for n in u_ix])
In [ ]:
print('Number of unique paths: {}'.format(u_arr.shape[0]))
In [ ]:
# Store the sample_id with unique path id
pd.DataFrame({'id': ids, 'u_arr_ix': i_ix}).to_csv(os.path.join(const.DATA_PATH, 'eda_product_flow_sample_paths_per_feature.csv'),
index=False)
# Store the unique strings with id
pd.DataFrame({'id': range(u_arr.shape[0]), 'u_str': u_str}).to_csv(os.path.join(const.DATA_PATH, 'eda_product_flow_unique_paths_per_feature.csv'),
index=False)
In [ ]: