In [ ]:
## Copyright (c) 2016 Mikel Bober-Irizar
## Feature-Time Instability Metric
# Script to evaluate the change in properties of features over time
# To use, call the function find_ovalue(FEATURE, TARGET)
In [ ]:
import numpy as np
import pandas as pd
In [ ]:
########## CONFIG #########
# Resolution of time axis to test
time_res = 10
# Resolution of feature value binning
x_res = 10000
# Higher values will find smaller fluctuations in data but may have more noise
# Threshold of values that have to be in a histogram bin for it to be considered:
thresh = 0.0001
# Method to measure stabilitiy, either 'inter' for histogram intersection or 'purity' for tree split purity
method = 'purity'
# Only used for the purity metric, gives weight to the different splits based on how many samples exist in the bin
weighted = True
ignore_zero = False
######### END CONFIG ##########
In [ ]:
def hist_inter(a, b, bins):
# Uses Histogram intersection distance function to measure instability
# Find range
hist_max = max(max(a), max(b))
hist_min = min(min(a), min(b))
# np.histogram 'normed' is broken, normalisation must be done manually
hist_a = np.histogram(a, bins=bins, range=(hist_min, hist_max), normed=False)[0].tolist()
hist_b = np.histogram(b, bins=bins, range=(hist_min, hist_max), normed=False)[0].tolist()
# Manual normalisation of histograms
size_a = len(a)
size_b = len(b)
hist_a = [x / size_a for x in hist_a]
hist_b = [x / size_b for x in hist_b]
k = 0
i = 0
# Evaluate histogram intersection
for d in zip(hist_a, hist_b):
if sum(d) > thresh:
k += min(d)
i += 1
return k
In [ ]:
def hist_purity(a, b, target_a, target_b, bins, weighted=True, ignore_nan=False):
# Get range of histogram to use
hist_max = max(max(a), max(b))
hist_min = min(min(a), min(b))
hist = pd.DataFrame()
hist['a'] = a
hist['b'] = b
hist['ta'] = target_a
hist['tb'] = target_b
# Separate data into labels
a_true = hist.loc[hist.ta == 1]['a'].values#.tolist()
a_false = hist.loc[hist.ta < 1]['a'].values#.tolist()
b_true = hist.loc[hist.tb == 1]['b'].values#.tolist()
b_false = hist.loc[hist.tb < 1]['b'].values#.tolist()
# Compute histograms
hist_a_true = np.histogram(a_true, bins=bins, range=(hist_min, hist_max), normed=False)[0]
hist_a_false = np.histogram(a_false, bins=bins, range=(hist_min, hist_max), normed=False)[0]
hist_b_true = np.histogram(b_true, bins=bins, range=(hist_min, hist_max), normed=False)[0]
hist_b_false = np.histogram(b_false, bins=bins, range=(hist_min, hist_max), normed=False)[0]
# Compute split purity
hist_a_tot = hist_a_true + hist_a_false
hist_b_tot = hist_b_true + hist_b_false
hist_a_purity = hist_a_true / hist_a_tot
hist_b_purity = hist_b_true / hist_b_tot
if ignore_nan is False:
hist_a_purity = np.nan_to_num(hist_a_purity)
hist_b_purity = np.nan_to_num(hist_b_purity)
# Compute histogram weights
hist_weights = ((hist_a_true + hist_a_false) / np.sum(hist_a_true + hist_a_false)) + ((hist_b_true + hist_b_false) / np.sum(hist_b_true + hist_b_false)) / 2
if weighted is False:
k = np.nansum(np.abs(hist_a_purity - hist_b_purity)) / len(a + b)
if weighted is True:
k = np.nansum(np.abs(hist_a_purity - hist_b_purity) * hist_weights)
#print(k)
return k
In [ ]:
def find_ovalue_inter(feature, target):
ftr_true = []
ftr_false = []
# Separate into positive and negative samples
for x in zip(feature, target):
if x[1] == 1:
ftr_true.append(x[0])
else:
ftr_false.append(x[0])
# Split into time bins
chunks_true = [x.tolist() for x in np.array_split(ftr_true, time_res)]
chunks_false = [x.tolist() for x in np.array_split(ftr_false, time_res)]
cross = []
# Shoddy method for cross-checking chunks
for x in chunks_true:
for y in chunks_true:
if x != y and y > x:
dist = hist_inter(x, y, x_res)
cross.append(dist)
for x in chunks_false:
for y in chunks_false:
if x != y and y > x:
dist = hist_inter(x, y, x_res)
cross.append(dist)
return 1 - (sum(cross) / len(cross))
In [ ]:
def find_ovalue_purity(feature, target):
feature_chunks = [x.tolist() for x in np.array_split(feature, time_res)]
target_chunks = [x.tolist() for x in np.array_split(target, time_res)]
cross = []
# Shoddy method for cross-checking chunks
for xi, x in enumerate(feature_chunks):
for yi, y in enumerate(feature_chunks):
if y > x: # Avoid repeating the same chunk pair
xt = target_chunks[xi]
yt = target_chunks[yi]
dist = hist_purity(x, y, xt, yt, x_res, weighted, ignore_zero)
cross.append(dist)
try:
return sum(cross) / len(cross)
except:
# Will return -1 if there is an error
return -1
In [ ]:
if method == 'inter':
find_ovalue = find_ovalue_inter
elif method == 'purity':
find_ovalue = find_ovalue_purity
else:
print("Method must be set to either 'inter' or 'purity'")
In [ ]:
from pygoose import *
project = kg.Project.discover()
In [ ]:
feature_lists = [
'simple_summaries',
'jaccard_ngrams',
'fuzzy',
'tfidf',
'lda',
'nlp_tags',
'wordnet_similarity',
'phrase_embedding',
'wmd',
'wm_intersect',
'3rdparty_abhishek',
'3rdparty_dasolmar_whq',
'3rdparty_mephistopheies',
'3rdparty_image_similarity',
'magic_pagerank',
'magic_frequencies',
'magic_cooccurrence_matrix',
'oofp_nn_mlp_with_magic',
'oofp_nn_cnn_with_magic',
'oofp_nn_bi_lstm_with_magic',
'oofp_nn_siamese_lstm_attention',
]
In [ ]:
df_train, df_test, _ = project.load_feature_lists(feature_lists)
In [ ]:
df_train = df_train.replace([np.inf, -np.inf], np.nan).fillna(0)
In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')
In [ ]:
df_train['is_duplicate'] = y_train
In [ ]:
def process(c):
# Here is the function that evaluates the feature, please provide it with a LIST
# It returns a float which is the overfitting value
o = find_ovalue(df_train[c].tolist(), df_train['is_duplicate'].tolist())
print(c.ljust(60) + ' ' + str(o))
f = open(f'ftim-overfit-test-res-{time_res}-x-{x_res}-thresh-{thresh}-{method}.csv', 'a')
f.write(c+','+str(o)+'\n')
f.close()
In [ ]:
for column in df_train.columns.tolist():
process(column)