This script show a simple example of using tf.contrib.learn library to create our model.
The code is divided in following steps:
v0.1: Added code for data loading, modeling and prediction model.
v0.2: Removed unnecessary output logs.
PS: I was able to get a score of 1295.07972 using this script with 70% (of train.csv) data used for training and rest for evaluation. Script took 2hrs for training and 3000 steps were used.
In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, minmax_scale
plt.rcParams['figure.dpi'] = 300
from sklearn.metrics import r2_score
from time import time
start0 = time()
In [2]:
spitzerDataRaw = pd.read_csv('pmap_ch2_0p1s_x4_rmulti_s3_7.csv')
In [3]:
PLDpixels = pd.DataFrame({key:spitzerDataRaw[key] for key in spitzerDataRaw.columns.values if 'pix' in key})
PLDpixels
Out[3]:
In [4]:
PLDnorm = np.sum(np.array(PLDpixels),axis=1)
In [5]:
PLDpixels = (PLDpixels.T / PLDnorm).T
PLDpixels
Out[5]:
In [6]:
[plt.plot(PLDpixels[key]) for key in PLDpixels.columns.values];
In [7]:
spitzerData = spitzerDataRaw.copy()
for key in spitzerDataRaw.columns:
if key in PLDpixels.columns:
spitzerData[key] = PLDpixels[key]
In [8]:
testPLD = np.array(pd.DataFrame({key:spitzerData[key] for key in spitzerData.columns.values if 'pix' in key}))
assert(not sum(abs(testPLD - np.array(PLDpixels))).all())
print('Confirmed that PLD Pixels have been Normalized to Spec')
In [9]:
notFeatures = ['flux', 'fluxerr', 'xerr', 'yerr', 'xycov']
feature_columns = spitzerData.drop(notFeatures,axis=1).columns.values
features = spitzerData.drop(notFeatures,axis=1).values
labels = spitzerData['flux'].values
In [10]:
x_val, x_traintest, y_val, y_traintest = train_test_split(features, labels, test_size=0.8, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_traintest, y_traintest, test_size=0.5, random_state=42)
x_val = minmax_scale(x_val.astype('float32'))
x_train = minmax_scale(x_train.astype('float32'))
x_test = minmax_scale(x_test.astype('float32'))
y_val = minmax_scale(y_val.astype('float32'))
y_train = minmax_scale(y_train.astype('float32'))
y_test = minmax_scale(y_test.astype('float32'))
print(x_val.shape[0] , 'validation samples')
print(x_train.shape[0], 'train samples')
print(x_test.shape[0] , 'test samples')
In [11]:
train_df = pd.DataFrame(np.c_[x_train, y_train], columns=list(feature_columns) + ['flux'])
test_df = pd.DataFrame(np.c_[x_test , y_test ], columns=list(feature_columns) + ['flux'])
evaluate_df = pd.DataFrame(np.c_[x_val , y_val ], columns=list(feature_columns) + ['flux'])
In [12]:
plt.scatter(train_df['xpos'].values, train_df['ypos'].values, c=train_df['flux'].values, alpha=0.5);
# plt.scatter(test_df['xpos'].values, test_df['ypos'].values, c=test_df['flux'].values, alpha=0.1);
plt.colorbar();
We only take first 1000 rows for training/testing and last 500 row for evaluation.
This done so that this script does not consume a lot of kaggle system resources.
In [13]:
# MODEL_DIR = "tf_model_spitzer/withNormalization_drop50/sigmoid"
MODEL_DIR = "tf_model_spitzer/randomforests"
print("train_df.shape = " , train_df.shape)
print("test_df.shape = " , test_df.shape)
print("evaluate_df.shape = ", evaluate_df.shape)
In [14]:
# categorical_features = [feature for feature in features if 'cat' in feature]
categorical_features = []
continuous_features = [feature for feature in train_df.columns]# if 'cat' in feature]
LABEL_COLUMN = 'flux'
When building a TF.Learn model, the input data is specified by means of an Input Builder function. This builder function will not be called until it is later passed to TF.Learn methods such as fit and evaluate. The purpose of this function is to construct the input data, which is represented in the form of Tensors or SparseTensors.
Note that input_fn will be called while constructing the TensorFlow graph, not while running the graph. What it is returning is a representation of the input data as the fundamental unit of TensorFlow computations, a Tensor (or SparseTensor).
More detail on input_fn.
In [15]:
# Converting Data into Tensors
def input_fn(df, training = True):
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values)
for k in continuous_features}
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
# categorical_cols = {k: tf.SparseTensor(
# indices=[[i, 0] for i in range(df[k].size)],
# values=df[k].values,
# shape=[df[k].size, 1])
# for k in categorical_features}
# Merges the two dictionaries into one.
feature_cols = continuous_cols
# feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
if training:
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
return feature_cols, label
# Returns the feature columns
return feature_cols
def train_input_fn():
return input_fn(train_df, training=True)
def eval_input_fn():
return input_fn(evaluate_df, training=True)
# def test_input_fn():
# return input_fn(test_df.drop(LABEL_COLUMN,axis=1), training=False)
def test_input_fn():
return input_fn(test_df, training=False)
We use tf.learn's concept of FeatureColumn which help in transforming raw data into suitable input features.
These engineered features will be used when we construct our model.
In [16]:
engineered_features = []
for continuous_feature in continuous_features:
engineered_features.append(
tf.contrib.layers.real_valued_column(continuous_feature))
# for categorical_feature in categorical_features:
# sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket(
# categorical_feature, hash_bucket_size=1000)
# engineered_features.append(tf.contrib.layers.embedding_column(sparse_id_column=sparse_column, dimension=16,
# combiner="sum"))
Following is the simple DNNRegressor model. More detail about hidden_units, etc can be found here.
model_dir is used to save and restore our model. This is because once we have trained the model we don't want to train it again, if we only want to predict on new data-set.
In [ ]:
del hparams, regressor
In [17]:
"""Tests multi-class classification using matrix data as input."""
regression = True
num_classes = 1
num_features = x_train.shape[1]#13
num_trees = 1000#3
max_nodes = 10000#1000
bagging_fraction = 0.5
num_splits_to_consider = 0
feature_bagging_fraction = 0.5
max_fertile_nodes = 0
split_after_samples = 250#20
min_split_samples = 5
valid_leaf_threshold = 1
dominate_method = 'bootstrap'
dominate_fraction = 0.99
ForestHParams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams
hparams = ForestHParams(regression = regression,
num_trees = num_trees,
num_classes = num_classes,
num_features = num_features,
max_nodes = max_nodes,
bagging_fraction = bagging_fraction,
num_splits_to_consider = num_splits_to_consider,
feature_bagging_fraction = feature_bagging_fraction,
max_fertile_nodes = max_fertile_nodes,
split_after_samples = split_after_samples,
min_split_samples = min_split_samples,
valid_leaf_threshold = valid_leaf_threshold,
dominate_method = dominate_method,
dominate_fraction = dominate_fraction
)
In [18]:
RandomForestGraphs = tf.contrib.tensor_forest.python.tensor_forest.RandomForestGraphs
TensorForestEstimator = tf.contrib.tensor_forest.random_forest.TensorForestEstimator
params = hparams.fill()
device_assigner = None
model_dir = MODEL_DIR
graph_builder_class = RandomForestGraphs
config = None
weights_name = None
keys_name = None
feature_engineering_fn = None
early_stopping_rounds = 100
early_stopping_loss_threshold = 0.01
num_trainers = 1
trainer_id = 0
report_feature_importances = False
local_eval = False
version = None
head = None
regressor = TensorForestEstimator(params = params,
device_assigner = device_assigner,
model_dir = model_dir,
graph_builder_class = RandomForestGraphs,
config = config,
weights_name = weights_name,
keys_name = keys_name,
feature_engineering_fn = feature_engineering_fn,
early_stopping_rounds = early_stopping_rounds,
early_stopping_loss_threshold = early_stopping_loss_threshold,
num_trainers = num_trainers,
trainer_id = trainer_id,
report_feature_importances = report_feature_importances,
local_eval = local_eval,
version = version,
head = head
)
In [19]:
x = x_train
y = y_train
input_fn = None
steps = None
batch_size = 50
monitors = None
max_steps = None
regressor.fit(x = x_train,
y = y_train,
# input_fn = train_input_fn,
steps = steps,
batch_size = batch_size,
monitors = monitors,
max_steps = max_steps)
Out[19]:
In [20]:
x = x_test
y = y_test
input_fn = None
feed_fn = None
batch_size = None
steps = None
metrics = None
name = None
checkpoint_path = None
hooks = None
log_progress = True
regressor.evaluate(x = x_test,
y = y_test,
input_fn = input_fn,
feed_fn = feed_fn,
batch_size = batch_size,
steps = steps,
metrics = metrics,
name = name,
checkpoint_path = checkpoint_path,
hooks = hooks,
log_progress = log_progress,
)
Out[20]:
In [21]:
def de_median(x):
return x - np.median(x)
In [ ]:
regressor.
In [23]:
predicted_output = list(regressor.predict(x_train))#input_fn=test_input_fn))
predicted_scores = regressor.predict_scores(input_fn=test_input_fn)
In [ ]:
predicted_scores
In [ ]:
# print([predicted_output() for _ in range(10)])
plt.plot((predicted_output - np.median(predicted_output)) / np.std(predicted_output),'.',alpha=0.1);
plt.plot((test_df['flux'].values - np.median(test_df['flux'].values)) / np.std(test_df['flux'].values),'.',alpha=0.1);
In [ ]:
plt.plot(de_median(predicted_output - test_df['flux'].values)/predicted_output,'.',alpha=0.1);
plt.ylim(-1.0,1.0);
In [ ]:
test_df['flux'].values.size/0.4
In [ ]:
r2_score(test_df['flux'].values, predicted_output)*100
In [ ]:
print('Full notebook took {} seconds'.format(time()-start0))