In [1]:
import sys, os, re
sys.path.append("lib")
import utils
import numpy as np
import sklearn
import iso8601
import datetime
print("Imports loaded...")
In [2]:
# Load and check the size of our training data. May take a minute.
print("Original JSON file size: {:,} Bytes".format(os.path.getsize("../data/simple_flight_delay_features.jsonl")))
training_data = utils.read_json_lines_file('../data/simple_flight_delay_features.jsonl')
print("Training items: {:,}".format(len(training_data))) # 457,013
print("Data loaded...")
In [3]:
# Inspect a record before we alter them
print("Size of training data in RAM: {:,} Bytes".format(sys.getsizeof(training_data))) # 4MB
print(training_data[0])
In [4]:
# # We need to sample our data to fit into RAM
# training_data = np.random.choice(training_data, 1000000) # 'Sample down to 1MM examples'
# print("Sampled items: {:,} Bytes".format(len(training_data)))
# print("Data sampled...")
In [5]:
# Separate our results from the rest of the data, vectorize and size up
results = [record['ArrDelay'] for record in training_data]
results_vector = np.array(results)
print("Results vectorized size: {:,}".format(sys.getsizeof(results_vector))) # 3,656,200 bytes
print("Results vectorized...")
In [6]:
# Remove the two delay fields and the flight date from our training data
for item in training_data:
item.pop('ArrDelay', None)
item.pop('FlightDate', None)
item.pop('DepDelay', None)
print("ArrDelay, DepDelay and FlightDate removed from training data...")
In [7]:
# Must convert datetime strings to unix times
for item in training_data:
if isinstance(item['CRSArrTime'], str):
dt = iso8601.parse_date(item['CRSArrTime'])
unix_time = int(dt.timestamp())
item['CRSArrTime'] = unix_time
if isinstance(item['CRSDepTime'], str):
dt = iso8601.parse_date(item['CRSDepTime'])
unix_time = int(dt.timestamp())
item['CRSDepTime'] = unix_time
print("CRSArr/DepTime converted to unix time...")
In [17]:
# Use DictVectorizer to convert feature dicts to vectors
from sklearn.feature_extraction import DictVectorizer
print("Sampled dimensions: [{:,}]".format(len(training_data)))
vectorizer = DictVectorizer()
training_vectors = vectorizer.fit_transform(training_data)
print("Size of DictVectorized vectors: {:,} Bytes".format(training_vectors.data.nbytes))
print(training_vectors)
print("Training data vectorized...")
In [9]:
from sklearn.model_selection import train_test_split
# Redo test/train split
X_train, X_test, y_train, y_test = train_test_split(
training_vectors,
results_vector,
test_size=0.1,
random_state=17
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print("Test train split performed again...")
In [10]:
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor()
print("Gradient boosting regressor instantiated...!")
In [11]:
# Refit regression on new training data
regressor.fit(X_train, y_train)
print("Regressor fitted again...")
In [12]:
# Predict using the test data again
predicted = regressor.predict(X_test.toarray())
print("Predictions made for X_test again...")
In [13]:
from sklearn.metrics import median_absolute_error, r2_score
# Get the median absolute error again
medae = median_absolute_error(y_test, predicted)
print("Median absolute error: {:.3g}".format(medae))
# Get the r2 score gain
r2 = r2_score(y_test, predicted)
print("r2 score: {:.3g}".format(r2))
In [14]:
# Plot outputs, compare actual vs predicted values
import matplotlib.pyplot as plt
plt.scatter(
y_test,
predicted,
color='blue',
linewidth=1
)
plt.xticks(())
plt.yticks(())
plt.show()
In [15]:
%matplotlib inline
# Plot outputs
import matplotlib.pyplot as plt
# Cleans up the appearance
plt.rcdefaults()
plt.scatter(
y_test,
predicted,
color='blue',
linewidth=1
)
plt.grid(True)
plt.xticks()
plt.yticks()
plt.show()
In [ ]: