In [1]:
%matplotlib inline
from __future__ import division
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sgd import *
from collections import Counter
import random
from probability import *
from stats import *
sns.set_context('poster')
In [2]:
def bucketize(point, bucket_size):
return bucket_size * math.floor(point / bucket_size)
def make_histogram(points, bucket_size):
return Counter(bucketize(point, bucket_size)
for point in points)
def plot_histogram(points, bucket_size, title = ""):
histogram = make_histogram(points, bucket_size)
plt.bar(histogram.keys(), histogram.values(), width = bucket_size)
plt.title(title)
plt.show()
In [3]:
random.seed(0)
uniform = [200 * random.random() - 100 for _ in range(10000)]
normal = [57 * inverse_normal_cdf(random.random())
for _ in range(10000)]
In [4]:
plot_histogram(uniform, 10, "Uniform Histogram")
In [5]:
plot_histogram(normal, 10, "Normal Histogram")
In [6]:
def random_normal():
return inverse_normal_cdf(random.random())
In [7]:
xs = [random_normal() for _ in range(1000)]
ys1 = [x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]
In [8]:
plot_histogram(ys1, 0.5)
plot_histogram(ys2, 0.5)
In [9]:
plt.scatter(xs, ys1, marker = '.', color = 'black', label = 'ys1')
plt.scatter(xs, ys2, marker = '.', color = 'gray', label = 'ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc = 'best')
Out[9]:
In [10]:
correlation(xs,ys1)
Out[10]:
In [11]:
correlation(xs,ys2)
Out[11]:
In [12]:
def correlation_matrix(data):
"""return pariwise correlations between all columns of data"""
_, num_columns = shape(data)
def matrix_entry(i,j):
return correlation(get_column(data, i), get_column(data, j))
return make_matrix(num_columns, num_columns, matrix_entry)
In [13]:
correlation_matrix([[x_i,y_1_i,y_2_i] for x_i, y_1_i, y_2_i in zip(xs,ys1,ys2)])
Out[13]:
In [14]:
def try_or_none(f):
"""try to run f if you can't return None"""
def f_or_none(x):
try:
return f(x)
except:
return None
return f_or_none
def parse_row(input_row, parsers):
"""take a given list of parsers and apply each one to a
particular element of an input_row"""
return [try_or_none(parser)(value) if parser is not None else value
for value, parser in zip(input_row, parsers)]
def parse_rows_with(reader, parsers):
"""wrap a reader with parsers to apply"""
for row in reader:
yield parse_row(row, parsers)
def try_parse_field(field_name, value, parser_dict):
"""try to parse a field using the right parser from the dict"""
parser = parser_dict.get(field_name) # None if it doesn't exist
if parser is not None:
return try_or_none(parser)(value)
else:
return value
def parse_dict(input_dict, parser_dict):
return {field_name : try_parse_field(field_name, value, parser_dict)
for field_name, value in input_dict.iteritems()}
In [15]:
from collections import defaultdict
def picker(field_name):
"""grabs a field from a dict"""
return lambda row: row[field_name]
def pluck(field_name, rows):
"""turn a list of dicts into the list of field_name values"""
return map(picker(field_name), rows)
def group_by(grouper, rows, value_transform = None):
# key is output of grouper where value is list of rows
grouped = defaultdict(list)
for row in rows:
grouped[grouper(row)].append(row)
if value_transform is None:
return grouped
else:
return {key : value_transform(rows)
for key,rows in grouped.iteritems()}
def percent_price_change(yesterday, today):
return today["closing_price"] / yesterday["closing_price"] - 1
def day_over_day_changes(grouped_rows):
# sort by date
ordered = sort(grouped_rows, key = picker("date"))
# shift and zip to make pairs then apply tranform
return [{"symbol": today["symbol"],
"date": today["date"],
"change": percent_price_change(yesterday, today)}
for yesterday, today in zip(ordered, ordered[1:])]
# max(all_changes, key = picker("change"))
def combine_pct_changes(pct_change1, pct_change2):
return (1 + pct_change1) * (1 + pct_change2) / - 1
def overall_change(changes):
return reduce(combine_pct_changes, pluck("change", changes))
In [16]:
def scale(data_matrix):
"""returns the means and stdev of the columns"""
num_rows, num_cols = shape(data_matrix)
means = [mean(get_column(data_matrix, i))
for i in range(num_cols)]
stdevs = [standard_deviation(get_column(data_matrix, i))
for i in range(num_cols)]
return means, stdevs
def rescale(data_matrix):
"""rescales the input data columns to standard mean 0 and stdev 1"""
means, stdev = scale(data_matrix)
def rescaled(i,j):
if stdevs[j] > 0:
return (data_matrix[i][j] - means[j]) / stdev[j]
else:
return data_matrix[i][j]
num_rows, num_cols = shape(data_matrix)
return make_matrix(num_rows, num_cols, rescaled)
In [17]:
def de_mean_matrix(A):
"""returns the matrix after substracting the mean of the columns from each column"""
nr, nc = shape(A)
column_means, _ = scale(A)
return make_matrix(nr, nc, lambda i,j: A[i][j] - column_means[j])
In [18]:
#de_mean_matrix([[x_i,y_1_i] for x_i, y_1_i in zip(xs,ys1)])
In [19]:
from functools import partial
def direction(w):
mag = magnitude(w)
return [w_i / mag for w_i in w]
def directional_variance_i(x_i, w):
"""the variance of row x_i in direction of w"""
return dot(x_i, direction(w)) ** 2
def directional_variance(X, w):
"""variance of data in direction w"""
return sum(directional_variance_i(x_i, w)
for x_i in X)
def directional_variance_gradient_i(x_i, w):
"""contribution of x_i to gradient in direction of w"""
projection_length = dot(x_i, direction(w))
return [2 * projection_length * x_ij for x_ij in x_i]
def directional_variance_gradient(X, w):
return vector_sum(directional_variance_gradient_i(x_i, w)
for x_i in X)
def first_principal_component(X):
guess = [1 for _ in X[0]]
unscaled_maximizer = maximize_batch(partial(directional_variance, X), # make function of w
partial(directional_variance_gradient, X), # make function of w
guess)
return direction(unscaled_maximizer)
def first_principal_component_sgd(X):
guess = [1 for _ in X[0]]
unscaled_maximizer = maximize_stochastic(lambda x, _, w: directional_variance_i(x, w),
lambda x, _, w: directional_variance_gradient_i(x, w),
X,
[None for _ in X],
guess)
return direction(unscaled_maximizer)
In [20]:
first_principal_component(de_mean_matrix([[x_i,y_1_i] for x_i, y_1_i in zip(xs,ys2)]))
Out[20]:
In [26]:
# This runs ass slow for some reason
#first_principal_component_sgd([[x_i,y_1_i] for x_i, y_1_i in zip(xs,ys2)])
In [27]:
def project(v, w):
"""return the proj of v into w"""
projection_length = dot(v,w)
return scalar_multiply(projection_length, w)
# To find others we simply 'remove' this vector from the data
def remove_projection_from_vector(v, w):
"""project v onto w then remove"""
return vector_subtract(v, project(v,w))
def remove_projection(X, w):
"""for each row of X, project row onto w then remove"""
return [remove_projection_from_vector(x_i, w) for x_i in X]
def principal_component_analysis(X, num_components):
components = []
for _ in range(num_components):
component = first_principal_component(X)
components.append(component)
X = remove_projection(X, component)
return components
# Make use of this and transform the data
def transform_vector(v, components):
return [dot(v,w) for w in components]
def transform(X, components):
return [transform_vector(x_i, components) for x_i in X]
In [ ]: