In [1]:
%%capture
%run "4 - Linear Algebra.ipynb"
%run "5 - Statistics.ipynb"
%run "6 - Probability.ipynb"
%run "8 - Gradient Descent.ipynb"

Exploring Your Data

Before you start building models and looking for answers, you should explore the data.

Exploring One-Dimensional Data

One-dimensional data is just a collection of some type, e.g., numbers.

A good place to start is with summary statistics.


In [2]:
def bucketize(point, bucket_size):
    """floor the point to the next lower multiple of bucket size"""
    return bucket_size * math.floor(point / bucket_size)

def make_histogram(points, bucket_size):
    return Counter(bucketize(point, bucket_size) for point in points)

def plot_histogram(points, bucket_size, title=''):
    histogram = make_histogram(points, bucket_size)
    plt.bar(list(histogram.keys()), histogram.values(), width=bucket_size, edgecolor='white')
    plt.title(title)
    plt.show()

In [3]:
import random

random.seed(0)

# uniform #'s will be between -100 and 100
uniform = [200 * random.random() - 100 for _ in range(10000)]

# normal #'s will be from the normal distribution with mean of 0 and standard deviation of 57
normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]

In [4]:
plot_histogram(uniform, 10, 'Uniform Histogram')



In [5]:
plot_histogram(normal, 10, 'Normal Histogram')


Two Dimensions


In [6]:
def random_normal():
    """retuns random #'s from standard normal distribution"""
    return inverse_normal_cdf(random.random())

xs = [random_normal() for _ in range(1000)]
ys1 = [x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]
ys3 = [0 if x < 0 else 6 for x in xs]

In [7]:
plt.scatter(xs, ys1, marker='.', color='black', label='ys1');
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2');
plt.xlabel('xs');
plt.ylabel('ys');
plt.legend(loc=5);
plt.title('Very Different Joint Distributions');



In [8]:
correlation(xs, ys1), correlation(xs, ys2)


Out[8]:
(0.9010493686379609, -0.8920981526880033)

Many Dimensions

An easy way to investigate relationships between many dimensions is with a correlation matrix.


In [9]:
def correlation_matrix(data):
    """returns a matrix where x_ij is the correlation between col i and col j"""
    _, num_cols = shape(data)
    
    def matrix_entry(i, j):
        return correlation(get_column(data, i), get_column(data, j))
    
    return make_matrix(num_cols, num_cols, matrix_entry)

In [10]:
data = [[x, y2, y1, ys3] for (x, y1, y2, ys3) in zip(xs, ys1, ys2, ys3)]
correlation_matrix(data)


Out[10]:
[[1.0, -0.8920981526880033, 0.9010493686379609, 0.7997519975096119],
 [-0.8920981526880033, 1.0, -0.8072731481569311, -0.7145022734332825],
 [0.9010493686379609, -0.8072731481569311, 1.0, 0.7037027015911191],
 [0.7997519975096119, -0.7145022734332824, 0.7037027015911193, 1.0]]

In [11]:
_, num_columns = shape(data)
fig, ax = plt.subplots(num_columns, num_columns)

for i in range(num_columns):
    for j in range(num_columns):
        # scatter column_j on the x-axis vs column_i on the y-axis
        if i != j: ax[i][j].scatter(get_column(data, j), get_column(data, i))
            
        # unless i == j, in which case show the series name
        else: ax[i][j].annotate("series " + str(i), (0.5, 0.5), xycoords='axes fraction', ha="center", va="center")
            
        # then hide axis labels except left and bottom charts
        if i < num_columns - 1: ax[i][j].xaxis.set_visible(False)
        if j > 0: ax[i][j].yaxis.set_visible(False)
        
# fix the bottom right and top left axis labels, which are wrong because
# their charts only have text in them
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())


Out[11]:
(-5.0, 4.0)

Cleaning And Munging


In [12]:
def parse_row(input_row, parsers):
    """given a list of parsers (some of which may be None)
    apply the appropriate one to each element of the input_row"""
    return [parser(value) if parser is not None else value for value, parser in zip(input_row, parsers)]

def parse_rows_with(reader, parsers):
    """wrap a reader to apply the parsers to each of its rows"""
    for row in reader:
        yield parse_row(row, parsers)

In [13]:
def try_or_none(f):
    """wraps f to return None if f raises an exception
    assumes f takes only one input"""
    def f_or_none(x):
        try: return f(x)
        except: return None
    return f_or_none

In [14]:
def parse_row(input_row, parsers):
    return [try_or_none(parser)(value) if parser is not None else value for value, parser in zip(input_row, parsers)]

In [15]:
import dateutil.parser
import csv

data = []

with open("stocks.csv", "r") as f:
    reader = csv.reader(f)
    for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
        data.append(line)
        
data


Out[15]:
[[datetime.datetime(2014, 6, 20, 0, 0), 'AAPL', 90.91],
 [datetime.datetime(2014, 6, 20, 0, 0), 'MSFT', 41.68],
 [datetime.datetime(2014, 6, 20, 0, 0), 'FB', 64.5],
 [datetime.datetime(2014, 6, 21, 0, 0), 'AAPL', 91.28],
 [datetime.datetime(2014, 6, 21, 0, 0), 'MSFT', 40.37],
 [datetime.datetime(2014, 6, 21, 0, 0), 'FB', 64.92]]

Manipulating Data


In [16]:
max_aapl_price = max(row[2]
                     for row in data
                     if row[1] == "AAPL")
max_aapl_price


Out[16]:
91.28

Rescaling

When working with data, many calculations are sensitive to the scale of the data. Consider clustering as an example:


In [17]:
import pandas as pd

pd.DataFrame({'Height (inches)': [63, 67, 70], 'Height (centimeters)': [160, 170.2, 177.8], 'Weight': [150, 160, 171]}, index=['A', 'B', 'C'])


Out[17]:
Height (centimeters) Height (inches) Weight
A 160.0 63 150
B 170.2 67 160
C 177.8 70 171

If we use inches, B's closest neighbor is A:


In [18]:
a_to_b = distance([63, 150], [67, 160])
a_to_c = distance([63, 150], [70, 171])
b_to_c = distance([67, 160], [70, 171])

a_to_b, a_to_c, b_to_c


Out[18]:
(10.770329614269007, 22.135943621178654, 11.40175425099138)

But if we use centimeters, the closest neighbor is C:


In [19]:
a_to_b = distance([160, 150], [170.2, 160])
a_to_c = distance([160, 150], [177.8, 171])
b_to_c = distance([170.2, 160], [177.8, 171])

a_to_b, a_to_c, b_to_c


Out[19]:
(14.284257068535268, 27.52889391167034, 13.370115930686627)

When results are impacted by units of measure like in the above example, scaling can help. Rescaling projects values into a dimension with mean 0 and standard deviation 1. This effectively converts data into a common unit of "standard deviations from the mean."


In [20]:
def scale(data_matrix):
    """returns the means and standard deviations of each column"""
    num_rows, num_cols = shape(data_matrix)
    means = [mean(get_column(data_matrix,j)) for j in range(num_cols)]
    stdevs = [standard_deviation(get_column(data_matrix,j)) for j in range(num_cols)]
    
    return means, stdevs

In [21]:
def rescale(data_matrix):
    """rescales the input data so that each column
    has mean 0 and standard deviation 1
    leaves alone columns with no deviation"""
    means, stdevs = scale(data_matrix)
    
    def rescaled(i, j):
        if stdevs[j] > 0:
            return (data_matrix[i][j] - means[j]) / stdevs[j]
        else:
            return data_matrix[i][j]
    
    num_rows, num_cols = shape(data_matrix)
    return make_matrix(num_rows, num_cols, rescaled)

Rescale our weights so that they do not differ by units:


In [22]:
hts_in = [[63], [67], [70]]
hts_cm = [[160], [170.2], [177.8]]
wts = [[150], [160], [171]]

hts_in_rescaled = rescale(hts_in)
hts_cm_rescaled = rescale(hts_cm)
wts_rescaled = rescale(wts)

hts_in_rescaled, hts_cm_rescaled, wts_rescaled


Out[22]:
([[-1.0440737953277504], [0.09491579957524855], [0.9491579957524977]],
 [[-1.0449798276855173], [0.09703384114222419], [0.9479459865432901]],
 [[-0.9837552647618352], [-0.03173404079876975], [1.0154893055606022]])

Now when we calculate distances, you can see that we end up with almost equivalent calculations for inches as we do centimeters:


In [23]:
a_to_b_rescaled = distance([hts_in_rescaled[0][0], wts_rescaled[0][0]], [hts_in_rescaled[1][0], wts_rescaled[1][0]])
a_to_c_rescaled = distance([hts_in_rescaled[0][0], wts_rescaled[0][0]], [hts_in_rescaled[2][0], wts_rescaled[2][0]])
b_to_c_rescaled = distance([hts_in_rescaled[1][0], wts_rescaled[1][0]], [hts_in_rescaled[2][0], wts_rescaled[2][0]])

a_to_b_rescaled, a_to_c_rescaled, b_to_c_rescaled


Out[23]:
(1.4844668093876099, 2.823110310444266, 1.3514460651057634)

In [24]:
a_to_b_rescaled = distance([hts_cm_rescaled[0][0], wts_rescaled[0][0]], [hts_cm_rescaled[1][0], wts_rescaled[1][0]])
a_to_c_rescaled = distance([hts_cm_rescaled[0][0], wts_rescaled[0][0]], [hts_cm_rescaled[2][0], wts_rescaled[2][0]])
b_to_c_rescaled = distance([hts_cm_rescaled[1][0], wts_rescaled[1][0]], [hts_cm_rescaled[2][0], wts_rescaled[2][0]])

a_to_b_rescaled, a_to_c_rescaled, b_to_c_rescaled


Out[24]:
(1.4867883610875934, 2.8228942865405537, 1.349343624267431)

Dimensionality Reduction

On occasion, the actually useful dimension of the data might not correspond to the dimensions that we possess in the data. We can use principal component analysis to extract one or more dimensions associated with as much of the variation as possible.


In [25]:
pca_data = [[7, 4, 3],
            [4, 1, 8],
            [6, 3, 5],
            [8, 6, 1],
            [8, 5, 7],
            [7, 2, 9],
            [5, 3, 3],
            [9, 5, 8],
            [7, 4, 5],
            [8, 2, 2]]
pca_xs = [random_normal() for _ in range(100)]
pca_ys1 = [x + random_normal() / 2 for x in pca_xs]
pca_ys2 = [-x + random_normal() / 2 for x in pca_xs]
pca_ys3 = [0 if x < 0 else 6 for x in pca_xs]
pca_data = [[x, y1, y2, y3] for (x, y1, y2, y3) in zip(pca_xs, pca_ys1, pca_ys2, pca_ys3)]

plt.scatter(pca_xs, pca_ys1);
plt.scatter(pca_xs, pca_ys2);
plt.scatter(pca_xs, pca_ys3);



In [26]:
def de_mean_matrix(A):
    """Returns each value in columns of A minus the mean of
    that column resulting in mean of 0 for all columns"""
    nr, nc = shape(A)
    column_means, _ = scale(A)
    return make_matrix(nr, nc, lambda i, j: A[i][j] - column_means[j])

pca_data_de_mean = de_mean_matrix(pca_data)
pca_data_de_mean[:5]


Out[26]:
[[0.15799179077148437, 0.952357006072998, -0.3080911636352539, 2.76],
 [-0.49382553100585935, -0.6675827980041504, 0.7918386459350586, -3.24],
 [0.9878410339355469, 1.295984935760498, -0.6932516098022461, 2.76],
 [0.7367576599121094, 0.700739574432373, -1.081254005432129, 2.76],
 [0.5204452514648438, -0.04740839004516602, -1.1748952865600586, 2.76]]

In [27]:
plt.scatter([x[0] for x in pca_data_de_mean], [0 for _ in pca_data_de_mean]);
plt.scatter([x[1] for x in pca_data_de_mean], [0.1 for _ in pca_data_de_mean]);
plt.scatter([x[2] for x in pca_data_de_mean], [0.2 for _ in pca_data_de_mean]);
plt.scatter([x[3] for x in pca_data_de_mean], [0.3 for _ in pca_data_de_mean]);
frame = plt.gca()
frame.axes.yaxis.set_visible(False);
plt.ylim((-0.1, 1));



In [28]:
plt.scatter([x[0] for x in pca_data_de_mean], [x[1] for x in pca_data_de_mean]);
plt.scatter([x[0] for x in pca_data_de_mean], [x[2] for x in pca_data_de_mean]);
plt.scatter([x[0] for x in pca_data_de_mean], [x[3] for x in pca_data_de_mean]);


With the de-meaned matrix, we can figure out which direction captures the most variance in the data:


In [29]:
def direction(w):
    mag = magnitude(w)
    return [w_i / mag for w_i in w]

pca_data_direction = [direction(w) for w in pca_data_de_mean]
pca_data_direction[:5]


Out[29]:
[[0.05373601425953709,
  0.3239147389153165,
  -0.10478766701421215,
  0.9387285164128337],
 [-0.1436721210365911,
  -0.1942245399127823,
  0.23037516417692525,
  -0.9426358965491234],
 [0.30123818988911794,
  0.39520544577573136,
  -0.2114043180030233,
  0.8416510101646703],
 [0.23510213703683877,
  0.22360863065202327,
  -0.34503221505299353,
  0.8807263684222605],
 [0.17092660150423958,
  -0.015570043093677903,
  -0.38586356180564746,
  0.9064496579109792]]

In [30]:
plt.scatter([x[0] for x in pca_data_direction], [x[1] for x in pca_data_direction]);
plt.scatter([x[0] for x in pca_data_direction], [x[2] for x in pca_data_direction]);
plt.scatter([x[0] for x in pca_data_direction], [x[3] for x in pca_data_direction]);


Now we can compute the variance of the data in the direction, w:


In [31]:
def directional_variance_i(x_i, w):
    """the variance of the row x_i in the direction determined by w"""
    return dot(x_i, direction(w)) ** 2

def directional_variance(X, w):
    """the variance of the data in the direction determined w"""
    return sum(directional_variance_i(x_i, w) for x_i in X)

def directional_variance_gradient_i(x_i, w):
    """the contribution of row x_i to the gradient of
    the direction-w variance"""
    projection_length = dot(x_i, direction(w))
    return [2 * projection_length * x_ij for x_ij in x_i]

def directional_variance_gradient(X, w):
    return vector_sum(directional_variance_gradient_i(x_i, w) for x_i in X)

pca_data_variance = [directional_variance(pca_data_direction, w) for w in pca_data_direction]
pca_data_variance[:5]


Out[31]:
[85.34343865640452,
 89.55570491740234,
 87.94532120355373,
 89.7202732375027,
 82.8163472587505]

The first principal component is the direction that maximizes the directional variance above.


In [32]:
def first_principal_component(X):
    guess = [1 for _ in X[0]]
    unscaled_maximizer = maximize_batch(
        partial(directional_variance, X), # is now a function of w
        partial(directional_variance_gradient, X), # is now a function of w
        guess)
    return direction(unscaled_maximizer)

first_principal_component(pca_data)


Out[32]:
[0.15818217241815652,
 0.17523078357890395,
 -0.15429505560345033,
 0.9594089892385438]

We can use stochastic gradient descent here:


In [33]:
def first_principal_component_sgd(X):
    guess = [1 for _ in X[0]]
    unscaled_maximizer = maximize_stochastic(
        lambda x, _, w: directional_variance_i(x, w),
        lambda x, _, w: directional_variance_gradient_i(x, w),
        X,
        [None for _ in X], # the fake "y"
        guess)
    return direction(unscaled_maximizer)

first_principal_component_sgd(pca_data)


Out[33]:
[0.5, 0.5, 0.5, 0.5]

In [34]:
def project(v, w):
    """returns the projection of v onto direction w"""
    projection_length = dot(v, w)
    return scalar_multiply(projection_length, w)

In [35]:
def remove_projection_from_vector(v, w):
    """projects v onto w and substracts the result from v"""
    return vector_subtract(v, project(v, w))

def remove_projection(X, w):
    return [remove_projection_from_vector(x_i, w) for x_i in X]

Given a three-dimensional data set, it becomes effectively two-dimensional when we remove the first component.


In [36]:
pca_data_fc_removed = remove_projection(pca_data, first_principal_component_sgd(pca_data))
pca_data_fc_removed[:5]


Out[36]:
[[-1.5223469734191895,
  -0.6775679588317871,
  -2.0588221549987793,
  4.258737087249756],
 [-0.3812074661254883,
  -0.5045509338378906,
  0.8340644836425781,
  0.05169391632080078],
 [-0.8895769119262695,
  -0.5310192108154297,
  -2.641061782836914,
  4.061657905578613],
 [-0.8320775032043457,
  -0.8176817893981934,
  -2.7204813957214355,
  4.370240688323975],
 [-0.7838644981384277,
  -1.3013043403625488,
  -2.5495972633361816,
  4.634766101837158]]

In [37]:
plt.scatter([x[0] for x in pca_data_fc_removed], [x[1] for x in pca_data_fc_removed]);
plt.scatter([x[0] for x in pca_data_fc_removed], [x[2] for x in pca_data_fc_removed]);
plt.scatter([x[0] for x in pca_data_fc_removed], [x[3] for x in pca_data_fc_removed]);


For higher-dimensional data sets, iterate to find as many components as you like:


In [38]:
def principal_component_analysis(X, num_components):
    components = []
    for _ in range(num_components):
        component = first_principal_component(X)
        components.append(component)
        X = remove_projection(X, component)
        
    return components

Then transform the data into a lower-dimensional space:


In [39]:
def transform_vector(v, components):
    return [dot(v, w) for w in components]

def transform(X, components):
    return [transform_vector(x_i, components) for x_i in X]

In [40]:
pca_components = principal_component_analysis(pca_data, 3)
pca_transformed = transform(pca_data, pca_components)
pca_transformed[:5]


Out[40]:
[[6.026472455495795, -0.7658842129800711, 0.5196024118460548],
 [-0.28666441226063133, -0.9784118272777231, 0.1944623816288471],
 [6.277382355395372, 0.07957798611090272, 0.39542078922307095],
 [6.193226981955014, -0.2043747241300513, -0.26510838715361756],
 [6.0423600478958726, -0.7193120119032888, -0.8372513770564836]]

In [41]:
plt.scatter([x[0] for x in pca_transformed], [x[1] for x in pca_transformed]);
plt.scatter([x[0] for x in pca_transformed], [x[2] for x in pca_transformed]);
plt.title('3-dimensions n=' + str(len(pca_transformed)));



In [42]:
pca_components = principal_component_analysis(pca_data, 2)
pca_transformed = transform(pca_data, pca_components)
pca_transformed[:5]


Out[42]:
[[6.026472455495795, -0.7658842129800711],
 [-0.28666441226063133, -0.9784118272777231],
 [6.277382355395372, 0.07957798611090272],
 [6.193226981955014, -0.2043747241300513],
 [6.0423600478958726, -0.7193120119032888]]

In [43]:
plt.scatter([x[0] for x in pca_transformed], [x[1] for x in pca_transformed]);
plt.title('2-dimensions n=' + str(len(pca_transformed)));



In [44]:
pca_components = principal_component_analysis(pca_data, 1)
pca_transformed = transform(pca_data, pca_components)
pca_transformed[:5]


Out[44]:
[[6.026472455495795],
 [-0.28666441226063133],
 [6.277382355395372],
 [6.193226981955014],
 [6.0423600478958726]]

In [45]:
plt.scatter([x[0] for x in pca_transformed], [0 for _ in pca_transformed]);
plt.title('1-dimension n=' + str(len(pca_transformed)));


Dimensionality reduction helps with analysis in two ways:

  • it removes noisy dimensions and consolidates highly correlated dimensions.
  • it enables the use of techniques that do not work well with high-dimensional data.

One downside, however is that it can make models harder to interpret because of the collapsing of variables.


In [ ]: