In [38]:
# Tell IPython that we want to show matplotlib plots using the inline renderer.
%matplotlib inline

import pandas as pd
from pandas.stats.api import ols
import numpy as np

In [39]:
# Build some fake data on which to do a regression.
x_start = 50
x_stop = 150

# np.arange(X, Y) returns a numpy array containing integer values in [X,Y).
x_vals = np.arange(x_start, x_stop)

# Build fake observations by taking a linear model 
# and adding a random fuzz value to each entry.
base = 5.6 + (3 * x_vals)
observations_0 = base + 15 * np.random.randn(len(x_vals))
observations_1 = base + 15 * np.random.randn(len(x_vals))

df = pd.DataFrame(
    {
        'x': x_vals,
        'y0': observations_0,
        'y1': observations_1,
    },
    # This isn't strictly necessary, but will make it easier to align
    # our plots later.
    index=x_vals,
)
df


Out[39]:
x y0 y1
50 50 172.854190 145.772328
51 51 146.172851 177.290998
52 52 150.585412 166.771689
53 53 167.788975 164.723109
54 54 192.526672 176.700035
55 55 173.087062 196.051053
56 56 169.319521 179.012183
57 57 171.688592 174.852734
58 58 169.765965 192.957446
59 59 195.677395 165.677555
60 60 211.965887 187.194226
61 61 174.972141 200.603028
62 62 214.138771 192.090558
63 63 206.196159 209.814485
64 64 191.253703 225.309439
65 65 210.448703 192.892762
66 66 211.215431 206.644236
67 67 193.563510 203.880360
68 68 197.270521 201.113099
69 69 220.618016 201.688256
70 70 207.151778 196.817555
71 71 213.060158 232.184963
72 72 247.754713 211.653061
73 73 213.057432 234.618208
74 74 226.267727 226.889442
75 75 232.616644 243.718635
76 76 238.005354 220.478286
77 77 227.736492 238.212674
78 78 263.640435 261.701072
79 79 230.519727 245.474255
... ... ... ...
120 120 364.626304 347.354081
121 121 398.163848 372.939704
122 122 354.719352 357.856101
123 123 381.885688 375.899306
124 124 386.171630 366.608254
125 125 383.883295 385.406969
126 126 390.743554 347.937944
127 127 408.241602 377.825537
128 128 370.128210 389.405398
129 129 392.087396 377.953093
130 130 390.125402 396.472032
131 131 401.071693 395.520709
132 132 391.293586 384.902955
133 133 419.354889 398.454692
134 134 414.409858 408.179517
135 135 397.060690 396.944569
136 136 423.654914 372.181352
137 137 416.444577 421.441720
138 138 414.907222 408.083432
139 139 438.579730 418.188859
140 140 419.688440 437.514931
141 141 449.265980 421.402262
142 142 424.936533 421.858118
143 143 448.530916 451.793766
144 144 403.320395 448.833260
145 145 448.641680 438.132560
146 146 464.186444 438.631212
147 147 439.629830 457.988423
148 148 449.603870 459.015180
149 149 477.391707 469.187296

100 rows × 3 columns


In [40]:
regression = ols(x=df.x, y=pd.concat([df.y0, df.y1]))
regression


Out[40]:
-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         200
Number of Degrees of Freedom:   2

R-squared:         0.9741
Adj R-squared:     0.9740

Rmse:             14.0016

F-stat (1, 198):  7446.3785, p-value:     0.0000

Degrees of Freedom: model 1, resid 198

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x     2.9597     0.0343      86.29     0.0000     2.8925     3.0269
     intercept     9.9445     3.5534       2.80     0.0056     2.9799    16.9092
---------------------------------End of Summary---------------------------------

Plot our data to get a visual sense of whether this is a reasonable model.


In [65]:
import matplotlib.pyplot as plt
from matplotlib import lines

axis = df.plot(
    x='x',
    y='y0',
    kind='scatter',
    # Everything after this is optional.
    marker='x',
    linewidth='1.5',
    color='red',
    xlim=(45, 155),
    ylim=(125, 485),
    figsize=(12, 7),
    label='Trial 0',
)
df.plot(
    x='x',
    y='y1',
    kind='scatter',
    ax=axis, # This tells matplotlib to add this plot on top of the previous one.
    # Everything after this is optional.
    marker='x',
    linewidth='1.5',
    color='blue',
    label='Trial 1',
)
ols_result.y_fitted.plot(
    ax=axis,
    color='purple',
    linewidth='2',
    # See https://docs.python.org/2/library/string.html#format-specification-mini-language
    # for an explanation of the format of the strings inside the curly braces here.
    label='Regression: Y = {m:.4f}X + {b:.5}'.format(
        m=regression.beta.x,
        b=regression.beta.intercept,
    )
)

plt.xlabel('My Awesome X Axis')
plt.ylabel('My Awesome Y Axis')

plt.legend(
    loc='upper left',
    scatterpoints=1,
)

plt.title('My Awesome Regression Analysis')


Out[65]:
<matplotlib.text.Text at 0x110802bd0>