In [ ]:
import pandas as pd
from scipy import stats
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
In [ ]:
brain = pd.read_csv("data/brain_size.csv", index_col=0, sep=';', na_values=".")
wages = pd.read_csv("data/wages.csv")
In [ ]:
stats.normaltest(brain['VIQ'])
In [ ]:
mean = brain['VIQ'].mean()
std = brain['VIQ'].std()
estimate = stats.norm.rvs(mean, std, size=(brain.shape[0],))
brain['VIQ'].plot(kind="kde", label="original")
pd.Series(estimate).plot(kind="kde", label="estimate")
plt.legend()
In [ ]:
# enter code here
In [ ]:
mbrain = brain[brain['Gender'] == "Male"]
fbrain = brain[brain['Gender'] == "Female"]
stats.ttest_ind(mbrain['VIQ'], fbrain['VIQ'])
In [ ]:
# enter code here
In [ ]:
# simulate some data
import numpy as np
x = np.linspace(-5, 5, 20)
np.random.seed(1)
# normal distributed noise
y = -5 + 3*x + 4 * np.random.normal(size=x.shape)
# Create a data frame containing all the relevant variables
df = pd.DataFrame({'x': x, 'y': y})
plt.scatter(x, y)
plt.xlabel("$x$")
plt.ylabel("$y$")
In [ ]:
# fitting an OLS model
model = sm.formula.ols("y ~ x", df).fit()
In [ ]:
print(model.summary())
In [ ]:
# reconstruction
alpha_est = 2.9369
beta_est = -5.5335
epsilon_est = 1.036
y_hat = x * alpha_est + beta_est + epsilon_est
plt.scatter(x, y, label="$y$")
plt.plot(x, y_hat, label="$\hat{y}$")
plt.legend()
In [ ]:
# enter code here
In [ ]:
from mpl_toolkits.mplot3d import Axes3D
x = np.linspace(-5, 5, 21)
# We generate a 2D grid
X, Y = np.meshgrid(x, x)
# To get reproducable values, provide a seed value
np.random.seed(1)
# Z is the elevation of this 2D grid
Z = -5 + 3*X - 0.5*Y + 8 * np.random.normal(size=X.shape)
# Plot the data
fig = plt.figure()
ax = fig.gca(projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm,
rstride=1, cstride=1)
ax.view_init(20, -120)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')
In [ ]:
iris = pd.read_csv("data/iris.csv")
iris.head()
In [ ]:
iris.plot.scatter("petal_length", "petal_width")
In [ ]:
# color above plot by species
gb = iris.groupby("name")
fig = plt.figure()
ax = fig.add_subplot(111)
colors = ['r', 'g', 'b']
for i, (name, xdf) in enumerate(gb):
xdf.plot.scatter("sepal_width", "petal_length", ax=ax, label=name, c=colors[i])
plt.legend(loc="upper left")
In [ ]:
model = sm.formula.ols("sepal_width ~ C(name) + petal_length", data=iris).fit()
In [ ]:
print(model.summary())
In [ ]:
# reconstruction from model
x_versicolor = iris[iris['name'] == "versicolor"]['petal_length']
x_virginica = iris[iris['name'] == "virginica"]['petal_length']
y_versicolor_est = x_versicolor * 0.2983 - 1.4821 + 2.9813
y_virginica_est = x_virginica * 0.2983 - 1.6635 + 2.9813
# plot estimated data
fig = plt.figure()
ax = fig.add_subplot(111)
plt.scatter(y_versicolor_est, x_versicolor, c="g", marker="x")
plt.scatter(y_virginica_est, x_virginica, c="b", marker="x")
# overlay original data
gb = iris.groupby("name")
colors = ['r', 'g', 'b']
for i, (name, xdf) in enumerate(gb):
xdf.plot.scatter("sepal_width", "petal_length", ax=ax, label=name, c=colors[i])
In [ ]:
# enter code here