In [ ]:
from azureml import Workspace
ws = Workspace()
ds = ws.datasets["Automobile price data (Raw)"]
frame = ds.to_dataframe()
In [ ]:
frame
In [ ]:
cols = ["price", "bore","stroke", "horsepower", "peak-rpm"]
frame[cols] = frame[cols].convert_objects(convert_numeric = True)
frame.dropna(axis = 0, inplace = True)
frame.dtypes
In [ ]:
def describe(df, col):
## Compute the summary stats
desc = df[col].describe()
## Change the name of the 50% index to median
idx = desc.index.tolist()
idx[5] = "median"
desc.index = idx
return desc
In [ ]:
describe(frame, "price")
In [ ]:
def plot_stats(df, col):
import matplotlib.pyplot as plt
## Setup for ploting two charts one over the other
fig, ax = plt.subplots(2, 1, figsize = (12,8))
## First a box plot
df.dropna().boxplot(col, ax = ax[0], vert=False,
return_type="dict")
## Plot the histogram
temp = df[col].as_matrix()
ax[1].hist(temp, bins = 30, alpha = 0.7)
plt.ylabel("Number of Cars")
plt.xlabel(col)
return [col]
In [ ]:
plot_stats(frame, "price")
In [ ]:
describe(frame, "horsepower")
In [ ]:
plot_stats(frame, "horsepower")