In [ ]:
from azureml import Workspace
ws = Workspace()
experiment = ws.experiments['6aa07b1af33b4b2bb32adc11b6e40bdc.f-id.299df9aa87674e0da3349774b2df8879']
ds = experiment.get_intermediate_dataset(
    node_id='e77d289d-9cde-4a3f-a13d-42425508c22b-239',
    port_name='Results dataset',
    data_type_id='GenericCSV'
)
auto_price = ds.to_dataframe()

In [ ]:
auto_price.dtypes

In [ ]:
plot_cols = ["wheel-base",
              "width",
              "height",
              "curb-weight",
              "engine-size",
              "bore",
              "compression-ratio",
              "city-mpg",
              "price",
              "lnprice"]

In [ ]:
## Create pair-wise scatter plots         
def auto_pairs(plot_cols, df):
    import matplotlib.pyplot as plt
    from pandas.tools.plotting import scatter_matrix
    fig = plt.figure(figsize=(12, 12))
    fig.clf()
    ax = fig.gca()
    scatter_matrix(df[plot_cols], alpha=0.3, 
               diagonal='kde', ax = ax)
    return 'Done'

In [ ]:
auto_pairs(plot_cols, auto_price)

In [ ]:
plot_cols2 = ["length",
               "curb-weight",
               "engine-size",
               "city-mpg",
               "price"]

In [ ]:
## Function to plot conditioned histograms
def cond_hists(df, plot_cols, grid_col):
    import matplotlib.pyplot as plt
    import pandas.tools.rplot as rplot
    ## Loop over the list of columns
    for col in plot_cols:
        ## Define figure
        fig = plt.figure(figsize=(14, 4))
        fig.clf()
        ax = fig.gca()
        ## Setup plot and grid and plot the data
        plot = rplot.RPlot(df, x = col, 
                                  y = '.') 
        plot.add(rplot.TrellisGrid(['.', grid_col]))
        plot.add(rplot.GeomHistogram())
        ax.set_title('Histograms of ' + col + ' conditioned by ' + grid_col + '\n')
        plot.render()
    return grid_col

In [ ]:
cond_hists(auto_price, plot_cols2, 'drive-wheels')

In [ ]:
## Create boxplots of data
def auto_boxplot(df, plot_cols, by):
    import matplotlib.pyplot as plt
    for col in plot_cols:
        fig = plt.figure(figsize=(9, 6))
        ax = fig.gca()
        df.boxplot(column = col, by = by, ax = ax)
        ax.set_title('Box plots of ' + col + ' by ' + by)
        ax.set_ylabel(col)
    return by

In [ ]:
auto_boxplot(auto_price, plot_cols2, 'drive-wheels')

In [ ]:
## Define columns for making scatter plots
plot_cols3 = ["length",
               "curb-weight",
               "engine-size",
               "city-mpg"]

In [ ]:
## Create scatter plot
def auto_scatter(df, plot_cols):
    import matplotlib.pyplot as plt
    for col in plot_cols:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.gca()
        temp1 = df.ix[df['fuel-type'] == 'gas']       
        temp2 = df.ix[df['fuel-type'] == 'diesel']
        if temp1.shape[0] > 0:                    
            temp1.plot(kind = 'scatter', x = col, y = 'price' , 
                           ax = ax, color = 'DarkBlue')                          
        if temp2.shape[0] > 0:                    
            temp2.plot(kind = 'scatter', x = col, y = 'price' , 
                           ax = ax, color = 'Red') 
        ax.set_title('Scatter plot of price vs. ' + col)
    return plot_cols

In [ ]:
auto_scatter(auto_price, plot_cols3)

In [ ]:
## Create conditioned scatter plots
def auto_scatter_cond(df, plot_cols, y, cond_var1, cond_var2):
    for col in plot_cols:
        condPltsCol(df, col, y, cond_var1, cond_var2)
    return cond_var1, cond_var2

def condPltsCol(df, col1, col2, var1, var2):
    import matplotlib.pyplot as plt
    
    ## Find the levels of the conditioning variables
    levs1 = df[var1].unique().tolist()
    num1 = len(levs1)
    levs2 = df[var2].unique().tolist()
    num2 = len(levs2)   
    
    ## Determine the limits for the plots
    xlims = (df[col1].min(), df[col1].max())
    ylims = (df[col2].min(), df[col2].max())
    
    ## Define a figure and axes for the plot
    fig, ax = plt.subplots(num1, num2, figsize = (12, 8))

    ## Loop over conditioning variables subset the data
    ## and set create scatter plots for each conditioning
    ## variable pair and with data both gas and diesel cars    
    for i, val1 in enumerate(levs1):
        for j, val2 in enumerate(levs2):
            temp1 = df.ix[(df[var1] == val1) & (df[var2] == val2) & (df['fuel-type'] == 'gas')]       
            temp2 = df.ix[(df[var1] == val1) & (df[var2] == val2) & (df['fuel-type'] == 'diesel')]
            if temp1.shape[0] > 0:                    
                temp1.plot(kind = 'scatter', x = col1, y = col2 , ax = ax[i,j],
                          xlim = xlims, ylim = ylims, color = 'DarkBlue')                          
            if temp2.shape[0] > 0:                    
                temp2.plot(kind = 'scatter', x = col1, y = col2 , ax = ax[i,j],
                          xlim = xlims, ylim = ylims, color = 'Red')    
            ax[i,j].set_title(val1 + ' and ' + val2 )
            ax[i,j].set_xlabel('')
    
    ## Some lables for the x axis    
    ax[i,j].set_xlabel(col1)
    ax[i,(j-1)].set_xlabel(col1)
    return col1, col2

In [ ]:
auto_scatter_cond(auto_price, plot_cols3, 'price', 'body-style', 'num-cylinders')