Notebook that accompanies article on Practical Business Python
In [1]:
import pandas as pd
import seaborn as sns
In [2]:
%matplotlib inline
In [3]:
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
"num_doors", "body_style", "drive_wheels", "engine_location",
"wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system",
"bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
"city_mpg", "highway_mpg", "price"]
In [4]:
# Read in the CSV file and convert "?" to NaN
df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
header=None, names=headers, na_values="?" )
df_raw.head()
Out[4]:
In [5]:
# Take a quick look at all the values in the data
df_raw.describe()
Out[5]:
In [6]:
# Filter out the top 8 manufacturers
models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"]
In [7]:
df = df_raw[df_raw.make.isin(models)].copy()
In [8]:
df.head()
Out[8]:
In [9]:
### Basic Crosstab functions
In [10]:
# Create a simple crosstab that counts the number of occurences of each combination
pd.crosstab(df.make, df.num_doors)
Out[10]:
In [11]:
# Add a subtotal
pd.crosstab(df.make, df.num_doors, margins=True, margins_name="Total")
Out[11]:
In [12]:
# Another example, this time of make and body_style
pd.crosstab(df.make, df.body_style)
Out[12]:
In [13]:
# Add custom names for the rows and columns?
pd.crosstab(df.make, df.body_style, rownames=['Auto Manufacturer'], colnames=['Body Style'])
Out[13]:
In [14]:
# Convert the occurrences to percentages
pd.crosstab(df.make, df.body_style, normalize=True)
Out[14]:
In [15]:
# Convert the occurrences to percentages for each row
pd.crosstab(df.make, df.body_style, normalize='index')
Out[15]:
In [16]:
# Convert the occurrences to percentages for each column
pd.crosstab(df.make, df.body_style, normalize='columns')
Out[16]:
In [17]:
# If you want to make the percentages a little easier to see, multiple all values by 100
pd.crosstab(df.make, df.body_style, normalize='columns').mul(100).round(0)
Out[17]:
In [18]:
# Perform aggregation functions - not just a simple count
pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0)
Out[18]:
In [19]:
pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean').fillna('-')
Out[19]:
In [20]:
# Crosstab supports grouping as well. In this case, group the columns
pd.crosstab(df.make, [df.body_style, df.drive_wheels])
Out[20]:
In [21]:
# A more complex example showing the grouping of rows and columns
pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels],
rownames=['Auto Manufacturer', "Doors"],
colnames=['Body Style', "Drive Type"],
dropna=False)
Out[21]:
In [22]:
# You can also use agg functions when grouping
pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean').fillna('-')
Out[22]:
In [23]:
# You can also use margins when grouping
pd.crosstab(df.make, [df.body_style, df.drive_wheels],
values=df.curb_weight, aggfunc='mean', margins=True,
margins_name='Average').fillna('-').round(0)
Out[23]:
In [24]:
# Seaborn's heatmap can visualize the final results of the crosstab
sns.heatmap(pd.crosstab(df.drive_wheels, df.make))
Out[24]:
In [25]:
# This is a more complex customization of a heatmap
sns.heatmap(pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels]), cmap="YlGnBu",
annot=True, cbar=False)
Out[25]:
In [26]:
sns.heatmap(pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0))
Out[26]:
In [27]:
sns.heatmap(pd.crosstab(df.make, [df.body_style, df.drive_wheels],
values=df.curb_weight, aggfunc='mean', margins=True, margins_name='Average'),
cmap="YlGnBu", annot=True, cbar=False, fmt='.0f')
Out[27]: