In [8]:
%matplotlib inline
import seaborn
import pandas
Pandas allows us to combine two sets of data using merge
, join
, and concat
.
In [26]:
df = pandas.read_csv('data/red_wine.csv', delimiter=';', parse_dates='time')
df.head()
In [10]:
df['quality'].unique()
Out[10]:
get_dummies
converts a categorical variable into indicator variables, i.e. 1 or 0.
In [11]:
quality_dummies = pandas.get_dummies(df['quality'], prefix='quality')
quality_dummies.head()
Out[11]:
Join this new data frame with the original, row for row:
In [12]:
joined_df = df.join(quality_dummies)
joined_df.head()
Out[12]:
We can accomplish something similar with concat
:
In [13]:
joined_df2 = pandas.concat([quality_dummies, df], axis=1)
joined_df2.head()
Out[13]:
Let's combine multiple data sources.
In [14]:
red_wines_df = pandas.read_csv('data/red_wine.csv', delimiter=';')
white_wines_df = pandas.read_csv('data/white_wine.csv', delimiter=';')
In [15]:
red_wines_quality_df = red_wines_df.groupby('quality').mean()['fixed acidity'].reset_index()
red_wines_quality_df
Out[15]:
In [16]:
white_wines_quality_df = white_wines_df.groupby('quality').mean()['fixed acidity'].reset_index()
white_wines_quality_df
Out[16]:
In [17]:
pandas.merge(red_wines_quality_df, white_wines_quality_df, on=['quality'], suffixes=[' red', ' white'])
Out[17]:
Let's take another look at the fixed acidity
column.
In [25]:
red_wines_df['fixed acidity'].plot.hist()
Out[25]:
cut
allows us to turn a column with continuous data into categoricals by specifying bins to place them in.
In [19]:
fixed_acidity_class = pandas.cut(red_wines_df['fixed acidity'], bins=range(4, 17), labels=range(4, 16))
fixed_acidity_class.head(20)
Out[19]:
In [20]:
fixed_acidity_class.name = 'fa_class'
red_wines_df = pandas.concat([red_wines_df, fixed_acidity_class], axis=1)
In [21]:
red_wines_df.head()
Out[21]:
Get the mean residual sugar for each quality category/fixed acidity pair using a pivot_table. mean
is the default agregation function.
In [22]:
pandas.pivot_table(red_wines_df, values='residual sugar', index='quality', columns='fa_class')
Out[22]:
Change the aggregation function to max
:
In [23]:
pandas.pivot_table(red_wines_df, values='residual sugar', index='quality',
columns='fa_class', aggfunc=max)
Out[23]:
Change the aggregation function to min
:
In [24]:
pandas.pivot_table(red_wines_df, values='residual sugar', index='quality',
columns='fa_class', aggfunc=min)
Out[24]: