In [8]:
%matplotlib inline
import seaborn
import pandas

Pandas: Combining Datasets

Pandas allows us to combine two sets of data using merge, join, and concat.


In [26]:
df = pandas.read_csv('data/red_wine.csv', delimiter=';', parse_dates='time')
df.head()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-26-cd978df3fb26> in <module>()
----> 1 df = pandas.read_csv('data/red_wine.csv', delimiter=';', parse_dates='time')
      2 df.head()

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    644                     skip_blank_lines=skip_blank_lines)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 
    648     parser_f.__name__ = name

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    387 
    388     # Create the parser.
--> 389     parser = TextFileReader(filepath_or_buffer, **kwds)
    390 
    391     if (nrows is not None) and (chunksize is not None):

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    728             self.options['has_index_names'] = kwds['has_index_names']
    729 
--> 730         self._make_engine(self.engine)
    731 
    732     def close(self):

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    921     def _make_engine(self, engine='c'):
    922         if engine == 'c':
--> 923             self._engine = CParserWrapper(self.f, **self.options)
    924         else:
    925             if engine == 'python':

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1376         kwds = kwds.copy()
   1377 
-> 1378         ParserBase.__init__(self, kwds)
   1379 
   1380         if 'utf-16' in (kwds.get('encoding') or ''):

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in __init__(self, kwds)
   1030 
   1031         self.parse_dates = _validate_parse_dates_arg(
-> 1032             kwds.pop('parse_dates', False))
   1033         self.date_parser = kwds.pop('date_parser', None)
   1034         self.dayfirst = kwds.pop('dayfirst', False)

/Library/Python/2.7/site-packages/pandas/io/parsers.pyc in _validate_parse_dates_arg(parse_dates)
   1010         if is_scalar(parse_dates):
   1011             if not lib.is_bool(parse_dates):
-> 1012                 raise TypeError(msg)
   1013 
   1014         elif not isinstance(parse_dates, (list, dict)):

TypeError: Only booleans, lists, and dictionaries are accepted for the 'parse_dates' parameter

In [10]:
df['quality'].unique()


Out[10]:
array([5, 6, 7, 4, 8, 3])

get_dummies converts a categorical variable into indicator variables, i.e. 1 or 0.


In [11]:
quality_dummies = pandas.get_dummies(df['quality'], prefix='quality')
quality_dummies.head()


Out[11]:
quality_3 quality_4 quality_5 quality_6 quality_7 quality_8
0 0 0 1 0 0 0
1 0 0 1 0 0 0
2 0 0 1 0 0 0
3 0 0 0 1 0 0
4 0 0 1 0 0 0

Join this new data frame with the original, row for row:


In [12]:
joined_df = df.join(quality_dummies)
joined_df.head()


Out[12]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality quality_3 quality_4 quality_5 quality_6 quality_7 quality_8
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 0 0 1 0 0 0
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5 0 0 1 0 0 0
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5 0 0 1 0 0 0
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6 0 0 0 1 0 0
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 0 0 1 0 0 0

We can accomplish something similar with concat:


In [13]:
joined_df2 = pandas.concat([quality_dummies, df], axis=1)
joined_df2.head()


Out[13]:
quality_3 quality_4 quality_5 quality_6 quality_7 quality_8 fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 0 0 1 0 0 0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 0 0 1 0 0 0 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 0 0 1 0 0 0 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 0 0 0 1 0 0 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 0 0 1 0 0 0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5

Let's combine multiple data sources.


In [14]:
red_wines_df = pandas.read_csv('data/red_wine.csv', delimiter=';')
white_wines_df = pandas.read_csv('data/white_wine.csv', delimiter=';')

In [15]:
red_wines_quality_df = red_wines_df.groupby('quality').mean()['fixed acidity'].reset_index()
red_wines_quality_df


Out[15]:
quality fixed acidity
0 3 8.360000
1 4 7.779245
2 5 8.167254
3 6 8.347179
4 7 8.872362
5 8 8.566667

In [16]:
white_wines_quality_df = white_wines_df.groupby('quality').mean()['fixed acidity'].reset_index()
white_wines_quality_df


Out[16]:
quality fixed acidity
0 3 7.600000
1 4 7.129448
2 5 6.933974
3 6 6.837671
4 7 6.734716
5 8 6.657143
6 9 7.420000

In [17]:
pandas.merge(red_wines_quality_df, white_wines_quality_df, on=['quality'], suffixes=[' red', ' white'])


Out[17]:
quality fixed acidity red fixed acidity white
0 3 8.360000 7.600000
1 4 7.779245 7.129448
2 5 8.167254 6.933974
3 6 8.347179 6.837671
4 7 8.872362 6.734716
5 8 8.566667 6.657143

Pivot Tables

Let's take another look at the fixed acidity column.


In [25]:
red_wines_df['fixed acidity'].plot.hist()


Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ec8ca10>

cut allows us to turn a column with continuous data into categoricals by specifying bins to place them in.


In [19]:
fixed_acidity_class = pandas.cut(red_wines_df['fixed acidity'], bins=range(4, 17), labels=range(4, 16))
fixed_acidity_class.head(20)


Out[19]:
0      7
1      7
2      7
3     11
4      7
5      7
6      7
7      7
8      7
9      7
10     6
11     7
12     5
13     7
14     8
15     8
16     8
17     8
18     7
19     7
Name: fixed acidity, dtype: category
Categories (12, int64): [4 < 5 < 6 < 7 ... 12 < 13 < 14 < 15]

In [20]:
fixed_acidity_class.name = 'fa_class'
red_wines_df = pandas.concat([red_wines_df, fixed_acidity_class], axis=1)

In [21]:
red_wines_df.head()


Out[21]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality fa_class
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 7
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5 7
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5 7
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6 11
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 7

Get the mean residual sugar for each quality category/fixed acidity pair using a pivot_table. mean is the default agregation function.


In [22]:
pandas.pivot_table(red_wines_df, values='residual sugar', index='quality', columns='fa_class')


Out[22]:
fa_class 4 5 6 7 8 9 10 11 12 13 14 15
quality
3 NaN NaN 1.500000 3.537500 3.400000 NaN 1.800000 2.200000 NaN NaN NaN NaN
4 1.75 5.300000 2.714286 2.453846 2.583333 2.433333 2.566667 1.500000 4.500000 NaN NaN NaN
5 1.60 1.850000 2.492623 2.441331 2.496786 2.675000 3.238889 2.770000 2.393333 3.133333 NaN 5.025
6 2.35 2.886538 2.556767 2.167027 2.281731 2.801562 2.910345 2.524359 2.912500 2.850000 1.8 NaN
7 2.10 1.900000 2.595000 2.655000 2.796429 2.862500 2.718000 2.638889 4.150000 2.800000 2.2 3.700
8 2.00 1.600000 NaN 2.316667 1.800000 2.166667 3.866667 5.200000 2.200000 NaN NaN NaN

Change the aggregation function to max:


In [23]:
pandas.pivot_table(red_wines_df, values='residual sugar', index='quality', 
                   columns='fa_class', aggfunc=max)


Out[23]:
fa_class 4 5 6 7 8 9 10 11 12 13 14 15
quality
3 NaN NaN 1.8 5.7 3.4 NaN 2.10 2.20 NaN NaN NaN NaN
4 2.1 12.9 5.6 4.4 6.3 3.4 3.40 1.60 4.5 NaN NaN NaN
5 1.6 2.5 7.9 8.1 7.9 13.8 15.50 5.15 4.6 4.8 NaN 7.5
6 4.3 13.9 10.7 5.5 5.1 11.0 15.40 6.20 4.3 3.8 1.8 NaN
7 2.1 2.2 6.0 8.3 6.2 8.9 6.55 4.40 5.8 2.8 2.2 3.7
8 2.0 1.8 NaN 3.6 1.8 2.8 6.40 5.20 2.2 NaN NaN NaN

Change the aggregation function to min:


In [24]:
pandas.pivot_table(red_wines_df, values='residual sugar', index='quality', 
                   columns='fa_class', aggfunc=min)


Out[24]:
fa_class 4 5 6 7 8 9 10 11 12 13 14 15
quality
3 NaN NaN 1.2 2.1 3.4 NaN 1.5 2.2 NaN NaN NaN NaN
4 1.4 1.5 1.6 1.3 1.5 1.6 2.1 1.4 4.5 NaN NaN NaN
5 1.6 1.4 1.4 1.3 1.2 1.3 1.9 1.8 1.4 1.9 NaN 4.2
6 1.2 1.2 1.4 0.9 1.3 1.5 1.4 1.4 2.0 2.2 1.8 NaN
7 2.1 1.2 1.4 1.2 1.7 1.4 1.6 1.4 2.3 2.8 2.2 3.7
8 2.0 1.4 NaN 1.7 1.8 1.8 2.6 5.2 2.2 NaN NaN NaN