Pandas

If you've never used pandas before, it's amazingly useful, and at times frustrating.

Recommended links:

Read through this full series of excellent blog posts by Tom Augspurger.

High level tip

  • try to represent data in the proper format
    • floats as floats; ints as ints; etc.
    • Especially if you have dates, or timestamps, or datetimestamps, keep them in that format.

This pdf Tidy Data by Hadley Wickham is an excellent read with a lot that relates to data analysis in any language.


In [1]:
from __future__ import absolute_import, division, print_function

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import seaborn as sns
sns.set_context('poster')
# sns.set_style('whitegrid') 
sns.set_style('darkgrid') 
plt.rcParams['figure.figsize'] = 12, 8  # plotsize

In [3]:
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
from sklearn.datasets import load_boston

Note

Using cleaned data from Data Cleaning Notebook. See Notebook for details.


In [4]:
df = pd.read_csv("../data/coal_prod_cleaned.csv")

Notebook Extensions -- qgrid


In [5]:
!conda install qgrid -y


Fetching package metadata .........
Solving package specifications: ..........

# All requested packages already installed.
# packages in environment at /Users/jonathan/miniconda3/envs/py3:
#
qgrid                     0.3.2                    py35_0  

In [6]:
# Check out http://nbviewer.ipython.org/github/quantopian/qgrid/blob/master/qgrid_demo.ipynb for more (including demo)
import qgrid # Put imports at the top
qgrid.nbinstall(overwrite=True)

In [7]:
df.head()


Out[7]:
MSHA_ID Average_Employees Company_Type Labor_Hours Mine_Basin Mine_County Mine_Name Mine_State Mine_Status Mine_Type Operating_Company Operating_Company_Address Operation_Type Production_short_tons Union_Code Year
0 103295 18.0 Independent Producer Operator 39175.0 Appalachia Southern Bibb Seymour Mine Alabama Active Surface Hope Coal Company Inc P.O. Box 249, Maylene, AL 35114 Mine only 105082.0 NaN 2008
1 103117 19.0 Operating Subsidiary 29926.0 Appalachia Southern Cullman Mine #2, #3, #4 Alabama Active, men working, not producing Surface Twin Pines Coal Company Inc 1874 County Road 15, Bremen, AL 35033 Mine only 10419.0 NaN 2008
2 103361 20.0 Operating Subsidiary 42542.0 Appalachia Southern Cullman Cold Springs West Mine Alabama Active Surface Twin Pines Coal Company 74 Industrial Parkway, Jasper, AL 35502 Mine only 143208.0 NaN 2008
3 100759 395.0 Operating Subsidiary 890710.0 Appalachia Southern Fayette North River # 1 Underground Mi Alabama Active Underground Chevron Mining Inc 3114 County Road 63 S, Berry, AL 35546 Mine and Preparation Plant 2923261.0 United Mine Workers of America 2008
4 103246 22.0 Independent Producer Operator 55403.0 Appalachia Southern Franklin Bear Creek Alabama Active Surface Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Mine only 183137.0 NaN 2008

In [7]:
qgrid.show_grid(df[['MSHA_ID', 'Year', 'Mine_Name', 'Mine_State', 'Mine_County']], remote_js=True)

In [8]:
%matplotlib inline

In [28]:
%matplotlib notebook

In [12]:
sns.set_context('poster')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = 12, 8  # plotsize

In [7]:
import mpld3

In [31]:
mpld3.enable_notebook()

In [49]:
mpld3.disable_notebook()

In [8]:
plt.scatter(df.Average_Employees, 
            df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked");



In [9]:
colors = sns.color_palette(n_colors=5)

In [10]:
color_dict = {key: value for key, value in zip(sorted(df.Year.unique()), colors)}

In [11]:
color_dict


Out[11]:
{2008: (0.2980392156862745, 0.4470588235294118, 0.6901960784313725),
 2009: (0.3333333333333333, 0.6588235294117647, 0.40784313725490196),
 2010: (0.7686274509803922, 0.3058823529411765, 0.3215686274509804),
 2011: (0.5058823529411764, 0.4470588235294118, 0.6980392156862745),
 2012: (0.8, 0.7254901960784313, 0.4549019607843137)}

In [12]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
                c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("ex1.png")



In [13]:
import matplotlib as mpl

In [14]:
mpl.style.use('bmh')

In [15]:
plt.style.available


Out[15]:
['fivethirtyeight',
 'seaborn-bright',
 'grayscale',
 'seaborn-whitegrid',
 'classic',
 'seaborn-poster',
 'seaborn-pastel',
 'ggplot',
 'seaborn-ticks',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-talk',
 'seaborn-notebook',
 'dark_background',
 'seaborn-white',
 'bmh',
 'seaborn-paper']

In [16]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
                c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
# plt.savefig("ex1.png")


Out[16]:
<matplotlib.legend.Legend at 0x1126a7a90>

In [17]:
df_dict = load_boston()
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)
df.head()


Out[17]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2

In [18]:
# Target variable
fig, ax = plt.subplots(figsize=(10,8))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False)


/Users/jonathan/miniconda3/envs/py3/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1126c9080>

In [19]:
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT,
            df.MEDV,
            ax=ax)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x115453ba8>

In [20]:
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(df[['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS']], alpha=0.2, diagonal='hist', ax=ax);


/Users/jonathan/miniconda3/envs/py3/lib/python3.5/site-packages/pandas/tools/plotting.py:3369: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  "the passed axes is being cleared", UserWarning)

In [ ]:


In [ ]:


In [ ]:


In [ ]: