If you've never used pandas
before, it's amazingly useful, and at times frustrating.
Recommended links:
Read through this full series of excellent blog posts by Tom Augspurger.
High level tip
This pdf Tidy Data by Hadley Wickham is an excellent read with a lot that relates to data analysis in any language.
In [1]:
from __future__ import absolute_import, division, print_function
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
import seaborn as sns
sns.set_context('poster')
# sns.set_style('whitegrid')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
In [3]:
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
from sklearn.datasets import load_boston
Using cleaned data from Data Cleaning Notebook. See Notebook for details.
In [4]:
df = pd.read_csv("../data/coal_prod_cleaned.csv")
In [5]:
!conda install qgrid -y
In [6]:
# Check out http://nbviewer.ipython.org/github/quantopian/qgrid/blob/master/qgrid_demo.ipynb for more (including demo)
import qgrid # Put imports at the top
qgrid.nbinstall(overwrite=True)
In [7]:
df.head()
Out[7]:
In [7]:
qgrid.show_grid(df[['MSHA_ID', 'Year', 'Mine_Name', 'Mine_State', 'Mine_County']], remote_js=True)
In [8]:
%matplotlib inline
In [28]:
%matplotlib notebook
In [12]:
sns.set_context('poster')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
In [7]:
import mpld3
In [31]:
mpld3.enable_notebook()
In [49]:
mpld3.disable_notebook()
In [8]:
plt.scatter(df.Average_Employees,
df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked");
In [9]:
colors = sns.color_palette(n_colors=5)
In [10]:
color_dict = {key: value for key, value in zip(sorted(df.Year.unique()), colors)}
In [11]:
color_dict
Out[11]:
In [12]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
plt.scatter(df[df.Year == year].Labor_Hours,
df[df.Year == year].Production_short_tons,
c=color_dict[year],
s=50,
label=year,
)
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("ex1.png")
In [13]:
import matplotlib as mpl
In [14]:
mpl.style.use('bmh')
In [15]:
plt.style.available
Out[15]:
In [16]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
plt.scatter(df[df.Year == year].Labor_Hours,
df[df.Year == year].Production_short_tons,
c=color_dict[year],
s=50,
label=year,
)
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
# plt.savefig("ex1.png")
Out[16]:
In [17]:
df_dict = load_boston()
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)
df.head()
Out[17]:
In [18]:
# Target variable
fig, ax = plt.subplots(figsize=(10,8))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False)
Out[18]:
In [19]:
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT,
df.MEDV,
ax=ax)
Out[19]:
In [20]:
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(df[['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS']], alpha=0.2, diagonal='hist', ax=ax);
In [ ]:
In [ ]:
In [ ]:
In [ ]: