If you've never used pandas
before, it's amazingly useful, and at times frustrating.
Recommended links:
Read through this full series of excellent blog posts by Tom Augspurger.
High level tip
This pdf Tidy Data by Hadley Wickham is an excellent read with a lot that relates to data analysis in any language.
In [ ]:
from __future__ import absolute_import, division, print_function
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
import seaborn as sns
sns.set_context('poster')
sns.set_style('whitegrid')
# sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
In [ ]:
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings('ignore')
Using cleaned data from Data Cleaning Notebook. See Notebook for details.
In [ ]:
df = pd.read_csv("../data/coal_prod_cleaned.csv")
In [ ]:
df.head()
In [ ]:
plt.scatter(df['Average_Employees'],
df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked");
In [ ]:
colors = sns.color_palette(n_colors=df.Year.nunique())
In [ ]:
color_dict = {key: value for key, value in zip(sorted(df.Year.unique()), colors)}
In [ ]:
color_dict
In [ ]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
plt.scatter(df[df.Year == year].Labor_Hours,
df[df.Year == year].Production_short_tons,
c=color_dict[year],
s=50,
label=year,
)
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("ex1.png")
In [ ]:
import matplotlib as mpl
In [ ]:
mpl.style.use('seaborn-colorblind')
In [ ]:
plt.style.available
In [ ]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
plt.scatter(df[df.Year == year].Labor_Hours,
df[df.Year == year].Production_short_tons,
c=color_dict[year],
s=50,
label=year,
)
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend();
# plt.savefig("ex1.png")
In [ ]:
df_dict = load_boston()
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)
df.head()
In [ ]:
# Target variable
fig, ax = plt.subplots(figsize=(6, 4))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False)
In [ ]:
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT,
df.MEDV,
ax=ax)
In [ ]:
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(df[['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS']], alpha=0.2, diagonal='hist', ax=ax);
In [ ]:
pd.cut()
In [ ]:
In [ ]:
In [ ]: