In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import textwrap
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# special things
from pivottablejs import pivot_ui
from ipywidgets import FloatSlider, interactive, IntSlider
from scipy import interpolate

# sql
%load_ext sql_magic
import sqlalchemy
import sqlite3
from sqlalchemy import create_engine
sqlite_engine = create_engine('sqlite://')
%config SQL.conn_name = "sqlite_engine"
%config SQL.output_result = False

# autoreload
%load_ext autoreload
%autoreload 1
# %aimport module_to_reload

import ipyvolume as ipv
import altair as alt
from vega_datasets import data

import seaborn as sns
sns.set_context('poster', font_scale=1.3)

import qgrid



In [2]:
coal = pd.read_csv("../data/coal_prod_cleaned.csv")
coal.head()


Out[2]:
MSHA_ID Average_Employees Company_Type Labor_Hours Mine_Basin Mine_County Mine_Name Mine_State Mine_Status Mine_Type Operating_Company Operating_Company_Address Operation_Type Production_short_tons Union_Code Year
0 103295 18.0 Independent Producer Operator 39175.0 Appalachia Southern Bibb Seymour Mine Alabama Active Surface Hope Coal Company Inc P.O. Box 249, Maylene, AL 35114 Mine only 105082.0 NaN 2008
1 103117 19.0 Operating Subsidiary 29926.0 Appalachia Southern Cullman Mine #2, #3, #4 Alabama Active, men working, not producing Surface Twin Pines Coal Company Inc 1874 County Road 15, Bremen, AL 35033 Mine only 10419.0 NaN 2008
2 103361 20.0 Operating Subsidiary 42542.0 Appalachia Southern Cullman Cold Springs West Mine Alabama Active Surface Twin Pines Coal Company 74 Industrial Parkway, Jasper, AL 35502 Mine only 143208.0 NaN 2008
3 100759 395.0 Operating Subsidiary 890710.0 Appalachia Southern Fayette North River # 1 Underground Mi Alabama Active Underground Chevron Mining Inc 3114 County Road 63 S, Berry, AL 35546 Mine and Preparation Plant 2923261.0 United Mine Workers of America 2008
4 103246 22.0 Independent Producer Operator 55403.0 Appalachia Southern Franklin Bear Creek Alabama Active Surface Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Mine only 183137.0 NaN 2008

In [3]:
coal.to_sql('coal', con=sqlite_engine, if_exists='append', index=False)

In [4]:
%%read_sql df
SELECT 
    Mine_State,
    Labor_Hours,
    sum(Production_short_tons) as Production_short_tons
FROM coal
WHERE Mine_State != "Wyoming"
GROUP BY Mine_State


Query started at 05:17:42 PM PDT; Query executed in 0.00 m

In [5]:
df.head()


Out[5]:
Mine_State Labor_Hours Production_short_tons
0 Alabama 39175.0 97714698.0
1 Alaska 234809.0 9689484.0
2 Arizona 999459.0 38855497.0
3 Arkansas 1310.0 337588.0
4 Colorado 5112.0 140913919.0

In [6]:
df = pd.read_csv("../data/coal_prod_cleaned.csv")
df.shape


Out[6]:
(9042, 16)

In [7]:
df.columns


Out[7]:
Index(['MSHA_ID', 'Average_Employees', 'Company_Type', 'Labor_Hours',
       'Mine_Basin', 'Mine_County', 'Mine_Name', 'Mine_State', 'Mine_Status',
       'Mine_Type', 'Operating_Company', 'Operating_Company_Address',
       'Operation_Type', 'Production_short_tons', 'Union_Code', 'Year'],
      dtype='object')

In [8]:
qgrid_widget = qgrid.show_grid(
    df[["Year", "Mine_State", "Labor_Hours", "Production_short_tons"]],
    show_toolbar=True,
)
qgrid_widget


QGrid

Interactive pandas dataframes: https://github.com/quantopian/qgrid

pip install qgrid --upgrade

In [9]:
df2 = df[df['Mine_State'] != "Wyoming"].groupby('Mine_State').sum()

In [10]:
df3 = df.groupby('Mine_State').sum()

In [11]:
# have to run this from the home dir of this repo
# cd insight/
# python setup.py develop

In [12]:
%aimport insight.plotting

In [13]:
insight.plotting.plot_prod_vs_hours(df3, color_index=1)
# insight.plotting.plot_prod_vs_hours(df2, color_index=1)



In [ ]:


In [14]:
def plot_prod_vs_hours(
    df, color_index=0, output_file="../img/production-vs-hours-worked.png"
):
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.regplot(
        df["Labor_Hours"],
        df["Production_short_tons"],
        ax=ax,
        color=sns.color_palette()[color_index],
    )
    ax.set_xlabel("Labor Hours Worked")
    ax.set_ylabel("Total Amount Produced")
    x = ax.set_xlim(-9506023.213266129, 204993853.21326613)
    y = ax.set_ylim(-51476801.43653282, 746280580.4034251)
    fig.tight_layout()
    fig.savefig(output_file)

In [15]:
plot_prod_vs_hours(df2, color_index=0)



In [16]:
plot_prod_vs_hours(df3, color_index=1)



In [ ]:
# make a change via qgrid
df3 = qgrid_widget.get_changed_df()

In [ ]:


In [ ]:
qgrid_widget = qgrid.show_grid(
    df2[["Year", "Labor_Hours", "Production_short_tons"]],
    show_toolbar=True,
)
qgrid_widget

In [ ]:


In [ ]: