If you've never used pandas
before, it's amazing. It will also frustrate you to tears.
High level tip -- try to represent data in the proper format: floats as floats; ints as ints; etc. Especially if you have dates, or timestamps, or datetimestamps, keep them in that format. The temptation to operate on them like strings may be overwhelming, but resist! In the long run you might prevail. :\
In [1]:
# %install_ext http://raw.github.com/jrjohansson/version_information/master/version_information.py
%load_ext version_information
%reload_ext version_information
%version_information numpy, scipy, matplotlib, pandas
Out[1]:
In [3]:
# Doing this in python 2.7 code allows for most of the code to be python 3 portable.
# But you have to write your print functions: print("Hello world.")
# from __future__ import division, absolute_import, print_function, unicode_literals
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = 12, 8 # plotsize
import numpy as np
import pandas as pd
Using cleaned data from Data Cleaning Notebook. See Notebook for details.
In [4]:
dframe = pd.read_csv("../data/coal_prod_cleaned.csv")
In [5]:
# Check out http://nbviewer.ipython.org/github/quantopian/qgrid/blob/master/qgrid_demo.ipynb for more (including demo)
import qgrid # Best practices is to put imports at the top of the Notebook.
qgrid.nbinstall(overwrite=True)
In [6]:
dframe.head()
Out[6]:
In [7]:
qgrid.show_grid(dframe[['MSHA_ID', 'Year', 'Mine_Name', 'Mine_State', 'Mine_County']], remote_js=True)
In [8]:
plt.scatter(dframe.Average_Employees, dframe.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked")
Out[8]:
In [9]:
plt.scatter(dframe.Labor_Hours, dframe.Production_short_tons, )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
Out[9]:
In [10]:
colors = sns.color_palette(n_colors=11)
In [11]:
color_dict = {key: value for key, value in zip(sorted(dframe.Year.unique()), colors)}
In [12]:
color_dict
Out[12]:
In [13]:
for year in sorted(dframe.Year.unique()[[0,2, 5, -1]]):
plt.scatter(dframe[dframe.Year == year].Labor_Hours,
dframe[dframe.Year == year].Production_short_tons,
c=color_dict[year],
s=50,
label=year,
)
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("ex1.png")
In [ ]:
# facet grid
In [14]:
for col in dframe.columns:
print col
In [ ]:
In [15]:
# An updated implementation from Christian Perez at SVDS https://github.com/cfperez/ipython-sql
%load_ext sql
%reload_ext sql
In [16]:
coalproduction = dframe.copy()
In [17]:
%config SqlMagic.autopandas=True
In [18]:
%%sql sqlite://
PERSIST coalproduction
Out[18]:
In [19]:
%%sql sqlite://
SELECT DISTINCT company_type FROM coalproduction
WHERE msha_id = 5000030
Out[19]:
In [20]:
dbtest = %sql SELECT * FROM coalproduction
In [21]:
type(dbtest)
Out[21]:
In [22]:
dbtest.head()
Out[22]:
In [ ]: