Pandas

If you've never used pandas before, it's amazingly useful, and at times frustrating.

Note

Using cleaned data from Data Cleaning Notebook. See Notebook for details.



In [4]:

    
df = pd.read_csv("../data/coal_prod_cleaned.csv")

Notebook Extensions -- qgrid



In [5]:

    
!conda install qgrid -y









    



Fetching package metadata .........
Solving package specifications: ..........

# All requested packages already installed.
# packages in environment at /Users/jonathan/miniconda3/envs/py3:
#
qgrid                     0.3.2                    py35_0



In [6]:

    
# Check out http://nbviewer.ipython.org/github/quantopian/qgrid/blob/master/qgrid_demo.ipynb for more (including demo)
import qgrid # Put imports at the top
qgrid.nbinstall(overwrite=True)



In [7]:

    
df.head()









    Out[7]:






  
    
      
      MSHA_ID
      Average_Employees
      Company_Type
      Labor_Hours
      Mine_Basin
      Mine_County
      Mine_Name
      Mine_State
      Mine_Status
      Mine_Type
      Operating_Company
      Operating_Company_Address
      Operation_Type
      Production_short_tons
      Union_Code
      Year
    
  
  
    
      0
      103295
      18.0
      Independent Producer Operator
      39175.0
      Appalachia Southern
      Bibb
      Seymour Mine
      Alabama
      Active
      Surface
      Hope Coal Company Inc
      P.O. Box 249, Maylene, AL 35114
      Mine only
      105082.0
      NaN
      2008
    
    
      1
      103117
      19.0
      Operating Subsidiary
      29926.0
      Appalachia Southern
      Cullman
      Mine #2, #3, #4
      Alabama
      Active, men working, not producing
      Surface
      Twin Pines Coal Company Inc
      1874 County Road 15, Bremen, AL 35033
      Mine only
      10419.0
      NaN
      2008
    
    
      2
      103361
      20.0
      Operating Subsidiary
      42542.0
      Appalachia Southern
      Cullman
      Cold Springs West Mine
      Alabama
      Active
      Surface
      Twin Pines Coal Company
      74 Industrial Parkway, Jasper, AL 35502
      Mine only
      143208.0
      NaN
      2008
    
    
      3
      100759
      395.0
      Operating Subsidiary
      890710.0
      Appalachia Southern
      Fayette
      North River # 1 Underground Mi
      Alabama
      Active
      Underground
      Chevron Mining Inc
      3114 County Road 63 S, Berry, AL 35546
      Mine and Preparation Plant
      2923261.0
      United Mine Workers of America
      2008
    
    
      4
      103246
      22.0
      Independent Producer Operator
      55403.0
      Appalachia Southern
      Franklin
      Bear Creek
      Alabama
      Active
      Surface
      Birmingham Coal & Coke Co., In
      912 Edenton Street, Birmingham, AL 35242
      Mine only
      183137.0
      NaN
      2008



In [7]:

    
qgrid.show_grid(df[['MSHA_ID', 'Year', 'Mine_Name', 'Mine_State', 'Mine_County']], remote_js=True)



In [8]:

    
%matplotlib inline



In [28]:

    
%matplotlib notebook



In [12]:

    
sns.set_context('poster')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = 12, 8  # plotsize



In [7]:

    
import mpld3



In [31]:

    
mpld3.enable_notebook()



In [49]:

    
mpld3.disable_notebook()



In [8]:

    
plt.scatter(df.Average_Employees, 
            df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked");



In [9]:

    
colors = sns.color_palette(n_colors=5)



In [10]:

    
color_dict = {key: value for key, value in zip(sorted(df.Year.unique()), colors)}



In [11]:

    
color_dict









    Out[11]:





{2008: (0.2980392156862745, 0.4470588235294118, 0.6901960784313725),
 2009: (0.3333333333333333, 0.6588235294117647, 0.40784313725490196),
 2010: (0.7686274509803922, 0.3058823529411765, 0.3215686274509804),
 2011: (0.5058823529411764, 0.4470588235294118, 0.6980392156862745),
 2012: (0.8, 0.7254901960784313, 0.4549019607843137)}



In [12]:

    
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
                c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("ex1.png")



In [13]:

    
import matplotlib as mpl



In [14]:

    
mpl.style.use('bmh')



In [15]:

    
plt.style.available









    Out[15]:





['fivethirtyeight',
 'seaborn-bright',
 'grayscale',
 'seaborn-whitegrid',
 'classic',
 'seaborn-poster',
 'seaborn-pastel',
 'ggplot',
 'seaborn-ticks',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-talk',
 'seaborn-notebook',
 'dark_background',
 'seaborn-white',
 'bmh',
 'seaborn-paper']



In [16]:

    
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
                c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
# plt.savefig("ex1.png")









    Out[16]:





<matplotlib.legend.Legend at 0x1126a7a90>



In [17]:

    
df_dict = load_boston()
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)
df.head()



In [18]:

    
# Target variable
fig, ax = plt.subplots(figsize=(10,8))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False)









    



/Users/jonathan/miniconda3/envs/py3/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x1126c9080>



In [19]:

    
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT,
            df.MEDV,
            ax=ax)









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x115453ba8>



In [20]:

    
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(df[['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS']], alpha=0.2, diagonal='hist', ax=ax);









    



/Users/jonathan/miniconda3/envs/py3/lib/python3.5/site-packages/pandas/tools/plotting.py:3369: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  "the passed axes is being cleared", UserWarning)



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2

	MSHA_ID	Average_Employees	Company_Type	Labor_Hours	Mine_Basin	Mine_County	Mine_Name	Mine_State	Mine_Status	Mine_Type	Operating_Company	Operating_Company_Address	Operation_Type	Production_short_tons	Union_Code	Year
0	103295	18.0	Independent Producer Operator	39175.0	Appalachia Southern	Bibb	Seymour Mine	Alabama	Active	Surface	Hope Coal Company Inc	P.O. Box 249, Maylene, AL 35114	Mine only	105082.0	NaN	2008
1	103117	19.0	Operating Subsidiary	29926.0	Appalachia Southern	Cullman	Mine #2, #3, #4	Alabama	Active, men working, not producing	Surface	Twin Pines Coal Company Inc	1874 County Road 15, Bremen, AL 35033	Mine only	10419.0	NaN	2008
2	103361	20.0	Operating Subsidiary	42542.0	Appalachia Southern	Cullman	Cold Springs West Mine	Alabama	Active	Surface	Twin Pines Coal Company	74 Industrial Parkway, Jasper, AL 35502	Mine only	143208.0	NaN	2008
3	100759	395.0	Operating Subsidiary	890710.0	Appalachia Southern	Fayette	North River # 1 Underground Mi	Alabama	Active	Underground	Chevron Mining Inc	3114 County Road 63 S, Berry, AL 35546	Mine and Preparation Plant	2923261.0	United Mine Workers of America	2008
4	103246	22.0	Independent Producer Operator	55403.0	Appalachia Southern	Franklin	Bear Creek	Alabama	Active	Surface	Birmingham Coal & Coke Co., In	912 Edenton Street, Birmingham, AL 35242	Mine only	183137.0	NaN	2008