CSV.reader()
function in the standard librarynumpy.loadtxt()
functionpandas.read_csv()
function
In [1]:
# Allow Jupyter Notebook to render plots inline
%matplotlib inline
In [2]:
####### IGNORE Warning that appears when executing this cell, it is an implementation issue for the seaborn module
# Load modules
import matplotlib
import matplotlib.pyplot as plt
import os as os
import pandas as pd
import sqlalchemy as sa
import seaborn as sns
# Load classes
from time import strftime
from pandas.tools.plotting import scatter_matrix
In [3]:
# create engine
engine_name = 'mssql+pyodbc'
host_name = 'au04db502nml3'
db_name = 'OneViewStaging_Telfer'
driver_name = 'SQL+Server+Native+Client+11.0'
cx = engine_name + '://' + host_name + '/' + db_name + '?driver=' + driver_name
engine = sa.create_engine(cx)
In [4]:
# define the query here to select the data wanted
sql = """
select
[StartDateTime]
,[Total Material Moved (t)]
,[Total ExPit Material Moved (t)]
,[Total ExPit Ore (t)]
,[Direct Feed & OP Rehandle (t)]
,[Direct Feed (t)]
,[Leach Ore (t)]
,[ExPit Ore (t)]
,[Waste (t)]
,[Rehandle (t)]
,[Relocation (t)]
,[TopSoil (t)]
,[Scats (t)]
,[MReef (OP) (t)]
,[SLC (t)]
,[M35 (t)]
from
[dbo].[tb_OPCO_TELReconcilorData]
order by
StartDateTime
;"""
In [5]:
# get the data from the db
df = pd.read_sql(sql, engine)
In [6]:
# We're' going to 'clean' the data of null values; the following columns have no
# data in them so we need to remove them to ensure our cleaning in the next step
# doesn't delete all the data
cols_to_drop = ['TopSoil (t)']
df.drop(cols_to_drop, axis=1, inplace=True)
# get rid of rows with nulls
# and add column with day of week to use for analysis
df.dropna(axis=0, how='any', inplace=True)
df['Day Of Week'] = (df['StartDateTime']).dt.dayofweek
In [7]:
# Summary stats for the results in the query
df.describe().transpose()
Out[7]:
In [17]:
# Lets view the first 5 rows of the data set
df.head()
Out[17]:
In [9]:
# Use default plotting options to forma scatter matrix
fig1 = scatter_matrix(df)
In [10]:
# Lets format the scatter matrix differently
matplotlib.style.use('ggplot')
fig2 = scatter_matrix(df, alpha=0.2, figsize=(15, 15), diagonal='kde')
In [11]:
# Perform scatter matrix with another viz library - seaborn - using its defaults
fig3 = sns.pairplot(df, hue="Day Of Week")
In [12]:
# Lets do the same thing again removing the analysis at the "Day of Week" level
# The upper half of the matrix triangle will have regression plots for the pairs
# The diagonal will have distribution plots for the variable/feature in question
# The lower half of the triangle will have residual plots for the the pairs
fig4 = sns.PairGrid(df)
fig4.map_upper(sns.regplot)
fig4.map_lower(sns.residplot)
fig4.map_diag(plt.hist)
for ax in fig4.axes.flat:
plt.setp(ax.get_xticklabels(), rotation=45)
fig4.add_legend()
fig4.set(alpha=0.5)
Out[12]:
In [13]:
# Lets view "Total ExPit Ore (t)" vs "Waste (t)" as there potentially
# looks to be a linear relationship
fig5 = sns.jointplot("Total ExPit Ore (t)", "Waste (t)", data=df, kind='kde')
In [14]:
# Lets view the same relationship, just styled differently
fig6 = sns.jointplot("Total ExPit Ore (t)", "Waste (t)", data=df, kind='reg')
In [15]:
# Look at the relationship through the days of the week
fig7 = sns.lmplot(x="Total ExPit Ore (t)", y="Waste (t)", data=df, col="Day Of Week", order=1);
In [16]:
# Create a box plot of the varaibles/features
date_cols_to_drop = ["Day Of Week", "StartDateTime"]
df = df.drop(date_cols_to_drop[1], axis=1)
fig8 = sns.boxplot(data=df, orient="h")