In [2]:
import pandas as pd
import sqlite3
import statsmodels.api as sm
import numpy as np # might not need this
import seaborn as sns
from pylab import savefig
from IPython.display import HTML
from premailer import transform
import matplotlib.pyplot as plt
In [3]:
# all of these imports are for the mail() function
import smtplib
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
import os
from sys import exit
In [4]:
#to view graphs in ipython
%matplotlib inline
In [5]:
# if this .sqlite db doesn't already exists, this will create it
# if the .sqlite db *does* already exist, this establishes the desired connection
con = sqlite3.connect("sql_sample_db_2.sqlite")
In [6]:
# create pandas dataframes from each .csv file:
sales_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/sales_table.csv')
car_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/car_table.csv')
salesman_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/salesman_table.csv')
cust_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/cust_table.csv')
In [7]:
# make a list of the tables (dataframes) and table names:
tables = [sales_table, car_table, salesman_table, cust_table]
table_names = ['sales_table', 'car_table', 'salesman_table', 'cust_table']
In [8]:
# drop each table name if it already exists to avoid error if you rerun this bit of code
# then add it back (or add it for the first time, if the table didn't already exist)
for i in range(len(tables)):
table_name = table_names[i]
table = tables[i]
con.execute("DROP TABLE IF EXISTS {}".format(table_name))
pd.io.sql.to_sql(table, "{}".format(table_name), con, index=False)
In [9]:
#quick function that's useful to turn things into a comma separated string, useful for SQL
def to_list_string(series):
try:
series.iloc[0] + 1
return ', '.join(["{:.0f}".format(row[1]) for row in series.iteritems()])
except TypeError:
return ', '.join(["'" + row[1] + "'" for row in series.iteritems()])
In [10]:
# these just provide formatting to enable images to appear in-line in emails
def to_cid(path):
return '<img src="cid:{}">'.format(path)
def to_image(path):
return dict(title=u'Graph', path=u'{}'.format(path), cid='{}'.format(path))
image_list = []
In [11]:
def html_prep(tables):
no_underscores = tables.replace('_', ' ')
no_indents = no_underscores.replace('<UL>', '')
no_bullet_points = no_indents.replace('<LI>', '')
no_nans = no_bullet_points.replace('NaN', '0')
no_nans2 = no_nans.replace('nan', '0')
no_weird_stuff1 = no_nans2.replace('<', '<')
no_weird_stuff2 = no_weird_stuff1.replace('>', '>')
return no_weird_stuff2
In [12]:
# makes it easy to send out an email with inline images and attachments
def mail(to, subject, text, html, attach=None, image=None):
sendMail = True
if type(to) == str:
tempList = []
tempList.append(to)
to = tempList
elif type(to) == list or type(to) == tuple:
pass
else:
print('Unrecognized address input type')
sendMail = False
if sendMail == True:
msg = MIMEMultipart('alternative')
msg['From'] = 'reports@peanutlabs.com'
msg['To'] = ','.join(to)
msg['Subject'] = subject
part1 = MIMEText(text,'plain')
part2 = MIMEText(html,'html')
msg.attach(part1)
msg.attach(part2)
def attacher(msg, item):
part = MIMEBase('application', 'octet-stream')
part.set_payload(open(item, 'rb').read())
Encoders.encode_base64(part)
part.add_header('Content-Disposition',
'attachment; filename="%s"' % os.path.basename(item))
msg.attach(part)
if attach:
if isinstance(attach, basestring):
attacher(msg, attach)
else:
for item in attach:
attacher(msg, item)
def embed_image(msg, image):
#img = dict(title=u'Picture report', path=u'TESTING.png', cid=cid)
with open(image['path'], 'rb') as file:
msg_image = MIMEImage(file.read(), name=os.path.basename(image['path']))
msg.attach(msg_image)
msg_image.add_header('Content-ID', '<{}>'.format(image['cid']))
if image:
if isinstance(image, dict):
embed_image(msg, image)
else:
for item in image:
embed_image(msg, item)
mailServer = smtplib.SMTP("smtp.gmail.com", 587)
mailServer.ehlo()
mailServer.starttls()
mailServer.ehlo()
mailServer.login('elizabethjbecket@gmail.com','fakeemailpassword')
mailServer.sendmail('elizabethjbecket@gmail.com', to, msg.as_string())
# Should be mailServer.quit(), but that crashes...
mailServer.close()
In [13]:
# Function to make it easy to run queries on this mini-database
def run(query):
results = pd.read_sql("{}".format(query), con).fillna(0)
return results
In [79]:
# actual query
df1 = run('''
SELECT
M.last_name,
first_name || ' ' || last_name as Salesperson,
COUNT(*) as Cars_Sold,
sum(revenue) as Revenue
FROM
sales_table S
JOIN salesman_table M ON S.salesman_id = M.id
GROUP BY
Salesperson
ORDER BY
Cars_Sold DESC
''')
df_html = html_prep(df1[['Salesperson','Cars_Sold', 'Revenue']].to_html(index=False))
HTML(df_html)
Out[79]:
In [80]:
# visualize results
sns.set_style('white')
sns.set_context('notebook')
plot = df1.plot(
x = 'Salesperson',
y = 'Cars_Sold',
kind='barh',
linewidth = 0,
colormap = 'Paired',
grid = False,
xticks = [1,3,5,7,9,11,13,15],
fontsize = 13,
style = 'bms'
)
plot.set_xlabel('Cars Sold')
sns.despine()
image_path = 'test.png'
savefig(image_path, bbox_inches='tight')
new_cid = to_cid(image_path)
image_list.append(to_image(image_path))
In [81]:
top_seller = df1.last_name[0]
In [82]:
top_seller_df = run('''
SELECT
salesman_id,
first_name || ' ' || last_name as Top_Seller,
round(avg(revenue), 2) Rev_per_Sale,
1 - round(avg(revenue/sticker_price), 3) Avg_Customer_Discount,
sum(revenue) Total_Revenue
FROM
sales_table S
JOIN salesman_table M on S.salesman_id = M.id
JOIN car_table C on S.model_id = C.model_id
WHERE
last_name = '{}'
'''.format(top_seller))
In [83]:
all_others = run('''
SELECT
revenue as Rev_per_Sale,
1-revenue/sticker_price as Avg_Customer_Discount
FROM
sales_table S
JOIN salesman_table M on S.salesman_id = M.id
JOIN car_table C on S.model_id = C.model_id
WHERE
last_name != '{}'
'''.format(top_seller))
In [84]:
stdev = all_others.Avg_Customer_Discount.std().round(3)
mean = all_others.Avg_Customer_Discount.mean().round(3)
top_seller_discount = round(top_seller_df.Avg_Customer_Discount.item(), 3)
print('The mean discount given to customers is',
mean,'with a', stdev, 'standard deviation.'
' {} has been giving customers a'.format(top_seller_df.Top_Seller.item()), top_seller_discount ,
'discount to customers, which is', round(top_seller_discount/stdev, 2),
'standard deviations from the mean. Action is advised.')
In [85]:
if top_seller_discount/stdev >= 2:
pass
else:
sys.exit()
In [57]:
stat_report = pd.DataFrame({'{}'.format(top_seller_df.Top_Seller.item()) : pd.Series([top_seller_discount]),
'Mean' : pd.Series([mean]),
'Standard Deviation' : pd.Series([stdev]),
'Deviations from the Mean' : pd.Series([round(top_seller_discount/stdev, 2)])})
In [58]:
cols = ['Jared Case', 'Mean', 'Standard Deviation', 'Deviations from the Mean']
In [59]:
stat_report = stat_report[cols]
In [60]:
report_html = html_prep(stat_report.to_html(index=False))
In [61]:
HTML(report_html)
Out[61]:
In [62]:
# now for a multiple regression analysis
# because data are randomly generated, statistical significance will be extremely low
seller0 = df1.last_name[0]
seller1 = df1.last_name[1]
seller2 = df1.last_name[2]
seller3 = df1.last_name[3]
seller4 = df1.last_name[4]
In [63]:
# query with all factors we want to consider in the regression analysis
sales_table = run('''
SELECT
date,
revenue,
case when payment_type = 'cash' then 1 end Cash_Payment,
case when gender = 'male' then 1 end Male_Customer,
age as Age,
case when make = 'Toyota' then 1 end Toyota,
case when make = 'Subaru' then 1 end Subaru,
case when last_name = '{seller0}' then 1 end '{seller0}',
case when last_name = '{seller1}' then 1 end {seller1},
case when last_name = '{seller2}' then 1 end {seller2},
case when last_name = '{seller3}' then 1 end {seller3},
case when last_name = '{seller4}' then 1 end {seller4}
FROM
sales_table S
JOIN car_table C on S.model_id = C.model_id
JOIN cust_table CUST on CUST.customer_id = S.customer_id
JOIN salesman_table M on S.salesman_id = M.id
'''.format(seller0=seller0, seller1=seller1, seller2=seller2, seller3=seller3,seller4=seller4))
In [64]:
date_df = sales_table.groupby('date').sum()
In [65]:
X = date_df[['Cash_Payment', 'Male_Customer', 'Age', 'Toyota', 'Subaru', '%s' % seller0, '%s' % seller1, '%s' % seller2, '%s' % seller3,
'%s' % seller4]]
y = date_df['revenue']
In [66]:
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
mr_analysis = html_prep(results.summary().as_html())
In [67]:
HTML(mr_analysis)
Out[67]:
In [68]:
top_seller_id = top_seller_df.salesman_id.item()
In [69]:
customer_bias = run('''
SELECT
CASE WHEN age BETWEEN 18 AND 24 THEN '18-24 years'
WHEN age BETWEEN 25 AND 34 THEN '25-34 years'
WHEN age BETWEEN 35 AND 44 THEN '35-45 years'
WHEN age BETWEEN 45 AND 54 THEN '45-54 years'
WHEN age BETWEEN 55 AND 64 THEN '55-64 years'
END Age_Group,
CASE WHEN gender = 'male' then 1-round(sum(revenue)/sum(sticker_price), 2) end Male_Customer,
CASE WHEN gender = 'female' then 1-round(sum(revenue)/sum(sticker_price), 2) end Female_Customer
FROM
cust_table CUST
JOIN sales_table S on S.customer_id = CUST.customer_id
JOIN car_table C on C.model_id = S.model_id
WHERE
salesman_id = {}
GROUP BY
Age_Group
'''.format(top_seller_id))
In [70]:
customer_bias
Out[70]:
In [71]:
# visualize results
sns.set_style('white')
sns.set_context('notebook')
plot = customer_bias.plot(
x = 'Age_Group',
kind='bar',
linewidth = 0,
colormap = 'Accent',
grid = False,
fontsize = 13,
rot = 30
)
plot.set_xlabel('Avg Customer Discount by Demographic')
sns.despine()
image_path = 'customer.png'
savefig(image_path, bbox_inches='tight')
cust_cid = to_cid(image_path)
image_list.append(to_image(image_path))
In [86]:
# some simple CSS to make the tables look cleaner in the email we'll be sending out
css_header = '<html>\n <head>\n <style>\n table, td, th\n {\n border:2px solid #f7f7f7;\n }\n th\n {\n background-color:#4b63e3;\n color:white;\n }\n table\n {\n border-collapse:collapse;\n }\n td, th\n {\n padding:3px;\n }\n td\n {\n font:12px arial,sans-serif;\n }\n th\n {\n font: 13px arial,sans-serif;\n }\n td\n {\n text-align:center;\n }\n th\n {\n text-align:left;\n }\n </style>\n </head>\n <body>'
css_footer = '</body>\n </html>'
In [87]:
header = '<font size="3"><b>Salesperson Report</b></font> <br>'
cust_discount = '<br><br><font size="2"><b>Negotiation Practices indicated by average customer discounts:</b></font><br>'
mr_title = '<br><br><font size="2"><b>Multiple Regression Analysis:</b></font><br>'
other = '<br><br><font size="2"><b>Total Team Performance:</b></font><br>'
In [88]:
to_email = transform(css_header + header + cust_discount + report_html + cust_cid + mr_title + mr_analysis + other + df_html + new_cid + css_footer)
In [89]:
HTML(to_email)
Out[89]:
In [90]:
mail('sbh355@stern.nyu.edu', 'Automated Report', 'Automated Report', to_email, image=image_list)
In [40]: