In [2]:
import pandas as pd
import sqlite3
import statsmodels.api as sm
import numpy as np # might not need this
import seaborn as sns
from pylab import savefig
from IPython.display import HTML
from premailer import transform
import matplotlib.pyplot as plt

In [3]:
# all of these imports are for the mail() function
import smtplib
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
import os
from sys import exit

In [4]:
#to view graphs in ipython
%matplotlib inline

In [5]:
# if this .sqlite db doesn't already exists, this will create it
# if the .sqlite db *does* already exist, this establishes the desired connection
con = sqlite3.connect("sql_sample_db_2.sqlite")

In [6]:
# create pandas dataframes from each .csv file:
sales_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/sales_table.csv')
car_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/car_table.csv')
salesman_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/salesman_table.csv')
cust_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/cust_table.csv')

In [7]:
# make a list of the tables (dataframes) and table names:
tables = [sales_table, car_table, salesman_table, cust_table]
table_names = ['sales_table', 'car_table', 'salesman_table', 'cust_table']

In [8]:
# drop each table name if it already exists to avoid error if you rerun this bit of code
# then add it back (or add it for the first time, if the table didn't already exist)
for i in range(len(tables)):
    table_name = table_names[i]
    table = tables[i]
    con.execute("DROP TABLE IF EXISTS {}".format(table_name))
    pd.io.sql.to_sql(table, "{}".format(table_name), con, index=False)

In [9]:
#quick function that's useful to turn things into a comma separated string, useful for SQL
def to_list_string(series):
    try:
        series.iloc[0] + 1
        return ', '.join(["{:.0f}".format(row[1]) for row in series.iteritems()])
    except TypeError:
        return ', '.join(["'" + row[1] + "'" for row in series.iteritems()])

In [10]:
# these just provide formatting to enable images to appear in-line in emails
def to_cid(path):
    return '<img src="cid:{}">'.format(path)
def to_image(path):
    return dict(title=u'Graph', path=u'{}'.format(path), cid='{}'.format(path))
image_list = []

In [11]:
def html_prep(tables):
    no_underscores = tables.replace('_', ' ')
    no_indents = no_underscores.replace('<UL>', '')
    no_bullet_points = no_indents.replace('<LI>', '')
    no_nans = no_bullet_points.replace('NaN', '0')
    no_nans2 = no_nans.replace('nan', '0')
    no_weird_stuff1 = no_nans2.replace('&lt;', '<')
    no_weird_stuff2 = no_weird_stuff1.replace('&gt;', '>')
    return no_weird_stuff2

In [12]:
# makes it easy to send out an email with inline images and attachments
def mail(to, subject, text, html, attach=None, image=None):
    sendMail = True
    
    if type(to) == str:
        tempList = []
        tempList.append(to)
        to = tempList
    elif type(to) == list or type(to) == tuple:
        pass
    else:
        print('Unrecognized address input type')
        sendMail = False
        
    if sendMail == True:
        msg = MIMEMultipart('alternative')

        msg['From'] = 'reports@peanutlabs.com'
        msg['To'] = ','.join(to)
        msg['Subject'] = subject
        

        part1 = MIMEText(text,'plain')
        part2 = MIMEText(html,'html')

        msg.attach(part1)
        msg.attach(part2)

        def attacher(msg, item):
            part = MIMEBase('application', 'octet-stream')
            part.set_payload(open(item, 'rb').read())
            Encoders.encode_base64(part)
            part.add_header('Content-Disposition',
                'attachment; filename="%s"' % os.path.basename(item))
            msg.attach(part)

        if attach:
            if isinstance(attach, basestring):
                attacher(msg, attach)
            else:
                for item in attach:
                    attacher(msg, item)

        def embed_image(msg, image):
            #img = dict(title=u'Picture report', path=u'TESTING.png', cid=cid)
            with open(image['path'], 'rb') as file:
                msg_image = MIMEImage(file.read(), name=os.path.basename(image['path']))
                msg.attach(msg_image)
            
            msg_image.add_header('Content-ID', '<{}>'.format(image['cid']))

        if image:
            if isinstance(image, dict):
                embed_image(msg, image)
            else:
                for item in image:
                    embed_image(msg, item)


        mailServer = smtplib.SMTP("smtp.gmail.com", 587)
        mailServer.ehlo()
        mailServer.starttls()
        mailServer.ehlo()
        mailServer.login('elizabethjbecket@gmail.com','fakeemailpassword')
        mailServer.sendmail('elizabethjbecket@gmail.com', to, msg.as_string())
        # Should be mailServer.quit(), but that crashes...
        mailServer.close()

In [13]:
# Function to make it easy to run queries on this mini-database
def run(query):
    results = pd.read_sql("{}".format(query), con).fillna(0)
    return results

In [79]:
# actual query
df1 = run('''
    SELECT
        M.last_name,
        first_name ||  ' ' || last_name as Salesperson,
        COUNT(*) as Cars_Sold,
        sum(revenue) as Revenue
    FROM
        sales_table S
        JOIN salesman_table M ON S.salesman_id = M.id
    GROUP BY 
        Salesperson
    ORDER BY
        Cars_Sold DESC
    ''') 
df_html = html_prep(df1[['Salesperson','Cars_Sold', 'Revenue']].to_html(index=False))
HTML(df_html)


Out[79]:
Salesperson Cars Sold Revenue
Jared Case 14 265323
Claudine Hatch 11 196790
Michael Hill 11 201558
Rosemarie Self 11 214085
Elton Elzy 10 194669
Matthew Luna 10 213492
Joseph Seney 9 180020
Justin Avellaneda 9 173182
Samantha Douglas 8 153193
Kathleen March 7 148940

In [80]:
# visualize results
sns.set_style('white')
sns.set_context('notebook')
plot = df1.plot(
    x = 'Salesperson',
    y = 'Cars_Sold', 
    kind='barh', 
    linewidth = 0,
    colormap = 'Paired',
    grid = False,
    xticks = [1,3,5,7,9,11,13,15],
    fontsize = 13,
    style = 'bms'
    )
plot.set_xlabel('Cars Sold')
sns.despine()
image_path = 'test.png'
savefig(image_path, bbox_inches='tight')
new_cid = to_cid(image_path)
image_list.append(to_image(image_path))



In [81]:
top_seller = df1.last_name[0]

In [82]:
top_seller_df = run('''
    SELECT
        salesman_id,
        first_name ||  ' ' || last_name as Top_Seller,
        round(avg(revenue), 2) Rev_per_Sale,
        1 - round(avg(revenue/sticker_price), 3) Avg_Customer_Discount,
        sum(revenue) Total_Revenue
    FROM
        sales_table S
        JOIN salesman_table M on S.salesman_id = M.id
        JOIN car_table C on S.model_id = C.model_id
    WHERE
        last_name = '{}'
'''.format(top_seller))

In [83]:
all_others = run('''
    SELECT
        revenue as Rev_per_Sale,
        1-revenue/sticker_price as Avg_Customer_Discount
    FROM
        sales_table S
        JOIN salesman_table M on S.salesman_id = M.id
        JOIN car_table C on S.model_id = C.model_id
    WHERE
        last_name != '{}'
        
'''.format(top_seller))

In [84]:
stdev = all_others.Avg_Customer_Discount.std().round(3)
mean = all_others.Avg_Customer_Discount.mean().round(3)
top_seller_discount = round(top_seller_df.Avg_Customer_Discount.item(), 3)
print('The mean discount given to customers is', 
                   mean,'with a', stdev, 'standard deviation.' 
                   ' {} has been giving customers a'.format(top_seller_df.Top_Seller.item()), top_seller_discount ,
                   'discount to customers, which is', round(top_seller_discount/stdev, 2), 
                   'standard deviations from the mean. Action is advised.')


The mean discount given to customers is 0.13 with a 0.074 standard deviation. Jared Case has been giving customers a 0.154 discount to customers, which is 2.08 standard deviations from the mean. Action is advised.

In [85]:
if top_seller_discount/stdev >= 2:
    pass
else:
    sys.exit()

In [57]:
stat_report = pd.DataFrame({'{}'.format(top_seller_df.Top_Seller.item()) : pd.Series([top_seller_discount]),
                           'Mean' : pd.Series([mean]),
                           'Standard Deviation' : pd.Series([stdev]),
                           'Deviations from the Mean' : pd.Series([round(top_seller_discount/stdev, 2)])})

In [58]:
cols = ['Jared Case', 'Mean', 'Standard Deviation', 'Deviations from the Mean']

In [59]:
stat_report = stat_report[cols]

In [60]:
report_html = html_prep(stat_report.to_html(index=False))

In [61]:
HTML(report_html)


Out[61]:
Jared Case Mean Standard Deviation Deviations from the Mean
0.154 0.13 0.074 2.08

In [62]:
# now for a multiple regression analysis
# because data are randomly generated, statistical significance will be extremely low 
seller0 = df1.last_name[0]
seller1 = df1.last_name[1]
seller2 = df1.last_name[2]
seller3 = df1.last_name[3]
seller4 = df1.last_name[4]

In [63]:
# query with all factors we want to consider in the regression analysis
sales_table = run('''
    SELECT 
        date,
        revenue,
        case when payment_type = 'cash' then 1 end Cash_Payment,
        case when gender = 'male' then 1 end Male_Customer,
        age as Age,
        case when make = 'Toyota' then 1 end Toyota,
        case when make = 'Subaru' then 1 end Subaru,
        case when last_name = '{seller0}' then 1 end '{seller0}',
        case when last_name = '{seller1}' then 1 end {seller1},
        case when last_name = '{seller2}' then 1 end {seller2},
        case when last_name = '{seller3}' then 1 end {seller3},
        case when last_name = '{seller4}' then 1 end {seller4}
    FROM
        sales_table S
        JOIN car_table C on S.model_id = C.model_id
        JOIN cust_table CUST on CUST.customer_id = S.customer_id
        JOIN salesman_table M on S.salesman_id = M.id
'''.format(seller0=seller0, seller1=seller1, seller2=seller2, seller3=seller3,seller4=seller4))

In [64]:
date_df = sales_table.groupby('date').sum()

In [65]:
X = date_df[['Cash_Payment', 'Male_Customer', 'Age', 'Toyota', 'Subaru', '%s' % seller0, '%s' % seller1, '%s' % seller2, '%s' % seller3,
             '%s' % seller4]]
y = date_df['revenue']

In [66]:
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()

mr_analysis = html_prep(results.summary().as_html())

In [67]:
HTML(mr_analysis)


Out[67]:
OLS Regression Results
Dep. Variable: revenue R-squared: 0.942
Model: OLS Adj. R-squared: 0.910
Method: Least Squares F-statistic: 29.18
Date: Thu, 05 Mar 2015 Prob (F-statistic): 4.35e-09
Time: 12:26:17 Log-Likelihood: -299.97
No. Observations: 29 AIC: 621.9
Df Residuals: 18 BIC: 637.0
Df Model: 10
coef std err t P>|t| [95.0% Conf. Int.]
const 6379.7470 4862.512 1.312 0.206 -3836.011 1.66e+04
Cash Payment 2538.6395 3267.185 0.777 0.447 -4325.462 9402.741
Male Customer 1374.2896 2225.105 0.618 0.545 -3300.482 6049.062
Age 213.9769 67.032 3.192 0.005 73.148 354.806
Toyota 7183.8143 2347.486 3.060 0.007 2251.929 1.21e+04
Subaru 1.003e+04 3328.096 3.014 0.007 3039.368 1.7e+04
Case 393.5815 3182.790 0.124 0.903 -6293.212 7080.375
Hatch -2661.7417 4604.299 -0.578 0.570 -1.23e+04 7011.532
Hill 6258.0356 3382.020 1.850 0.081 -847.324 1.34e+04
Self 2567.4752 4453.775 0.576 0.571 -6789.558 1.19e+04
Elzy 2310.8865 5592.478 0.413 0.684 -9438.475 1.41e+04
Omnibus: 6.172 Durbin-Watson: 2.066
Prob(Omnibus): 0.046 Jarque-Bera (JB): 4.422
Skew: -0.879 Prob(JB): 0.110
Kurtosis: 3.752 Cond. No. 540.

In [68]:
top_seller_id = top_seller_df.salesman_id.item()

In [69]:
customer_bias = run('''
    SELECT
        CASE WHEN age BETWEEN 18 AND 24 THEN '18-24 years'
             WHEN age BETWEEN 25 AND 34 THEN '25-34 years'
             WHEN age BETWEEN 35 AND 44 THEN '35-45 years'
             WHEN age BETWEEN 45 AND 54 THEN '45-54 years'
             WHEN age BETWEEN 55 AND 64 THEN '55-64 years'
             END Age_Group,
        CASE WHEN gender = 'male' then 1-round(sum(revenue)/sum(sticker_price), 2) end Male_Customer,
        CASE WHEN gender = 'female' then 1-round(sum(revenue)/sum(sticker_price), 2) end Female_Customer
    FROM
        cust_table CUST
        JOIN sales_table S on S.customer_id = CUST.customer_id
        JOIN car_table C on C.model_id = S.model_id
    WHERE
        salesman_id = {}
    GROUP BY
        Age_Group
    '''.format(top_seller_id))

In [70]:
customer_bias


Out[70]:
Age_Group Male_Customer Female_Customer
0 18-24 years 0.00 0.24
1 25-34 years 0.15 0.00
2 35-45 years 0.09 0.00
3 45-54 years 0.16 0.00
4 55-64 years 0.18 0.00

In [71]:
# visualize results
sns.set_style('white')
sns.set_context('notebook')
plot = customer_bias.plot(
    x = 'Age_Group',
    kind='bar', 
    linewidth = 0,
    colormap = 'Accent',
    grid = False,
    fontsize = 13,
    rot = 30
    )
plot.set_xlabel('Avg Customer Discount by Demographic')
sns.despine()
image_path = 'customer.png'
savefig(image_path, bbox_inches='tight')
cust_cid = to_cid(image_path)
image_list.append(to_image(image_path))



In [86]:
# some simple CSS to make the tables look cleaner in the email we'll be sending out
css_header = '<html>\n <head>\n <style>\n table, td, th\n {\n border:2px solid #f7f7f7;\n }\n th\n {\n background-color:#4b63e3;\n color:white;\n }\n table\n {\n border-collapse:collapse;\n }\n td, th\n {\n padding:3px;\n }\n td\n {\n font:12px arial,sans-serif;\n }\n th\n {\n font: 13px arial,sans-serif;\n }\n td\n {\n text-align:center;\n }\n th\n {\n text-align:left;\n }\n </style>\n </head>\n <body>'
css_footer = '</body>\n </html>'

In [87]:
header = '<font size="3"><b>Salesperson Report</b></font> <br>'
cust_discount = '<br><br><font size="2"><b>Negotiation Practices indicated by average customer discounts:</b></font><br>'
mr_title = '<br><br><font size="2"><b>Multiple Regression Analysis:</b></font><br>'
other = '<br><br><font size="2"><b>Total Team Performance:</b></font><br>'

In [88]:
to_email = transform(css_header + header + cust_discount + report_html + cust_cid + mr_title + mr_analysis + other + df_html + new_cid + css_footer)

In [89]:
HTML(to_email)


Out[89]:
Salesperson Report


Negotiation Practices indicated by average customer discounts:
Jared Case Mean Standard Deviation Deviations from the Mean
0.154 0.13 0.074 2.08


Multiple Regression Analysis:
OLS Regression Results
Dep. Variable: revenue R-squared: 0.942
Model: OLS Adj. R-squared: 0.910
Method: Least Squares F-statistic: 29.18
Date: Thu, 05 Mar 2015 Prob (F-statistic): 4.35e-09
Time: 12:26:17 Log-Likelihood: -299.97
No. Observations: 29 AIC: 621.9
Df Residuals: 18 BIC: 637.0
Df Model: 10
coef std err t P>|t| [95.0% Conf. Int.]
const 6379.7470 4862.512 1.312 0.206 -3836.011 1.66e+04
Cash Payment 2538.6395 3267.185 0.777 0.447 -4325.462 9402.741
Male Customer 1374.2896 2225.105 0.618 0.545 -3300.482 6049.062
Age 213.9769 67.032 3.192 0.005 73.148 354.806
Toyota 7183.8143 2347.486 3.060 0.007 2251.929 1.21e+04
Subaru 1.003e+04 3328.096 3.014 0.007 3039.368 1.7e+04
Case 393.5815 3182.790 0.124 0.903 -6293.212 7080.375
Hatch -2661.7417 4604.299 -0.578 0.570 -1.23e+04 7011.532
Hill 6258.0356 3382.020 1.850 0.081 -847.324 1.34e+04
Self 2567.4752 4453.775 0.576 0.571 -6789.558 1.19e+04
Elzy 2310.8865 5592.478 0.413 0.684 -9438.475 1.41e+04
Omnibus: 6.172 Durbin-Watson: 2.066
Prob(Omnibus): 0.046 Jarque-Bera (JB): 4.422
Skew: -0.879 Prob(JB): 0.110
Kurtosis: 3.752 Cond. No. 540.


Total Team Performance:
Salesperson Cars Sold Revenue
Jared Case 14 265323
Claudine Hatch 11 196790
Michael Hill 11 201558
Rosemarie Self 11 214085
Elton Elzy 10 194669
Matthew Luna 10 213492
Joseph Seney 9 180020
Justin Avellaneda 9 173182
Samantha Douglas 8 153193
Kathleen March 7 148940

In [90]:
mail('sbh355@stern.nyu.edu', 'Automated Report', 'Automated Report', to_email, image=image_list)

In [40]: