notebook.community

Edit and run



In [2]:

    
import pandas as pd
import sqlite3
import statsmodels.api as sm
import numpy as np # might not need this
import seaborn as sns
from pylab import savefig
from IPython.display import HTML
from premailer import transform
import matplotlib.pyplot as plt



In [3]:

    
# all of these imports are for the mail() function
import smtplib
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
import os
from sys import exit



In [4]:

    
#to view graphs in ipython
%matplotlib inline



In [5]:

    
# if this .sqlite db doesn't already exists, this will create it
# if the .sqlite db *does* already exist, this establishes the desired connection
con = sqlite3.connect("sql_sample_db_2.sqlite")



In [6]:

    
# create pandas dataframes from each .csv file:
sales_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/sales_table.csv')
car_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/car_table.csv')
salesman_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/salesman_table.csv')
cust_table = pd.read_csv('https://raw.githubusercontent.com/DaveBackus/Data_Bootcamp/master/Code/SQL/cust_table.csv')



In [7]:

    
# make a list of the tables (dataframes) and table names:
tables = [sales_table, car_table, salesman_table, cust_table]
table_names = ['sales_table', 'car_table', 'salesman_table', 'cust_table']



In [8]:

    
# drop each table name if it already exists to avoid error if you rerun this bit of code
# then add it back (or add it for the first time, if the table didn't already exist)
for i in range(len(tables)):
    table_name = table_names[i]
    table = tables[i]
    con.execute("DROP TABLE IF EXISTS {}".format(table_name))
    pd.io.sql.to_sql(table, "{}".format(table_name), con, index=False)



In [9]:

    
#quick function that's useful to turn things into a comma separated string, useful for SQL
def to_list_string(series):
    try:
        series.iloc[0] + 1
        return ', '.join(["{:.0f}".format(row[1]) for row in series.iteritems()])
    except TypeError:
        return ', '.join(["'" + row[1] + "'" for row in series.iteritems()])



In [10]:

    
# these just provide formatting to enable images to appear in-line in emails
def to_cid(path):
    return '<img src="cid:{}">'.format(path)
def to_image(path):
    return dict(title=u'Graph', path=u'{}'.format(path), cid='{}'.format(path))
image_list = []



In [11]:

    
def html_prep(tables):
    no_underscores = tables.replace('_', ' ')
    no_indents = no_underscores.replace('<UL>', '')
    no_bullet_points = no_indents.replace('<LI>', '')
    no_nans = no_bullet_points.replace('NaN', '0')
    no_nans2 = no_nans.replace('nan', '0')
    no_weird_stuff1 = no_nans2.replace('&lt;', '<')
    no_weird_stuff2 = no_weird_stuff1.replace('&gt;', '>')
    return no_weird_stuff2



In [12]:

    
# makes it easy to send out an email with inline images and attachments
def mail(to, subject, text, html, attach=None, image=None):
    sendMail = True
    
    if type(to) == str:
        tempList = []
        tempList.append(to)
        to = tempList
    elif type(to) == list or type(to) == tuple:
        pass
    else:
        print('Unrecognized address input type')
        sendMail = False
        
    if sendMail == True:
        msg = MIMEMultipart('alternative')

        msg['From'] = 'reports@peanutlabs.com'
        msg['To'] = ','.join(to)
        msg['Subject'] = subject
        

        part1 = MIMEText(text,'plain')
        part2 = MIMEText(html,'html')

        msg.attach(part1)
        msg.attach(part2)

        def attacher(msg, item):
            part = MIMEBase('application', 'octet-stream')
            part.set_payload(open(item, 'rb').read())
            Encoders.encode_base64(part)
            part.add_header('Content-Disposition',
                'attachment; filename="%s"' % os.path.basename(item))
            msg.attach(part)

        if attach:
            if isinstance(attach, basestring):
                attacher(msg, attach)
            else:
                for item in attach:
                    attacher(msg, item)

        def embed_image(msg, image):
            #img = dict(title=u'Picture report', path=u'TESTING.png', cid=cid)
            with open(image['path'], 'rb') as file:
                msg_image = MIMEImage(file.read(), name=os.path.basename(image['path']))
                msg.attach(msg_image)
            
            msg_image.add_header('Content-ID', '<{}>'.format(image['cid']))

        if image:
            if isinstance(image, dict):
                embed_image(msg, image)
            else:
                for item in image:
                    embed_image(msg, item)


        mailServer = smtplib.SMTP("smtp.gmail.com", 587)
        mailServer.ehlo()
        mailServer.starttls()
        mailServer.ehlo()
        mailServer.login('elizabethjbecket@gmail.com','fakeemailpassword')
        mailServer.sendmail('elizabethjbecket@gmail.com', to, msg.as_string())
        # Should be mailServer.quit(), but that crashes...
        mailServer.close()



In [13]:

    
# Function to make it easy to run queries on this mini-database
def run(query):
    results = pd.read_sql("{}".format(query), con).fillna(0)
    return results



In [79]:

    
# actual query
df1 = run('''
    SELECT
        M.last_name,
        first_name ||  ' ' || last_name as Salesperson,
        COUNT(*) as Cars_Sold,
        sum(revenue) as Revenue
    FROM
        sales_table S
        JOIN salesman_table M ON S.salesman_id = M.id
    GROUP BY 
        Salesperson
    ORDER BY
        Cars_Sold DESC
    ''') 
df_html = html_prep(df1[['Salesperson','Cars_Sold', 'Revenue']].to_html(index=False))
HTML(df_html)









    Out[79]:





  
    
      Salesperson
      Cars Sold
      Revenue
    
  
  
    
              Jared Case
       14
       265323
    
    
          Claudine Hatch
       11
       196790
    
    
            Michael Hill
       11
       201558
    
    
          Rosemarie Self
       11
       214085
    
    
              Elton Elzy
       10
       194669
    
    
            Matthew Luna
       10
       213492
    
    
            Joseph Seney
        9
       180020
    
    
       Justin Avellaneda
        9
       173182
    
    
        Samantha Douglas
        8
       153193
    
    
          Kathleen March
        7
       148940



In [80]:

    
# visualize results
sns.set_style('white')
sns.set_context('notebook')
plot = df1.plot(
    x = 'Salesperson',
    y = 'Cars_Sold', 
    kind='barh', 
    linewidth = 0,
    colormap = 'Paired',
    grid = False,
    xticks = [1,3,5,7,9,11,13,15],
    fontsize = 13,
    style = 'bms'
    )
plot.set_xlabel('Cars Sold')
sns.despine()
image_path = 'test.png'
savefig(image_path, bbox_inches='tight')
new_cid = to_cid(image_path)
image_list.append(to_image(image_path))



In [81]:

    
top_seller = df1.last_name[0]



In [82]:

    
top_seller_df = run('''
    SELECT
        salesman_id,
        first_name ||  ' ' || last_name as Top_Seller,
        round(avg(revenue), 2) Rev_per_Sale,
        1 - round(avg(revenue/sticker_price), 3) Avg_Customer_Discount,
        sum(revenue) Total_Revenue
    FROM
        sales_table S
        JOIN salesman_table M on S.salesman_id = M.id
        JOIN car_table C on S.model_id = C.model_id
    WHERE
        last_name = '{}'
'''.format(top_seller))



In [83]:

    
all_others = run('''
    SELECT
        revenue as Rev_per_Sale,
        1-revenue/sticker_price as Avg_Customer_Discount
    FROM
        sales_table S
        JOIN salesman_table M on S.salesman_id = M.id
        JOIN car_table C on S.model_id = C.model_id
    WHERE
        last_name != '{}'
        
'''.format(top_seller))



In [84]:

    
stdev = all_others.Avg_Customer_Discount.std().round(3)
mean = all_others.Avg_Customer_Discount.mean().round(3)
top_seller_discount = round(top_seller_df.Avg_Customer_Discount.item(), 3)
print('The mean discount given to customers is', 
                   mean,'with a', stdev, 'standard deviation.' 
                   ' {} has been giving customers a'.format(top_seller_df.Top_Seller.item()), top_seller_discount ,
                   'discount to customers, which is', round(top_seller_discount/stdev, 2), 
                   'standard deviations from the mean. Action is advised.')









    



The mean discount given to customers is 0.13 with a 0.074 standard deviation. Jared Case has been giving customers a 0.154 discount to customers, which is 2.08 standard deviations from the mean. Action is advised.



In [85]:

    
if top_seller_discount/stdev >= 2:
    pass
else:
    sys.exit()



In [57]:

    
stat_report = pd.DataFrame({'{}'.format(top_seller_df.Top_Seller.item()) : pd.Series([top_seller_discount]),
                           'Mean' : pd.Series([mean]),
                           'Standard Deviation' : pd.Series([stdev]),
                           'Deviations from the Mean' : pd.Series([round(top_seller_discount/stdev, 2)])})



In [58]:

    
cols = ['Jared Case', 'Mean', 'Standard Deviation', 'Deviations from the Mean']



In [59]:

    
stat_report = stat_report[cols]



In [60]:

    
report_html = html_prep(stat_report.to_html(index=False))



In [61]:

    
HTML(report_html)









    Out[61]:





  
    
      Jared Case
      Mean
      Standard Deviation
      Deviations from the Mean
    
  
  
    
       0.154
       0.13
       0.074
       2.08



In [62]:

    
# now for a multiple regression analysis
# because data are randomly generated, statistical significance will be extremely low 
seller0 = df1.last_name[0]
seller1 = df1.last_name[1]
seller2 = df1.last_name[2]
seller3 = df1.last_name[3]
seller4 = df1.last_name[4]



In [63]:

    
# query with all factors we want to consider in the regression analysis
sales_table = run('''
    SELECT 
        date,
        revenue,
        case when payment_type = 'cash' then 1 end Cash_Payment,
        case when gender = 'male' then 1 end Male_Customer,
        age as Age,
        case when make = 'Toyota' then 1 end Toyota,
        case when make = 'Subaru' then 1 end Subaru,
        case when last_name = '{seller0}' then 1 end '{seller0}',
        case when last_name = '{seller1}' then 1 end {seller1},
        case when last_name = '{seller2}' then 1 end {seller2},
        case when last_name = '{seller3}' then 1 end {seller3},
        case when last_name = '{seller4}' then 1 end {seller4}
    FROM
        sales_table S
        JOIN car_table C on S.model_id = C.model_id
        JOIN cust_table CUST on CUST.customer_id = S.customer_id
        JOIN salesman_table M on S.salesman_id = M.id
'''.format(seller0=seller0, seller1=seller1, seller2=seller2, seller3=seller3,seller4=seller4))



In [64]:

    
date_df = sales_table.groupby('date').sum()



In [65]:

    
X = date_df[['Cash_Payment', 'Male_Customer', 'Age', 'Toyota', 'Subaru', '%s' % seller0, '%s' % seller1, '%s' % seller2, '%s' % seller3,
             '%s' % seller4]]
y = date_df['revenue']



In [66]:

    
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()

mr_analysis = html_prep(results.summary().as_html())



In [67]:

    
HTML(mr_analysis)









    Out[67]:





OLS Regression Results

  Dep. Variable:          revenue        R-squared:             0.942


  Model:                    OLS          Adj. R-squared:        0.910


  Method:              Least Squares     F-statistic:           29.18


  Date:              Thu, 05 Mar 2015    Prob (F-statistic):  4.35e-09


  Time:                  12:26:17        Log-Likelihood:      -299.97


  No. Observations:           29         AIC:                   621.9


  Df Residuals:               18         BIC:                   637.0


  Df Model:                   10                                     




                   coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           6379.7470   4862.512      1.312   0.206  -3836.011  1.66e+04


  Cash Payment    2538.6395   3267.185      0.777   0.447  -4325.462  9402.741


  Male Customer   1374.2896   2225.105      0.618   0.545  -3300.482  6049.062


  Age              213.9769     67.032      3.192   0.005     73.148   354.806


  Toyota          7183.8143   2347.486      3.060   0.007   2251.929  1.21e+04


  Subaru          1.003e+04   3328.096      3.014   0.007   3039.368   1.7e+04


  Case             393.5815   3182.790      0.124   0.903  -6293.212  7080.375


  Hatch          -2661.7417   4604.299     -0.578   0.570  -1.23e+04  7011.532


  Hill            6258.0356   3382.020      1.850   0.081   -847.324  1.34e+04


  Self            2567.4752   4453.775      0.576   0.571  -6789.558  1.19e+04


  Elzy            2310.8865   5592.478      0.413   0.684  -9438.475  1.41e+04




  Omnibus:         6.172    Durbin-Watson:         2.066


  Prob(Omnibus):   0.046    Jarque-Bera (JB):      4.422


  Skew:           -0.879    Prob(JB):              0.110


  Kurtosis:        3.752    Cond. No.               540.



In [68]:

    
top_seller_id = top_seller_df.salesman_id.item()



In [69]:

    
customer_bias = run('''
    SELECT
        CASE WHEN age BETWEEN 18 AND 24 THEN '18-24 years'
             WHEN age BETWEEN 25 AND 34 THEN '25-34 years'
             WHEN age BETWEEN 35 AND 44 THEN '35-45 years'
             WHEN age BETWEEN 45 AND 54 THEN '45-54 years'
             WHEN age BETWEEN 55 AND 64 THEN '55-64 years'
             END Age_Group,
        CASE WHEN gender = 'male' then 1-round(sum(revenue)/sum(sticker_price), 2) end Male_Customer,
        CASE WHEN gender = 'female' then 1-round(sum(revenue)/sum(sticker_price), 2) end Female_Customer
    FROM
        cust_table CUST
        JOIN sales_table S on S.customer_id = CUST.customer_id
        JOIN car_table C on C.model_id = S.model_id
    WHERE
        salesman_id = {}
    GROUP BY
        Age_Group
    '''.format(top_seller_id))



In [70]:

    
customer_bias









    Out[70]:






  
    
      
      Age_Group
      Male_Customer
      Female_Customer
    
  
  
    
      0
       18-24 years
       0.00
       0.24
    
    
      1
       25-34 years
       0.15
       0.00
    
    
      2
       35-45 years
       0.09
       0.00
    
    
      3
       45-54 years
       0.16
       0.00
    
    
      4
       55-64 years
       0.18
       0.00



In [71]:

    
# visualize results
sns.set_style('white')
sns.set_context('notebook')
plot = customer_bias.plot(
    x = 'Age_Group',
    kind='bar', 
    linewidth = 0,
    colormap = 'Accent',
    grid = False,
    fontsize = 13,
    rot = 30
    )
plot.set_xlabel('Avg Customer Discount by Demographic')
sns.despine()
image_path = 'customer.png'
savefig(image_path, bbox_inches='tight')
cust_cid = to_cid(image_path)
image_list.append(to_image(image_path))



In [86]:

    
# some simple CSS to make the tables look cleaner in the email we'll be sending out
css_header = '<html>\n <head>\n <style>\n table, td, th\n {\n border:2px solid #f7f7f7;\n }\n th\n {\n background-color:#4b63e3;\n color:white;\n }\n table\n {\n border-collapse:collapse;\n }\n td, th\n {\n padding:3px;\n }\n td\n {\n font:12px arial,sans-serif;\n }\n th\n {\n font: 13px arial,sans-serif;\n }\n td\n {\n text-align:center;\n }\n th\n {\n text-align:left;\n }\n </style>\n </head>\n <body>'
css_footer = '</body>\n </html>'



In [87]:

    
header = '<font size="3"><b>Salesperson Report</b></font> <br>'
cust_discount = '<br><br><font size="2"><b>Negotiation Practices indicated by average customer discounts:</b></font><br>'
mr_title = '<br><br><font size="2"><b>Multiple Regression Analysis:</b></font><br>'
other = '<br><br><font size="2"><b>Total Team Performance:</b></font><br>'



In [88]:

    
to_email = transform(css_header + header + cust_discount + report_html + cust_cid + mr_title + mr_analysis + other + df_html + new_cid + css_footer)



In [89]:

    
HTML(to_email)









    Out[89]:





 
 
 
Salesperson Report 


Negotiation Practices indicated by average customer discounts:

  
    
      Jared Case
      Mean
      Standard Deviation
      Deviations from the Mean
    
  
  
    
       0.154
       0.13
       0.074
       2.08
    
  



Multiple Regression Analysis:

OLS Regression Results

  Dep. Variable:          revenue        R-squared:             0.942


  Model:                    OLS          Adj. R-squared:        0.910


  Method:              Least Squares     F-statistic:           29.18


  Date:              Thu, 05 Mar 2015    Prob (F-statistic):  4.35e-09


  Time:                  12:26:17        Log-Likelihood:      -299.97


  No. Observations:           29         AIC:                   621.9


  Df Residuals:               18         BIC:                   637.0


  Df Model:                   10                                     




                   coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           6379.7470   4862.512      1.312   0.206  -3836.011  1.66e+04


  Cash Payment    2538.6395   3267.185      0.777   0.447  -4325.462  9402.741


  Male Customer   1374.2896   2225.105      0.618   0.545  -3300.482  6049.062


  Age              213.9769     67.032      3.192   0.005     73.148   354.806


  Toyota          7183.8143   2347.486      3.060   0.007   2251.929  1.21e+04


  Subaru          1.003e+04   3328.096      3.014   0.007   3039.368   1.7e+04


  Case             393.5815   3182.790      0.124   0.903  -6293.212  7080.375


  Hatch          -2661.7417   4604.299     -0.578   0.570  -1.23e+04  7011.532


  Hill            6258.0356   3382.020      1.850   0.081   -847.324  1.34e+04


  Self            2567.4752   4453.775      0.576   0.571  -6789.558  1.19e+04


  Elzy            2310.8865   5592.478      0.413   0.684  -9438.475  1.41e+04




  Omnibus:         6.172    Durbin-Watson:         2.066


  Prob(Omnibus):   0.046    Jarque-Bera (JB):      4.422


  Skew:           -0.879    Prob(JB):              0.110


  Kurtosis:        3.752    Cond. No.               540.




Total Team Performance:

  
    
      Salesperson
      Cars Sold
      Revenue
    
  
  
    
              Jared Case
       14
       265323
    
    
          Claudine Hatch
       11
       196790
    
    
            Michael Hill
       11
       201558
    
    
          Rosemarie Self
       11
       214085
    
    
              Elton Elzy
       10
       194669
    
    
            Matthew Luna
       10
       213492
    
    
            Joseph Seney
        9
       180020
    
    
       Justin Avellaneda
        9
       173182
    
    
        Samantha Douglas
        8
       153193
    
    
          Kathleen March
        7
       148940



In [90]:

    
mail('sbh355@stern.nyu.edu', 'Automated Report', 'Automated Report', to_email, image=image_list)



In [40]:

Salesperson	Cars Sold	Revenue
Jared Case	14	265323
Claudine Hatch	11	196790
Michael Hill	11	201558
Rosemarie Self	11	214085
Elton Elzy	10	194669
Matthew Luna	10	213492
Joseph Seney	9	180020
Justin Avellaneda	9	173182
Samantha Douglas	8	153193
Kathleen March	7	148940

Dep. Variable:	revenue	R-squared:	0.942
Model:	OLS	Adj. R-squared:	0.910
Method:	Least Squares	F-statistic:	29.18
Date:	Thu, 05 Mar 2015	Prob (F-statistic):	4.35e-09
Time:	12:26:17	Log-Likelihood:	-299.97
No. Observations:	29	AIC:	621.9
Df Residuals:	18	BIC:	637.0
Df Model:	10

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	6379.7470	4862.512	1.312	0.206	-3836.011 1.66e+04
Cash Payment	2538.6395	3267.185	0.777	0.447	-4325.462 9402.741
Male Customer	1374.2896	2225.105	0.618	0.545	-3300.482 6049.062
Age	213.9769	67.032	3.192	0.005	73.148 354.806
Toyota	7183.8143	2347.486	3.060	0.007	2251.929 1.21e+04
Subaru	1.003e+04	3328.096	3.014	0.007	3039.368 1.7e+04
Case	393.5815	3182.790	0.124	0.903	-6293.212 7080.375
Hatch	-2661.7417	4604.299	-0.578	0.570	-1.23e+04 7011.532
Hill	6258.0356	3382.020	1.850	0.081	-847.324 1.34e+04
Self	2567.4752	4453.775	0.576	0.571	-6789.558 1.19e+04
Elzy	2310.8865	5592.478	0.413	0.684	-9438.475 1.41e+04

Omnibus:	6.172	Durbin-Watson:	2.066
Prob(Omnibus):	0.046	Jarque-Bera (JB):	4.422
Skew:	-0.879	Prob(JB):	0.110
Kurtosis:	3.752	Cond. No.	540.

	Age_Group	Male_Customer	Female_Customer
0	18-24 years	0.00	0.24
1	25-34 years	0.15	0.00
2	35-45 years	0.09	0.00
3	45-54 years	0.16	0.00
4	55-64 years	0.18	0.00