notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np



In [2]:

    
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [3]:

    
ad_data = pd.read_csv("advertising.csv")



In [4]:

    
ad_data.head()









    Out[4]:







  
    
      
      Daily Time Spent on Site
      Age
      Area Income
      Daily Internet Usage
      Ad Topic Line
      City
      Male
      Country
      Timestamp
      Clicked on Ad
    
  
  
    
      0
      68.95
      35
      61833.90
      256.09
      Cloned 5thgeneration orchestration
      Wrightburgh
      0
      Tunisia
      2016-03-27 00:53:11
      0
    
    
      1
      80.23
      31
      68441.85
      193.77
      Monitored national standardization
      West Jodi
      1
      Nauru
      2016-04-04 01:39:02
      0
    
    
      2
      69.47
      26
      59785.94
      236.50
      Organic bottom-line service-desk
      Davidton
      0
      San Marino
      2016-03-13 20:35:42
      0
    
    
      3
      74.15
      29
      54806.18
      245.89
      Triple-buffered reciprocal time-frame
      West Terrifurt
      1
      Italy
      2016-01-10 02:31:19
      0
    
    
      4
      68.37
      35
      73889.99
      225.58
      Robust logistical utilization
      South Manuel
      0
      Iceland
      2016-06-03 03:36:18
      0



In [5]:

    
ad_data.describe()









    Out[5]:







  
    
      
      Daily Time Spent on Site
      Age
      Area Income
      Daily Internet Usage
      Male
      Clicked on Ad
    
  
  
    
      count
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.00000
    
    
      mean
      65.000200
      36.009000
      55000.000080
      180.000100
      0.481000
      0.50000
    
    
      std
      15.853615
      8.785562
      13414.634022
      43.902339
      0.499889
      0.50025
    
    
      min
      32.600000
      19.000000
      13996.500000
      104.780000
      0.000000
      0.00000
    
    
      25%
      51.360000
      29.000000
      47031.802500
      138.830000
      0.000000
      0.00000
    
    
      50%
      68.215000
      35.000000
      57012.300000
      183.130000
      0.000000
      0.50000
    
    
      75%
      78.547500
      42.000000
      65470.635000
      218.792500
      1.000000
      1.00000
    
    
      max
      91.430000
      61.000000
      79484.800000
      269.960000
      1.000000
      1.00000



In [6]:

    
ad_data["Age"].hist(bins = 30)









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x1b1f065ae48>



In [7]:

    
sns.jointplot(ad_data["Age"], ad_data["Area Income"])









    Out[7]:





<seaborn.axisgrid.JointGrid at 0x1b1f067e2b0>



In [8]:

    
sns.jointplot(ad_data["Age"], ad_data["Daily Time Spent on Site"], kind="kde")









    Out[8]:





<seaborn.axisgrid.JointGrid at 0x1b1f1f7c5c0>



In [9]:

    
sns.jointplot(ad_data["Daily Internet Usage"], ad_data["Daily Time Spent on Site"])









    Out[9]:





<seaborn.axisgrid.JointGrid at 0x1b1f2148b38>



In [10]:

    
sns.pairplot(ad_data, hue="Clicked on Ad")









    Out[10]:





<seaborn.axisgrid.PairGrid at 0x1b1f2148940>



In [13]:

    
from sklearn.model_selection import train_test_split



In [22]:

    
preX = ad_data.drop(["Ad Topic Line", "City", "Country", "Timestamp"], axis = 1, )
X = preX.drop("Clicked on Ad", axis = 1)
y = ad_data["Clicked on Ad"]



In [23]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)



In [24]:

    
from sklearn.linear_model import LogisticRegression



In [25]:

    
logreg = LogisticRegression()



In [26]:

    
logreg.fit(X_train, y_train)









    Out[26]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [27]:

    
predictions = logreg.predict(X_test)



In [29]:

    
from sklearn.metrics import classification_report



In [30]:

    
print(classification_report(y_test, predictions))









    



             precision    recall  f1-score   support

          0       0.84      0.97      0.90       146
          1       0.96      0.82      0.89       154

avg / total       0.90      0.89      0.89       300



In [ ]:

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Ad Topic Line	City	Male	Country	Timestamp
0	68.95	35	61833.90	256.09	Cloned 5thgeneration orchestration	Wrightburgh	0	Tunisia	2016-03-27 00:53:11
1	80.23	31	68441.85	193.77	Monitored national standardization	West Jodi	1	Nauru	2016-04-04 01:39:02
2	69.47	26	59785.94	236.50	Organic bottom-line service-desk	Davidton	0	San Marino	2016-03-13 20:35:42
3	74.15	29	54806.18	245.89	Triple-buffered reciprocal time-frame	West Terrifurt	1	Italy	2016-01-10 02:31:19
4	68.37	35	73889.99	225.58	Robust logistical utilization	South Manuel	0	Iceland	2016-06-03 03:36:18

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Male	Clicked on Ad
count	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.00000
mean	65.000200	36.009000	55000.000080	180.000100	0.481000	0.50000
std	15.853615	8.785562	13414.634022	43.902339	0.499889	0.50025
min	32.600000	19.000000	13996.500000	104.780000	0.000000	0.00000
25%	51.360000	29.000000	47031.802500	138.830000	0.000000	0.00000
50%	68.215000	35.000000	57012.300000	183.130000	0.000000	0.50000
75%	78.547500	42.000000	65470.635000	218.792500	1.000000	1.00000
max	91.430000	61.000000	79484.800000	269.960000	1.000000	1.00000