A Logistic Regression Example from the book "Principles of Data Science"



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import metrics
%matplotlib inline



In [2]:

    
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
bikes = pd.read_csv(url)
bikes.head()









    Out[2]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0.0
      3
      13
      16
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      8
      32
      40
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      5
      27
      32
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      3
      10
      13
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      0
      1
      1



In [4]:

    
average_bike_rental = bikes['count'].mean()
print(average_bike_rental)









    



191.57413191254824



In [5]:

    
bikes['above_average'] = bikes['count'] >= average_bike_rental
bikes['above_average'].value_counts(normalize=True)









    Out[5]:





False    0.599853
True     0.400147
Name: above_average, dtype: float64



In [7]:

    
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
features_cols = ['temp']
x = bikes[features_cols]
y = bikes['above_average']
x_train, x_test, y_train, y_test = train_test_split(x, y)
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg.score(x_test, y_test)









    Out[7]:





0.65319617927994122



In [8]:

    
bikes.head()









    Out[8]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
      above_average
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0.0
      3
      13
      16
      False
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      8
      32
      40
      False
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      5
      27
      32
      False
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      3
      10
      13
      False
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      0
      1
      1
      False



In [ ]:

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1