A Logistic Regression Example from the book "Principles of Data Science"


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import metrics
%matplotlib inline

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
bikes = pd.read_csv(url)
bikes.head()


Out[2]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0 0 1 1

In [4]:
average_bike_rental = bikes['count'].mean()
print(average_bike_rental)


191.57413191254824

In [5]:
bikes['above_average'] = bikes['count'] >= average_bike_rental
bikes['above_average'].value_counts(normalize=True)


Out[5]:
False    0.599853
True     0.400147
Name: above_average, dtype: float64

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
features_cols = ['temp']
x = bikes[features_cols]
y = bikes['above_average']
x_train, x_test, y_train, y_test = train_test_split(x, y)
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg.score(x_test, y_test)


Out[7]:
0.65319617927994122

In [8]:
bikes.head()


Out[8]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count above_average
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0 3 13 16 False
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0 8 32 40 False
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0 5 27 32 False
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0 3 10 13 False
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0 0 1 1 False

In [ ]: