In [1]:
from IPython.display import HTML
HTML('<iframe src=http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data width=300 height=200></iframe>')
Out[1]:
In [2]:
# import load_iris function from datasets module
from sklearn.datasets import load_iris
# save "bunch" object containing iris dataset and its attributes
iris = load_iris()
type(iris)
Out[2]:
In [3]:
# print the iris data
print (iris.data[:7,])
In [4]:
# print the names of the four features
print "names:"
print iris.feature_names
print "Y catgorical data names:"
# print the encoding scheme for species: 0 = setosa, 1 = versicolor, 2 = virginica
print iris.target_names
print "y:"
# print integers representing the species of each observation
print iris.target
In [5]:
# check the types of the features and response
print type(iris.data)
print type(iris.target)
In [6]:
# check the shape of the features (first dimension = number of observations, second dimensions = number of features)
print iris.data.shape
# check the shape of the response (single dimension matching the number of observations)
print iris.target.shape
In [7]:
# store feature matrix in "X"
X = iris.data
# store response vector in "y"
y = iris.target
# print the shapes of X and y
print X.shape
print y.shape
In [8]:
from sklearn.neighbors import KNeighborsClassifier
# Init classifier
knn = KNeighborsClassifier(n_neighbors=1)
print knn
In [9]:
knn.fit(X,y)
Out[9]:
Predict the outcome of new data
In [10]:
X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
knn.predict(X_new)
Out[10]:
In [11]:
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)
# fit the model with data
knn.fit(X, y)
# predict the response for new observations
knn.predict(X_new)
Out[11]:
In [12]:
# import the class
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X, y)
# predict the response for new observations
logreg.predict(X_new)
Out[12]:
In [13]:
# import the class
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X, y)
# predict the response values for the observations in X
print("Printing prediction...")
print(logreg.predict(X))
# store the predicted response values
y_pred = logreg.predict(X)
# check how many predictions were generated
print("size train: " + str(len(y_pred)))
# compute classification accuracy for the logistic regression model
from sklearn import metrics
## Number of match/total
print metrics.accuracy_score(y, y_pred)
In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)
print metrics.accuracy_score(y, y_pred)
In [15]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(X)
print metrics.accuracy_score(y, y_pred)
print(y)
print(y_pred)
In [16]:
# print the shapes of X and y
print X.shape
print y.shape
# STEP 1: split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
## random_state is the seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)
print("train size: " + str(y_train.shape))
print("test size: " + str(y_test.shape))
In [17]:
# STEP 2: train the model on the training set
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
# STEP 3: make predictions on the testing set
y_pred = logreg.predict(X_test)
# compare actual response values (y_test) with predicted response values (y_pred)
print metrics.accuracy_score(y_test, y_pred)
In [18]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
In [19]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
In [20]:
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test, y_pred))
# import Matplotlib (scientific plotting library)
import matplotlib.pyplot as plt
# allow plots to appear within the notebook
%matplotlib inline
# plot the relationship between K and testing accuracy
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')
Out[20]:
In [21]:
# instantiate the model with the best known parameters
knn = KNeighborsClassifier(n_neighbors=11)
# train the model with X and y (not X_train and y_train)
knn.fit(X, y)
# make a prediction for an out-of-sample observation
knn.predict([3, 5, 4, 2])
Out[21]:
In [22]:
!cd /home/leandroohf/Documents/kaggle/python_tutorial
!pwd
In [23]:
# conventional way to import pandas
import pandas as pd
# read CSV file directly from a URL and save the results
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# display the first 5 rows
data.head()
Out[23]:
In [24]:
# display the last 5 rows
data.tail()
Out[24]:
In [25]:
# check the shape of the DataFrame (rows, columns)
data.shape
Out[25]:
In [26]:
# conventional way to import seaborn
import seaborn as sns
# allow plots to appear within the notebook
%matplotlib inline
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7, kind='reg')
Out[26]:
In [27]:
# create a Python list of feature names
feature_cols = ['TV', 'Radio', 'Newspaper']
# use the list to select a subset of the original DataFrame
X = data[feature_cols]
# equivalent command to do this in one line
X = data[['TV', 'Radio', 'Newspaper']]
# print the first 5 rows
X.head()
Out[27]:
In [28]:
# check the type and shape of X
print type(X)
print X.shape
# select a Series from the DataFrame
y = data['Sales']
# equivalent command that works if there are no spaces in the column name
y = data.Sales
# print the first 5 values
y.head()
Out[28]:
In [29]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [30]:
# import model
from sklearn.linear_model import LinearRegression
# instantiate
linreg = LinearRegression()
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
Out[30]:
In [31]:
# print the intercept and coefficients
print linreg.intercept_
print linreg.coef_
# pair the feature names with the coefficients
zip(feature_cols, linreg.coef_)
Out[31]:
In [32]:
# make predictions on the testing se
y_pred = linreg.predict(X_test)
print(y_pred)
In [33]:
## Just understandings metrics in python
# define true and predicted response values
true = [100, 50, 30, 20]
pred = [90, 50, 50, 30]
# MAE: Mea Absolute Error
print("MAE: Mean abs error:")
# calculate MAE by hand.
print (10 + 0 + 20 + 10)/4.
# calculate MAE using scikit-learn
from sklearn import metrics
print metrics.mean_absolute_error(true, pred)
print("MSE:")
# calculate MSE by hand
print (10**2 + 0**2 + 20**2 + 10**2)/4.
# calculate MSE using scikit-learn
print metrics.mean_squared_error(true, pred)
print("RMSE:")
# calculate RMSE by hand
import numpy as np
print np.sqrt((10**2 + 0**2 + 20**2 + 10**2)/4.)
# calculate RMSE using scikit-learn
print np.sqrt(metrics.mean_squared_error(true, pred))
In [38]:
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))
In [34]:
from ggplot import *
diamonds.head()
Out[34]:
In [35]:
p = ggplot(aes(x='date', y='beef'), data=meat)
p = p + geom_point() + stat_smooth(color='blue')
p
Out[35]:
In [46]:
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# read in the iris data
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
In [53]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.cross_validation import KFold
kf = KFold(25, n_folds=5, shuffle=False)
# print the contents of each training and testing set
print '{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations')
for iteration, data in enumerate(kf, start=1):
print '{:^9} {} {:^25}'.format(iteration, data[0], data[1])
1. K can be any number, but K=10 is generally recommended
2. For classification problems, stratified sampling is recommended for creating the folds
* Each response class should be represented with equal proportions in each of the K folds
* scikit-learn's cross_val_score function does this by default
In [52]:
from sklearn.cross_validation import cross_val_score
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print "Scores:"
print scores
# use average accuracy as an estimate of out-of-sample accuracy
print "Mean scores:"
print scores.mean()
In [60]:
# search for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())
import matplotlib.pyplot as plt
%matplotlib inline
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[60]:
In [62]:
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=20)
print "knn mean score:"
print cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print "logisc regression mean score:"
print cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()
In [93]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
# read in the advertising dataset
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# create a Python list of three feature names
feature_cols = ['TV', 'Radio', 'Newspaper']
# use the list to select a subset of the DataFrame (X)
X = data[feature_cols]
# select the Sales column as the response (y)
y = data.Sales
# 10-fold cross-validation with all three features
lm = LinearRegression()
scores = cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')
print scores
# fix the sign of MSE scores
mse_scores = -scores
# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)
# 10-fold cross-validation with two features (excluding Newspaper)
feature_cols = ['TV', 'Radio']
X = data[feature_cols]
print "Features TV , Radio mean scores: "
print np.sqrt(-cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')).mean()
print "All features mean scores: "
# calculate the average RMSE
print rmse_scores.mean()
Out[93]:
In [73]:
# read in the iris data
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
# use average accuracy as an estimate of out-of-sample accuracy
print scores.mean()
from sklearn.grid_search import GridSearchCV
# define the parameter values that should be searched
k_range = range(1, 31)
print k_range
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range)
print param_grid
# instantiate the grid
# n_jobs = -1 to run computations in parallel (if supported by your computer and OS)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy',n_jobs=-1)
# fit the grid with data
grid.fit(X, y)
# view the complete results (list of named tuples)
grid.grid_scores_
Out[73]:
In [74]:
# examine the first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score
In [76]:
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
# plot the results
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[76]:
In [78]:
# examine the best model
print grid.best_score_
print grid.best_params_
print grid.best_estimator_
Example: tuning max_depth and min_samples_leaf for a DecisionTreeClassifier
Could tune parameters independently: change max_depth while leaving min_samples_leaf at its default value, and vice versa
But, best performance might be achieved when neither parameter is at its default value
In [82]:
# define the parameter values that should be searched
k_range = range(1, 31)
weight_options = ['uniform', 'distance']
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print param_grid
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
# view the complete results
grid.grid_scores_[:7]
# examine the best model
print "Best model:"
print grid.best_score_
print grid.best_params_
In [87]:
# Using the best model
# train your model using all data and the best known parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)
# make a prediction on out-of-sample data
print knn.predict([3, 5, 4, 2])
# shortcut: GridSearchCV automatically refits the best model using all of the data
print grid.predict([3, 5, 4, 2])
In [89]:
from sklearn.grid_search import RandomizedSearchCV
## Important: Specify a continuous distribution (rather than a list of values) for any continous parameters
# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)
# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_[:7]
Out[89]:
In [91]:
# examine the best model
print rand.best_score_
print rand.best_params_