In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=0)
In [2]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train, Y_train)
Y_pred = regr.predict(X_test)
from sklearn.metrics import mean_absolute_error
print "MAE", mean_absolute_error(Y_test, Y_pred)
In [3]:
%timeit regr.fit(X_train, Y_train)
In [4]:
import numpy as np
avg_price_house = np.average(boston.target)
high_priced_idx = (Y_train >= avg_price_house)
Y_train[high_priced_idx] = 1
Y_train[np.logical_not(high_priced_idx)] = 0
Y_train = Y_train.astype(np.int8)
high_priced_idx = (Y_test >= avg_price_house)
Y_test[high_priced_idx] = 1
Y_test[np.logical_not(high_priced_idx)] = 0
Y_test = Y_test.astype(np.int8)
In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report
print classification_report(Y_test, Y_pred)
In [6]:
%timeit clf.fit(X_train, Y_train)
In [7]:
from sklearn import datasets
iris = datasets.load_iris()
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0)
In [8]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
In [9]:
from sklearn.metrics import classification_report
print classification_report(Y_test, Y_pred)
In [10]:
%timeit clf.fit(X_train, Y_train)
In [11]:
from sklearn.utils import shuffle
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
In [12]:
mnist = fetch_mldata("MNIST original")
In [13]:
mnist.data, mnist.target = shuffle(mnist.data, mnist.target)
In [14]:
# We reduce the dataset size, otherwise it'll take too much time to run
mnist.data = mnist.data[:1000]
mnist.target = mnist.target[:1000]
X_train, X_test, Y_train, Y_test = train_test_split(mnist.data, mnist.target, test_size=0.8, random_state=0)
In [15]:
from sklearn.neighbors import KNeighborsClassifier
# KNN: K=10, default measure of distance (euclidean)
clf = KNeighborsClassifier(3)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
In [16]:
from sklearn.metrics import classification_report
print classification_report(Y_test, Y_pred)
In [17]:
%timeit clf.fit(X_train, Y_train)
In [18]:
%timeit clf.predict(X_test)
In [20]:
import urllib2
target_page = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.bz2'
with open('ijcnn1.bz2','wb') as W:
W.write(urllib2.urlopen(target_page).read())
In [21]:
from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file('ijcnn1.bz2')
first_rows = 2500
X_train, y_train = X_train[:first_rows,:], y_train[:first_rows]
In [22]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
hypothesis = SVC(kernel='rbf', degree=2, random_state=101)
scores = cross_val_score(hypothesis, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print "SVC with rbf kernel -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [23]:
import urllib2
target_page = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/poker.bz2'
with open('poker.bz2','wb') as W:
W.write(urllib2.urlopen(target_page).read())
In [24]:
from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file('poker.bz2')
from sklearn.preprocessing import OneHotEncoder
hot_encoding = OneHotEncoder(sparse=True)
X_train = hot_encoding.fit_transform(X_train.toarray())
In [26]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
hypothesis = LinearSVC(dual=False)
scores = cross_val_score(hypothesis, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
print "LinearSVC -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [29]:
import urllib2
target_page = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/cadata'
from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file(urllib2.urlopen(target_page))
from sklearn.preprocessing import scale
first_rows = 2000
X_train = scale(X_train[:first_rows,:].toarray())
y_train = y_train[:first_rows]/10**4.0
In [30]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR
hypothesis = SVR()
scores = cross_val_score(hypothesis, X_train, y_train, cv=3, scoring='mean_absolute_error', n_jobs=-1)
print "SVR -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [31]:
X_train, y_train = load_svmlight_file('ijcnn1.bz2')
first_rows = 2500
X_train, y_train = X_train[:first_rows,:], y_train[:first_rows]
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV
hypothesis = SVC(kernel='rbf', random_state=101)
search_dict = {'degree':[2,3], 'C': [0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001]}
search_func = RandomizedSearchCV(estimator=hypothesis, param_distributions=search_dict, n_iter=30, scoring='accuracy', n_jobs=-1, iid=True, refit=True, cv=5, random_state=101)
search_func.fit(X_train, y_train)
print 'Best parameters %s' % search_func.best_params_
print 'Cross validation accuracy: mean = %0.3f' % search_func.best_score_
In [33]:
from sklearn.datasets import fetch_covtype
covertype_dataset = fetch_covtype(random_state=101, shuffle=True)
print covertype_dataset.DESCR
covertype_X = covertype_dataset.data[:15000,:]
covertype_y = covertype_dataset.target[:15000]
covertypes = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 'Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz']
In [34]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
hypothesis = BaggingClassifier(KNeighborsClassifier(n_neighbors=1), max_samples=0.7, max_features=0.7, n_estimators=100)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, scoring='accuracy', n_jobs=-1)
print "BaggingClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [35]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
hypothesis = RandomForestClassifier(n_estimators=100, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, scoring='accuracy', n_jobs=-1)
print "RandomForestClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [36]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
hypothesis = ExtraTreesClassifier(n_estimators=100, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, scoring='accuracy', n_jobs=-1)
print "ExtraTreesClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [37]:
import urllib2
target_page = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/cadata'
from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file(urllib2.urlopen(target_page))
from sklearn.preprocessing import scale
first_rows = 2000
In [38]:
X_train = scale(X_train[:first_rows,:].toarray())
y_train = y_train[:first_rows]/10**4.
from sklearn.ensemble import RandomForestRegressor
hypothesis = RandomForestRegressor(n_estimators=300, random_state=101)
scores = cross_val_score(hypothesis, X_train, y_train, cv=3, scoring='mean_absolute_error', n_jobs=-1)
print "RandomForestClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [39]:
from sklearn.ensemble import AdaBoostClassifier
hypothesis = AdaBoostClassifier(n_estimators=300, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, scoring='accuracy', n_jobs=-1)
print "Adaboost -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [40]:
import urllib2
target_page = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/poker.bz2'
with open('poker.bz2','wb') as W:
W.write(urllib2.urlopen(target_page).read())
from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file('poker.bz2')
from sklearn.preprocessing import OneHotEncoder
hot_encoding = OneHotEncoder(sparse=True)
X_train = hot_encoding.fit_transform(X_train.toarray()).toarray()[:2500,:]
y_train = y_train[:2500]
In [41]:
from sklearn.ensemble import GradientBoostingClassifier
hypothesis = GradientBoostingClassifier(max_depth=5, n_estimators=300, random_state=101)
scores = cross_val_score(hypothesis, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
print "GradientBoostingClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
In [44]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
newsgroups_dataset = fetch_20newsgroups(shuffle=True, remove=('headers', 'footers', 'quotes'), random_state=6)
print 'Posts inside the data: %s' % np.shape(newsgroups_dataset.data)
print 'Average number of words for post: %0.0f' % np.mean([len(text.split(' ')) for text in newsgroups_dataset.data])
In [45]:
# Attention this may take a while
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=10**5, n_features=5, n_informative=3, random_state=101)
D = np.c_[y,X]
np.savetxt('huge_dataset_10__5.csv', D, delimiter=",") # the saved file should be around 14,6 MB
del(D, X, y)
X,y = make_classification(n_samples=10**6, n_features=5, n_informative=3, random_state=101)
D = np.c_[y,X]
np.savetxt('huge_dataset_10__6.csv', D, delimiter=",") # the saved file should be around 146 MB
del(D, X, y)
X,y = make_classification(n_samples=10**7, n_features=5, n_informative=3, random_state=101)
D = np.c_[y,X]
np.savetxt('huge_dataset_10__7.csv', D, delimiter=",") # the saved file should be around 1,46 GB
del(D, X, y)
In [46]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
streaming = pd.read_csv('huge_dataset_10__7.csv', header=None, chunksize=10000)
learner = SGDClassifier(loss='log')
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
cumulative_accuracy = list()
for n,chunk in enumerate(streaming):
if n == 0:
minmax_scaler.fit(chunk.ix[:,1:].values)
X = minmax_scaler.transform(chunk.ix[:,1:].values)
X[X>1] = 1
X[X<0] = 0
y = chunk.ix[:,0]
if n > 8 :
cumulative_accuracy.append(learner.score(X,y))
learner.partial_fit(X,y,classes=np.unique(y))
print 'Progressive validation mean accuracy %0.3f' % np.mean(cumulative_accuracy)
In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
import pandas as pd
from datetime import datetime
classifiers = {
'SGDClassifier hinge loss' : SGDClassifier(loss='hinge', random_state=101),
'SGDClassifier log loss' : SGDClassifier(loss='log', random_state=101),
'Perceptron' : Perceptron(random_state=101),
'BernoulliNB' : BernoulliNB(),
'PassiveAggressiveClassifier' : PassiveAggressiveClassifier(random_state=101)
}
huge_dataset = 'huge_dataset_10__6.csv'
for algorithm in classifiers:
start = datetime.now()
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
streaming = pd.read_csv(huge_dataset, header=None, chunksize=100)
learner = classifiers[algorithm]
cumulative_accuracy = list()
for n,chunk in enumerate(streaming):
y = chunk.ix[:,0]
X = chunk.ix[:,1:]
if n > 50 :
cumulative_accuracy.append(learner.score(X,y))
learner.partial_fit(X,y,classes=np.unique(y))
elapsed_time = datetime.now() - start
print algorithm + ' : mean accuracy %0.3f in %s secs' % (np.mean(cumulative_accuracy),elapsed_time.total_seconds())
In [49]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
def streaming():
for response, item in zip(newsgroups_dataset.target, newsgroups_dataset.data):
yield response, item
hashing_trick = HashingVectorizer(stop_words='english', norm = 'l2', non_negative=True)
learner = SGDClassifier(random_state=101)
texts = list()
targets = list()
for n,(target, text) in enumerate(streaming()):
texts.append(text)
targets.append(target)
if n % 1000 == 0 and n >0:
learning_chunk = hashing_trick.transform(texts)
if n > 1000:
last_validation_score = learner.score(learning_chunk, targets),
learner.partial_fit(learning_chunk, targets, classes=[k for k in range(20)])
texts, targets = list(), list()
print 'Last validation score: %0.3f' % last_validation_score
In [50]:
New_text = ['A 2014 red Toyota Prius v Five with fewer than 14K miles. Powered by a reliable 1.8L four cylinder hybrid engine that averages 44mpg in the city and 40mpg on the highway.']
text_vector = hashing_trick.transform(New_text)
print np.shape(text_vector), type(text_vector)
print 'Predicted newsgroup: %s' % newsgroups_dataset.target_names[learner.predict(text_vector)]
In [51]:
my_text = "The sexy job in the next 10 years will be statisticians. People think I'm joking, but who would've guessed that computer engineers would've been the sexy job of the 1990s?"
simple_tokens = my_text.split(' ')
print simple_tokens
Executing this or other nltk package calls, In case of an error saying: “Resource u'tokenizers/punkt/english.pickle' not found.”, just write on your console: nltk.download() and select to download everything or just to browse for the missing resource that triggered the warning.
In [55]:
import nltk
nltk_tokens = nltk.word_tokenize(my_text)
print nltk_tokens
In [57]:
from nltk.stem import *
stemmer = LancasterStemmer()
print [stemmer.stem(word) for word in nltk_tokens]
In [58]:
print nltk.pos_tag(nltk_tokens)
In [59]:
text = "Elvis Aaron Presley was an American singer and actor. Born in Tupelo, Mississippi, when Presley was 13 years old he and his family relocated to Memphis, Tennessee."
chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
print chunks
In [60]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS
print stop_words
In [61]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
to_remove = ('headers', 'footers', 'quotes')
twenty_sci_news_train = fetch_20newsgroups(subset='train', remove=to_remove, categories=categories)
twenty_sci_news_test = fetch_20newsgroups(subset='test', remove=to_remove, categories=categories)
In [62]:
tf_vect = TfidfVectorizer()
X_train = tf_vect.fit_transform(twenty_sci_news_train.data)
X_test = tf_vect.transform(twenty_sci_news_test.data)
Y_train = twenty_sci_news_train.target
Y_test = twenty_sci_news_test.target
In [63]:
clf = SGDClassifier()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print "Accuracy=", accuracy_score(Y_test, Y_pred)
In [64]:
def clean_and_stem_text(text):
tokens = nltk.word_tokenize(text.lower())
clean_tokens = [word for word in tokens if word not in stop_words]
stem_tokens = [stemmer.stem(token) for token in clean_tokens]
return " ".join(stem_tokens)
cleaned_docs_train = [clean_and_stem_text(text) for text in twenty_sci_news_train.data]
cleaned_docs_test = [clean_and_stem_text(text) for text in twenty_sci_news_test.data]
In [65]:
X1_train = tf_vect.fit_transform(cleaned_docs_train)
X1_test = tf_vect.transform(cleaned_docs_test)
clf.fit(X1_train, Y_train)
Y1_pred = clf.predict(X1_test)
print "Accuracy=", accuracy_score(Y_test, Y1_pred)
In [67]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
In [69]:
from sklearn import datasets
N_samples = 2000
dataset_1 = np.array(datasets.make_circles(n_samples=N_samples, noise=0.05, factor=0.3)[0])
dataset_2 = np.array(datasets.make_blobs(n_samples=N_samples, centers=4, cluster_std=0.4, random_state=0)[0])
In [70]:
plt.scatter(dataset_2[:,0], dataset_2[:,1], c='k', alpha=0.8, s=5.0)
plt.show()
In [71]:
from sklearn.cluster import KMeans
K_dataset_1 = 2
km_1 = KMeans(n_clusters=K_dataset_1)
labels_1 = km_1.fit(dataset_1).labels_
In [72]:
plt.scatter(dataset_1[:,0], dataset_1[:,1], c=labels_1, alpha=0.8, s=5.0, lw = 0)
plt.scatter(km_1.cluster_centers_[:,0], km_1.cluster_centers_[:,1], s=100, c=np.unique(labels_1), lw=0.2)
plt.show()
In [73]:
K_dataset_2 = 4
km_2 = KMeans(n_clusters=K_dataset_2)
labels_2 = km_2.fit(dataset_2).labels_
In [74]:
plt.scatter(dataset_2[:,0], dataset_2[:,1], c=labels_2, alpha=0.8, s=5.0, lw = 0)
plt.scatter(km_2.cluster_centers_[:,0], km_2.cluster_centers_[:,1], s=100, c=np.unique(labels_2), lw=0.2)
plt.show()
In [75]:
from sklearn.cluster import DBSCAN
dbs_1 = DBSCAN(eps=0.4)
labels_1 = dbs_1.fit(dataset_1).labels_
In [76]:
plt.scatter(dataset_1[:,0], dataset_1[:,1], c=labels_1, alpha=0.8, s=5.0, lw = 0)
plt.show()
In [77]:
np.unique(labels_1)
Out[77]:
In [79]:
dbs_2 = DBSCAN(eps=0.5)
labels_2 = dbs_2.fit(dataset_2).labels_
In [80]:
plt.scatter(dataset_2[:,0], dataset_2[:,1], c=labels_2, alpha=0.8, s=5.0, lw = 0)
plt.show()
In [81]:
np.unique(labels_2)
Out[81]: