In [90]:
from IPython.display import Image
Image(filename='images/phd053104s.png')
Out[90]:
A learning problem considers a set of n samples of data and then tries to predict properties of unknown data. If each sample is more than a single number and, for instance, a multi-dimensional entry (aka multivariate data), is it said to have several attributes or features.
We can separate learning problems in a few large categories:
In [41]:
from IPython.display import Image
Image(filename='images/instalacion.png')
Out[41]:
In [1]:
import numpy as np
In [2]:
a = np.array(range(64), dtype=np.int32)
a
Out[2]:
In [3]:
a = a.reshape((8, 8))
a
Out[3]:
In [4]:
b = np.random.rand(5, 5)
b.shape
Out[4]:
In [5]:
a.mean(), a.std(), a.cumsum()
Out[5]:
In [6]:
a.cumsum() * 2
Out[6]:
In [8]:
c = _
c
Out[8]:
In [9]:
c[:5]
Out[9]:
In [11]:
c[-1]
Out[11]:
In [56]:
from IPython.display import Image
Image(filename='images/numpy.jpg', width=300)
Out[56]:
In [12]:
from IPython.display import IFrame
IFrame('http://matplotlib.org/', width=900, height=350)
Out[12]:
In [58]:
from IPython.display import Image
Image(filename='images/convincing.png')
Out[58]:
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
In [15]:
print plt.style.available
plt.style.use(plt.style.available[0])
In [16]:
x = np.linspace(0, 2, 10)
x
Out[16]:
In [18]:
plt.plot(x, x, 'o--', label='linear')
plt.plot(x, x ** 2, 'x-', label='quadratic')
plt.legend(loc='best')
plt.title('Linear vs Quadratic progression')
plt.xlabel('Input')
plt.ylabel('Output')
Out[18]:
In [19]:
plt.scatter(np.random.rand(100,), np.random.rand(100,))
Out[19]:
In [20]:
from IPython.display import IFrame
IFrame('http://pandas.pydata.org/', width=900, height=350)
Out[20]:
In [4]:
import pandas as pd
In [23]:
pd.read_csv?
In [5]:
bandas = pd.read_csv('data/rec 1a validacion Vanesa B 1 2 3 4 5 7 S_.txt',index_col=None, header=4, sep=';')
bandas
Out[5]:
In [6]:
bandas.describe()
Out[6]:
In [7]:
bandas.corr()
Out[7]:
In [27]:
from IPython.display import Image
Image(url='http://1.bp.blogspot.com/-ME24ePzpzIM/UQLWTwurfXI/AAAAAAAAANw/W3EETIroA80/s1600/drop_shadows_background.png',
width=1000, height=1000)
Out[27]:
In [28]:
from IPython.display import IFrame
IFrame('http://scikit-learn.org/stable/index.html', width=900, height=350)
Out[28]:
In [8]:
bandas_cut = pd.read_csv('data/rec 1a validacion Vanesa B 1 2 3 4 5 7 S_.txt',index_col=None, header=4, sep=';', nrows=10000)
bandas_cut
Out[8]:
In [9]:
bandas_cut.columns
Out[9]:
In [10]:
X = bandas_cut[bandas_cut.columns[:-1]].values
y = bandas_cut[bandas_cut.columns[-1]].values
In [13]:
import numpy as np
In [11]:
from sklearn.decomposition import PCA
Xp = PCA(n_components=2).fit_transform(X)
Xp
Out[11]:
In [14]:
% matplotlib inline
import matplotlib.pyplot as plt
# get the product class
product_class = np.unique(y)
colors = plt.get_cmap("hsv")
plt.figure(figsize=(10, 4))
for i, p in enumerate(product_class):
mask = (y == p)
plt.scatter(Xp[mask, 0], Xp[mask, 1],
c=colors(1. * i / 11), label=p, alpha=0.2)
plt.legend(loc="best")
plt.xlabel('PC 1')
plt.ylabel('PC 2')
Out[14]:
In [16]:
from sklearn.cluster import MeanShift, estimate_bandwidth
bandwidth = estimate_bandwidth(Xp, n_samples=1000)
In [17]:
# Create Model
ms = MeanShift(bandwidth=bandwidth)
In [18]:
# Train the model without y data!
ms.fit(Xp)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
In [19]:
import matplotlib.pyplot as plt
from itertools import cycle
plt.figure()
plt.clf()
colors = cycle('rc')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(Xp[my_members, 0], Xp[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [20]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print """X_train shape : {}, y_train shape : {}
X_test shape : {}, y_test shape : {}""".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
In [21]:
def plot_matrix(clf, X_test, y_test):
plt.clf()
plt.imshow(confusion_matrix(clf.predict(X_test), y_test),
interpolation='nearest', cmap=plt.cm.Blues)
plt.colorbar()
plt.xlabel("true label")
plt.ylabel("predicted label")
plt.show()
An SVM model is a representation of the examples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. New examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall on.
In [48]:
from sklearn.svm import SVC
sv = SVC(kernel='linear', cache_size=1000, probability=True)
In [49]:
sv.fit(X_train, y_train)
Out[49]:
In [50]:
y_pred = sv.predict(X_test)
In [26]:
y_pred, X_test
Out[26]:
In [29]:
confusion_matrix(sv.predict(X_test), y_test)
Out[29]:
In [27]:
print classification_report( y_pred, y_test)
print sv.score(X_test, y_test)
plot_matrix(sv, X_test, y_test)
In [45]:
confusion_matrix(y_pred, y_test)
Out[45]:
It essentially consists of randomizing -partial or totally - both attribute and cut-point choice while splitting a tree node. In the extreme case, it builds totally randomized trees whose structures are independent of the output values of the learning sample. The strength of the randomization can be tuned to problem specifics by the appropriate choice of a parameter.
In [47]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=200,
max_features=0.2,
n_jobs=2,
max_depth=None,
min_samples_split=1,
random_state=1).fit(X_train, y_train)
print classification_report(clf.predict(X_test), y_test)
print "Score over Testing Data {}".format(clf.score(X_test, y_test))
print "Score over Training Data {}".format(clf.score(X_train, y_train))
plot_matrix(clf, X_test, y_test)
In [31]:
importances = clf.feature_importances_
text = map(lambda i: bandas.columns[:-1][i], range(6))
plt.figure(figsize=(20, 6))
print importances[::-1].shape
plt.bar(range(6),height=importances, width=1.)
plt.xticks(np.arange(0.5, 6, 1.), text, rotation=90)
plt.xlim((0, 6))
plt.show()
indices = np.argsort(importances)[::-1]
for i in range(3):
print importances[indices[i]], bandas_cut.columns[:-1][indices[i]]
In [32]:
from sklearn.linear_model import LinearRegression
In [33]:
X_reg = np.random.random(size=(200, 1))
y_reg = 3 * X_reg[:, 0] + 2 + np.random.normal(size=200)
In [35]:
model = LinearRegression()
model.fit(X_reg, y_reg)
print("Model coefficient: %.5f, and intercept: %.5f"% (model.coef_, model.intercept_))
In [36]:
# Plot the data and the model prediction
X_test_reg = np.linspace(0, 1, 100)[:, np.newaxis]
y_test_reg = model.predict(X_test_reg)
plt.plot(X_reg[:, 0], y_reg, 'o')
plt.plot(X_test_reg[:, 0], y_test_reg)
plt.title('Linear regression with a single input variable');
In [37]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
parameter_grid = {
'n_estimators': [100, 200],
'max_features': [0.2, 0.5],
#'max_depth': [5., None]
}
grid_search = GridSearchCV(ExtraTreesClassifier(n_jobs=4), parameter_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)
Out[37]:
In [39]:
grid_search.best_params_, grid_search.best_estimator_
Out[39]:
In [40]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent',random_state=0).fit(X, y)
#print clf.score(X_test, y_test)
#plot_matrix(clf, X_test, y_test)
clf.score(X, y)
Out[40]:
In [43]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
def plot_roc_curve(target_test, target_predicted_proba):
fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
In [51]:
plot_roc_curve(y_test, sv.predict_proba(X_test))
In [52]:
from sklearn.cross_validation import cross_val_score, ShuffleSplit
cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.1, random_state=0)
test_scores = cross_val_score(sv, X, y, cv=cv, n_jobs=2)
print "scores: {} mean: {} std: {}".format(str(test_scores), np.mean(test_scores), np.std(test_scores))
In [55]:
Out[55]:
In [57]:
import theano
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import BatchIterator
from nolearn.lasagne import NeuralNet
from lasagne.layers import InputLayer, Conv2DLayer, DropoutLayer,\
MaxPool2DLayer, DenseLayer
from lasagne.nonlinearities import softmax
from sklearn.preprocessing import MinMaxScaler, label_binarize
In [82]:
import lasagne
lasagne.__version__
Out[82]:
In [58]:
X = bandas_cut[bandas_cut.columns[:-1]].values
y = bandas_cut[bandas_cut.columns[-1]].values
In [59]:
X_net = X.astype(np.float32)
y_net = y.astype(np.int32)
In [60]:
X_scaler = MinMaxScaler()
X_net = X_scaler.fit_transform(X_net)
In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_net, y_net,
test_size=0.3,
random_state=42)
print "X_train.shape -> {}, X_test.shape -> {} ".format(X_train.shape,
X_test.shape)
print "y_train.shape -> {}, y_test.shape -> {} ".format(y_train.shape,
y_test.shape)
print X.min(), X.max()
print y.min(), y.max()
In [62]:
layers_0 = [
(InputLayer, {'shape': (None, 6)}),
(DenseLayer, {'num_units': 100}),
(DropoutLayer, {}),
(DenseLayer, {'num_units': 100}),
(DenseLayer, {'num_units': 2, 'nonlinearity': softmax}),
]
In [63]:
def create_network(npochs=50, batch_size=200):
return NeuralNet(
layers=layers_0,
update=nesterov_momentum,
update_learning_rate=theano.shared(np.float32(0.01)),
update_momentum=theano.shared(np.float32(0.9)),
regression=False,
batch_iterator_train=BatchIterator(batch_size=batch_size),
max_epochs=npochs,
verbose=1)
net0 = create_network()
In [64]:
net0.fit(X_train, y_train)
Out[64]:
In [66]:
import cPickle as pickle
with open('data/aguatierra_simpleNN.pickle', 'wb') as f:
pickle.dump(net0, f, -1)
In [67]:
import cPickle as pickle
net0 = None
fnames_nets = ['data/aguatierra_simpleNN.pickle']
nets = [net0]
for n, fnames in enumerate(fnames_nets):
with open(fnames, 'rb') as f:
nets[n] = pickle.load(f)
In [68]:
from nolearn.lasagne import PrintLayerInfo
layer_info = PrintLayerInfo()
nets[0].verbose = 3
nets[0].initialize()
layer_info(nets[0])
In [69]:
%matplotlib inline
plt.clf()
plt.figure(figsize=(15,5))
for net, net_name, color in zip(nets, ['Arch0'], ['m']):
train_loss = np.array([i["train_loss"] for i in net.train_history_])
valid_loss = np.array([i["valid_loss"] for i in net.train_history_])
plt.plot(train_loss, '--{}'.format(color), linewidth=2, label="{} train".format(net_name))
plt.plot(valid_loss, '-{}'.format(color), linewidth=2, label="{} valid".format(net_name))
plt.grid()
plt.legend()
plt.xlabel("epoch")
plt.ylabel("loss")
plt.yscale("log")
plt.show()
In [ ]:
In [70]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn import cross_validation
for net in nets:
y_pred = net.predict(X_net)
print classification_report(y_net, y_pred)
print "[Test dataset] Score: %.5f" % net.score(X_test, y_test)