In [1]:
%matplotlib inline
In [2]:
import matplotlib.pyplot as plt
In [12]:
import sklearn
from sklearn import datasets
from sklearn import svm
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
In [135]:
import nltk
In [4]:
import numpy as np
import scipy
In [5]:
import re
import os, sys
print(os.getcwd())
os.listdir( os.getcwd()+"/ex6/" )
Out[5]:
In [99]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup
In [6]:
# Load saved matrices from file
ex6data1_mat_data = scipy.io.loadmat( os.getcwd()+"/ex6/ex6data1.mat")
print(type(ex6data1_mat_data))
print(ex6data1_mat_data.keys())
In [17]:
print(type(ex6data1_mat_data["y"]))
print(ex6data1_mat_data['y'].shape)
print(type(ex6data1_mat_data["X"]))
print(ex6data1_mat_data['X'].shape)
In [21]:
plt.scatter( ex6data1_mat_data['X'][:,0] ,ex6data1_mat_data['X'][:,1] )
Out[21]:
In [22]:
y=ex6data1_mat_data['y']
In [25]:
np.where(y==1)
Out[25]:
In [31]:
# these are the x-coordinates of the X input data such that y=1
# ex6data1_mat_data['X'][np.where(y==1),0]
# and so
plt.scatter( ex6data1_mat_data['X'][np.where(y==0)[0],0], ex6data1_mat_data['X'][np.where(y==0)[0],1] ,
s=35,c='y',marker='o' , label='y=0' )
plt.scatter( ex6data1_mat_data['X'][np.where(y==1)[0],0], ex6data1_mat_data['X'][np.where(y==1)[0],1] ,
s=75,c='b',marker='+' , label='y=1' )
plt.legend(loc=6)
plt.show()
In [8]:
ex6data1_mat_data = scipy.io.loadmat( os.getcwd()+"/ex6/ex6data1.mat")
print(type(ex6data1_mat_data))
print(ex6data1_mat_data.keys())
In [9]:
print("\nTraining Linear SVM ...\n")
You should try to change the $C$ value below and see how the decision boundary varies (e.g., try $C=1000$)
In [10]:
C=1
In [13]:
clf=svm.SVC() # C=1. default, kernel='rbl' default, gamma : float, (default='auto')
# if gamma is 'auto' then 1/n_features will be used instead
cf. sklearn.svm.SVC
fit(X,y,sample_weight=None)
X : (array-like, sparse matrix), shape(n_samples,n_features
),
y : array-like,shape(n_samples
)
In [15]:
print( ex6data1_mat_data['X'].shape )
print( ex6data1_mat_data['y'].shape )
In [18]:
clf.fit( ex6data1_mat_data['X'], ex6data1_mat_data['y'].flatten())
Out[18]:
In [19]:
# get support vectors
clf.support_vectors_
Out[19]:
In [20]:
# get indices of support vectors
clf.support_
Out[20]:
In [21]:
# get number of support vectors for each class
clf.n_support_
Out[21]:
In [31]:
h=.02 # step size in the mesh
# create a mesh to plot in
X = ex6data1_mat_data['X']
x_min, x_max = X[:,0].min()-1, X[:,0].max()+1
y_min, y_max = X[:,1].min()-1, X[:,1].max()+1
xx,yy = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
Z = clf.predict(np.c_[xx.ravel(),yy.ravel()]) # translates slice objects to concatenation along the second axis
print(Z.shape)
Z = Z.reshape(xx.shape)
plt.contourf(xx,yy,Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:,0],X[:,1],c=ex6data1_mat_data['y'], cmap=plt.cm.coolwarm)
Out[31]:
In [32]:
print(xx.shape); print(Z.shape)
In [33]:
clf_lin=svm.SVC(kernel='linear',C=C).fit(ex6data1_mat_data['X'], ex6data1_mat_data['y'].flatten())
In [34]:
h=.02 # step size in the mesh
# create a mesh to plot in
X = ex6data1_mat_data['X']
x_min, x_max = X[:,0].min()-1, X[:,0].max()+1
y_min, y_max = X[:,1].min()-1, X[:,1].max()+1
xx,yy = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
Z = clf_lin.predict(np.c_[xx.ravel(),yy.ravel()]) # translates slice objects to concatenation along the second axis
print(Z.shape)
Z = Z.reshape(xx.shape)
plt.contourf(xx,yy,Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:,0],X[:,1],c=ex6data1_mat_data['y'], cmap=plt.cm.coolwarm)
Out[34]:
In [44]:
C_lst = [0.01,0.1,1.,100.]
In [45]:
clf_lst = [svm.SVC(kernel='linear',C=C).fit(ex6data1_mat_data['X'], ex6data1_mat_data['y'].flatten()) for C in C_lst]
In [46]:
h=.02 # step size in the mesh
# create a mesh to plot in
X = ex6data1_mat_data['X']
x_min, x_max = X[:,0].min()-1, X[:,0].max()+1
y_min, y_max = X[:,1].min()-1, X[:,1].max()+1
xx,yy = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
# title for the plots
titles = ['SVC linear kernel C='+str(C) for C in C_lst]
for i, clf in enumerate(clf_lst):
# Plot the decision boundary. For that, we'll assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max]
plt.subplot(2,2,i+1)
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx,yy,Z,cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:,0], X[:,1],c=ex6data1_mat_data['y'], cmap=plt.cm.coolwarm)
plt.title(titles[i])
plt.show()
In [39]:
titles
Out[39]:
In [ ]:
In [37]:
x1=np.array([1,2,1])
x2=np.array([0,4,-1])
sigma=2
In [42]:
sum( -(x1-x2)**2 )/(2.*2**2)
Out[42]:
In [44]:
np.exp( sum( -(x1-x2)**2 )/(2.*2**2) ) #
Out[44]:
In [45]:
def gaussianKernel(x1,x2,sigma):
""" gaussianKernel : returns a gaussian kernel between x1 and x2 and returns the value in sim
"""
# You need to return the following variables correctly.
sim = 0
sim = np.exp( -np.sum((x1-x2)**2/(2.*sigma**2)))
return sim
In [46]:
gaussianKernel(x1,x2,sigma)
Out[46]:
In [47]:
# Load saved matrices from file
ex6data2_mat_data = scipy.io.loadmat( os.getcwd()+"/ex6/ex6data2.mat")
print(type(ex6data2_mat_data))
print(ex6data2_mat_data.keys())
ex6data3_mat_data = scipy.io.loadmat( os.getcwd()+"/ex6/ex6data3.mat")
print(type(ex6data3_mat_data))
print(ex6data3_mat_data.keys()) # Xval, yval are CROSS VALIDATION set data
In [48]:
X = ex6data2_mat_data['X']
print(X.shape)
plt.scatter(X[:,0],X[:,1],c=ex6data2_mat_data['y'], cmap=plt.cm.coolwarm)
Out[48]:
In [58]:
C=1.
sigma=0.1
gamma_gaussiankernel = 1./(2.*sigma**2) # I'm supposing that sci-kit learn SVC's gamma = 1/(2*sigma^2)
clf=svm.SVC(kernel='rbf',C=C,gamma=gamma_gaussiankernel)
In [59]:
clf.fit( ex6data2_mat_data['X'], ex6data2_mat_data['y'].flatten())
Out[59]:
In [60]:
h=.02 # step size in the mesh
# create a mesh to plot in
X = ex6data2_mat_data['X']
x_min, x_max = X[:,0].min()-.1, X[:,0].max()+.1
y_min, y_max = X[:,1].min()-.1, X[:,1].max()+.1
xx,yy = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
Z = clf.predict(np.c_[xx.ravel(),yy.ravel()]) # translates slice objects to concatenation along the second axis
print(Z.shape)
Z = Z.reshape(xx.shape)
plt.contourf(xx,yy,Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:,0],X[:,1],c=ex6data2_mat_data['y'], cmap=plt.cm.coolwarm)
Out[60]:
In [67]:
X = ex6data3_mat_data['X']
y = ex6data3_mat_data['y']
Xval = ex6data3_mat_data['Xval']
yval = ex6data3_mat_data['yval']
C_lst = [0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1.,10.]
sigma_lst = [0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1.,10.]
models = [ [svm.SVC(kernel='rbf',
C=C,
gamma=1./(2.*sigma**2)).fit(X,
y.flatten()) for C in C_lst] for sigma in sigma_lst]
In [65]:
ex6data3_mat_data.keys()
Out[65]:
In [75]:
(models[0][0].predict(Xval) != yval).astype('int').mean()
Out[75]:
In [76]:
predict_errs = np.array( [[(model.predict(Xval)!=yval).astype('int').mean() for model in rowmodel] for rowmodel in models] )
In [93]:
predict_errs
Out[93]:
In [92]:
predict_errs[predict_errs.argmin() // 9 , predict_errs.argmin() % 9]
Out[92]:
In [90]:
print( sigma_lst[predict_errs.argmin()//9] )
print( C_lst[predict_errs.argmin() % 9] )
Indeed
In [88]:
C=1.0
sigma=0.03
clf = svm.SVC(kernel='rbf',C=C,gamma=1./(2.*sigma**2)).fit(X,y.flatten())
In [89]:
(clf.predict(Xval) != yval).astype('int').mean()
Out[89]:
In [95]:
# Extract Features
f = open(os.getcwd()+"/ex6/emailSample1.txt",'r')
file_contents = f.read()
f.close()
In [96]:
file_contents
Out[96]:
In [97]:
# Lower case
file_contents.lower()
Out[97]:
In [100]:
# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
BeautifulSoup( file_contents.lower() )
BeautifulSoup( file_contents.lower() ).get_text()
# Calling get_text() gives you the text of the review, without tags or markup.
# cf. https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
Out[100]:
In [102]:
import re
# Use regular expressions to do a find-and-replace
# Handle Numbers
# look for 1 or more characters between 0-9
email_contents = re.sub("[0-9]+", # The pattern to search for
"number", # The pattern to replace it with
BeautifulSoup( file_contents.lower() ).get_text() ) # The text to search
In [103]:
# Handle URLS
# Look for strings starting with http:// or https://
re.sub( '(http|https)://[^\s]*','httpaddr',email_contents)
Out[103]:
In [104]:
# Handle Email Addresses
# Look for strings with @ in the middle
re.sub( '[^\s]+@[^\s]+','emailaddr', email_contents)
Out[104]:
In [105]:
# Handle $ sign
re.sub('[$]+','dollar', email_contents)
Out[105]:
In [119]:
def processEmail_regex(email_contents):
""" processEmail_regex - process email with regular expressions, 1st.
"""
# Lower case
email_contents = email_contents.lower()
# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
email_contents = BeautifulSoup( email_contents,"lxml" ).get_text()
# Use regular expressions to do a find-and-replace
# Handle Numbers
# look for 1 or more characters between 0-9
email_contents = re.sub("[0-9]+", # The pattern to search for
"number", # The pattern to replace it with
email_contents ) # The text to search
# Handle URLS
# Look for strings starting with http:// or https://
email_contents = re.sub( '(http|https)://[^\s]*','httpaddr',email_contents)
# Handle Email Addresses
# Look for strings with @ in the middle
email_contents = re.sub( '[^\s]+@[^\s]+','emailaddr', email_contents)
# Handle $ sign
email_contents = re.sub('[$]+','dollar', email_contents)
# Remove any non alphanumeric characters
email_contents = re.sub('[^a-zA-Z0-9]',' ', email_contents)
return email_contents
In [120]:
f = open(os.getcwd()+"/ex6/emailSample1.txt",'r')
file_contents = f.read()
f.close()
email_contents = processEmail_regex(file_contents)
In [121]:
email_contents
Out[121]:
In [111]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
In [126]:
test_email_vec = count_vect.fit_transform( [email_contents,])
In [129]:
print( type(test_email_vec) );
print( test_email_vec.shape )
In [133]:
test_email_vec[0][0]
Out[133]:
In [ ]:
import nltk
In [138]:
nltk.download()
Out[138]:
In [151]:
tokens_email_contents = nltk.word_tokenize(email_contents)
In [137]:
email_contents
Out[137]:
In [152]:
tokens_email_contents
Out[152]:
In [153]:
tagged_email_contents = nltk.pos_tag( tokens_email_contents )
In [154]:
tagged_email_contents
Out[154]:
In [156]:
entities_email_contents = nltk.chunk.ne_chunk( tagged_email_contents )
In [158]:
type(entities_email_contents)
Out[158]:
In [159]:
entities_email_contents
Out[159]:
cf. Non-linear SVM
In [139]:
X = np.random.randn(300,2)
y = np.logical_xor(X[:,0] > 0, X[:,1] > 0)
In [144]:
print(X.shape)
print(y.shape)
print(X.max())
print(X.min())
print(y.max()); print(y.min())
In [143]:
plt.scatter(X[:,0],X[:,1],s=30,c=y,cmap=plt.cm.Paired)
Out[143]:
In [145]:
y = y.astype("int")
In [147]:
print(y.max());print(y.min())
plt.scatter(X[:,0],X[:,1],s=30,c=y,cmap=plt.cm.Paired)
Out[147]:
In [149]:
# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
y = [0] * 20 + [1] * 20
In [150]:
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
Out[150]:
In [ ]: