In [ ]:
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
def get_soup(url):
return BeautifulSoup(requests.get(url).text)
image_type = "license"
query = "drivers license sample"
url = "http://www.bing.com/images/search?q=" + query + \
"&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3"
soup = get_soup(url)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("mm.bing.net")})]
for img in images:
raw_img = urllib2.urlopen(img).read()
cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
f = open("images/" + image_type + "_"+ str(cntr), 'wb')
f.write(raw_img)
f.close()
In [1]:
#setup a standard image size; this will distort some images but will get everything into the same shape
import Image
STANDARD_SIZE = (300, 167)
def img_to_matrix(filename, verbose=False):
"""
takes a filename and turns it into a numpy array of RGB pixels
"""
img = Image.open(filename)
if verbose==True:
print "changing size from %s to %s" % (str(img.size), str(STANDARD_SIZE))
img = img.resize(STANDARD_SIZE)
img = list(img.getdata())
img = map(list, img)
img = np.array(img)
return img
def flatten_image(img):
"""
takes in an (m, n) numpy array and flattens it
into an array of shape (1, m * n)
"""
s = img.shape[0] * img.shape[1]
img_wide = img.reshape(1, s)
return img_wide[0]
In [2]:
import os
img_dir = "/home/bakuda/ageekrepo/images/"
images = [img_dir+ f for f in os.listdir(img_dir)]
labels = ["cheque" if "cheque" in f.split('/')[-1] else "license" for f in images]
data = []
for image in images:
img = img_to_matrix(image)
img = flatten_image(img)
data.append(img)
data = np.array(data)
data
Out[2]:
In [15]:
from sklearn.decomposition import RandomizedPCA
import pandas as pd
import pylab as pl
pca = RandomizedPCA(n_components=2)
X = pca.fit_transform(data)
X.shape
Out[15]:
In [39]:
df['x'] = X[:,0]
df['y'] = X[:,1]
df['label'] = (np.where(labels=="cheque", "cheque", "license"))
#df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":np.where(X[:,1]==1, "cheque", "license")})
#df = pd.DataFrame({X,y,label})
colors = ["red", "yellow"]
for label, color in zip(df['label'].unique(), colors):
mask = df['label']==label
pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
pl.legend()
pl.show()
In [10]:
data.shape
Out[10]:
In [31]:
df.shape
Out[31]:
In [23]:
df['y']
Out[23]:
In [42]:
df['label']
Out[42]:
In [29]:
X[:5,1]
Out[29]:
In [37]:
np.where(labels==1, "cheque", "license")
Out[37]:
In [43]:
pca = RandomizedPCA(n_components=5)
train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)
print train_x[:5]
#array([[ 12614.55016475, -9156.62662224, -7649.37090539, -3230.94749506,
# 2495.71170459],
# [ 16111.39363837, -259.55063579, 699.60464599, 3058.59026495,
# -1552.34714653],
# [ 15019.71069584, -6403.86621428, 1968.44401114, 2896.76676466,
# -2157.76499726],
# [ 13410.53053415, -1658.3751377 , 261.26829049, 1991.33404567,
# -486.60683822],
# [ 12717.28773107, -1544.27233216, -1279.70167969, 503.33658729,
# -38.00244617]])
knn = KNeighborsClassifier()
knn.fit(train_x, train_y)
In [ ]: