In [ ]:
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os


def get_soup(url):
    return BeautifulSoup(requests.get(url).text)

image_type = "license"
query = "drivers license sample"
url = "http://www.bing.com/images/search?q=" + query + \
    "&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3"

soup = get_soup(url)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("mm.bing.net")})]

for img in images:
    raw_img = urllib2.urlopen(img).read()
    cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
    f = open("images/" + image_type + "_"+ str(cntr), 'wb')
    f.write(raw_img)
    f.close()

In [1]:
#setup a standard image size; this will distort some images but will get everything into the same shape
import Image
STANDARD_SIZE = (300, 167)
def img_to_matrix(filename, verbose=False):
    """
    takes a filename and turns it into a numpy array of RGB pixels
    """
    img = Image.open(filename)
    if verbose==True:
        print "changing size from %s to %s" % (str(img.size), str(STANDARD_SIZE))
    img = img.resize(STANDARD_SIZE)
    img = list(img.getdata())
    img = map(list, img)
    img = np.array(img)
    return img
 
def flatten_image(img):
    """
    takes in an (m, n) numpy array and flattens it 
    into an array of shape (1, m * n)
    """
    s = img.shape[0] * img.shape[1]
    img_wide = img.reshape(1, s)
    return img_wide[0]

In [2]:
import os
img_dir = "/home/bakuda/ageekrepo/images/"
images = [img_dir+ f for f in os.listdir(img_dir)]
labels = ["cheque" if "cheque" in f.split('/')[-1] else "license" for f in images]
 
data = []
for image in images:
    img = img_to_matrix(image)
    img = flatten_image(img)
    data.append(img)
 
data = np.array(data)
data


Out[2]:
array([[ 55,  51,  48, ...,  45,  45,  45],
       [255, 251, 255, ..., 244, 244, 246],
       [255, 247, 255, ..., 248, 248, 248],
       ..., 
       [196, 224, 228, ..., 244, 202, 203],
       [ 32,  39,  47, ...,  29,  38,  37],
       [244, 244, 244, ..., 252, 252, 252]])

In [15]:
from sklearn.decomposition import RandomizedPCA
import pandas as pd
import pylab as pl
pca = RandomizedPCA(n_components=2)
X = pca.fit_transform(data)
X.shape


Out[15]:
(140, 2)

In [39]:
df['x'] = X[:,0]
df['y'] = X[:,1]
df['label'] = (np.where(labels=="cheque", "cheque", "license"))

#df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":np.where(X[:,1]==1, "cheque", "license")})
#df = pd.DataFrame({X,y,label})

colors = ["red", "yellow"]
for label, color in zip(df['label'].unique(), colors):
    mask = df['label']==label
    pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
pl.legend()
pl.show()



In [10]:
data.shape


Out[10]:
(140, 150300)

In [31]:
df.shape


Out[31]:
(140, 3)

In [23]:
df['y']


Out[23]:
0      3042.493873
1     14552.904449
2     14769.595902
3     -6454.433768
4     14552.904449
5    -14446.198279
6      4093.367570
7      1113.761665
8     -7731.597506
9      1911.510269
10      877.590197
11     4093.367570
12     -360.783995
13    -5214.419131
14     4845.093785
...
125    -5086.711868
126    -4505.311451
127    -2283.676091
128     2303.336319
129      514.758486
130    -7731.597506
131      686.232581
132     4093.367570
133     2083.428705
134   -14446.198279
135     3042.493873
136     7054.812728
137     1773.281029
138   -21693.212090
139     2633.373259
Name: y, Length: 140, dtype: float64

In [42]:
df['label']


Out[42]:
0     license
1     license
2     license
3     license
4     license
5     license
6     license
7     license
8     license
9     license
10    license
11    license
12    license
13    license
14    license
...
125    license
126    license
127    license
128    license
129    license
130    license
131    license
132    license
133    license
134    license
135    license
136    license
137    license
138    license
139    license
Name: label, Length: 140, dtype: object

In [29]:
X[:5,1]


Out[29]:
array([  3042.49387327,  14552.9044491 ,  14769.59590244,  -6454.43376819,
        14552.9044491 ])

In [37]:
np.where(labels==1, "cheque", "license")


Out[37]:
array('license', 
      dtype='|S7')

In [43]:
pca = RandomizedPCA(n_components=5)
train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)
 
print train_x[:5]
#array([[ 12614.55016475,  -9156.62662224,  -7649.37090539,  -3230.94749506,
#          2495.71170459],
#       [ 16111.39363837,   -259.55063579,    699.60464599,   3058.59026495,
#         -1552.34714653],
#       [ 15019.71069584,  -6403.86621428,   1968.44401114,   2896.76676466,
#         -2157.76499726],
#       [ 13410.53053415,  -1658.3751377 ,    261.26829049,   1991.33404567,
#          -486.60683822],
#       [ 12717.28773107,  -1544.27233216,  -1279.70167969,    503.33658729,
#           -38.00244617]])
 
knn = KNeighborsClassifier()
knn.fit(train_x, train_y)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-5de7f732a8be> in <module>()
      1 pca = RandomizedPCA(n_components=5)
----> 2 train_x = pca.fit_transform(train_x)
      3 test_x = pca.transform(test_x)
      4 
      5 print train_x[:5]

NameError: name 'train_x' is not defined

In [ ]: