In [300]:

    
import numpy as np
from skimage import io



In [301]:

    
import os
train_directory = "./img/grey/"

Getting the data



In [302]:

    
def images(image_directory):
    return [image_directory+image for image in os.listdir(image_directory)]

images(train_directory)









    Out[302]:





['./img/grey/kfc.1.jpg',
 './img/grey/kfc.10.jpg',
 './img/grey/kfc.11.jpg',
 './img/grey/kfc.12.jpg',
 './img/grey/kfc.13.jpg',
 './img/grey/kfc.14.jpg',
 './img/grey/kfc.15.jpg',
 './img/grey/kfc.16.jpg',
 './img/grey/kfc.17.jpg',
 './img/grey/kfc.18.jpg',
 './img/grey/kfc.19.jpg',
 './img/grey/kfc.2.jpg',
 './img/grey/kfc.20.jpg',
 './img/grey/kfc.3.jpg',
 './img/grey/kfc.4.jpg',
 './img/grey/kfc.5.jpg',
 './img/grey/kfc.6.jpg',
 './img/grey/kfc.7.jpg',
 './img/grey/kfc.8.jpg',
 './img/grey/kfc.9.jpg',
 './img/grey/mcd.1.jpg',
 './img/grey/mcd.10.jpg',
 './img/grey/mcd.11.jpg',
 './img/grey/mcd.12.jpg',
 './img/grey/mcd.13.jpg',
 './img/grey/mcd.14.jpg',
 './img/grey/mcd.15.jpg',
 './img/grey/mcd.16.jpg',
 './img/grey/mcd.2.jpg',
 './img/grey/mcd.3.jpg',
 './img/grey/mcd.4.jpg',
 './img/grey/mcd.5.jpg',
 './img/grey/mcd.6.jpg',
 './img/grey/mcd.7.jpg',
 './img/grey/mcd.8.jpg',
 './img/grey/mcd.9.jpg',
 './img/grey/sub.1.jpg',
 './img/grey/sub.10.jpg',
 './img/grey/sub.11.jpg',
 './img/grey/sub.12.jpg',
 './img/grey/sub.13.jpg',
 './img/grey/sub.14.jpg',
 './img/grey/sub.15.jpg',
 './img/grey/sub.16.jpg',
 './img/grey/sub.17.jpg',
 './img/grey/sub.2.jpg',
 './img/grey/sub.3.jpg',
 './img/grey/sub.4.jpg',
 './img/grey/sub.5.jpg',
 './img/grey/sub.6.jpg',
 './img/grey/sub.7.jpg',
 './img/grey/sub.8.jpg',
 './img/grey/sub.9.jpg']



In [303]:

    
train_image_names = images(train_directory)

Extracting labels



In [304]:

    
# Function to extract labels
def extract_labels(file_names):
    '''Create labels from file names: kfc = 0 and mcd = 1 and sub = 2'''
    
    # Create empty vector of length = no. of files, filled with zeros 
    n = len(file_names)
    y = np.zeros(n, dtype = np.uint8)
    
    # Enumerate gives index
    for i, filename in enumerate(file_names):
        
        # If 'kfc' string is in file name assign '0'
        if 'kfc' in str(filename):
            y[i] = 0
        elif 'mcd' in str(filename):
            y[i] = 1
        else :
            y[i] = 2
    return y
     

extract_labels(train_image_names)









    Out[304]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2], dtype=uint8)



In [305]:

    
y = extract_labels(train_image_names)

# Save labels: np.save(file or string, array)
np.save('y', y)



In [306]:

    
from PIL import Image



In [307]:

    
def img_to_matrix(filename):
        
    '''
    takes a filename and turns it into a numpy array of RGB pixels
    '''
    img = Image.open(filename)
    # img = Image.fromarray(filename)
    img = list(img.getdata())
    img = np.asarray(img)
    return img



In [308]:

    
data = []
for i in images(train_directory):
    img = img_to_matrix(i)
    data.append(img)
    
data = np.array(data)
data.shape









    Out[308]:





(53, 50000)



In [309]:

    
np.savetxt("./img/train.txt", data)



In [310]:

    
import numpy as np
import math
import matplotlib.pyplot as plt
from PIL import Image



In [311]:

    
X_train = np.loadtxt('./img/train.txt')



In [312]:

    
print("Shape of training set: {}".format(X_train.shape))









    



Shape of training set: (53, 50000)



In [313]:

    
def image_grid(D,H,W,cols=10,scale=1):
    """ display a grid of images
        H,W: Height and width of the images
        cols: number of columns = number of images in each row
        scale: 1 to fill screen
    """
    n = np.shape(D)[0]
    rows = int(math.ceil((n+0.0)/cols))
    fig = plt.figure(1,figsize=[scale*20.0/H*W,scale*20.0/cols*rows],dpi=300)
    for i in range(n):
        plt.subplot(rows,cols,i+1)
        fig=plt.imshow(np.reshape(D[i,:],[H,W]), cmap = plt.get_cmap("gray"))
        plt.axis('off')

H = 100
W = 500



In [314]:

    
mean_image = np.mean(X_train, axis=0)

plt.imshow(np.reshape(mean_image,[H,W]), cmap = plt.get_cmap("gray"))
plt.show()

Reducing dimensionality using PCA



In [315]:

    
from sklearn.decomposition import PCA
n_components = 40



In [316]:

    
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)



In [317]:

    
pca_result = pca.transform(X_train)

print(X_train.shape)
print(pca_result.shape)
print(y.shape)

x_train= X_train.copy() #for autoML
Y_train= y.copy









    



(53, 50000)
(53, 40)
(53,)



In [318]:

    
%matplotlib inline
plt.hist(pca.explained_variance_ratio_, bins=n_components, log=True)









    Out[318]:





(array([ 22.,   6.,   4.,   2.,   1.,   1.,   0.,   1.,   0.,   0.,   1.,
          0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   1.]),
 array([ 0.0051866 ,  0.01159281,  0.01799902,  0.02440524,  0.03081145,
         0.03721766,  0.04362387,  0.05003009,  0.0564363 ,  0.06284251,
         0.06924873,  0.07565494,  0.08206115,  0.08846737,  0.09487358,
         0.10127979,  0.10768601,  0.11409222,  0.12049843,  0.12690465,
         0.13331086,  0.13971707,  0.14612329,  0.1525295 ,  0.15893571,
         0.16534193,  0.17174814,  0.17815435,  0.18456056,  0.19096678,
         0.19737299,  0.2037792 ,  0.21018542,  0.21659163,  0.22299784,
         0.22940406,  0.23581027,  0.24221648,  0.2486227 ,  0.25502891,
         0.26143512]),
 <a list of 40 Patch objects>)



In [319]:

    
import pandas as pd
pca.explained_variance_ratio_.sum()









    Out[319]:





0.95073380151022924



In [320]:

    
labels= pd.DataFrame(y)
value= pd.DataFrame(X_train)

from sklearn.decomposition import PCA
n_components = 100

pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(value)


pca_result = pca.transform(X_train)

print(value.shape)
print(pca_result.shape)









    



(53, 50000)
(53, 53)



In [321]:

    
value.head()









    Out[321]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      49990
      49991
      49992
      49993
      49994
      49995
      49996
      49997
      49998
      49999
    
  
  
    
      0
      171.0
      171.0
      172.0
      169.0
      159.0
      141.0
      120.0
      105.0
      96.0
      96.0
      ...
      50.0
      52.0
      49.0
      48.0
      54.0
      64.0
      133.0
      206.0
      219.0
      227.0
    
    
      1
      44.0
      38.0
      43.0
      47.0
      42.0
      44.0
      49.0
      46.0
      38.0
      17.0
      ...
      15.0
      15.0
      14.0
      13.0
      12.0
      13.0
      18.0
      18.0
      18.0
      18.0
    
    
      2
      255.0
      253.0
      252.0
      254.0
      255.0
      255.0
      253.0
      246.0
      253.0
      255.0
      ...
      182.0
      171.0
      152.0
      154.0
      173.0
      184.0
      177.0
      173.0
      164.0
      153.0
    
    
      3
      28.0
      27.0
      27.0
      26.0
      25.0
      24.0
      24.0
      24.0
      23.0
      22.0
      ...
      197.0
      195.0
      194.0
      193.0
      189.0
      186.0
      189.0
      190.0
      191.0
      192.0
    
    
      4
      255.0
      255.0
      255.0
      255.0
      255.0
      255.0
      255.0
      255.0
      255.0
      255.0
      ...
      16.0
      16.0
      16.0
      16.0
      15.0
      15.0
      16.0
      16.0
      18.0
      20.0
    
  

5 rows × 50000 columns

KNN, SVM and KMeans algorithms used to classify images



In [322]:

    
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
kmeans = KMeans(n_clusters=3)

ksv= kmeans.fit(value, labels) #kmeans



In [323]:

    
tsv= SVC(kernel='rbf').fit(value, labels) # svm









    



C:\Users\sam\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)



In [324]:

    
knn= KNeighborsClassifier() #knn
knn.fit(value, labels)









    



C:\Users\sam\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  






    Out[324]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')



In [325]:

    
tt= np.loadtxt('./img/test_x.txt')
value2= pd.DataFrame(tt)
value2.head()









    Out[325]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      49990
      49991
      49992
      49993
      49994
      49995
      49996
      49997
      49998
      49999
    
  
  
    
      0
      125.0
      28.0
      25.0
      25.0
      30.0
      23.0
      23.0
      33.0
      25.0
      24.0
      ...
      96.0
      72.0
      86.0
      72.0
      73.0
      78.0
      75.0
      59.0
      62.0
      44.0
    
    
      1
      145.0
      146.0
      147.0
      148.0
      149.0
      149.0
      149.0
      149.0
      148.0
      148.0
      ...
      158.0
      158.0
      159.0
      159.0
      160.0
      161.0
      161.0
      161.0
      161.0
      161.0
    
    
      2
      5.0
      4.0
      3.0
      2.0
      2.0
      2.0
      2.0
      3.0
      3.0
      4.0
      ...
      9.0
      10.0
      10.0
      10.0
      9.0
      8.0
      8.0
      8.0
      7.0
      7.0
    
    
      3
      4.0
      3.0
      3.0
      2.0
      2.0
      3.0
      3.0
      4.0
      5.0
      4.0
      ...
      207.0
      207.0
      207.0
      207.0
      207.0
      207.0
      206.0
      206.0
      206.0
      206.0
    
    
      4
      177.0
      170.0
      182.0
      86.0
      77.0
      81.0
      67.0
      68.0
      72.0
      73.0
      ...
      130.0
      175.0
      91.0
      85.0
      73.0
      112.0
      94.0
      101.0
      103.0
      96.0
    
  

5 rows × 50000 columns



In [326]:

    
knn.predict(value2)









    Out[326]:





array([1, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=uint8)



In [327]:

    
ksv.predict(value2)









    Out[327]:





array([2, 1, 0, 0, 2, 0, 0, 2, 2, 2])



In [328]:

    
tsv.predict(value2)









    Out[328]:





array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint8)



In [329]:

    
from sklearn.metrics import precision_score



In [330]:

    
#print("accuracy"+ str(roc_auc_score(value2, tsv.predict(value2))))



In [331]:

    
from sklearn.model_selection import train_test_split



In [332]:

    
pca_result = data



In [333]:

    
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2,
random_state=0)



In [334]:

    
knn = KNeighborsClassifier(n_neighbors=1)
model= knn.fit(X_train, y_train)



In [335]:

    
predictions= knn.predict(X_test)



In [336]:

    
print (model.score(X_test, y_test))









    



0.272727272727



In [337]:

    
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)



In [338]:

    
Kmeans_model= kmeans.fit(X_train, y_test)



In [339]:

    
predictions= kmeans.predict(X_test)



In [340]:

    
print(Kmeans_model.score(X_test, y_test))









    



-2483701112.3



In [341]:

    
from sklearn import svm
svm= svm.SVC(kernel='linear', C=1)



In [342]:

    
svm_model= svm.fit(X_train, y_train)



In [343]:

    
predictions= svm.predict(X_test)



In [344]:

    
print(svm_model.score(X_test, y_test))









    



0.454545454545



In [345]:

    
from sklearn.model_selection import cross_val_score
from sklearn import metrics



In [346]:

    
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
print("knn")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))









    



knn
Accuracy: 0.44 (+/- 0.27)



In [ ]:



In [347]:

    
scores = cross_val_score(kmeans, X_train, y_train, cv=5, scoring='accuracy')
print("kmeans")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))









    



kmeans
Accuracy: 0.24 (+/- 0.16)



In [ ]:



In [348]:

    
scores = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
print("svm")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))









    



svm
Accuracy: 0.47 (+/- 0.17)

cv scores of svm



In [349]:

    
scores









    Out[349]:





array([ 0.5       ,  0.44444444,  0.375     ,  0.625     ,  0.42857143])

Accuracy scores

knn = 0.44 svm = 0.47 kmeans = 0.24

Trying autoML procedure



In [350]:

    
""""
from tpot import TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1, cv= 5)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Best pipeline: GaussianNB(LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l1))
0.363636363636

No significant improvement on this dataset
"""









    Out[350]:





'"\nfrom tpot import TPOTClassifier\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1, cv= 5)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\n\nBest pipeline: GaussianNB(LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l1))\n0.363636363636\n'



In [ ]:

The generated pipeline from the autoML does not have a better accuracy than our best performing SVM classifier.

Improvements can be made by using Convolutional Neural Networks, increasing the training data or by creating an ensemble of algorithms.

Ensemble of knn, kmeans and svm by stacking



In [351]:

    
import mlxtend
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

sclf = StackingClassifier(classifiers=[kmeans, svm, knn], 
                          meta_classifier=lr)



In [352]:

    
print('5-fold cross validation:\n')

for clf, label in zip([kmeans, svm, knn, sclf], 
                      ['kmeans', 
                       'svm', 
                       'knn',
                       'StackingClassifier']):
    
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))









    



5-fold cross validation:

Accuracy: 0.36 (+/- 0.11) [kmeans]
Accuracy: 0.47 (+/- 0.09) [svm]
Accuracy: 0.44 (+/- 0.13) [knn]
Accuracy: 0.53 (+/- 0.03) [StackingClassifier]

By stacking the classifiers, we are able to improve the overall accuracy to 0.53



In [ ]:

	0	1	2	3	4	5	6	7	8	9	...	49990	49991	49992	49993	49994	49995	49996	49997	49998	49999
0	171.0	171.0	172.0	169.0	159.0	141.0	120.0	105.0	96.0	96.0	...	50.0	52.0	49.0	48.0	54.0	64.0	133.0	206.0	219.0	227.0
1	44.0	38.0	43.0	47.0	42.0	44.0	49.0	46.0	38.0	17.0	...	15.0	15.0	14.0	13.0	12.0	13.0	18.0	18.0	18.0	18.0
2	255.0	253.0	252.0	254.0	255.0	255.0	253.0	246.0	253.0	255.0	...	182.0	171.0	152.0	154.0	173.0	184.0	177.0	173.0	164.0	153.0
3	28.0	27.0	27.0	26.0	25.0	24.0	24.0	24.0	23.0	22.0	...	197.0	195.0	194.0	193.0	189.0	186.0	189.0	190.0	191.0	192.0
4	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0	...	16.0	16.0	16.0	16.0	15.0	15.0	16.0	16.0	18.0	20.0

	0	1	2	3	4	5	6	7	8	9	...	49990	49991	49992	49993	49994	49995	49996	49997	49998	49999
0	125.0	28.0	25.0	25.0	30.0	23.0	23.0	33.0	25.0	24.0	...	96.0	72.0	86.0	72.0	73.0	78.0	75.0	59.0	62.0	44.0
1	145.0	146.0	147.0	148.0	149.0	149.0	149.0	149.0	148.0	148.0	...	158.0	158.0	159.0	159.0	160.0	161.0	161.0	161.0	161.0	161.0
2	5.0	4.0	3.0	2.0	2.0	2.0	2.0	3.0	3.0	4.0	...	9.0	10.0	10.0	10.0	9.0	8.0	8.0	8.0	7.0	7.0
3	4.0	3.0	3.0	2.0	2.0	3.0	3.0	4.0	5.0	4.0	...	207.0	207.0	207.0	207.0	207.0	207.0	206.0	206.0	206.0	206.0
4	177.0	170.0	182.0	86.0	77.0	81.0	67.0	68.0	72.0	73.0	...	130.0	175.0	91.0	85.0	73.0	112.0	94.0	101.0	103.0	96.0