In [15]:
import os
import csv
import numpy as np
import pylab as pl
from PIL import Image 

from sklearn import datasets
from sklearn.cross_validation import cross_val_score
from sklearn import svm, metrics

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pylab as pl
import numpy as np

import matplotlib.image as mpimg
import cv2
#from skimage.color import rgb2gray
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
import re

def tryint(s):
    try:
        return int(s)
    except:
        return s
     
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

In [17]:
def myrgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.587, 0.114])



# skimage.rgb2gray Normalized Data, unlike other methods
def readDataGray(path):
    os.chdir(path)
    data = []
    # the files needs to be read in numerically increasing order
    # thats the order we need for label mapping, else use some sor of dict
    for fileName in sorted(os.listdir("."), key=alphanum_key):
        if fileName.endswith(".png"):
            fullPath = os.path.join(path, fileName)
            #print fileName, fullPath
            #png data is of type float
            img = mpimg.imread(fullPath)
            data.append(myrgb2gray(img).reshape(1024,))
            
            #using opencv:
            #image = cv2.imread(fullPath)
            #img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            
            # try with skimage.color.rgb2gray - returns normalized array
            #data.append(rgb2gray(img).reshape(1024,))
            
    data = np.asarray(data, dtype='float32')
    return data



def readTrainingLabel(path):
    # skip the first row: id,label
    label = [x[1] for x in csv.reader(file(path))][1:]
    label = np.asarray(label, dtype='uint8')
    return label
    

def display_scores(params, scores, append_star=False):
    """Format the mean score +/- std error for params"""
    params = ", ".join("{0}={1}".format(k, v)
                      for k, v in params.items())
    line = "{0}:\t{1:.3f} (+/-{2:.3f})".format(
        params, np.mean(scores), sem(scores))
    if append_star:
        line += " *"
    return line

def display_grid_scores(grid_scores, top=None):
    """Helper function to format a report on a grid of scores"""
    
    grid_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)
    if top is not None:
        grid_scores = grid_scores[:top]
        
    # Compute a threshold for staring models with overlapping
    # stderr:
    _, best_mean, best_scores = grid_scores[0]
    threshold = best_mean - 2 * sem(best_scores)
    
    for params, mean_score, scores in grid_scores:
        append_star = mean_score + 2 * sem(scores) > threshold
        print(display_scores(params, scores, append_star=append_star)) 
        
def writePrediction(predictions, outFile):
    data =''
    dataFile = open(outFile, "w")
    for res in predictions:
        line = str(res) + "\n"
        data += line
    dataFile.write(data)

In [7]:
X_10k = readDataGray("/home/bakuda/ageekrepo/kaggle/object_recognition_in_images/train-10000/")
y_10k = readTrainingLabel("/home/bakuda/ageekrepo/kaggle/object_recognition_in_images/trainLabels_int.txt")

In [23]:
X_4k = X_10k[:4000]
y_4k = y_10k[:4000]

In [10]:
digits = datasets.load_digits()

In [12]:
X_digits = digits.data
y_digits = digits.target

In [13]:
X_digits.shape, y_digits.shape


Out[13]:
((1797, 64), (1797,))

In [25]:
X_train, X_test, y_train,y_test = train_test_split(X_4k, y_4k, test_size=.2)

In [28]:
X_train.shape, y_test.shape


Out[28]:
((3200, 1024), (800,))

In [26]:
clf_lr = LogisticRegression().fit(X_train, y_train)
print clf_lr.score(X_train, y_train)
print clf_lr.score(X_test, y_test)


0.628125
0.20875

In [29]:
clf_lr


Out[29]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [47]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(loss='log', penalty='elasticnet', n_iter=30).fit(X_train, y_train)
print clf_sgd.score(X_train, y_train)
print clf_sgd.score(X_test, y_test)


0.4196875
0.16875

In [50]:
from sklearn.linear_model import RandomizedLogisticRegression
clf_rand_lr = RandomizedLogisticRegression().fit(X_train, y_train)

In [52]:
X_1 = clf_rand_lr.transform(X_train)
X_2 = clf_rand_lr.transform(X_test)

In [55]:
X_1.shape, X_2.shape, X_train.shape


Out[55]:
((3200, 156), (800, 156), (3200, 1024))

In [63]:


In [ ]: