In [15]:
import os
import csv
import numpy as np
import pylab as pl
from PIL import Image
from sklearn import datasets
from sklearn.cross_validation import cross_val_score
from sklearn import svm, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pylab as pl
import numpy as np
import matplotlib.image as mpimg
import cv2
#from skimage.color import rgb2gray
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
In [16]:
import re
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
In [17]:
def myrgb2gray(rgb):
return np.dot(rgb[...,:3], [0.2989, 0.587, 0.114])
# skimage.rgb2gray Normalized Data, unlike other methods
def readDataGray(path):
os.chdir(path)
data = []
# the files needs to be read in numerically increasing order
# thats the order we need for label mapping, else use some sor of dict
for fileName in sorted(os.listdir("."), key=alphanum_key):
if fileName.endswith(".png"):
fullPath = os.path.join(path, fileName)
#print fileName, fullPath
#png data is of type float
img = mpimg.imread(fullPath)
data.append(myrgb2gray(img).reshape(1024,))
#using opencv:
#image = cv2.imread(fullPath)
#img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# try with skimage.color.rgb2gray - returns normalized array
#data.append(rgb2gray(img).reshape(1024,))
data = np.asarray(data, dtype='float32')
return data
def readTrainingLabel(path):
# skip the first row: id,label
label = [x[1] for x in csv.reader(file(path))][1:]
label = np.asarray(label, dtype='uint8')
return label
def display_scores(params, scores, append_star=False):
"""Format the mean score +/- std error for params"""
params = ", ".join("{0}={1}".format(k, v)
for k, v in params.items())
line = "{0}:\t{1:.3f} (+/-{2:.3f})".format(
params, np.mean(scores), sem(scores))
if append_star:
line += " *"
return line
def display_grid_scores(grid_scores, top=None):
"""Helper function to format a report on a grid of scores"""
grid_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)
if top is not None:
grid_scores = grid_scores[:top]
# Compute a threshold for staring models with overlapping
# stderr:
_, best_mean, best_scores = grid_scores[0]
threshold = best_mean - 2 * sem(best_scores)
for params, mean_score, scores in grid_scores:
append_star = mean_score + 2 * sem(scores) > threshold
print(display_scores(params, scores, append_star=append_star))
def writePrediction(predictions, outFile):
data =''
dataFile = open(outFile, "w")
for res in predictions:
line = str(res) + "\n"
data += line
dataFile.write(data)
In [7]:
X_10k = readDataGray("/home/bakuda/ageekrepo/kaggle/object_recognition_in_images/train-10000/")
y_10k = readTrainingLabel("/home/bakuda/ageekrepo/kaggle/object_recognition_in_images/trainLabels_int.txt")
In [23]:
X_4k = X_10k[:4000]
y_4k = y_10k[:4000]
In [10]:
digits = datasets.load_digits()
In [12]:
X_digits = digits.data
y_digits = digits.target
In [13]:
X_digits.shape, y_digits.shape
Out[13]:
In [25]:
X_train, X_test, y_train,y_test = train_test_split(X_4k, y_4k, test_size=.2)
In [28]:
X_train.shape, y_test.shape
Out[28]:
In [26]:
clf_lr = LogisticRegression().fit(X_train, y_train)
print clf_lr.score(X_train, y_train)
print clf_lr.score(X_test, y_test)
In [29]:
clf_lr
Out[29]:
In [47]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(loss='log', penalty='elasticnet', n_iter=30).fit(X_train, y_train)
print clf_sgd.score(X_train, y_train)
print clf_sgd.score(X_test, y_test)
In [50]:
from sklearn.linear_model import RandomizedLogisticRegression
clf_rand_lr = RandomizedLogisticRegression().fit(X_train, y_train)
In [52]:
X_1 = clf_rand_lr.transform(X_train)
X_2 = clf_rand_lr.transform(X_test)
In [55]:
X_1.shape, X_2.shape, X_train.shape
Out[55]:
In [63]:
In [ ]: