In [1]:
%pylab inline
import copy
import numpy as np
import pandas as pd
import sys
import os
import re
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, RMSprop
from keras.layers.normalization import BatchNormalization
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from gensim.models import word2vec
In [2]:
dir_in = "../../../class_data/stl10/"
X_train = np.genfromtxt(dir_in + 'X_train_new.csv', delimiter=',')
Y_train = np.genfromtxt(dir_in + 'Y_train.csv', delimiter=',')
X_test = np.genfromtxt(dir_in + 'X_test_new.csv', delimiter=',')
Y_test = np.genfromtxt(dir_in + 'Y_test.csv', delimiter=',')
And construct a flattened version of it, for the linear model case:
In [3]:
Y_train_flat = np.zeros(Y_train.shape[0])
Y_test_flat = np.zeros(Y_test.shape[0])
for i in range(10):
Y_train_flat[Y_train[:,i] == 1] = i
Y_test_flat[Y_test[:,i] == 1] = i
In [4]:
model = Sequential()
model.add(Dense(1024, input_shape = (X_train.shape[1],)))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation('softmax'))
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms,
metrics=['accuracy'])
In [5]:
model.fit(X_train, Y_train, batch_size=32, nb_epoch=5, verbose=1)
Out[5]:
In [6]:
test_rate = model.evaluate(X_test, Y_test)[1]
print("Test classification rate %0.05f" % test_rate)
In [7]:
svc_obj = SVC(kernel='linear', C=1)
svc_obj.fit(X_train, Y_train_flat)
pred = svc_obj.predict(X_test)
pd.crosstab(pred, Y_test_flat)
c_rate = sum(pred == Y_test_flat) / len(pred)
print("Test classification rate %0.05f" % c_rate)
In [8]:
lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, Y_train_flat)
pred = lr.predict(X_test)
pd.crosstab(pred, Y_test_flat)
c_rate = sum(pred == Y_test_flat) / len(pred)
print("Test classification rate %0.05f" % c_rate)
In [9]:
dir_in = "../../../class_data/chi_python/"
X_train = np.genfromtxt(dir_in + 'chiCrimeMat_X_train.csv', delimiter=',')
Y_train = np.genfromtxt(dir_in + 'chiCrimeMat_Y_train.csv', delimiter=',')
X_test = np.genfromtxt(dir_in + 'chiCrimeMat_X_test.csv', delimiter=',')
Y_test = np.genfromtxt(dir_in + 'chiCrimeMat_Y_test.csv', delimiter=',')
Now, built a neural network for the model
In [10]:
model = Sequential()
model.add(Dense(1024, input_shape = (434,)))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(5))
model.add(Activation('softmax'))
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms,
metrics=['accuracy'])
In [11]:
# downsample, if need be:
num_sample = X_train.shape[0]
model.fit(X_train[:num_sample], Y_train[:num_sample], batch_size=32,
nb_epoch=10, verbose=1)
Out[11]:
In [12]:
test_rate = model.evaluate(X_test, Y_test)[1]
print("Test classification rate %0.05f" % test_rate)
In [13]:
path = "../../../class_data/aclImdb/"
ff = [path + "train/pos/" + x for x in os.listdir(path + "train/pos")] + \
[path + "train/neg/" + x for x in os.listdir(path + "train/neg")] + \
[path + "test/pos/" + x for x in os.listdir(path + "test/pos")] + \
[path + "test/neg/" + x for x in os.listdir(path + "test/neg")]
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
input_label = ([1] * 12500 + [0] * 12500) * 2
input_text = []
for f in ff:
with open(f) as fin:
pass
input_text += [remove_tags(" ".join(fin.readlines()))]
I'll fit a significantly larger vocabular this time, as the embeddings are basically given for us.
In [14]:
num_words = 5000
max_len = 400
tok = Tokenizer(num_words)
tok.fit_on_texts(input_text[:25000])
In [15]:
X_train = tok.texts_to_sequences(input_text[:25000])
X_test = tok.texts_to_sequences(input_text[25000:])
y_train = input_label[:25000]
y_test = input_label[25000:]
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)
In [16]:
words = []
for iter in range(num_words):
words += [key for key,value in tok.word_index.items() if value==iter+1]
In [17]:
loc = "/Users/taylor/files/word2vec_python/GoogleNews-vectors-negative300.bin"
w2v = word2vec.Word2Vec.load_word2vec_format(loc, binary=True)
In [18]:
weights = np.zeros((num_words,300))
for idx, w in enumerate(words):
try:
weights[idx,:] = w2v[w]
except KeyError as e:
pass
In [19]:
model = Sequential()
model.add(Embedding(num_words, 300, input_length=max_len))
model.add(Dropout(0.5))
model.add(GRU(16,activation='relu'))
model.add(Dense(128))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.layers[0].set_weights([weights])
model.layers[0].trainable = False
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
In [22]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=5, verbose=1,
validation_data=(X_test, y_test))
Out[22]:
In [ ]: