In [1]:
import sys
import math
import numpy as np
import pandas as pd
from util.reader import reader
from numpy.random import RandomState
from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics, cross_validation, preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Binarizer
dataset = '/Users/jordansilva/Documents/Jordan/Mestrado/Lorien/code/output/vector.rbm'
r = reader(dataset)
data, labels, data_full = r.load(size=sys.maxsize, progress=False)
print('loaded')
In [2]:
import pandas as pd
df = pd.read_csv('reviews.csv',header=None)
#df.head()
index = 'review, uid, bid, active_life, arts_entertainment, automotive, beauty_spas, education, event_planning_services, financial_services, food, health_medical, home_services, hotels_travel, local_flavor, local_services, mass_media, nightlife, pets, professional_services, public_services_government, real_estate, religious_organizations, restaurants, shopping, weather, distance, daysOfWeek, isWeekend, month, season'.split(', ')
df.columns = index
labels3 = df.review.map(float)
data3 = df.loc[:,index[1:]]
In [3]:
#data without context
data2 = []
for d in data:
data2.append(d[:24])
data4 = []
for d in data:
data4.append(d[2:])
In [90]:
b = Binarizer(copy=True)
data4 = b.fit_transform(data4)
In [9]:
print('starting')
X = np.asarray(data2, 'float32')
y = np.asarray(labels3, 'float32')
N = len(y)
kf = cross_validation.KFold(N, n_folds=5)
fold = 1 ; mae = []; rmse = [];
logistic = linear_model.LogisticRegression()
logistic.C = 6000.0
sgd = linear_model.SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
eta0=0.0, fit_intercept=True, l1_ratio=0.15,
learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
penalty='l2', power_t=0.5, random_state=None, shuffle=True,
verbose=0, warm_start=False)
prng = RandomState(1234567890)
rbm = BernoulliRBM(random_state=prng, verbose=1, batch_size=100, learning_rate=0.01, n_iter=20, n_components=256)
prng2 = RandomState()
rbm2 = BernoulliRBM(random_state=prng, verbose=0, batch_size=100, learning_rate=0.08, n_iter=40, n_components=128)
rbm3 = BernoulliRBM(verbose=0, batch_size=10, learning_rate=0.08, n_iter=20, n_components=1)
b = Binarizer(copy=False)
classifier = Pipeline(steps=[('binarizer', b), ('rbm', rbm), ('rbm2', rbm2), ('sgd', sgd)])
#classifier = Pipeline(steps=[('binarizer', b), ('rbm', rbm), ('rbm2', rbm2), ('rbm3', rbm3), ('logistic', logistic)])
xtest = 0
ytest = 0
np.set_printoptions(threshold='nan')
for train_index, test_index in kf:
print("FOLD:",fold,"TRAIN:", len(train_index), "TEST:", len(test_index)); fold+=1
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
#logistic = linear_model.LogisticRegression()
#logistic.C = 6000.0
#classifier = Pipeline(steps=[('rbm', rbm), ('sgd', sgd)])
#classifier = Pipeline(steps=[('rbm', rbm), ('rbm2', rbm2), ('sgd', sgd)])
classifier.fit(X_train, y_train)
# gibbs = rbm.gibbs(X_train)
# np.savetxt('output/rbm-gibbs-fold' + str(fold) + '.txt', gibbs, delimiter=';')
#print 'rbm'
#np.savetxt('output/rmb1-fold' + str(fold) + '.txt', rbm.components_, delimiter=';')
#print 'rbm2'
#np.savetxt('output/rmb2-fold' + str(fold) + '.txt', rbm2.components_, delimiter=';')
y_pred = classifier.predict(X_test)
print '==============='
print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(y_test, y_pred)))
#print y_test
#np.savetxt('output/y_pred-fold' + str(fold) + '.txt', y_pred, delimiter=';')
#np.savetxt('output/y_pred-result-fold' + str(fold) + '.txt', y_test, delimiter=';')
mae.append(mean_absolute_error(y_test,y_pred))
rmse.append(math.sqrt(mean_squared_error(y_test,y_pred)))
#print mae
#print rmse
print("MAE: ", sum(mae)/len(mae))
print("RMSE: ", sum(rmse)/len(rmse))
In [194]:
import random
pxx = []
for x in range(0,100000):
a57 = []
for i in range(0,57):
a57.append(random.randint(0,1))
a57 = np.asarray(a57)
#print rbm2.transform(rbm.transform(a57))
predict = classifier.predict(a57)
if predict not in pxx:
pxx.append(predict)
print predict
In [4]:
print('starting')
X = np.asarray(data, 'float32')
y = np.asarray(labels3, 'float32')
N = len(y)
kf = cross_validation.KFold(N, n_folds=5)
fold = 1 ; rmse = []; mae = [];
#model = RandomForestRegressor(n_estimators=100, n_jobs=4) # n_jobs=4
model = RandomForestClassifier(n_estimators=60, n_jobs=4, warm_start=True) # n_jobs=4
for train_index, test_index in kf:
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
print("FOLD:",fold,"TRAIN:", len(X_train), "TEST:", len(y_test)); fold+=1
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
#print y_pred
#print y_test
mae.append(mean_absolute_error(y_test,y_pred))
rmse.append(math.sqrt(mean_squared_error(y_test,y_pred)))
model.n_estimators += model.n_estimators
print("RMSE: ", sum(rmse)/len(rmse))
print("MAE: ", sum(mae)/len(mae))
In [9]:
sorted(zip(model.feature_importances_,index[1:]),reverse=True)
Out[9]: