In [1]:
import sys
import math
import numpy as np
import pandas as pd
from util.reader import reader
from numpy.random import RandomState
from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics, cross_validation, preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Binarizer

dataset = '/Users/jordansilva/Documents/Jordan/Mestrado/Lorien/code/output/vector.rbm'

r = reader(dataset)
data, labels, data_full = r.load(size=sys.maxsize, progress=False)
print('loaded')


loaded

In [2]:
import pandas as pd
df = pd.read_csv('reviews.csv',header=None)
#df.head()

index = 'review, uid, bid, active_life, arts_entertainment, automotive, beauty_spas, education, event_planning_services, financial_services, food, health_medical, home_services, hotels_travel, local_flavor, local_services, mass_media, nightlife, pets, professional_services, public_services_government, real_estate, religious_organizations, restaurants, shopping, weather, distance, daysOfWeek, isWeekend, month, season'.split(', ')
df.columns = index

labels3 = df.review.map(float)
data3 = df.loc[:,index[1:]]

In [3]:
#data without context
data2 = []
for d in data:
    data2.append(d[:24])

data4 = []
for d in data:
    data4.append(d[2:])


[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [90]:
b = Binarizer(copy=True)
data4 = b.fit_transform(data4)


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0
 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]

In [9]:
print('starting')
X = np.asarray(data2, 'float32')
y = np.asarray(labels3, 'float32')
N = len(y)
kf = cross_validation.KFold(N, n_folds=5)
fold = 1 ; mae = []; rmse = [];

logistic = linear_model.LogisticRegression()
logistic.C = 6000.0

sgd = linear_model.SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
        learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
        penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False)

prng = RandomState(1234567890)
rbm = BernoulliRBM(random_state=prng, verbose=1, batch_size=100, learning_rate=0.01, n_iter=20, n_components=256)

prng2 = RandomState()
rbm2 = BernoulliRBM(random_state=prng, verbose=0, batch_size=100, learning_rate=0.08, n_iter=40, n_components=128)

rbm3 = BernoulliRBM(verbose=0, batch_size=10, learning_rate=0.08, n_iter=20, n_components=1)

b = Binarizer(copy=False)

classifier = Pipeline(steps=[('binarizer', b), ('rbm', rbm), ('rbm2', rbm2), ('sgd', sgd)])
#classifier = Pipeline(steps=[('binarizer', b), ('rbm', rbm), ('rbm2', rbm2), ('rbm3', rbm3), ('logistic', logistic)])
xtest = 0
ytest = 0
np.set_printoptions(threshold='nan')
for train_index, test_index in kf:
    print("FOLD:",fold,"TRAIN:", len(train_index), "TEST:", len(test_index)); fold+=1
    X_train = X[train_index]
    y_train = y[train_index]

    X_test = X[test_index]
    y_test = y[test_index]

    #logistic = linear_model.LogisticRegression()
    #logistic.C = 6000.0

    #classifier = Pipeline(steps=[('rbm', rbm), ('sgd', sgd)])
    #classifier = Pipeline(steps=[('rbm', rbm), ('rbm2', rbm2), ('sgd', sgd)])

    classifier.fit(X_train, y_train)
#    gibbs = rbm.gibbs(X_train)
#    np.savetxt('output/rbm-gibbs-fold' + str(fold) + '.txt', gibbs, delimiter=';')
    #print 'rbm'
    #np.savetxt('output/rmb1-fold' + str(fold) + '.txt', rbm.components_, delimiter=';')
    #print 'rbm2'
    #np.savetxt('output/rmb2-fold' + str(fold) + '.txt', rbm2.components_, delimiter=';')

    y_pred = classifier.predict(X_test)    
    print '==============='
    print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(y_test, y_pred)))
    #print y_test
    #np.savetxt('output/y_pred-fold' + str(fold) + '.txt', y_pred, delimiter=';')
    #np.savetxt('output/y_pred-result-fold' + str(fold) + '.txt', y_test, delimiter=';')
    mae.append(mean_absolute_error(y_test,y_pred))
    rmse.append(math.sqrt(mean_squared_error(y_test,y_pred)))

    #print mae
    #print rmse
    
print("MAE: ", sum(mae)/len(mae))
print("RMSE: ", sum(rmse)/len(rmse))


starting
('FOLD:', 1, 'TRAIN:', 251643, 'TEST:', 62911)
[BernoulliRBM] Iteration 1, pseudo-likelihood = -3.08, time = 9.20s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -3.04, time = 17.93s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -3.08, time = 17.65s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -3.02, time = 16.00s
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-9-bd6835da2fc4> in <module>()
     44     #classifier = Pipeline(steps=[('rbm', rbm), ('rbm2', rbm2), ('sgd', sgd)])
     45 
---> 46     classifier.fit(X_train, y_train)
     47 #    gibbs = rbm.gibbs(X_train)
     48 #    np.savetxt('output/rbm-gibbs-fold' + str(fold) + '.txt', gibbs, delimiter=';')

/usr/local/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
    138             the pipeline.
    139         """
--> 140         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    141         self.steps[-1][-1].fit(Xt, y, **fit_params)
    142         return self

/usr/local/lib/python2.7/site-packages/sklearn/pipeline.pyc in _pre_transform(self, X, y, **fit_params)
    119         for name, transform in self.steps[:-1]:
    120             if hasattr(transform, "fit_transform"):
--> 121                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    122             else:
    123                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/usr/local/lib/python2.7/site-packages/sklearn/base.pyc in fit_transform(self, X, y, **fit_params)
    434         else:
    435             # fit method of arity 2 (supervised transformation)
--> 436             return self.fit(X, y, **fit_params).transform(X)
    437 
    438 

/usr/local/lib/python2.7/site-packages/sklearn/neural_network/rbm.pyc in fit(self, X, y)
    351         for iteration in xrange(1, self.n_iter + 1):
    352             for batch_slice in batch_slices:
--> 353                 self._fit(X[batch_slice], rng)
    354 
    355             if verbose:

/usr/local/lib/python2.7/site-packages/sklearn/neural_network/rbm.pyc in _fit(self, v_pos, rng)
    268         h_pos = self._mean_hiddens(v_pos)
    269         v_neg = self._sample_visibles(self.h_samples_, rng)
--> 270         h_neg = self._mean_hiddens(v_neg)
    271 
    272         lr = float(self.learning_rate) / v_pos.shape[0]

/usr/local/lib/python2.7/site-packages/sklearn/neural_network/rbm.pyc in _mean_hiddens(self, v)
    138         p = safe_sparse_dot(v, self.components_.T)
    139         p += self.intercept_hidden_
--> 140         return expit(p, out=p)
    141 
    142     def _sample_hiddens(self, v, rng):

KeyboardInterrupt: 

In [194]:
import random
pxx = []
for x in range(0,100000):
    a57 = []
    for i in range(0,57):
        a57.append(random.randint(0,1))
    a57 = np.asarray(a57)
    #print rbm2.transform(rbm.transform(a57))
    predict = classifier.predict(a57)
    if predict not in pxx:
        pxx.append(predict)
        print predict


[ 5.]
[ 2.]
[ 4.]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-194-849754dd1a1f> in <module>()
      7     a57 = np.asarray(a57)
      8     #print rbm2.transform(rbm.transform(a57))
----> 9     predict = classifier.predict(a57)
     10     if predict not in pxx:
     11         pxx.append(predict)

/usr/local/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc in <lambda>(*args, **kwargs)
     35             self.get_attribute(obj)
     36         # lambda, but not partial, allows help() to work with update_wrapper
---> 37         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
     38         # update the docstring of the returned function
     39         update_wrapper(out, self.fn)

/usr/local/lib/python2.7/site-packages/sklearn/pipeline.pyc in predict(self, X)
    177         Xt = X
    178         for name, transform in self.steps[:-1]:
--> 179             Xt = transform.transform(Xt)
    180         return self.steps[-1][-1].predict(Xt)
    181 

/usr/local/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in transform(self, X, y, copy)
    791         """
    792         copy = copy if copy is not None else self.copy
--> 793         return binarize(X, threshold=self.threshold, copy=copy)
    794 
    795 

/usr/local/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in binarize(X, threshold, copy)
    715     :class:`sklearn.pipeline.Pipeline`)
    716     """
--> 717     X = check_array(X, accept_sparse=['csr', 'csc'], copy=copy)
    718     if sparse.issparse(X):
    719         if threshold < 0:

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    350                              array.ndim)
    351         if force_all_finite:
--> 352             _assert_all_finite(array)
    353 
    354     shape_repr = _shape_repr(array.shape)

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     43 def _assert_all_finite(X):
     44     """Like assert_all_finite, but only for ndarray."""
---> 45     X = np.asanyarray(X)
     46     # First try an O(n) time, O(1) space solution for the common case that
     47     # everything is finite; fall back to O(n) space np.isfinite to prevent

/usr/local/lib/python2.7/site-packages/numpy/core/numeric.pyc in asanyarray(a, dtype, order)
    512 
    513     """
--> 514     return array(a, dtype, copy=False, order=order, subok=True)
    515 
    516 def ascontiguousarray(a, dtype=None):

KeyboardInterrupt: 

In [4]:
print('starting')
X = np.asarray(data, 'float32')
y = np.asarray(labels3, 'float32')
N = len(y)
kf = cross_validation.KFold(N, n_folds=5)
fold = 1 ; rmse = []; mae = [];
#model = RandomForestRegressor(n_estimators=100, n_jobs=4) # n_jobs=4
model = RandomForestClassifier(n_estimators=60, n_jobs=4, warm_start=True) # n_jobs=4    
for train_index, test_index in kf:
    X_train = X[train_index]
    y_train = y[train_index]

    X_test = X[test_index]
    y_test = y[test_index]

    print("FOLD:",fold,"TRAIN:", len(X_train), "TEST:", len(y_test)); fold+=1
    model.fit(X_train,y_train)
    

    y_pred = model.predict(X_test)
    #print y_pred
    #print y_test
    mae.append(mean_absolute_error(y_test,y_pred))
    rmse.append(math.sqrt(mean_squared_error(y_test,y_pred)))
    
    model.n_estimators += model.n_estimators 

print("RMSE: ", sum(rmse)/len(rmse))
print("MAE: ", sum(mae)/len(mae))


starting
('FOLD:', 1, 'TRAIN:', 251643, 'TEST:', 62911)
('FOLD:', 2, 'TRAIN:', 251643, 'TEST:', 62911)
('FOLD:', 3, 'TRAIN:', 251643, 'TEST:', 62911)
('FOLD:', 4, 'TRAIN:', 251643, 'TEST:', 62911)
('FOLD:', 5, 'TRAIN:', 251644, 'TEST:', 62910)
('RMSE: ', 1.0366112143593573)
('MAE: ', 0.4950845718383789)

In [9]:
sorted(zip(model.feature_importances_,index[1:]),reverse=True)


Out[9]:
[(0.36288522843037246, 'bid'),
 (0.35236295541624507, 'uid'),
 (0.0097089792477287155, 'weather'),
 (0.0096741798840783641, 'nightlife'),
 (0.0096544773190209432, 'food'),
 (0.0091768878306588975, 'month'),
 (0.0084236673352251958, 'isWeekend'),
 (0.0072587134769545982, 'restaurants'),
 (0.0063720867336881891, 'arts_entertainment'),
 (0.0048327478975741193, 'shopping'),
 (0.0034990593324511435, 'event_planning_services'),
 (0.0033083079672388744, 'active_life'),
 (0.0029843261509949958, 'beauty_spas'),
 (0.0026272468252846928, 'daysOfWeek'),
 (0.0024822476770182166, 'hotels_travel'),
 (0.0020837981928097465, 'automotive'),
 (0.0020175237374475026, 'health_medical'),
 (0.0018861845469913584, 'home_services'),
 (0.0015102695526123839, 'local_services'),
 (0.0012498377103550124, 'season'),
 (0.0012048223762613735, 'pets'),
 (0.00116570690528272, 'public_services_government'),
 (0.00084231461283984759, 'professional_services'),
 (0.00079295913971628746, 'education'),
 (0.00064536965313162975, 'distance'),
 (0.00063265705065798867, 'local_flavor'),
 (0.00054800441027946691, 'financial_services'),
 (0.00049751374259131135, 'real_estate'),
 (0.00023733794672887407, 'religious_organizations'),
 (8.2756835444395418e-05, 'mass_media')]