In [3]:
import numpy as np
import pysal
import scipy.sparse as sp
import itertools as iter
from scipy.stats import f, chisqprob
import numpy.linalg as la
import pandas as pd
from datetime import datetime as dt
import matplotlib.pyplot as plt
%pylab inline
In [4]:
#OLD
"""
def spcategorical2(n_cat_ids):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
n_cat_ids : array
A 1d vector of the categorical labels for n observations.
Returns
--------
dummy : array
A sparse matrix of dummy (indicator/binary) variables for the
categorical data.
'''
if np.squeeze(n_cat_ids).ndim == 1:
cat_set = np.unique(n_cat_ids)
n = len(n_cat_ids)
C = len(cat_set)
row_map = dict((id, np.where(cat_set == id)[0]) for id in n_cat_ids)
indices = np.array([row_map[row] for row in n_cat_ids]).flatten()
indptr = np.zeros((n + 1, ), dtype=int)
indptr[:-1] = list(np.arange(n))
indptr[-1] = n
return sp.csr_matrix((np.ones(n), indices, indptr))
else:
raise IndexError("The index %s is not understood" % col)
"""
def spcategorical2(n_cat_ids):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
n_cat_ids : array
A 1d vector of the categorical labels for n observations.
Returns
--------
dummy : array
A sparse matrix of dummy (indicator/binary) variables for the
categorical data.
'''
if np.squeeze(n_cat_ids).ndim == 1:
cat_set = np.unique(n_cat_ids)
n = len(n_cat_ids)
C = len(cat_set)
indices = n_cat_ids
indptr = np.arange(n+1, dtype=int)
return sp.csr_matrix((np.ones(n), indices, indptr))
else:
raise IndexError("The index %s is not understood" % col)
In [5]:
def spcategorical1(data):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
data : array
A 1d vector of the categorical variable.
Returns
--------
dummy_matrix
A sparse matrix of dummy (indicator/binary) variables for the
categorical data.
'''
if np.squeeze(data).ndim == 1:
tmp_arr = np.unique(data)
tmp_dummy = sp.csr_matrix((0, len(data)))
for each in tmp_arr[:, None]:
row = sp.csr_matrix((each == data).astype(float))
tmp_dummy = sp.vstack([tmp_dummy, row])
tmp_dummy = tmp_dummy.T
return tmp_dummy
else:
raise IndexError("The index %s is not understood" % col)
In [6]:
def spcategorical1a(data):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
data : array
A 1d vector of the categorical variable.
Returns
--------
dummy_matrix
A sparse matrix of dummy (indicator/binary) variables for the
categorical data.
'''
if np.squeeze(data).ndim == 1:
tmp_arr = np.unique(data)
n = len(data)
C = len(tmp_arr)
tmp_dummy = sp.dok_matrix((n, C))
for each in tmp_arr[:, None]:
row = (each == data).astype(float)
tmp_dummy[:,each[0]] = row.reshape((n,1))
return tmp_dummy.tocsr()
else:
raise IndexError("The index %s is not understood" % col)
In [7]:
def spcategorical1b(data):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
data : array
A 1d vector of the categorical variable.
Returns
--------
dummy_matrix
A sparse matrix of dummy (indicator/binary) variables for the
categorical data.
'''
if np.squeeze(data).ndim == 1:
tmp_arr = np.unique(data)
n = len(data)
C = len(tmp_arr)
tmp_dummy = sp.lil_matrix((n, C))
for each in tmp_arr[:, None]:
row = (each == data).astype(float)
tmp_dummy[:,each[0]] = row.reshape((n,1))
return tmp_dummy.tocsr()
else:
raise IndexError("The index %s is not understood" % col)
In [8]:
n = 20
o = np.tile(np.arange(n),n)
print np.allclose(spcategorical1(o).toarray(), spcategorical2(o).toarray())
print np.allclose(spcategorical1(o).toarray(), spcategorical1a(o).toarray())
print np.allclose(spcategorical1(o).toarray(), spcategorical1b(o).toarray())
In [9]:
spcat1 = []
for n in np.arange(25,250,25):
o = np.tile(np.arange(n),n)
s = dt.now()
a = spcategorical1(np.array(o))
e = dt.now()
spcat1.append((e-s).total_seconds())
In [ ]:
spcat1a = []
for n in np.arange(25,250,25):
o = np.tile(np.arange(n),n)
s = dt.now()
b = spcategorical1a(np.array(o))
e = dt.now()
spcat1a.append((e-s).total_seconds())
In [ ]:
spcat1b = []
for n in np.arange(25,250,25):
o = np.tile(np.arange(n),n)
s = dt.now()
b = spcategorical1b(np.array(o))
e = dt.now()
spcat1b.append((e-s).total_seconds())
In [ ]:
spcat2 = []
for n in np.arange(25,250,25):
o = np.tile(np.arange(n),n)
s = dt.now()
b = spcategorical2(np.array(o))
e = dt.now()
spcat2.append((e-s).total_seconds())
In [ ]:
spcat1
In [ ]:
spcat1a
In [ ]:
spcat1b
In [ ]:
spcat2
In [ ]:
x = np.arange(25, 250, 25)
plt.plot(x, spcat1, x, spcat1a, x, spcat1b, x, spcat2)
plt.legend(('spcat1', 'spcat1a', 'spcat1b', 'spcat2'))
plt.title('Speed of Sparse Dummy Functions')
plt.xlabel('Sample Size')
plt.ylabel('Seconds')
In [ ]:
spcat1 = []
for n in np.arange(100,1000,100):
o = np.tile(np.arange(n),n)
s = dt.now()
b = spcategorical1(np.array(o))
e = dt.now()
spcat1.append((e-s).total_seconds())
In [ ]:
spcat2 = []
for n in np.arange(100,1000,100):
o = np.tile(np.arange(n),n)
s = dt.now()
b = spcategorical2(np.array(o))
e = dt.now()
spcat2.append((e-s).total_seconds())
In [ ]:
spcat1
In [ ]:
spcat2
In [ ]:
x = np.arange(100, 1000, 100)
plt.plot(x, spcat1, x, spcat2)
plt.legend(('spcat1', 'spcat2'))
plt.title('Speed of Sparse Dummy Functions')
plt.xlabel('Sample Size')
plt.ylabel('Seconds')
In [10]:
def concatenate_csc_matrices_by_columns(matrix1, matrix2):
new_data = np.concatenate((matrix1.data, matrix2.data))
new_indices = np.concatenate((matrix1.indices, matrix2.indices))
new_ind_ptr = matrix2.indptr + len(matrix1.data)
new_ind_ptr = new_ind_ptr[1:]
new_ind_ptr = np.concatenate((matrix1.indptr, new_ind_ptr))
return csc_matrix((new_data, new_indices, new_ind_ptr))
def spcategorical2a(n_cat_ids, unique=None):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
n_cat_ids : array
A 1d vector of the categorical labels for n observations.
Returns
--------
dummy : array
A sparse matrix of dummy (indicator/binary) variables for the
categorical data.
'''
if np.squeeze(n_cat_ids).ndim == 1:
n = np.size(n_cat_ids)
indptr = np.arange(n+1, dtype=uint32)
return sp.csr_matrix((np.ones(n, dtype=int8), n_cat_ids, indptr))
else:
raise IndexError("The index %s is not understood" % col)
In [13]:
n = 3500
o = np.tile(np.arange(n, dtype=uint16),n)
s = dt.now()
b2 = spcategorical2(o)
e = dt.now()
print e-s
b2
Out[13]:
In [14]:
n = 3500
o = np.tile(np.arange(n, dtype=uint16),n)
s = dt.now()
b2a = spcategorical2a(o)
e = dt.now()
print e-s
b2a
Out[14]:
In [ ]: