In [2]:
from scipy.special import comb
import numpy as np
def how_many(max_n = 6, length = 16):
"""
Compute how many different binary vectors of a given length can be formed up to a given number.
If a list is passed, compute the vectors as specified in the list.
"""
if isinstance(max_n, int):
indexes = range(1,max_n+1)
if isinstance(max_n, list):
indexes = max_n
else:
raise TypeError("how_many(x,y) requires x to be either list or int")
rows_n=0
for i in indexes:
rows_n = rows_n + comb(length,i, exact=True)
return(rows_n)
def binary_vectors(length = 16, max_n = 6, one_hot = False):
"""
Return an array of size [how_many(max_n, length), length]
Each row is a binary vector with up to max_n ones.
Return a label array of size how_many(max_n, length) either as
integer or as one_hot representation
The function computes all possibilities by converting successive integers into
binary representation and then extracts those within range
"""
#Compute the dimension of the matrix for memory allocation
# numbers of column
columns_n = 16
# numbers of rows
rows_n = 2**columns_n
#location matrix
locations = np.zeros((rows_n, columns_n))
#populate the location matrix
for i in range(rows_n):
bin_string = np.binary_repr(i,length)
# we need to convert the binary string into a "boolean vector"
# http://stackoverflow.com/questions/29091869/convert-bitstring-string-of-1-and-0s-to-numpy-array
bin_array = np.fromstring(bin_string,'u1') - ord('0')
locations[i,:] = bin_array
#Exctrat vector within range
locations = locations[np.sum(locations, axis=1)<=max_n]
return locations
# The 50.000 inputs
# Repeat the matrix 4 times and cut the excess
# inputs = np.tile(locations,(4,1))
# inputs = inputs[0:50000,:]
# labels = np.sum(inputs, axis=1).reshape(50000,1)
# First we store the
# print("vector {} has label {}".format(inputs[2532,:], labels[2532,:]))
In [3]:
# def binary_vector_2(rows_n = [2,4,6,8,10], columns_n = 10):
# rows = how_many(rows_n, 10)
# index = 0
# locations = np.zeros((rows, columns_n))
# for i in rows_n:
# for bin_string in kbits(10,i):
# bin_array = np.fromstring(bin_string,'u1') - ord('0')
# locations[index,:] = bin_array
# index = index+1
# return locations
# inputs = binary_vector_2()
# labels = find_labels(inputs, one_hot=True)
# #dataset_ver = Dataset(inputs, labels)
# #pickle_test(dataset_ver)
# inputs.shape
In [4]:
import numpy as np
import itertools
from scipy.special import comb
def kbits(n, k):
""" Generate a list of ordered binary strings representing all the possibile
way n chooses k.
Args:
n (int): set cardinality
k (int): subset cardinality
Returns:
result (string): list of binary strings
"""
result = []
for bits in itertools.combinations(range(n), k):
s = ['0'] * n
for bit in bits:
s[bit] = '1'
result.append(''.join(s))
return result
def binary_vector_2(rows_n = [2,4,6,8,10], distribution=[45], columns_n = 10):
""" Matrix of binary vectors from distribution.
Args:
rows_n (int, ndarray): nx1
distribution (int, ndarray): nx1
Returns:
ndarray of dimension rows_n * distribution, columns_n
TODO: check inputs, here given as list, but should it be a ndarray?
remove index accumulator and rewrite via len(kbit)
Examples:
Should be written in doctest format and should illustrate how
to use the function.
distribution=comb(columns_n, row)
returns all possible combinations: in reality not, should remove randomness: or better set flag
replacement = False
"""
rows_n = np.array(rows_n)
distribution = np.array(distribution)
assert np.all(rows_n >0)
assert np.all(distribution >0), "Distribution values must be positive. {} provided".format(distribution)
if len(distribution) == 1:
distribution = np.repeat(distribution, len(rows_n))
assert len(distribution) == len(rows_n)
rows = np.sum(distribution)
index = 0
locations = np.zeros((rows, columns_n))
cluster_size = comb(columns_n,rows_n)
for i in range(len(rows_n)):
kbit = kbits(10,rows_n[i])
take_this = np.random.randint(cluster_size[i], size=distribution[i])
lista =[]
for indices in take_this:
lista.append(kbit[indices])
kbit = lista
for bin_string in kbit:
bin_array = np.fromstring(bin_string,'u1') - ord('0')
locations[index,:] = bin_array
index = index+1
return locations
In [5]:
import numpy as np
class accumulatorMatrix(object):
"""
Generate a matrix which row vectors correspond to accumulated numerosity, where each number
is coded by repeating 1 times times. If zero = true, the zero vector is included.
Args:
max_number (int): the greatest number to be represented
length (int): vectors length, if not provided is computed as the minimum length compatible
times (int): length of unity representation
zero (bool): whether the zero vector is included or excluded
Returns:
outputs (int, ndarray): max_number x length ndarray
"""
def __init__(self, max_number, length=None, times=2, zero=False):
self.max_number = max_number
self.length = length
self.times = times
self.zero = zero
if not length:
self.length = self.times * self.max_number
assert self.max_number == self.length/times
if self.zero:
self.max_number = self.max_number + 1
add = 0
else:
add = 1
self.outputs = np.zeros((self.max_number, self.length), dtype=int)
for i in range(0,self.max_number):
self.outputs[i,:self.times * (i+add)].fill(1)
def shuffle_(self):
np.random.shuffle(self.outputs)
#def unshuffle(self):
"""We want to access the random shuffle in order to have the list
http://stackoverflow.com/questions/19306976/python-shuffling-with-a-parameter-to-get-the-same-result"""
def replicate(self, times=1):
self.outputs = np.tile(self.outputs, [times, 1])
In [6]:
import warnings
def accumulator_matrix(max_number, length=None, times=2, zero=False):
"""
Generate a matrix which row vectors correspond to accumulated numerosity, where each number
is coded by repeating 1 times times. If zero = true, the zero vector is included.
Args:
max_number (int): the greatest number to be represented
length (int): vectors length, if not provided is computed as the minimum length compatible
times (int): length of unity representation
zero (bool): whether the zero vector is included or excluded
Returns:
outputs (int, ndarray): max_number x length ndarray
"""
warnings.warn("shouldn't use this function anymore! Now use the class accumulatorMatrix.",DeprecationWarning)
if not length:
length = times * max_number
assert max_number == length/times
if zero:
max_number = max_number + 1
add = 0
else:
add = 1
outputs = np.zeros((max_number, length), dtype=int)
for i in range(0,max_number):
outputs[i,:times * (i+add)].fill(1)
return outputs
# np.random.seed(105)
# Weights = np.random.rand(5,10)
In [7]:
def find_labels(inputs, multiple=1, one_hot=False):
"""
Generate the labels corresponding to binary vectors. If one_hot = true, the label are
on hot encoded, otherwise integers.
Args:
inputs (int, ndarray): ndarray row samples
multiple (int): lenght of unity representation
one_hot (bool): False for integer labels, True for one hot encoded labels
Returns:
labels (int): integer or one hot encoded labels
"""
labels = (np.sum(inputs, axis=1)/multiple).astype(int)
if one_hot:
size = np.max(labels)
label_matrix = np.zeros((labels.shape[0], size+1))
label_matrix[np.arange(labels.shape[0]), labels] = 1
labels = label_matrix
return labels
In [8]:
from collections import namedtuple
def Dataset(inputs, labels):
"""Creates dataset
Args:
inputs (array):
labels (array): corresponding labels
Returns:
Datasets: named tuple
"""
Dataset = namedtuple('Dataset', ['data', 'labels'])
Datasets = Dataset(inputs, labels)
return Datasets
In [9]:
from collections import namedtuple
Dataset = namedtuple('Dataset', ['data', 'labels'])
#data_verguts = Dataset(inputs, labels)
import pickle
def pickle_test(Data, name):
f = open(name+'.pickle', 'ab')
pickle.dump(Data, f)
f.close()
#pickle_test(data_verguts, "verguts")
# # Test opening the pickle
# pickle_in = open("Data.pickle", "rb")
# ex = pickle.load(pickle_in)
# ex.labels[25]
We now pickle the named_tuple cfr. When to pickle
See http://localhost:8888/notebooks/Dropbox/Programming/Jupyter/Competitive-Unsupervised/NNTf.ipynb for creating a panda dataframe out of the namedtuple http://stackoverflow.com/questions/16377215/how-to-pickle-a-namedtuple-instance-correctly
In [ ]:
In [10]:
rows_n = [2,4,6,8,10]
#comb(10, rows_n)
inputs = binary_vector_2(distribution = comb(10, rows_n))
labels = find_labels(inputs, multiple=2, one_hot=True)
count = 0
for i in inputs:
print(count, i, int(np.sum(i)/2), labels[count])
count +=1
In [12]:
inputs = accumulatorMatrix(5, times=2).outputs
labels = find_labels(inputs, multiple=2, one_hot=True)
Dataset = namedtuple('Dataset', ['data', 'labels'])
verguts2004 = Dataset(inputs, labels)
pickle_test(verguts2004, "verguts_accumulator")
In [98]:
verguts2004.labels
Out[98]:
In [ ]: