In [1]:
import tohu
from tohu import *
from utils import print_generated_sequence
In [2]:
print(f"Tohu version: {tohu.__version__}")
This notebook contains high-level tests for tohu
's "standard" generators.
Generates random integers in the range [lo
, hi
].
In [3]:
g = Integer(low=100, high=200)
In [4]:
g.reset(seed=12345); print_generated_sequence(g, num=15)
g.reset(seed=9999); print_generated_sequence(g, num=15)
In [5]:
some_integers = g.generate(5, seed=99999)
In [6]:
for x in some_integers:
print(x)
The default distribution is "uniform", but we can use any(?) of the distributions supported by numpy.
In [7]:
#g = Integer(low=100, high=200, distribution=None)
Generates random floating point numbers in the range [lo
, hi
].
In [8]:
g = Float(low=2.71828, high=3.14159)
In [9]:
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=9999); print_generated_sequence(g, num=4)
Generates random numbers using one of the random number generators supported by numpy.
In [10]:
g1 = NumpyRandomGenerator(method="normal", loc=3.0, scale=5.0)
g2 = NumpyRandomGenerator(method="poisson", lam=30)
g3 = NumpyRandomGenerator(method="exponential", scale=0.3)
In [11]:
g1.reset(seed=12345); print_generated_sequence(g1, num=4)
g2.reset(seed=12345); print_generated_sequence(g2, num=15)
g3.reset(seed=12345); print_generated_sequence(g3, num=4)
It is also possible to use any generator provided by the faker library.
In [12]:
g1 = FakerGenerator(method="name")
g2 = FakerGenerator(method="name", locale='hi_IN')
g3 = FakerGenerator(method="phone_number")
g4 = FakerGenerator(method="job")
In [13]:
g1.reset(seed=12345); print_generated_sequence(g1, num=4)
g2.reset(seed=12345); print_generated_sequence(g2, num=4)
g3.reset(seed=12345); print_generated_sequence(g3, num=4)
g4.reset(seed=12345); print_generated_sequence(g4, num=4)
Generates a sequence repeating the same element indefinitely.
In [14]:
g = Constant("Foobar"); print_generated_sequence(g, num=10)
g = Constant(42); print_generated_sequence(g, num=20)
Generates a sequence of sequentially numbered strings with a given prefix.
In [15]:
g = Sequential(prefix='Foo_', digits=3)
Calling reset()
on the generator makes the numbering start from 1 again.
In [16]:
g.reset()
print_generated_sequence(g, num=5)
print_generated_sequence(g, num=5)
print("-----------------------------")
g.reset()
print_generated_sequence(g, num=5)
Note: the method Sequential.reset()
supports the seed
argument for consistency with other generators, but its value is ignored - the generator is simply reset to its initial value. This is illustrated here:
In [17]:
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)
If a new Sequential
generator is created from an existing one via the _spawn()
method then its count will start again from 1.
In [18]:
g1 = Sequential(prefix="Quux_", digits=2)
g1.reset(seed=12345)
print_generated_sequence(g1, num=5)
g2 = g1._spawn()
print_generated_sequence(g1, num=5)
print_generated_sequence(g2, num=5)
In [19]:
g = SelectOne(values=['foobar', 42, 'quux', True, 1.2345])
In [20]:
g.reset(seed=12345); print_generated_sequence(g, num=15)
g.reset(seed=9999); print_generated_sequence(g, num=15)
It is possible to specify different probabilities for each element to be chosen.
In [21]:
g = SelectOne(values=['aa', 'bb', 'cc'], p=[0.8, 0.15, 0.05])
g.reset(seed=12345); print_generated_sequence(g, num=20)
In [22]:
g = SelectMultiple(values=['foobar', 42, 'quux', True, 1.2345], size=3)
In [23]:
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=99999); print_generated_sequence(g, num=4)
Similarly to SelectOne
, one can pass a list of probabilities for the values to be chosen.
In [24]:
g = SelectMultiple(values=['aa', 'bb', 'cc', 'dd', 'ee'], size=3, p=[0.6, 0.1, 0.2, 0.05, 0.05])
In [25]:
g.reset(seed=12345); print_generated_sequence(g, num=4)
It is also possible to pass a random generator for the argument n
. This produces tuples of varying length, where the length of each tuple is determined by the values produced by this generator.
In [26]:
rand_nums = Integer(low=2, high=5)
In [27]:
g = SelectMultiple(values=['a', 'b', 'c', 'd', 'e'], size=rand_nums)
In [28]:
g.reset(seed=11111); print_generated_sequence(g, num=10, sep='\n')
The Subsample
generator can extract a subsample from a given set of values, where each individual element is chosen with a given probability p
.
In [29]:
values = list(range(50))
In [30]:
g = Subsample(values, p=0.3)
In [31]:
g.reset(seed=12345); print_generated_sequence(g, num=10, sep='\n')
In [32]:
g = CharString(length=15)
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)
It is possible to vary the length of generated character strings, and to specify the character set.
In [33]:
g = CharString(min_length=4, max_length=12, charset="ABCDEFGHIJKLMNOPQRSTUVWXYZ")
In [34]:
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')
In [35]:
g = DigitString(length=15)
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)
In [36]:
g = DigitString(min_length=5, max_length=20)
g.reset(seed=9999); print_generated_sequence(g, num=10, sep='\n')
In [37]:
g = HashDigest(length=8)
g.reset(seed=12345); print_generated_sequence(g, num=9)
g.reset(seed=9999); print_generated_sequence(g, num=9)
In [38]:
g = HashDigest(length=20)
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=9999); print_generated_sequence(g, num=4)
In [39]:
g = HashDigest(min_length=6, max_length=20)
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')
In [40]:
g = HashDigest(length=16, as_bytes=True)
In [41]:
g.reset(seed=12345); print_generated_sequence(g, num=3, sep='\n')
In [42]:
g = GeolocationPair()
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')
In [43]:
from tohu.generators import TimestampNEW
In [44]:
g = TimestampNEW(start='2016-02-14', end='2016-02-18')
In [45]:
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')
In [46]:
g = TimestampNEW(start='1998-03-01 00:02:00', end='1998-03-01 00:02:15')
In [47]:
g.reset(seed=99999); print_generated_sequence(g, num=10, sep='\n')
Note that the generated items are datetime
objects (even though they appear as strings when printed above).
In [48]:
type(next(g))
Out[48]:
The GeoJSONGeolocationPair
allows generating points within a geographical area given by a GeoJSON object.
In [49]:
import json
from shapely.geometry import MultiPoint
In [50]:
with open('./data/ne_110m_admin_1_states_provinces_shp.geojson', 'r') as f:
geojson = json.load(f)
In [51]:
g = GeoJSONGeolocationPair(geojson)
In [52]:
pts = g.generate(N=200, seed=12345)
In [53]:
list(pts)[:10]
Out[53]:
In [54]:
MultiPoint(pts)
Out[54]:
In [55]:
class QuuxGenerator(CustomGenerator):
aaa = Integer(0, 100)
bbb = HashDigest(length=6)
In [56]:
g = QuuxGenerator()
Using ExtractAttribute
we can produce \"derived\" generators which extract the attributes aaa
, bbb
from the elements produced by g
.
In [57]:
h1 = ExtractAttribute(g, 'aaa')
h2 = ExtractAttribute(g, 'bbb')
In [58]:
g.reset(seed=99999); print_generated_sequence(g, num=5, sep='\n')
In [59]:
print_generated_sequence(h1, num=5)
print_generated_sequence(h2, num=5)
In [60]:
seq = ['aa', 'bb', 'cc', 'dd', 'ee']
In [61]:
g = IterateOver(seq)
In [62]:
g.reset(); print(list(g.generate(N=3)))
g.reset(); print(list(g.generate(N=10)))
g.reset(); print(list(g))
Each tohu
generator can also be used as a Python iterator producing an (infinite) series of elements.
In [63]:
int_generator = Integer(low=100, high=500).reset(seed=99999)
for i, x in enumerate(int_generator):
if i > 20:
break
print(x, end=" ")
The .generate()
method produces an ItemList
instance.
In [64]:
g = HashDigest(length=6)
In [65]:
item_list = g.generate(N=10, seed=12345)
print(item_list)
Fundamentally an ItemList
behaves like a regular list.
In [66]:
print(list(item_list))
In [67]:
item_list.reset(seed=999999)
print(list(item_list.subsample(num=6)))
print(list(item_list.subsample(num=6)))
print(list(item_list.subsample(num=6)))
In [68]:
item_list.reset(seed=99999)
print(list(item_list.subsample(p=0.4)))
print(list(item_list.subsample(p=0.4)))
print(list(item_list.subsample(p=0.4)))
In [ ]: