Custom data types

Python is an object oriented programming language. OOP is an import programming paradigm that you need to know about in order to fully understand Python.

Fundamental ideas:

  • Objects
  • Inheritance

Objects and classes

The 'class' keyword allows us to define custom data types.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dir(list)


Out[2]:
['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [3]:
class Rectangle(object):
    """
    retangular objects - requires a 2 x 5 np.array corresponding to points in the plane
    traversed counterclockwise - first same as last
    """
    def __init__(self, coords=None):
        """
        C++/Java/Fortran/etc. programmers - this is like the constructor
        """
        self.coords = coords
        
    def plot(self, **kwargs):
        """
        class method - generally public in Python
        """
        plt.fill_between(self.coords[0],self.coords[1], **kwargs)

In [4]:
unit_square = Rectangle(coords=np.array([[0, 1, 1, 0], [0, 0, 1, 1]]))

In [5]:
dir(unit_square)


Out[5]:
['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'coords',
 'plot']

In [6]:
unit_square.plot(lw=5)



In [7]:
fig, ax = plt.subplots()
unit_square.plot(lw=5)
plt.ylim(-1, 2)
plt.xlim(-1, 2);



In [8]:
class Rectangle(object):
    """
    rectangular objects
    requires a 2 x 5 np.array corresponding to four points in the plane
    traversed counterclockwise
    two sides parallel to horizontal axis
    first set of coordinates same as last
    """
    def __init__(self, coords=None):
        self.coords = coords
        
    def plot(self, **kwargs):
        """
        basic mechanism to plot the rectangle
        """
        plt.fill_between(self.coords[0],self.coords[1], **kwargs)
    
    def get_area(self):
        """
        basic area function
        Document class methods just like any other function
        """
        return (np.max(self.coords[0]) - np.min(self.coords[0])) * (np.max(self.coords[1] - np.min(self.coords[1])))

In [9]:
rect = Rectangle(coords=np.array([[0, 2, 2, 0, 0], [0, 0, 1, 1, 0]]))
print('rectangle area = %d' % rect.get_area())
rect.plot(lw=5)


rectangle area = 2

In [10]:
dir(rect)


Out[10]:
['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'coords',
 'get_area',
 'plot']

In [11]:
rect.__dir__()


Out[11]:
['__class__',
 '__str__',
 '__repr__',
 '__subclasshook__',
 '__setattr__',
 '__weakref__',
 '__format__',
 'coords',
 '__le__',
 '__dir__',
 '__hash__',
 '__reduce__',
 '__sizeof__',
 '__ne__',
 '__gt__',
 '__getattribute__',
 '__delattr__',
 '__lt__',
 '__ge__',
 '__new__',
 '__dict__',
 '__eq__',
 '__module__',
 'get_area',
 '__doc__',
 '__reduce_ex__',
 '__init__',
 'plot']

In [12]:
dir(rect).sort() == rect.__dir__().sort()


Out[12]:
True

Check out our documentation:


In [13]:
rect.plot?

Classes are dicts in a sense - hence the magic dict function.


In [14]:
rect.__dict__


Out[14]:
{'coords': array([[0, 2, 2, 0, 0],
        [0, 0, 1, 1, 0]])}

Inheritance

Classes share many common attributes and methods. And just like in real life, there are general and specialized types of things.


In [15]:
# objects are the most basic Python types
class EuclideanShape2D(object):
    """
    generic base class for shapes
    all shapes have area"""
    def __init__(self):
        pass
    
    def get_area(self):
        pass
    
    def plot(self):
        print('no plot method defined - TO DO')
        pass

In [16]:
object?

In [17]:
class Rectangle(EuclideanShape2D):
    """
    rectangular objects
    requires a 2 x 5 np.array corresponding to four points in the plane
    traversed counterclockwise
    two sides parallel to horizontal axis
    """
    def __init__(self, coords=None):
        self.coords = coords
        
    def plot(self, **kwargs):
        """
        basic mechanism to plot the rectangle
        """
        plt.fill_between(self.coords[0],self.coords[1], **kwargs)
    
    def get_area(self):
        """
        basic area function
        """
        return (np.max(self.coords[0]) - np.min(self.coords[0])) * (np.max(self.coords[1] - np.min(self.coords[1])))

In [18]:
class Circle(EuclideanShape2D):
    """
    circular objects
    requires a center and a radius
    """
    def __init__(self, center=None, radius=None):
        self.center = center
        self.radius = radius
        
    def plot(self, **kwargs):
        c = plt.Circle(self.center, self.radius)
        fig, ax = plt.subplots(figsize=(6,6))
        ax.set_ylim(self.center[1] - self.radius - 1, self.center[1] + self.radius + 1)
        ax.set_xlim(self.center[0] - self.radius - 1, self.center[1] + self.radius + 1)
        ax.add_artist(c)
        
    def get_area(self):
        return np.pi*self.radius**2
    
    def get_circumference(self):
        return 2*np.pi*self.radius

In [19]:
circy = Circle(center=(1,1), radius=3)
circy.plot(lw=5)



In [20]:
circy.get_circumference()


Out[20]:
18.84955592153876

In [21]:
class Triangle(EuclideanShape2D):
    """
    triangle class
    requires a 2 x 3 np array corresponding to three points in the plane
    assumes base is parallel to x-axis
    starting point is lower left"""
    def __init__(self, coords):
        self.coords = coords
        
    def get_area(self):
        return 0.5 * (self.coords[0, 1] - self.coords[0, 0]) * (self.coords[1,2] - self.coords[1,1])
    
    # no plotting method defined - what happens?

In [22]:
tri = Triangle(coords=np.array([[0, 2, 1], [0, 0, 3]]))
tri.get_area()


Out[22]:
3.0

In [23]:
tri.plot()


no plot method defined - TO DO

In [24]:
from scipy.stats import bernoulli

In [25]:
?bernoulli

In [26]:
dir(bernoulli)


Out[26]:
['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_argcheck',
 '_cdf',
 '_cdf_single',
 '_cdfvec',
 '_construct_argparser',
 '_construct_default_doc',
 '_construct_doc',
 '_ctor_param',
 '_entropy',
 '_isf',
 '_logcdf',
 '_logpmf',
 '_logsf',
 '_munp',
 '_nonzero',
 '_parse_args',
 '_parse_args_rvs',
 '_parse_args_stats',
 '_pmf',
 '_ppf',
 '_ppfvec',
 '_random_state',
 '_rvs',
 '_sf',
 '_stats',
 '_stats_has_moments',
 '_updated_ctor_param',
 'a',
 'b',
 'badvalue',
 'cdf',
 'entropy',
 'expect',
 'extradoc',
 'freeze',
 'generic_moment',
 'inc',
 'interval',
 'isf',
 'logcdf',
 'logpmf',
 'logsf',
 'mean',
 'median',
 'moment',
 'moment_tol',
 'name',
 'numargs',
 'pmf',
 'ppf',
 'random_state',
 'return_integers',
 'rvs',
 'sf',
 'shapes',
 'stats',
 'std',
 'var',
 'vec_generic_moment',
 'vecentropy']

lots of stuff - what do the underscores and double-underscores all mean? https://www.python.org/dev/peps/pep-0008/#method-names-and-instance-variables


In [27]:
bernoulli.__doc__


Out[27]:
'A Bernoulli discrete random variable.\n\n    As an instance of the `rv_discrete` class, `bernoulli` object inherits from it\n    a collection of generic methods (see below for the full list),\n    and completes them with details specific for this particular distribution.\n    \n    Methods\n    -------\n    ``rvs(p, loc=0, size=1, random_state=None)``\n        Random variates.\n    ``pmf(x, p, loc=0)``\n        Probability mass function.\n    ``logpmf(x, p, loc=0)``\n        Log of the probability mass function.\n    ``cdf(x, p, loc=0)``\n        Cumulative density function.\n    ``logcdf(x, p, loc=0)``\n        Log of the cumulative density function.\n    ``sf(x, p, loc=0)``\n        Survival function  (also defined as ``1 - cdf``, but `sf` is sometimes more accurate).\n    ``logsf(x, p, loc=0)``\n        Log of the survival function.\n    ``ppf(q, p, loc=0)``\n        Percent point function (inverse of ``cdf`` --- percentiles).\n    ``isf(q, p, loc=0)``\n        Inverse survival function (inverse of ``sf``).\n    ``stats(p, loc=0, moments=\'mv\')``\n        Mean(\'m\'), variance(\'v\'), skew(\'s\'), and/or kurtosis(\'k\').\n    ``entropy(p, loc=0)``\n        (Differential) entropy of the RV.\n    ``expect(func, args=(p,), loc=0, lb=None, ub=None, conditional=False)``\n        Expected value of a function (of one argument) with respect to the distribution.\n    ``median(p, loc=0)``\n        Median of the distribution.\n    ``mean(p, loc=0)``\n        Mean of the distribution.\n    ``var(p, loc=0)``\n        Variance of the distribution.\n    ``std(p, loc=0)``\n        Standard deviation of the distribution.\n    ``interval(alpha, p, loc=0)``\n        Endpoints of the range that contains alpha percent of the distribution\n\n    Notes\n    -----\n    The probability mass function for `bernoulli` is::\n\n       bernoulli.pmf(k) = 1-p  if k = 0\n                        = p    if k = 1\n\n    for ``k`` in ``{0, 1}``.\n\n    `bernoulli` takes ``p`` as shape parameter.\n\n    The probability mass function above is defined in the "standardized" form.\n    To shift distribution use the ``loc`` parameter.\n    Specifically, ``bernoulli.pmf(k, p, loc)`` is identically\n    equivalent to ``bernoulli.pmf(k - loc, p)``.\n\n    Examples\n    --------\n    >>> from scipy.stats import bernoulli\n    >>> import matplotlib.pyplot as plt\n    >>> fig, ax = plt.subplots(1, 1)\n    \n    Calculate a few first moments:\n    \n    >>> p = 0.3\n    >>> mean, var, skew, kurt = bernoulli.stats(p, moments=\'mvsk\')\n    \n    Display the probability mass function (``pmf``):\n    \n    >>> x = np.arange(bernoulli.ppf(0.01, p),\n    ...               bernoulli.ppf(0.99, p))\n    >>> ax.plot(x, bernoulli.pmf(x, p), \'bo\', ms=8, label=\'bernoulli pmf\')\n    >>> ax.vlines(x, 0, bernoulli.pmf(x, p), colors=\'b\', lw=5, alpha=0.5)\n    \n    Alternatively, the distribution object can be called (as a function)\n    to fix the shape and location. This returns a "frozen" RV object holding \n    the given parameters fixed. \n    \n    Freeze the distribution and display the frozen ``pmf``:\n    \n    >>> rv = bernoulli(p)\n    >>> ax.vlines(x, 0, rv.pmf(x), colors=\'k\', linestyles=\'-\', lw=1,\n    ...         label=\'frozen pmf\')\n    >>> ax.legend(loc=\'best\', frameon=False)\n    >>> plt.show()\n    \n    Check accuracy of ``cdf`` and ``ppf``:\n    \n    >>> prob = bernoulli.cdf(x, p)\n    >>> np.allclose(x, bernoulli.ppf(prob, p))\n    True\n    \n    Generate random numbers:\n    \n    >>> r = bernoulli.rvs(p, size=1000)\n\n    '

Here is how we can use some of these magic methods to build a custom data structure - a list with additional capabilities, like 'head' and 'tail.'


In [28]:
class SpecialList:
    '''
    A class slightly extending the capabilities of list.
    '''

    def __init__(self, values=None):
        if values is None:
            self.values = []
        else:
            self.values = values

    def __len__(self):
        return len(self.values)

    def __getitem__(self, key):
        return self.values[key]

    def __setitem__(self, key, value):
        self.values[key] = value

    def __delitem__(self, key):
        del self.values[key]

    def __iter__(self):
        return iter(self.values)

    def __reversed__(self):
        return SpecialList(reversed(self.values))

    def append(self, value):
        self.values.append(value)
        
    def head(self, n=5):
        return self.values[:n]
    
    def tail(self, n=5):
        return self.values[-n:]

In [29]:
x = SpecialList(values=list(range(100)))

x.head()


Out[29]:
[0, 1, 2, 3, 4]

In [30]:
x.head(10)


Out[30]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [31]:
len(x)


Out[31]:
100

In [32]:
y=iter(x)

In [33]:
next(y)


Out[33]:
0

In [34]:
next(y)


Out[34]:
1

In [35]:
del x[5]  # behavior defined by __delitem__!

In [36]:
x.head(10)


Out[36]:
[0, 1, 2, 3, 4, 6, 7, 8, 9, 10]

In [37]:
next(y)


Out[37]:
2

A more real-world example

Here is an example of a class built for stochastic gradient descent. Need our helper functions.


In [38]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def ll(act, pred, threshold=1e-15):
    pred = np.maximum(pred, threshold)
    pred = np.minimum(1 - threshold, pred)
    return (-1 / len(act)) * np.sum(((1 - act) * np.log(1 - pred) + act * np.log(pred)))

In [39]:
class SGDClassifier(object):
    """
    sgd_classifier
    random initialization
    """
    
    def __init__(self, data=None, label=None, alpha=None, max_epochs=None):
        # don't introduce a new class attribute outside of init!
        self.data=data
        self.label = label
        
        if data is not None:
            self.w = np.random.randn(self.data.shape[1]) / np.sqrt(self.data.shape[1])  # Xavier initialization
        else:
            self.w = None
        
        if alpha is None:
            self.alpha = 0.0001
        else:
            self.alpha = alpha
            
        if max_epochs is None:
            self.max_epochs = 10000
        else:
            self.max_epochs = max_epochs
            
        self.train_losses=[]
        self.val_losses=[]
        
    def fit(self):
        if self.data is None:
            print('No data, nothing to fit')
            return
        if self.label is None:
            print('No labels, can\'t fit')
            return
        
        # cross-validation
        train_idx = np.random.choice(range(self.data.shape[0]), replace=False,
                                 size=int(np.floor(0.8 * self.data.shape[0])))
        val_idx = [i for i in range(self.data.shape[0]) if i not in train_idx]
        trainX = self.data[train_idx, :]
        valX = self.data[val_idx, :]
        trainy = self.label[train_idx]
        valy = self.label[val_idx]

        n = trainX.shape[0]
        
        for i in range(self.max_epochs):
            # Update weights - where the magic happens
            for j in range(n):
                self.w += self.alpha * (trainy[j] -
                              sigmoid(np.dot(trainX[j, :], self.w))) * trainX[j, :]
            if i % 100 == 0:
                current_train_loss = ll(trainy, sigmoid(np.dot(trainX, self.w)))
                self.train_losses.append(current_train_loss)
                current_val_loss = ll(valy, sigmoid(np.dot(valX, self.w)))
                self.val_losses.append(current_val_loss)
                print('epoch {}: train loss {:.5f}\tvalidation loss {:.5f}'.format(i, current_train_loss, current_val_loss))
    
    def predict(self):
        # TO_DO
        pass
    
    def plot(self, **kwargs):
        if self.data.shape[1] != 3:
            print('wrong dimensions for plot != 2')
            return
        x = np.linspace(-5, 5, 100)
        plt.plot(x, -self.w[0] / self.w[2] - (self.w[1] /self.w[2]) * x, **kwargs)
        plt.scatter(self.data[:500, 1], self.data[:500, 2], color='red')
        plt.scatter(self.data[500:, 1], self.data[500:, 2], color='blue')
        plt.ylim(-5, 5)

In [40]:
# grab our synthetic data from last time
data = np.genfromtxt('../data/synthetic_data.txt', delimiter=' ')
label = np.genfromtxt('../data/label.txt')

In [41]:
d =SGDClassifier(data=data, label=label, alpha=0.0001, max_epochs=500)

In [42]:
whos


Variable           Type             Data/Info
---------------------------------------------
Circle             type             <class '__main__.Circle'>
EuclideanShape2D   type             <class '__main__.EuclideanShape2D'>
Rectangle          type             <class '__main__.Rectangle'>
SGDClassifier      type             <class '__main__.SGDClassifier'>
SpecialList        type             <class '__main__.SpecialList'>
Triangle           type             <class '__main__.Triangle'>
ax                 AxesSubplot      Axes(0.125,0.125;0.775x0.775)
bernoulli          bernoulli_gen    <scipy.stats._discrete_di<...>en object at 0x10fedbac8>
circy              Circle           <__main__.Circle object at 0x10e8aaa90>
d                  SGDClassifier    <__main__.SGDClassifier object at 0x1100edc18>
data               ndarray          1000x3: 3000 elems, type `float64`, 24000 bytes
fig                Figure           Figure(480x320)
label              ndarray          1000: 1000 elems, type `float64`, 8000 bytes
ll                 function         <function ll at 0x10ffad6a8>
np                 module           <module 'numpy' from '/Us<...>kages/numpy/__init__.py'>
plt                module           <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
rect               Rectangle        <__main__.Rectangle object at 0x10e8a0198>
sigmoid            function         <function sigmoid at 0x10ffad598>
tri                Triangle         <__main__.Triangle object at 0x10eb3add8>
unit_square        Rectangle        <__main__.Rectangle object at 0x10bf130f0>
x                  SpecialList      <__main__.SpecialList object at 0x1100ed0f0>
y                  list_iterator    <list_iterator object at 0x1100ed5f8>

In [43]:
d.w


Out[43]:
array([-0.31657565, -0.24357673, -0.39779973])

In [44]:
d.fit()


epoch 0: train loss 1.10715	validation loss 1.10270
epoch 100: train loss 0.19798	validation loss 0.21928
epoch 200: train loss 0.14537	validation loss 0.16397
epoch 300: train loss 0.11993	validation loss 0.13700
epoch 400: train loss 0.10484	validation loss 0.12100

In [45]:
d.plot(color='steelblue', lw=5)



In [46]:
d.w


Out[46]:
array([-2.78574653,  1.62061323,  0.22228673])

In [47]:
d.val_losses[-1]


Out[47]:
0.12100106897339304

In [48]:
SGDClassifier(data=data, label=label).w


Out[48]:
array([ 0.19376242, -0.819977  , -0.16605056])

In [49]:
# messier data

x1 = np.random.normal(loc=1, scale=3, size=500)
x2 = np.random.normal(loc=3, scale=3, size=500)
y1 = np.random.normal(loc=1, scale=3, size=500)
y2 = np.random.normal(loc=3, scale=3, size=500)

x = np.hstack([x1, x2])
y = np.hstack([y1, y2])
ones = np.ones(1000)
data = np.vstack([ones, x, y])
data = data.T

lab1 = np.zeros(500)
lab2 = np.ones(500)
labs = np.hstack([lab1, lab2]).T

In [50]:
d2 = SGDClassifier(data, labs, max_epochs=500)
d2.fit()


epoch 0: train loss 1.24212	validation loss 1.43132
epoch 100: train loss 0.58038	validation loss 0.62481
epoch 200: train loss 0.57707	validation loss 0.62276
epoch 300: train loss 0.57646	validation loss 0.62268
epoch 400: train loss 0.57634	validation loss 0.62279

In [51]:
d2.plot(lw=5)



In [52]:
d2.val_losses[-1]


Out[52]:
0.62278729003135991

In [ ]: