Tutorial Setup

Check your install


In [55]:
import numpy

In [56]:
import scipy

In [57]:
import matplotlib

In [58]:
import sklearn

In [59]:
import psutil

In [60]:
import pandas

In [61]:
import IPython.parallel


/usr/local/lib/python2.7/dist-packages/IPython/parallel.py:13: ShimWarning: The `IPython.parallel` package has been deprecated. You should import from ipyparallel instead.
  "You should import from ipyparallel instead.", ShimWarning)

Finding the location of an installed package and its version:


In [62]:
numpy.__path__


Out[62]:
['/usr/local/lib/python2.7/dist-packages/numpy']

In [63]:
numpy.__version__


Out[63]:
'1.10.1'

Check that you have the datasets


In [64]:
%run ../fetch_data.py


Using existing dataset folder:/home/janus/21_perspective/github/21v-python/unit_20/parallel_ml_tutorial-master/datasets
Checking availability of the 20 newsgroups dataset
Found archive: /home/janus/21_perspective/github/21v-python/unit_20/parallel_ml_tutorial-master/datasets/20news-bydate.tar.gz
Checking that the 20 newsgroups files exist...
=> Success!

In [1]:
import os
for fname in os.listdir('../datasets/'):
    print(fname)


20news-bydate.tar.gz
20news-bydate-train
20news-bydate-test

A NumPy primer

NumPy array dtypes and shapes


In [2]:
import numpy as np

In [3]:
a = np.array([1, 2, 3])

In [4]:
a


Out[4]:
array([1, 2, 3])

In [5]:
b = np.array([[0, 2, 4], [1, 3, 5]])

In [6]:
b


Out[6]:
array([[0, 2, 4],
       [1, 3, 5]])

In [7]:
b.shape


Out[7]:
(2, 3)

In [8]:
b.dtype


Out[8]:
dtype('int64')

In [9]:
a.shape


Out[9]:
(3,)

In [10]:
a.dtype


Out[10]:
dtype('int64')

In [11]:
np.zeros(5)


Out[11]:
array([ 0.,  0.,  0.,  0.,  0.])

In [12]:
np.ones(shape=(3, 4), dtype=np.int32)


Out[12]:
array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1]], dtype=int32)

Common array operations


In [13]:
c = b * 0.5

In [14]:
c


Out[14]:
array([[ 0. ,  1. ,  2. ],
       [ 0.5,  1.5,  2.5]])

In [15]:
c.shape


Out[15]:
(2, 3)

In [16]:
c.dtype


Out[16]:
dtype('float64')

In [17]:
a


Out[17]:
array([1, 2, 3])

In [18]:
d = a + c

In [19]:
d


Out[19]:
array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [20]:
d[0]


Out[20]:
array([ 1.,  3.,  5.])

In [21]:
d[0, 0]


Out[21]:
1.0

In [22]:
d[:, 0]


Out[22]:
array([ 1. ,  1.5])

In [23]:
d.sum()


Out[23]:
19.5

In [24]:
d.mean()


Out[24]:
3.25

In [25]:
d.sum(axis=0)


Out[25]:
array([  2.5,   6.5,  10.5])

In [26]:
d.mean(axis=1)


Out[26]:
array([ 3. ,  3.5])

Reshaping and inplace update


In [27]:
e = np.arange(12)

In [28]:
e


Out[28]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [29]:
f = e.reshape(3, 4)

In [30]:
f


Out[30]:
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [31]:
e


Out[31]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [32]:
e[5:] = 0

In [33]:
e


Out[33]:
array([0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0])

In [34]:
f


Out[34]:
array([[0, 1, 2, 3],
       [4, 0, 0, 0],
       [0, 0, 0, 0]])

Combining arrays


In [35]:
a


Out[35]:
array([1, 2, 3])

In [36]:
b


Out[36]:
array([[0, 2, 4],
       [1, 3, 5]])

In [37]:
d


Out[37]:
array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [38]:
np.concatenate([a, a, a])


Out[38]:
array([1, 2, 3, 1, 2, 3, 1, 2, 3])

In [39]:
np.vstack([a, b, d])


Out[39]:
array([[ 1. ,  2. ,  3. ],
       [ 0. ,  2. ,  4. ],
       [ 1. ,  3. ,  5. ],
       [ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [40]:
np.hstack([b, d])


Out[40]:
array([[ 0. ,  2. ,  4. ,  1. ,  3. ,  5. ],
       [ 1. ,  3. ,  5. ,  1.5,  3.5,  5.5]])

A Matplotlib primer


In [41]:
%matplotlib inline

In [42]:
import matplotlib.pyplot as plt

In [43]:
x = np.linspace(0, 2, 10)

In [44]:
x


Out[44]:
array([ 0.        ,  0.22222222,  0.44444444,  0.66666667,  0.88888889,
        1.11111111,  1.33333333,  1.55555556,  1.77777778,  2.        ])

In [45]:
plt.plot(x, 'o-');



In [46]:
plt.plot(x, x, 'o-', label='linear')
plt.plot(x, x ** 2, 'x-', label='quadratic')

plt.legend(loc='best')
plt.title('Linear vs Quadratic progression')
plt.xlabel('Input')
plt.ylabel('Output');



In [47]:
samples = np.random.normal(loc=1.0, scale=0.5, size=1000)

In [48]:
samples.shape


Out[48]:
(1000,)

In [49]:
samples.dtype


Out[49]:
dtype('float64')

In [50]:
samples[:30]


Out[50]:
array([ 0.9723313 ,  1.05145447,  0.31986674, -0.26642655,  1.79545186,
        0.89663891, -0.78769912,  0.67095478,  1.28211239,  0.66237962,
        1.12488518,  1.76541164,  0.69160337,  0.05183348,  0.64934675,
        1.38717818,  0.37051531,  0.52509259,  1.17117495,  1.8224232 ,
        0.97875658,  1.60924242,  1.09170888,  0.40106831,  1.06174804,
        0.54542997,  1.21400516,  1.11310423, -0.1808943 , -0.36915236])

In [51]:
plt.hist(samples, bins=50);



In [52]:
samples_1 = np.random.normal(loc=1, scale=.5, size=10000)
samples_2 = np.random.standard_t(df=10, size=10000)

In [53]:
bins = np.linspace(-3, 3, 50)
_ = plt.hist(samples_1, bins=bins, alpha=0.5, label='samples 1')
_ = plt.hist(samples_2, bins=bins, alpha=0.5, label='samples 2')
plt.legend(loc='upper left');



In [54]:
plt.scatter(samples_1, samples_2, alpha=0.1);



In [ ]:


In [ ]:


In [ ]:


In [ ]: