Tutorial Setup

Check your install


In [0]:
import numpy

In [1]:
import scipy

In [2]:
import matplotlib

In [3]:
import sklearn

In [4]:
import psutil

In [5]:
import pandas

In [6]:
import IPython.parallel

Finding the location of an installed package and its version:


In [7]:
numpy.__path__


Out[7]:
['/Users/ogrisel/venvs/py34/lib/python3.4/site-packages/numpy']

In [8]:
numpy.__version__


Out[8]:
'1.9.2'

Check that you have the datasets


In [9]:
%run ../fetch_data.py


Using existing dataset folder:/Users/ogrisel/code/parallel_ml_tutorial/datasets
Checking availability of the 20 newsgroups dataset
Found archive: /Users/ogrisel/code/parallel_ml_tutorial/datasets/20news-bydate.tar.gz
Checking that the 20 newsgroups files exist...
=> Success!

In [10]:
import os
for fname in os.listdir('../datasets/'):
    print(fname)


20news-bydate-test
20news-bydate-train
20news-bydate.tar.gz
sentiment140
titanic_train.csv
trainingandtestdata.zip

A NumPy primer

NumPy array dtypes and shapes


In [11]:
import numpy as np

In [12]:
a = np.array([1, 2, 3])

In [13]:
a


Out[13]:
array([1, 2, 3])

In [14]:
b = np.array([[0, 2, 4], [1, 3, 5]])

In [15]:
b


Out[15]:
array([[0, 2, 4],
       [1, 3, 5]])

In [16]:
b.shape


Out[16]:
(2, 3)

In [17]:
b.dtype


Out[17]:
dtype('int64')

In [18]:
a.shape


Out[18]:
(3,)

In [19]:
a.dtype


Out[19]:
dtype('int64')

In [20]:
np.zeros(5)


Out[20]:
array([ 0.,  0.,  0.,  0.,  0.])

In [21]:
np.ones(shape=(3, 4), dtype=np.int32)


Out[21]:
array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1]], dtype=int32)

Common array operations


In [22]:
c = b * 0.5

In [23]:
c


Out[23]:
array([[ 0. ,  1. ,  2. ],
       [ 0.5,  1.5,  2.5]])

In [24]:
c.shape


Out[24]:
(2, 3)

In [25]:
c.dtype


Out[25]:
dtype('float64')

In [26]:
a


Out[26]:
array([1, 2, 3])

In [27]:
d = a + c

In [28]:
d


Out[28]:
array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [29]:
d[0]


Out[29]:
array([ 1.,  3.,  5.])

In [30]:
d[0, 0]


Out[30]:
1.0

In [31]:
d[:, 0]


Out[31]:
array([ 1. ,  1.5])

In [32]:
d.sum()


Out[32]:
19.5

In [33]:
d.mean()


Out[33]:
3.25

In [34]:
d.sum(axis=0)


Out[34]:
array([  2.5,   6.5,  10.5])

In [35]:
d.mean(axis=1)


Out[35]:
array([ 3. ,  3.5])

Reshaping and inplace update


In [36]:
e = np.arange(12)

In [37]:
e


Out[37]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [38]:
f = e.reshape(3, 4)

In [39]:
f


Out[39]:
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [40]:
e


Out[40]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [41]:
e[5:] = 0

In [42]:
e


Out[42]:
array([0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0])

In [43]:
f


Out[43]:
array([[0, 1, 2, 3],
       [4, 0, 0, 0],
       [0, 0, 0, 0]])

Combining arrays


In [44]:
a


Out[44]:
array([1, 2, 3])

In [45]:
b


Out[45]:
array([[0, 2, 4],
       [1, 3, 5]])

In [46]:
d


Out[46]:
array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [47]:
np.concatenate([a, a, a])


Out[47]:
array([1, 2, 3, 1, 2, 3, 1, 2, 3])

In [48]:
np.vstack([a, b, d])


Out[48]:
array([[ 1. ,  2. ,  3. ],
       [ 0. ,  2. ,  4. ],
       [ 1. ,  3. ,  5. ],
       [ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [49]:
np.hstack([b, d])


Out[49]:
array([[ 0. ,  2. ,  4. ,  1. ,  3. ,  5. ],
       [ 1. ,  3. ,  5. ,  1.5,  3.5,  5.5]])

A Matplotlib primer


In [50]:
%matplotlib inline

In [51]:
import matplotlib.pyplot as plt

In [52]:
x = np.linspace(0, 2, 10)

In [53]:
x


Out[53]:
array([ 0.        ,  0.22222222,  0.44444444,  0.66666667,  0.88888889,
        1.11111111,  1.33333333,  1.55555556,  1.77777778,  2.        ])

In [54]:
plt.plot(x, 'o-');



In [55]:
plt.plot(x, x, 'o-', label='linear')
plt.plot(x, x ** 2, 'x-', label='quadratic')

plt.legend(loc='best')
plt.title('Linear vs Quadratic progression')
plt.xlabel('Input')
plt.ylabel('Output');



In [56]:
samples = np.random.normal(loc=1.0, scale=0.5, size=1000)

In [57]:
samples.shape


Out[57]:
(1000,)

In [58]:
samples.dtype


Out[58]:
dtype('float64')

In [59]:
samples[:30]


Out[59]:
array([ 0.49154735, -0.20568649,  1.31377114,  2.00566731,  0.7841002 ,
        1.37162944,  0.35393214,  0.64864944,  0.61681463,  1.42298751,
        1.10847225,  1.90760358,  0.98976634,  1.32393257,  1.15057275,
        0.71417411,  1.27197596,  1.82152059,  1.12728921,  0.93116918,
        0.54941321,  1.18379504,  0.60626759,  1.16439308,  2.41235109,
        1.92930117,  0.30859505, -0.12063577,  2.29732949,  1.47543464])

In [60]:
plt.hist(samples, bins=50);



In [61]:
samples_1 = np.random.normal(loc=1, scale=.5, size=10000)
samples_2 = np.random.standard_t(df=10, size=10000)

In [62]:
bins = np.linspace(-3, 3, 50)
_ = plt.hist(samples_1, bins=bins, alpha=0.5, label='samples 1')
_ = plt.hist(samples_2, bins=bins, alpha=0.5, label='samples 2')
plt.legend(loc='upper left');



In [63]:
plt.scatter(samples_1, samples_2, alpha=0.1);



In [64]: