## Chapter 2: Basic Operations

### Vectors, Data, Matrices and Subsetting

``````

In [1]:

from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

``````
``````

In [2]:

x = np.array([2,7,5])    # explicit vector creation
x

``````
``````

Out[2]:

array([2, 7, 5])

``````
``````

In [3]:

y = np.arange(4, 13, 3)  # vector creation from a sequence (start, stop, step)
y

``````
``````

Out[3]:

array([ 4,  7, 10])

``````
``````

In [4]:

x + y    # vectors can be added

``````
``````

Out[4]:

array([ 6, 14, 15])

``````
``````

In [5]:

x / y    # divided

``````
``````

Out[5]:

array([ 0.5,  1. ,  0.5])

``````
``````

In [6]:

x ** y    # exponentiated

``````
``````

Out[6]:

array([     16,  823543, 9765625])

``````
``````

In [7]:

x[1]    # vector elements can be selected by position

``````
``````

Out[7]:

7

``````
``````

In [8]:

x[1:3]  # multiple elements can be selected using slices

``````
``````

Out[8]:

array([7, 5])

``````
``````

In [9]:

x[-2]  # elements can be specified as offset from end

``````
``````

Out[9]:

7

``````
``````

In [10]:

x[np.array([0,1])]  # elements can be specified as an array

``````
``````

Out[10]:

array([2, 7])

``````
``````

In [11]:

Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z                 # note: R arranges the elements column-wise

``````
``````

Out[11]:

matrix([[ 1,  2,  3],
[ 4,  5,  6],
[ 7,  8,  9],
[10, 11, 12]])

``````
``````

In [12]:

Z[2:4, 1:3]    # R is 1-based and includes ending index, Python is 0 based and does not.

``````
``````

Out[12]:

matrix([[ 8,  9],
[11, 12]])

``````
``````

In [13]:

Z[:, 1:3]    # column slice

``````
``````

Out[13]:

matrix([[ 2,  3],
[ 5,  6],
[ 8,  9],
[11, 12]])

``````
``````

In [14]:

Z.shape

``````
``````

Out[14]:

(4, 3)

``````

### Generating Random Data, Graphics

``````

In [15]:

x = np.random.uniform(0.0, 1.0, 50)
x

``````
``````

Out[15]:

array([ 0.65268288,  0.55438063,  0.12332376,  0.73656161,  0.172588  ,
0.6952661 ,  0.36758793,  0.42034067,  0.54835434,  0.85493252,
0.36666796,  0.14029571,  0.16910043,  0.18956811,  0.77016189,
0.1914512 ,  0.48202498,  0.98936099,  0.43339457,  0.3690124 ,
0.91976653,  0.42354589,  0.23428098,  0.24846295,  0.20409963,
0.71024602,  0.57361799,  0.12510961,  0.70550836,  0.62120022,
0.7089459 ,  0.67670746,  0.39836928,  0.17109468,  0.57367169,
0.60584339,  0.72445482,  0.67991423,  0.94658844,  0.91068048,
0.65489981,  0.05985463,  0.56797862,  0.67604148,  0.989085  ,
0.79619792,  0.46406902,  0.91227664,  0.38043679,  0.04246386])

``````
``````

In [16]:

y = np.random.normal(0.0, 1.0, 50)
y

``````
``````

Out[16]:

array([ -5.95648785e-01,   1.54445571e+00,   1.35312925e+00,
1.88123098e+00,  -3.80353324e-01,  -9.67551972e-04,
5.97422145e-01,   3.76114342e-02,  -1.33779968e+00,
-5.95407697e-01,   1.27206236e+00,  -1.70114410e+00,
-1.84147571e+00,   7.16611666e-01,   2.33006825e+00,
-6.56709283e-01,   8.26951067e-01,   4.64854930e-01,
8.79290232e-01,  -7.42861574e-02,   4.88797828e-01,
1.50860241e+00,   1.07467313e+00,  -7.99843607e-01,
6.81719603e-02,   5.06424430e-01,   2.15770333e-01,
1.67843398e+00,  -5.01086886e-01,   1.51407152e-01,
-3.30648417e-02,  -8.50683506e-01,  -1.56531920e+00,
-6.43438018e-01,  -9.55514605e-01,   1.51696803e-01,
-9.87417666e-01,  -4.44820480e-01,   1.21902188e-01,
-5.20474443e-01,   8.31544982e-01,  -6.75527626e-01,
1.00537660e+00,   4.87948062e-01,   3.43123002e-01,
3.95408496e-01,   4.27046930e-01,   3.69945068e-01,
1.36728064e+00,  -3.15495180e-01])

``````
``````

In [17]:

fig, ax = plt.subplots()
plt.scatter(x, y)

``````
``````

Out[17]:

<matplotlib.collections.PathCollection at 0x420b410>

``````
``````

In [18]:

fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red')  # plot customizations

``````
``````

Out[18]:

<matplotlib.collections.PathCollection at 0x4241f50>

``````
``````

In [19]:

plt.subplot(121)    # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)

``````
``````

Out[19]:

(array([ 3.,  1.,  9.,  5.,  9.,  9.,  6.,  3.,  4.,  1.]),
array([-1.84147571, -1.42432131, -1.00716692, -0.59001252, -0.17285812,
0.24429627,  0.66145067,  1.07860506,  1.49575946,  1.91291385,
2.33006825]),
<a list of 10 Patch objects>)

``````

``````

In [20]:

# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df.columns     # column names

``````
``````

Out[20]:

Index([u'mpg', u'cylinders', u'displacement', u'horsepower', u'weight', u'acceleration', u'year', u'origin', u'name'], dtype='object')

``````
``````

In [21]:

auto_df.shape    # number of rows, number of columns

``````
``````

Out[21]:

(397, 9)

``````
``````

In [22]:

type(auto_df)

``````
``````

Out[22]:

pandas.core.frame.DataFrame

``````
``````

In [23]:

auto_df.describe()  # equivalent of R's DataFrame.summary()

``````
``````

Out[23]:

mpg
cylinders
displacement
weight
acceleration
year
origin

count
397.000000
397.000000
397.000000
397.000000
397.000000
397.000000
397.000000

mean
23.515869
5.458438
193.532746
2970.261965
15.555668
75.994962
1.574307

std
7.825804
1.701577
104.379583
847.904119
2.749995
3.690005
0.802549

min
9.000000
3.000000
68.000000
1613.000000
8.000000
70.000000
1.000000

25%
17.500000
4.000000
104.000000
2223.000000
13.800000
73.000000
1.000000

50%
23.000000
4.000000
146.000000
2800.000000
15.500000
76.000000
1.000000

75%
29.000000
8.000000
262.000000
3609.000000
17.100000
79.000000
2.000000

max
46.600000
8.000000
455.000000
5140.000000
24.800000
82.000000
3.000000

8 rows × 7 columns

``````
``````

In [24]:

plt.ylabel("MPG")
auto_df.plot(x="cylinders", y="mpg", style='o')

``````
``````

Out[24]:

<matplotlib.axes.AxesSubplot at 0x46ef650>

``````
``````

In [25]:

auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders

``````
``````

Out[25]:

<matplotlib.axes.AxesSubplot at 0x499f510>

``````
``````

In [26]:

# similar to R pairs, shows correlation scatter plots between columns and distribution for each
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")

``````
``````

``````