## Chapter 2: Basic Operations

### Vectors, Data, Matrices and Subsetting

In [1]:

from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

x = np.array([2,7,5])    # explicit vector creation
x

Out[2]:

array([2, 7, 5])

In [3]:

y = np.arange(4, 13, 3)  # vector creation from a sequence (start, stop, step)
y

Out[3]:

array([ 4,  7, 10])

In [4]:

x + y    # vectors can be added

Out[4]:

array([ 6, 14, 15])

In [5]:

x / y    # divided

Out[5]:

array([ 0.5,  1. ,  0.5])

In [6]:

x ** y    # exponentiated

Out[6]:

array([     16,  823543, 9765625])

In [7]:

x[1]    # vector elements can be selected by position

Out[7]:

7

In [8]:

x[1:3]  # multiple elements can be selected using slices

Out[8]:

array([7, 5])

In [9]:

x[-2]  # elements can be specified as offset from end

Out[9]:

7

In [10]:

x[np.array([0,1])]  # elements can be specified as an array

Out[10]:

array([2, 7])

In [11]:

Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z                 # note: R arranges the elements column-wise

Out[11]:

matrix([[ 1,  2,  3],
[ 4,  5,  6],
[ 7,  8,  9],
[10, 11, 12]])

In [12]:

Z[2:4, 1:3]    # R is 1-based and includes ending index, Python is 0 based and does not.

Out[12]:

matrix([[ 8,  9],
[11, 12]])

In [13]:

Z[:, 1:3]    # column slice

``````
matrix([[ 2,  3],
[ 5,  6],
[ 8,  9],
[11, 12]])

In [14]:

Z.shape

Out[14]:

(4, 3)

### Generating Random Data, Graphics

In [15]:

x = np.random.uniform(0.0, 1.0, 50)
x

Out[15]:

array([ 0.65268288,  0.55438063,  0.12332376,  0.73656161,  0.172588  ,
0.6952661 ,  0.36758793,  0.42034067,  0.54835434,  0.85493252,
0.36666796,  0.14029571,  0.16910043,  0.18956811,  0.77016189,
0.1914512 ,  0.48202498,  0.98936099,  0.43339457,  0.3690124 ,
0.91976653,  0.42354589,  0.23428098,  0.24846295,  0.20409963,
0.71024602,  0.57361799,  0.12510961,  0.70550836,  0.62120022,
0.7089459 ,  0.67670746,  0.39836928,  0.17109468,  0.57367169,
0.60584339,  0.72445482,  0.67991423,  0.94658844,  0.91068048,
0.65489981,  0.05985463,  0.56797862,  0.67604148,  0.989085  ,
0.79619792,  0.46406902,  0.91227664,  0.38043679,  0.04246386])

In [16]:

y = np.random.normal(0.0, 1.0, 50)
y

Out[16]:

array([ -5.95648785e-01,   1.54445571e+00,   1.35312925e+00,
1.88123098e+00,  -3.80353324e-01,  -9.67551972e-04,
5.97422145e-01,   3.76114342e-02,  -1.33779968e+00,
-5.95407697e-01,   1.27206236e+00,  -1.70114410e+00,
-1.84147571e+00,   7.16611666e-01,   2.33006825e+00,
-6.56709283e-01,   8.26951067e-01,   4.64854930e-01,
8.79290232e-01,  -7.42861574e-02,   4.88797828e-01,
1.50860241e+00,   1.07467313e+00,  -7.99843607e-01,
6.81719603e-02,   5.06424430e-01,   2.15770333e-01,
1.67843398e+00,  -5.01086886e-01,   1.51407152e-01,
-3.30648417e-02,  -8.50683506e-01,  -1.56531920e+00,
-6.43438018e-01,  -9.55514605e-01,   1.51696803e-01,
-9.87417666e-01,  -4.44820480e-01,   1.21902188e-01,
-5.20474443e-01,   8.31544982e-01,  -6.75527626e-01,
1.00537660e+00,   4.87948062e-01,   3.43123002e-01,
3.95408496e-01,   4.27046930e-01,   3.69945068e-01,
1.36728064e+00,  -3.15495180e-01])

In [17]:

fig, ax = plt.subplots()
plt.scatter(x, y)

Out[17]:

<matplotlib.collections.PathCollection at 0x420b410>

In [18]:

fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red')  # plot customizations

Out[18]:

<matplotlib.collections.PathCollection at 0x4241f50>

In [19]:

plt.subplot(121)    # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)

Out[19]:

(array([ 3.,  1.,  9.,  5.,  9.,  9.,  6.,  3.,  4.,  1.]),
array([-1.84147571, -1.42432131, -1.00716692, -0.59001252, -0.17285812,
0.24429627,  0.66145067,  1.07860506,  1.49575946,  1.91291385,
2.33006825]),
<a list of 10 Patch objects>)

In [20]:

# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df.columns     # column names

Out[20]:

Index([u'mpg', u'cylinders', u'displacement', u'horsepower', u'weight', u'acceleration', u'year', u'origin', u'name'], dtype='object')

In [21]:

auto_df.shape    # number of rows, number of columns

Out[21]:

(397, 9)

In [22]:

type(auto_df)

Out[22]:

pandas.core.frame.DataFrame

In [23]:

auto_df.describe()  # equivalent of R's DataFrame.summary()

Out[23]:

mpg
cylinders
displacement
weight
acceleration
year
origin

count
397.000000
397.000000
397.000000
397.000000
397.000000
397.000000
397.000000

mean
23.515869
5.458438
193.532746
2970.261965
15.555668
75.994962
1.574307

std
7.825804
1.701577
104.379583
847.904119
2.749995
3.690005
0.802549

min
9.000000
3.000000
68.000000
1613.000000
8.000000
70.000000
1.000000

25%
17.500000
4.000000
104.000000
2223.000000
13.800000
73.000000
1.000000

50%
23.000000
4.000000
146.000000
2800.000000
15.500000
76.000000
1.000000

75%
29.000000
8.000000
262.000000
3609.000000
17.100000
79.000000
2.000000

max
46.600000
8.000000
455.000000
5140.000000
24.800000
82.000000
3.000000

8 rows × 7 columns

In [24]:

plt.ylabel("MPG")
auto_df.plot(x="cylinders", y="mpg", style='o')

Out[24]:

<matplotlib.axes.AxesSubplot at 0x46ef650>

In [25]:

auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders

Out[25]:

<matplotlib.axes.AxesSubplot at 0x499f510>

In [26]:

# similar to R pairs, shows correlation scatter plots between columns and distribution for each
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")

