Week 3



In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib notebook

2D Numpy Arrays



In [3]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [4]:
# Accessing elements
print ridership[1, 3]
print ridership[1:3, 3:5]
print ridership[1, :]
    
# Vectorized operations on rows or columns
print ridership[0, :] + ridership[1, :]
print ridership[:, 0] + ridership[:, 1]
    
# Vectorized operations on entire arrays
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print a + b


2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]
[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]
[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]

In [7]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    overall_mean = np.mean(ridership)
    mean_for_max = np.mean(ridership[:,np.argmax(ridership[0,:])])
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership)


Out[7]:
(2342.5999999999999, 3239.9000000000001)

In [8]:
# numpy axis argument
a = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])
    
print a.sum()
print a.sum(axis=0)
print a.sum(axis=1)


45
[12 15 18]
[ 6 15 24]

In [10]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    max_daily_ridership = np.max(ridership.mean(axis=0))
    min_daily_ridership = np.min(ridership.mean(axis=0))
    
    return (max_daily_ridership, min_daily_ridership)

print min_and_max_riders_per_day(ridership)


(3239.9000000000001, 1071.2)

Pandas Data Frames



In [11]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

In [16]:
# DataFrame creation
# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df_1

# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print df_2

# Pandas axis
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df.sum()
print df.sum(axis=1)
print df.values.sum()


   A  B
0  0  3
1  1  4
2  2  5
   A  B  C
0  0  1  2
1  3  4  5
A     3
B    12
dtype: int64
0    3
1    5
2    7
dtype: int64
15

In [15]:
# Accessing elements
print ridership_df.iloc[0]
print ridership_df.loc['05-05-11']
print ridership_df['R003']
print ridership_df.iloc[1, 3]
    
# Accessing multiple rows
print ridership_df.iloc[1:4]
    
# Accessing multiple columns
print ridership_df[['R003', 'R005']]


R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64
R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64
05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64
2328
          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613
          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009
A     3
B    12
dtype: int64
0    3
1    5
2    7
dtype: int64
15

In [30]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    overall_mean = ridership.values.mean()
    mean_for_max = ridership[ridership.iloc[0].argmax()].mean()
    
    return (overall_mean, mean_for_max)

print mean_riders_for_max_station(ridership_df)


(2342.5999999999999, 3239.9000000000001)

Reading subway Data



In [31]:
subway_df = pd.read_csv('nyc_subway_weather.csv')
print subway_df.head()


   UNIT     DATEn     TIMEn  ENTRIESn   EXITSn  ENTRIESn_hourly  \
0  R003  05-01-11  00:00:00   4388333  2911002                0   
1  R003  05-01-11  04:00:00   4388333  2911002                0   
2  R003  05-01-11  12:00:00   4388333  2911002                0   
3  R003  05-01-11  16:00:00   4388333  2911002                0   
4  R003  05-01-11  20:00:00   4388333  2911002                0   

   EXITSn_hourly             datetime  hour  day_week     ...       pressurei  \
0              0  2011-05-01 00:00:00     0         6     ...           30.22   
1              0  2011-05-01 04:00:00     4         6     ...           30.25   
2              0  2011-05-01 12:00:00    12         6     ...           30.28   
3              0  2011-05-01 16:00:00    16         6     ...           30.26   
4              0  2011-05-01 20:00:00    20         6     ...           30.28   

  rain  tempi  wspdi meanprecipi  meanpressurei  meantempi  meanwspdi  \
0    0   55.9    3.5           0         30.258      55.98       7.86   
1    0   52.0    3.5           0         30.258      55.98       7.86   
2    0   62.1    6.9           0         30.258      55.98       7.86   
3    0   57.9   15.0           0         30.258      55.98       7.86   
4    0   52.0   10.4           0         30.258      55.98       7.86   

   weather_lat  weather_lon  
0    40.700348   -73.887177  
1    40.700348   -73.887177  
2    40.700348   -73.887177  
3    40.700348   -73.887177  
4    40.700348   -73.887177  

[5 rows x 27 columns]

In [52]:
# computing the pearson correlation
def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    xstd = (x - x.mean())/x.std(ddof=0)
    ystd = (y - y.mean())/y.std(ddof=0)
    
    return (xstd*ystd).mean()

In [51]:
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print correlation(rain, temp)
print correlation(entries, rain)
print correlation(entries, temp)
print correlation(entries, cum_entries)


-0.229034323408
0.0356485157722
-0.0266933483216
0.585895470766

Vectorized Operations on Pandas



In [69]:
# Cumulative entries and exits for one station for a few hours
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

In [53]:
# Adding DataFrames with the column names
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
print df1 + df2
    
# Adding DataFrames with overlapping column names 
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
print df1 + df2

# Adding DataFrames with overlapping row indexes
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                    index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                    index=['row4', 'row3', 'row2'])
print df1 + df2


    a   b   c
0  11  44  77
1  22  55  88
2  33  66  99
    a   b   c   d
0 NaN  74  47 NaN
1 NaN  85  58 NaN
2 NaN  96  69 NaN
       a   b   c
row1 NaN NaN NaN
row2  32  65  98
row3  23  56  89
row4 NaN NaN NaN

In [72]:
def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    return (entries_and_exits - entries_and_exits.shift()).fillna(0.0)

print get_hourly_entries_and_exits(entries_and_exits)


   ENTRIESn  EXITSn
0         0       0
1        23       8
2        18      18
3        71      54
4       170      44
5       214      42
6        87      11
7        10       3
8        36      89
9       153     333

In [73]:
entries_and_exits = pd.DataFrame(
    {'ENTRIESn': [10, 40, 60, 65, 85], 'EXITSn': [0, 10, 20, 60, 60]},
    index=[0, 1, 2, 3, 4])

print get_hourly_entries_and_exits(entries_and_exits)


   ENTRIESn  EXITSn
0         0       0
1        30      10
2        20      10
3         5      40
4        20       0

In [ ]: