In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
In [3]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
In [4]:
# Accessing elements
print ridership[1, 3]
print ridership[1:3, 3:5]
print ridership[1, :]
# Vectorized operations on rows or columns
print ridership[0, :] + ridership[1, :]
print ridership[:, 0] + ridership[:, 1]
# Vectorized operations on entire arrays
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print a + b
In [7]:
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
Hint: NumPy's argmax() function might be useful:
http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
'''
overall_mean = np.mean(ridership)
mean_for_max = np.mean(ridership[:,np.argmax(ridership[0,:])])
return (overall_mean, mean_for_max)
mean_riders_for_max_station(ridership)
Out[7]:
In [8]:
# numpy axis argument
a = np.array([
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]
])
print a.sum()
print a.sum(axis=0)
print a.sum(axis=1)
In [10]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
def min_and_max_riders_per_day(ridership):
'''
Fill in this function. First, for each subway station, calculate the
mean ridership per day. Then, out of all the subway stations, return the
maximum and minimum of these values. That is, find the maximum
mean-ridership-per-day and the minimum mean-ridership-per-day for any
subway station.
'''
max_daily_ridership = np.max(ridership.mean(axis=0))
min_daily_ridership = np.min(ridership.mean(axis=0))
return (max_daily_ridership, min_daily_ridership)
print min_and_max_riders_per_day(ridership)
In [11]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
data=[[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]],
index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
In [16]:
# DataFrame creation
# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df_1
# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print df_2
# Pandas axis
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df.sum()
print df.sum(axis=1)
print df.values.sum()
In [15]:
# Accessing elements
print ridership_df.iloc[0]
print ridership_df.loc['05-05-11']
print ridership_df['R003']
print ridership_df.iloc[1, 3]
# Accessing multiple rows
print ridership_df.iloc[1:4]
# Accessing multiple columns
print ridership_df[['R003', 'R005']]
In [30]:
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
This is the same as a previous exercise, but this time the
input is a Pandas DataFrame rather than a 2D NumPy array.
'''
overall_mean = ridership.values.mean()
mean_for_max = ridership[ridership.iloc[0].argmax()].mean()
return (overall_mean, mean_for_max)
print mean_riders_for_max_station(ridership_df)
In [31]:
subway_df = pd.read_csv('nyc_subway_weather.csv')
print subway_df.head()
In [52]:
# computing the pearson correlation
def correlation(x, y):
'''
Fill in this function to compute the correlation between the two
input variables. Each input is either a NumPy array or a Pandas
Series.
correlation = average of (x in standard units) times (y in standard units)
Remember to pass the argument "ddof=0" to the Pandas std() function!
'''
xstd = (x - x.mean())/x.std(ddof=0)
ystd = (y - y.mean())/y.std(ddof=0)
return (xstd*ystd).mean()
In [51]:
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']
print correlation(rain, temp)
print correlation(entries, rain)
print correlation(entries, temp)
print correlation(entries, cum_entries)
In [69]:
# Cumulative entries and exits for one station for a few hours
entries_and_exits = pd.DataFrame({
'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
3144808, 3144895, 3144905, 3144941, 3145094],
'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
1088317, 1088328, 1088331, 1088420, 1088753]
})
In [53]:
# Adding DataFrames with the column names
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
print df1 + df2
# Adding DataFrames with overlapping column names
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
print df1 + df2
# Adding DataFrames with overlapping row indexes
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
index=['row4', 'row3', 'row2'])
print df1 + df2
In [72]:
def get_hourly_entries_and_exits(entries_and_exits):
'''
Fill in this function to take a DataFrame with cumulative entries
and exits (entries in the first column, exits in the second) and
return a DataFrame with hourly entries and exits (entries in the
first column, exits in the second).
'''
return (entries_and_exits - entries_and_exits.shift()).fillna(0.0)
print get_hourly_entries_and_exits(entries_and_exits)
In [73]:
entries_and_exits = pd.DataFrame(
{'ENTRIESn': [10, 40, 60, 65, 85], 'EXITSn': [0, 10, 20, 60, 60]},
index=[0, 1, 2, 3, 4])
print get_hourly_entries_and_exits(entries_and_exits)
In [ ]: