a[1, 3] vs. a[1][3]Exercise for getting familiar with arrays
In [1]:
import numpy as np
import pandas as pd
In [2]:
#ridership for 10 days across 5 stations
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
In [3]:
print ridership[1, 3]
print ridership[1:3, 3:5]
print ridership[1, :]
In [4]:
print ridership[0, :] + ridership[1, :]
print ridership[:, 0] + ridership[:, 1]
In [5]:
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
a + b
Out[5]:
In [6]:
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
Hint: NumPy's argmax() function might be useful:
http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
'''
max_station_first_day = ridership[0, :].argmax()
print max_station_first_day
overall_mean = ridership.mean()
mean_for_max = ridership[:, max_station_first_day].mean()
return (overall_mean, mean_for_max)
In [7]:
mean_riders_for_max_station(ridership)
Out[7]:
In [8]:
a = np.array([
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]
])
In [9]:
print a.sum()
print a.sum(axis=0)
print a.sum(axis=1)
Exercise for finding the min and max riders per day
In [10]:
def min_and_max_riders_per_day(ridership):
'''
Fill in this function. First, for each subway station, calculate the
mean ridership per day. Then, out of all the subway stations, return the
maximum and minimum of these values. That is, find the maximum
mean-ridership-per-day and the minimum mean-ridership-per-day for any
subway station.
'''
mean_ridership = ridership.mean(axis = 0)
max_daily_ridership = mean_ridership.max()
min_daily_ridership = mean_ridership.min()
return (max_daily_ridership, min_daily_ridership)
In [11]:
min_and_max_riders_per_day(ridership)
Out[11]:
In [12]:
enrollments_df = pd.DataFrame({
'account_key': [448, 448, 448],
'status': ['canceled', 'canceled', 'canceled'],
'days_to_cancel': [65, 5, 0]
})
In [13]:
enrollments_df
Out[13]:
In [14]:
enrollments_df.mean()
Out[14]:
In [15]:
ridership_df = pd.DataFrame(
data=[[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]],
index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
In [16]:
ridership_df
Out[16]:
We can access the data from a data frame using many ways.
By row index
In [17]:
ridership_df.loc['05-02-11']
Out[17]:
By position
In [18]:
ridership_df.iloc[9]
Out[18]:
By row and column position
In [19]:
ridership_df.iloc[0, 3]
Out[19]:
By row index and column names
In [20]:
ridership_df.loc['05-04-11', 'R004']
Out[20]:
In [21]:
ridership_df['R004']
Out[21]:
We can get the underlying array
In [22]:
ridership_df.values
Out[22]:
In [23]:
ridership_df.values.mean()
Out[23]:
In [24]:
#Creating data frame using list of lists
pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
Out[24]:
In [25]:
# Accessing multiple columns
ridership_df[['R003', 'R005']]
Out[25]:
Let us re write a function that we had written earlier for numpy arrays for data frames.
In [26]:
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
This is the same as a previous exercise, but this time the
input is a Pandas DataFrame rather than a 2D NumPy array.
'''
max_station_first_day = ridership.iloc[0].argmax()
overall_mean = ridership.values.mean()
mean_for_max = ridership[max_station_first_day].mean()
return (overall_mean, mean_for_max)
In [27]:
mean_riders_for_max_station(ridership_df)
Out[27]:
In [28]:
subway_df = pd.read_csv('nyc_subway_weather.csv')
In [29]:
subway_df.head()
Out[29]:
In [30]:
subway_df.describe()
Out[30]:
subtracting mean causes it be divided in 4 parts
Measured by average of (x in std units) * (y in std units)
std() function computes the standard deviation using Bessel's correction. Calling std(ddof=0) ensures that Bessel's correction will not be used.
In [31]:
def standardize_1D(x):
return (x - x.mean()) / x.std(ddof = 0)
def correlation(x, y):
return (standardize_1D(x) * standardize_1D(y)).mean()
In [32]:
correlation(subway_df['ENTRIESn_hourly'], subway_df['meanprecipi'])
Out[32]:
This correlation is not very strong. So mean precipitation affects hourly entries but not too much.
In [33]:
correlation(subway_df['ENTRIESn_hourly'], subway_df['ENTRIESn'])
Out[33]:
This correlation is stronger as we do expect cumulative entries to be related to hourly entries.
In [34]:
entries_and_exits = pd.DataFrame({
'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
3144808, 3144895, 3144905, 3144941, 3145094],
'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
1088317, 1088328, 1088331, 1088420, 1088753]
})
entries_and_exits
Out[34]:
In [35]:
# Adding DataFrames with the column names
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
print df1 + df2
In [36]:
# Adding DataFrames with overlapping column names
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
print df1 + df2
In [37]:
# Adding DataFrames with overlapping row indexes
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
index=['row4', 'row3', 'row2'])
print df1 + df2
Documentation for the Pandas shift() function is here
In [38]:
def get_hourly_entries_and_exits(entries_and_exits):
'''
Fill in this function to take a DataFrame with cumulative entries
and exits (entries in the first column, exits in the second) and
return a DataFrame with hourly entries and exits (entries in the
first column, exits in the second).
'''
return entries_and_exits - entries_and_exits.shift(1)
In [39]:
get_hourly_entries_and_exits(entries_and_exits)
Out[39]:
In [40]:
entries_and_exits.diff()
Out[40]:
In [41]:
df = pd.DataFrame({
'a': [1, 2, 3],
'b': [10, 20, 30],
'c': [5, 10, 15]
})
def add_one(x):
return x + 1
df.applymap(add_one)
Out[41]:
In [42]:
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
grades_df
Out[42]:
In [43]:
def convert_grades(grades):
'''
Fill in this function to convert the given DataFrame of numerical
grades to letter grades. Return a new DataFrame with the converted
grade.
The conversion rule is:
90-100 -> A
80-89 -> B
70-79 -> C
60-69 -> D
0-59 -> F
'''
def convert_grade(num):
if num >= 90:
return 'A'
elif num >= 80:
return 'B'
elif num >= 70:
return 'C'
elif num >= 60:
return 'D'
else:
return 'F'
return grades.applymap(convert_grade)
In [44]:
convert_grades(grades_df)
Out[44]:
In [45]:
def convert_grades_curve(exam_grades):
# Pandas has a bult-in function that will perform this calculation
# This will give the bottom 0% to 10% of students the grade 'F',
# 10% to 20% the grade 'D', and so on. You can read more about
# the qcut() function here:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
return pd.qcut(exam_grades,
[0, 0.1, 0.2, 0.5, 0.8, 1],
labels=['F', 'D', 'C', 'B', 'A'])
# qcut() operates on a list, array, or Series. This is the
# result of running the function on a single column of the
# DataFrame.
print convert_grades_curve(grades_df['exam1'])
# qcut() does not work on DataFrames, but we can use apply()
# to call the function on each column separately
print grades_df.apply(convert_grades_curve)
In [46]:
def standardize(df):
'''
Fill in this function to standardize each column of the given
DataFrame. To standardize a variable, convert each value to the
number of standard deviations it is above or below the mean.
'''
return df.apply(standardize_1D)
In [47]:
standardize(grades_df)
Out[47]:
Here as we know the score of all students hence we used ddof = 0 in case of standardize_1D as we don't want Bessel's correction.
Another use case of apply would be when we want to convert a column into a single value rather than a new column. e.g. when we need to find the max value in a column
Exercise to find the 2nd largest value in a data frame
In [48]:
df = pd.DataFrame({
'a': [4, 5, 3, 1, 2],
'b': [20, 10, 40, 50, 30],
'c': [25, 20, 5, 15, 10]
})
df
Out[48]:
In [49]:
def second_largest_in_column(column):
sorted_column = column.sort_values(ascending = False)
return sorted_column.iloc[1]
In [50]:
second_largest_in_column(df['a'])
Out[50]:
In [51]:
def second_largest(df):
return df.apply(second_largest_in_column)
In [52]:
second_largest(df)
Out[52]:
In [53]:
# Adding a Series to a square DataFrame
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
In [54]:
# Adding a Series to a one-row DataFrame
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
print df
print '' # Create a blank line between outputs
print df + s
In [55]:
# Adding a Series to a one-column DataFrame
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})
print df
print '' # Create a blank line between outputs
print df + s
In [56]:
df.add(s, axis = 'index')
Out[56]:
In [57]:
# Adding when DataFrame column names match Series index
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
In [58]:
# Adding when DataFrame column names don't match Series index
s = pd.Series([1, 2, 3, 4], index = ['b', 'c', 'd', 'e'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
In [59]:
grades_df
Out[59]:
In [60]:
#Standardize each column
(grades_df - grades_df.mean()) / grades_df.std()
Out[60]:
To standardize each row is a little bit trickier. So we will break it into parts
In [61]:
#Find the mean of each row
mean_rows = grades_df.mean(axis = 'columns')
By default data frame - Series is each element of series subtracted from each matching column of the data frame. But we want to subtract from each row so we will have add index
In [62]:
#Subtract mean
mean_diffs = grades_df.sub(mean_rows, axis = 'index')
mean_diffs
Out[62]:
In [63]:
mean_diffs.div(grades_df.std(axis = 'columns'), axis = 'index')
Out[63]:
In [64]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
In [65]:
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
In [66]:
values
Out[66]:
In [67]:
example_df
Out[67]:
In [68]:
# Examine groups
grouped_data = example_df.groupby('even')
# The groups attribute is a dictionary mapping keys to lists of row indexes
print grouped_data.groups
In [69]:
# Group by multiple columns
grouped_data = example_df.groupby(['even', 'above_three'])
print grouped_data.groups
In [70]:
# Get sum of each group
grouped_data = example_df.groupby('even')
print grouped_data.sum()
In [71]:
# Limit columns in result
grouped_data = example_df.groupby('even')
# You can take one or more columns from the result DataFrame
print grouped_data.sum()['value']
print '\n' # Blank line to separate results
# You can also take a subset of columns from the grouped data before
# collapsing to a DataFrame. In this case, the result is the same.
print grouped_data['value'].sum()
EXERCISE: Write code here to group the subway data by a variable of your choice, then either print out the mean ridership within each group or create a plot.
Let's see what the subway data frame looks like for reference
In [72]:
subway_df.columns
Out[72]:
In [73]:
subway_df.head()
Out[73]:
In [74]:
ridership_by_day = subway_df.groupby('day_week').mean()['ENTRIESn_hourly']
In [75]:
ridership_by_day
Out[75]:
In [76]:
ridership_by_day.index = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']
In [77]:
ridership_by_day
Out[77]:
In [78]:
%pylab inline
figsize(10, 6) #Changes plot height and width
import seaborn as sns
ridership_plot = ridership_by_day.plot()
ridership_plot.set_ylabel('Hourly entries')
Out[78]:
In the quiz where we calculated hourly entries and exits, we did so for a single set of cumulative entries. However, in the original data, there was a separate set of numbers for each station.
Thus, to correctly calculate the hourly entries and exits, it was necessary to group by station and day, then calculate the hourly entries and exits within each day.
So now we are going to write a function to do that.
In [79]:
ridership_df = pd.DataFrame({
'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275]
})
In [80]:
ridership_df
Out[80]:
In [81]:
ridership_df.groupby('UNIT')[['ENTRIESn', 'EXITSn']].apply(get_hourly_entries_and_exits)
Out[81]:
We have 2 NaNs for 2 stations where we did not have any previous entry to compare it with
In [82]:
subway_df = pd.DataFrame({
'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
'R004', 'R004'],
'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'ENTRIESn': [ 4388333, 4388348, 4389885, 4391507, 4393043, 14656120,
14656174, 14660126, 14664247, 14668301],
'EXITSn': [ 2911002, 2911036, 2912127, 2913223, 2914284, 14451774,
14451851, 14454734, 14457780, 14460818],
'latitude': [ 40.689945, 40.689945, 40.689945, 40.689945, 40.689945,
40.69132 , 40.69132 , 40.69132 , 40.69132 , 40.69132 ],
'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
-73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
})
weather_df = pd.DataFrame({
'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
'05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'latitude': [ 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945,
40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 ],
'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
-73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
'pressurei': [ 30.24, 30.24, 30.32, 30.32, 30.14, 30.14, 29.98, 29.98,
30.01, 30.01],
'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'tempi': [ 52. , 52. , 48.9, 48.9, 54. , 54. , 57.2, 57.2, 48.9, 48.9],
'wspdi': [ 8.1, 8.1, 6.9, 6.9, 3.5, 3.5, 15. , 15. , 15. , 15. ]
})
In [83]:
def combine_dfs(subway_df, weather_df):
'''
Fill in this function to take 2 DataFrames, one with subway data and one with weather data,
and return a single dataframe with one row for each date, hour, and location. Only include
times and locations that have both subway data and weather data available.
'''
return subway_df.merge(weather_df, on = ['DATEn', 'hour', 'latitude', 'longitude'], how = 'inner')
In [84]:
combine_dfs(subway_df, weather_df)
Out[84]:
The 4 columns on which I had to join had the same names. If they had different names we could have used the arguments left_on and right_on to specify the column names.
In [85]:
subway_df = pd.read_csv('nyc_subway_weather.csv')
In [86]:
subway_df.columns
Out[86]:
In [87]:
data_by_location = subway_df.groupby(['latitude', 'longitude'],
as_index = False).mean()
data_by_location.head()
Out[87]:
In [88]:
scaled_entries = (data_by_location['ENTRIESn_hourly'] / data_by_location['ENTRIESn_hourly'].std())
In [89]:
plt.scatter(data_by_location['latitude'], data_by_location['longitude'],
s = scaled_entries)
Out[89]: