Before we get started, a couple of reminders to keep in mind when using iPython notebooks:
In [2]:
import unicodecsv
## Longer version of code (replaced with shorter, equivalent version below)
# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
# enrollments.append(row)
# f.close()
with open('enrollments.csv', 'rb') as f:
reader = unicodecsv.DictReader(f)
enrollments = list(reader)
In [3]:
#####################################
# 1 #
#####################################
## Read in the data from daily_engagement.csv and project_submissions.csv
## and store the results in the below variables.
## Then look at the first row of each table.
import unicodecsv
with open('daily_engagement.csv', 'rb') as f:
reader1 = unicodecsv.DictReader(f)
daily_engagement = list(reader1)
print daily_engagement[0]
with open('project_submissions.csv', 'rb') as f:
reader2 = unicodecsv.DictReader(f)
project_submissions = list(reader2)
print project_submissions[0]
In [4]:
from datetime import datetime as dt
# Takes a date as a string, and returns a Python datetime object.
# If there is no date given, returns None
def parse_date(date):
if date == '':
return None
else:
return dt.strptime(date, '%Y-%m-%d')
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
if i == '':
return None
else:
return int(i)
# Clean up the data types in the enrollments table
for enrollment in enrollments:
enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
enrollment['join_date'] = parse_date(enrollment['join_date'])
enrollments[0]
Out[4]:
In [5]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
daily_engagement[0]
Out[5]:
In [6]:
# Clean up the data types in the submissions table
for submission in project_submissions:
submission['completion_date'] = parse_date(submission['completion_date'])
submission['creation_date'] = parse_date(submission['creation_date'])
project_submissions[0]
Out[6]:
In [7]:
#####################################
# 2 #
#####################################
## Find the total number of rows and the number of unique students (account keys)
## in each table.
enrollment_num_rows = len(enrollments)
list_of_unique_enrl_keys = []
for item in enrollments:
if(int(item.get('account_key')) not in list_of_unique_enrl_keys):
list_of_unique_enrl_keys.append(int(item.get('account_key')))
enrollment_num_unique_students = len(list_of_unique_enrl_keys)
engagement_num_rows = len(daily_engagement)
list_of_unique_eng_keys = []
for item in daily_engagement:
if(int(item.get('acct')) not in list_of_unique_eng_keys):
list_of_unique_eng_keys.append(int(item.get('acct')))
engagement_num_unique_students = len(list_of_unique_eng_keys)
submission_num_rows = len(project_submissions)
list_of_unique_keys = []
for item in project_submissions:
if(int(item.get('account_key')) not in list_of_unique_keys):
list_of_unique_keys.append(int(item.get('account_key')))
submission_num_unique_students = len(list_of_unique_keys)
print enrollment_num_rows
print engagement_num_rows
print submission_num_rows
print enrollment_num_unique_students
print engagement_num_unique_students
print submission_num_unique_students
In [8]:
#####################################
# 3 #
#####################################
## Rename the "acct" column in the daily_engagement table to "account_key".
for item in daily_engagement:
item['account_key'] = item.pop('acct')
In [9]:
#####################################
# 4 #
#####################################
## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.
# Here the goal is to get the account ids of accounts in the enrollment list where the student is
# enrolled greater than 1 day and none of the times are 'None' and where the enrolled student is
# not in the list of accounts in the engagement table
good_enrollments = []
for item in enrollments:
if item.get('join_date') != item.get('cancel_date') and item.get('days_to_cancel') != 'None':
good_enrollments.append(item.get('account_key'))
bad_ids = []
for pid in good_enrollments:
if int(pid) not in list_of_unique_eng_keys:
bad_ids.append(int(pid))
print len(bad_ids)
In [10]:
#####################################
# 5 #
#####################################
## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.
test_enrolment_ids = []
for item in enrollments:
if item.get('is_udacity') and int(item.get('account_key')) not in test_enrolment_ids:
test_enrolment_ids.append(int(item.get('account_key')))
bad_ids = []
num_problem_students = 0
for item in daily_engagement:
if int(item.get('account_key')) in test_enrolment_ids and int(item.get('account_key')) not in bad_ids:
bad_ids.append(int(item.get('account_key')))
num_problem_students = len(bad_ids)
print num_problem_students
In [11]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
if enrollment['is_udacity']:
udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)
Out[11]:
In [12]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
non_udacity_data = []
for data_point in data:
if data_point['account_key'] not in udacity_test_accounts:
non_udacity_data.append(data_point)
return non_udacity_data
In [13]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)
print len(non_udacity_enrollments)
print len(non_udacity_engagement)
print len(non_udacity_submissions)
In [14]:
#####################################
# 6 #
#####################################
## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.
paid_students = {}
for enrollment in non_udacity_enrollments:
if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
account_key = enrollment['account_key']
enrollment_date = enrollment['join_date']
if account_key not in paid_students or enrollment_date > paid_students[account_key]:
paid_students[account_key] = enrollment_date
len(paid_students)
Out[14]:
In [15]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
time_delta = engagement_date - join_date
return time_delta.days < 7 and time_delta.days >= 0
In [16]:
# Function to remove trials
def remove_free_trial_cancels(data):
new_data = []
for data_point in data:
if data_point.get('account_key') in paid_students:
new_data.append(data_point)
return new_data
In [17]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
print len(paid_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
print len(paid_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)
print len(paid_submissions)
In [18]:
#####################################
# 7 #
#####################################
## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.
# paid_students is list of paid account keys
# need engagement utc_dates where the id is in the paid_students array
paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
account_key = engagement_record.get('account_key')
join_date = paid_students[account_key]
engagement_record_date = engagement_record.get('utc_date')
if within_one_week(join_date,engagement_record_date):
paid_engagement_in_first_week.append(engagement_record)
len(paid_engagement_in_first_week)
Out[18]:
In [19]:
from collections import defaultdict
# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
account_key = engagement_record['account_key']
engagement_by_account[account_key].append(engagement_record)
In [20]:
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
total_minutes = 0
for engagement_record in engagement_for_student:
total_minutes += engagement_record['total_minutes_visited']
total_minutes_by_account[account_key] = total_minutes
In [21]:
import numpy as np
# Summarize the data about minutes spent in the classroom
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)
In [22]:
#####################################
# 8 #
#####################################
## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.
max_time_spent = 0
bad_account = 0
for account_key, time_spent in total_minutes_by_account.items():
if time_spent > max_time_spent:
max_time_spent = time_spent
bad_account = account_key
print "Bad Account = " + str(bad_account) + " Time Spent = " + str(max_time_spent)
In [23]:
bad_engagements = []
for engagement in paid_engagement_in_first_week:
if engagement.get('account_key') == bad_account:
bad_engagements.append(engagement)
print bad_engagements
In [24]:
#####################################
# 9 #
#####################################
## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.
lessons_completed_by_student={}
lesson_metrics=[]
for account_key, engagement_for_student in engagement_by_account.items():
lessons_completed = 0
for record in engagement_for_student:
lessons_completed += int(record.get('lessons_completed'))
lesson_metrics.append(lessons_completed)
lessons_completed_by_student[account_key] = lessons_completed
# Summarize the data about lessons completed
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(lesson_metrics)
print 'Standard deviation:', np.std(lesson_metrics)
print 'Minimum:', np.min(lesson_metrics)
print 'Maximum:', np.max(lesson_metrics)
In [25]:
######################################
# 10 #
######################################
## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.
days_visited_by_student={}
days_visited=[]
for account_key, engagement_for_student in engagement_by_account.items():
days = 0
for item in engagement_for_student:
if item['num_courses_visited'] > 0:
days += 1
days_visited.append(days)
days_visited_by_student[account_key]=days
# Summarize the data about lessons completed
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(days_visited)
print 'Standard deviation:', np.std(days_visited)
print 'Minimum:', np.min(days_visited)
print 'Maximum:', np.max(days_visited)
In [63]:
######################################
# 11 #
######################################
## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.
subway_project_lesson_keys = ['746169184', '3176718735']
passing_subway = []
non_passing_subway_temp = []
non_passing_subway = []
lesson_ids = []
for submission in paid_submissions:
pass_fail = str(submission.get('assigned_rating'))
lesson_key = str(submission.get('lesson_key'))
account_key = int(submission.get('account_key'))
if lesson_key not in lesson_ids:
lesson_ids.append(str(submission.get('lesson_key')))
if lesson_key in subway_project_lesson_keys and \
(pass_fail == 'PASSED' or pass_fail == 'DISTINCTION') and \
account_key not in passing_subway:
passing_subway.append(account_key)
elif pass_fail != 'PASSED' and pass_fail != 'DISTINCTION' and\
str(submission.get('assigned_rating')) != 'UNGRADED':
non_passing_subway_temp.append(account_key)
for submission in non_passing_subway_temp:
if submission not in passing_subway:
non_passing_subway.append(submission)
print "Students Passed Subway = " + str(len(passing_subway))
print "Students Failed Subway = " + str(len(non_passing_subway))
print "Overlap = " + str(len([i for i in passing_subway if i in non_passing_subway]))
passing_engagement = []
non_passing_engagement = []
for engagement in paid_engagement_in_first_week:
acc_key = int(engagement.get('account_key'))
if acc_key in passing_subway:
passing_engagement.append(acc_key)
else:
non_passing_engagement.append(acc_key)
print len(passing_engagement)
print len(non_passing_engagement)
print len([i for i in passing_engagement if i in non_passing_engagement])
In [104]:
######################################
# 12 #
######################################
## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).
minutes_passing = []
minutes_non_passing = []
lesson_count_passing = []
lesson_count_non_passing = []
days_visited_passing = []
days_visited_non_passing = []
for account_key, engagement_for_student in engagement_by_account.items():
account_key = int(account_key)
minutes = 0
lesson_count = 0
days_visited = 0
if account_key in passing_subway:
for engagement in engagement_for_student:
minutes += float(engagement.get('total_minutes_visited'))
lesson_count += int(engagement.get('lessons_completed'))
if float(engagement.get('total_minutes_visited')) > 0:
days_visited += 1
minutes_passing.append(minutes)
lesson_count_passing.append(lesson_count)
days_visited_passing.append(days_visited)
else:
for engagement in engagement_for_student:
minutes += float(engagement.get('total_minutes_visited'))
lesson_count += int(engagement.get('lessons_completed'))
if float(engagement.get('total_minutes_visited')) > 0:
days_visited += 1
minutes_non_passing.append(minutes)
lesson_count_non_passing.append(lesson_count)
days_visited_non_passing.append(days_visited)
print 'Minutes Spent in Classrom:'
print ''
print '------------------------------------------------'
print 'Passing Metrics:'
print 'Mean Passing:', np.mean(minutes_passing)
print 'Standard deviation:', np.std(minutes_passing)
print 'Minimum:', np.min(minutes_passing)
print 'Maximum:', np.max(minutes_passing)
print '------------------------------------------------'
print 'Non-Passing Metrics:'
print 'Mean Non-Passing:', np.mean(minutes_non_passing)
print 'Standard deviation:', np.std(minutes_non_passing)
print 'Minimum:', np.min(minutes_non_passing)
print 'Maximum:', np.max(minutes_non_passing)
print ''
print ''
print 'Student Lesson Count:'
print ''
print '------------------------------------------------'
print 'Passing Metrics:'
print 'Mean Passing:', np.mean(lesson_count_passing)
print 'Standard deviation:', np.std(lesson_count_passing)
print 'Minimum:', np.min(lesson_count_passing)
print 'Maximum:', np.max(lesson_count_passing)
print '------------------------------------------------'
print 'Non-Passing Metrics:'
print 'Mean Non-Passing:', np.mean(lesson_count_non_passing)
print 'Standard deviation:', np.std(lesson_count_non_passing)
print 'Minimum:', np.min(lesson_count_non_passing)
print 'Maximum:', np.max(lesson_count_non_passing)
print ''
print ''
print 'Student Days Visited:'
print ''
print '------------------------------------------------'
print 'Passing Metrics:'
print 'Mean Passing:', np.mean(days_visited_passing)
print 'Standard deviation:', np.std(days_visited_passing)
print 'Minimum:', np.min(days_visited_passing)
print 'Maximum:', np.max(days_visited_passing)
print '------------------------------------------------'
print 'Non-Passing Metrics:'
print 'Mean Non-Passing:', np.mean(days_visited_non_passing)
print 'Standard deviation:', np.std(days_visited_non_passing)
print 'Minimum:', np.min(days_visited_non_passing)
print 'Maximum:', np.max(days_visited_non_passing)
In [206]:
######################################
# 13 #
######################################
## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.
%matplotlib inline
# Above is for jupyter to render in the notebook
import matplotlib.pyplot as plt
fig = plt.figure()
fig = plt.figure(figsize=(20, 20))
fig.subplots_adjust(hspace=0.5, wspace=0.15)
ax1 = fig.add_subplot(3, 2, 1)
ax1.set
ax1.set_xlabel('Minutes Spent')
ax1.set_ylabel('Student Count')
ax1.set_title('Minutes Spent-Passing Students')
ax2 = fig.add_subplot(3, 2, 2)
ax2.set_xlabel('Minutes Spent')
ax2.set_ylabel('Student Count')
ax2.set_title('Minutes Spent-Failing Students')
ax3 = fig.add_subplot(3, 2, 3)
ax3.set_xlabel('Lessons Completed')
ax3.set_ylabel('Students')
ax3.set_title('Lessons Completed-Passing Students')
ax4 = fig.add_subplot(3, 2, 4)
ax4.set_xlabel('Lessons Completed')
ax4.set_ylabel('Students')
ax4.set_title('Lessons Completed-Failing Students')
ax5 = fig.add_subplot(3, 2, 5)
ax5.set_xlabel('Days Visited')
ax5.set_ylabel('Students')
ax5.set_title('Days Spent-Passing Students')
ax6 = fig.add_subplot(3, 2, 6)
ax6.set_xlabel('Days Visited')
ax6.set_ylabel('Students')
ax6.set_title('Days Spent-Failing Students')
ax1 = ax1.hist(minutes_passing, bins=30)
ax2 = ax2.hist(minutes_non_passing, bins=30)
ax3 = ax3.hist(lesson_count_passing)
ax4 = ax4.hist(lesson_count_non_passing)
ax5 = ax5.hist(days_visited_passing)
ax6 = ax6.hist(days_visited_non_passing)
In [228]:
## Comparison for description on git
%matplotlib inline
# Above is for jupyter to render in the notebook
import matplotlib.pyplot as plt
plt.hist(minutes_passing, bins=30, color=['lightgreen'], label=['Passing Students'])
plt.hist(minutes_non_passing, bins=30, color=['pink'], label=['Failing Students'])
plt.xlabel('Time Spent (Minutes)')
plt.ylabel('# of Students')
plt.title('Student Success - Time Investment', fontsize=16)
plt.grid(True)
plt.legend()
Out[228]:
In [229]:
## Comparison for description on git
%matplotlib inline
# Above is for jupyter to render in the notebook
import matplotlib.pyplot as plt
plt.hist(days_visited_passing, color=['lightgreen'], label=['Passing Students'])
plt.hist(days_visited_non_passing, color=['pink'], label=['Failing Students'])
plt.xlabel('Course Usage - Logins by Day')
plt.ylabel('# of Students')
plt.title('Student Success - Course Usage', fontsize=16)
plt.grid(True)
plt.legend()
Out[229]:
In [282]:
######################################
# 14 #
######################################
## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.
import seaborn as sns
sns.distplot(minutes_passing, kde=False, bins=10, label="Passing Students");
sns.distplot(minutes_non_passing, kde=False, bins=10, color='green', label="Failing Students");
plt.legend()
plt.xlabel('Time Spent (Minutes)')
plt.ylabel('# of Students')
plt.title('Student Success - Time Investment', fontsize=16)
Out[282]:
In [283]:
sns.distplot(days_visited_passing, kde=False, bins=10, label="Passing Students");
sns.distplot(days_visited_non_passing, kde=False, bins=10, color='green', label="Failing Students");
plt.legend()
plt.xlabel('Course Usage - Logins by Day')
plt.ylabel('# of Students')
plt.title('Student Success - Course Usage', fontsize=16)
Out[283]:
In [285]:
# Setup pandas to read in the full engagement table
import pandas as pd
daily_engagement = pd.read_csv('daily_engagement_full.csv')
len(daily_engagement.get('acct').unique())
Out[285]:
In [286]:
import numpy as np
# First 20 countries with employment data
countries = np.array([
'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
'Belize', 'Benin', 'Bhutan', 'Bolivia',
'Bosnia and Herzegovina'
])
# Employment data in 2007 for those 20 countries
employment = np.array([
55.70000076, 51.40000153, 50.5 , 75.69999695,
58.40000153, 40.09999847, 61.5 , 57.09999847,
60.90000153, 66.59999847, 60.40000153, 68.09999847,
66.90000153, 53.40000153, 48.59999847, 56.79999924,
71.59999847, 58.40000153, 70.40000153, 41.20000076
])
In [329]:
# Change False to True for each block of code to see what it does
# Accessing elements
if True:
print countries[0]
print countries[3]
# Slicing
if False:
print countries[0:3]
print countries[:3]
print countries[17:]
print countries[:]
# Element types
if False:
print countries.dtype
print employment.dtype
print np.array([0, 1, 2, 3]).dtype
print np.array([1.0, 1.5, 2.0, 2.5]).dtype
print np.array([True, False, True]).dtype
print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype
# Looping
if False:
for country in countries:
print 'Examining country {}'.format(country)
for i in range(len(countries)):
country = countries[i]
country_employment = employment[i]
print 'Country {} has employment {}'.format(country,
country_employment)
# Numpy functions
if False:
print employment.mean()
print employment.std()
print employment.max()
print employment.sum()
In [340]:
def max_employment(countries, employment):
'''
Fill in this function to return the name of the country
with the highest employment in the given employment
data, and the employment in that country.
'''
max_country = countries[int(tuple(np.where(employment==employment.max())[0])[0])] # Replace this with your code
max_value = employment.max() # Replace this with your code
return (max_country, max_value)
# Alternate Instructor solution
# return (countries[employment.argmax()], employment.max())
max_employment(countries, employment)
Out[340]:
In [342]:
import numpy as np
# Change False to True for each block of code to see what it does
# Arithmetic operations between 2 NumPy arrays
if False:
a = np.array([1, 2, 3, 4])
b = np.array([1, 2, 1, 2])
print a + b
print a - b
print a * b
print a / b
print a ** b
# Arithmetic operations between a NumPy array and a single number
if True:
a = np.array([1, 2, 3, 4])
b = 2
print a + b
print a - b
print a * b
print a / b
print a ** b
# Logical operations with NumPy arrays
if False:
a = np.array([True, True, False, False])
b = np.array([True, False, True, False])
print a & b
print a | b
print ~a
print a & True
print a & False
print a | True
print a | False
# Comparison operations between 2 NumPy Arrays
if False:
a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 3, 2, 1])
print a > b
print a >= b
print a < b
print a <= b
print a == b
print a != b
# Comparison operations between a NumPy array and a single number
if False:
a = np.array([1, 2, 3, 4])
b = 2
print a > b
print a >= b
print a < b
print a <= b
print a == b
print a != b
In [362]:
# First 20 countries with school completion data
countries = np.array([
'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan',
'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia',
'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi',
'Cambodia', 'Cameroon', 'Cape Verde'
])
# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([
97.35583, 104.62379, 103.02998, 95.14321, 103.69019,
98.49185, 100.88828, 95.43974, 92.11484, 91.54804,
95.98029, 98.22902, 96.12179, 119.28105, 97.84627,
29.07386, 38.41644, 90.70509, 51.7478 , 95.45072
])
# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([
95.47622, 100.66476, 99.7926 , 91.48936, 103.22096,
97.80458, 103.81398, 88.11736, 93.55611, 87.76347,
102.45714, 98.73953, 92.22388, 115.3892 , 98.70502,
37.00692, 45.39401, 91.22084, 62.42028, 90.66958
])
def overall_completion_rate(female_completion, male_completion):
'''
Fill in this function to return a NumPy array containing the overall
school completion rate for each country. The arguments are NumPy
arrays giving the female and male completion of each country in
the same order.
'''
return (female_completion + male_completion)/2
overall_completion_rate(female_completion, male_completion)
Out[362]:
In [365]:
import numpy as np
# First 20 countries with employment data
countries = np.array([
'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
'Belize', 'Benin', 'Bhutan', 'Bolivia',
'Bosnia and Herzegovina'
])
# Employment data in 2007 for those 20 countries
employment = np.array([
55.70000076, 51.40000153, 50.5 , 75.69999695,
58.40000153, 40.09999847, 61.5 , 57.09999847,
60.90000153, 66.59999847, 60.40000153, 68.09999847,
66.90000153, 53.40000153, 48.59999847, 56.79999924,
71.59999847, 58.40000153, 70.40000153, 41.20000076
])
# Change this country name to change what country will be printed when you
# click "Test Run". Your function will be called to determine the standardized
# score for this country for each of the given 5 Gapminder variables in 2007.
# The possible country names are available in the Downloadables section.
country_name = 'United States'
def standardize_data(values):
'''
Fill in this function to return a standardized version of the given values,
which will be in a NumPy array. Each value should be translated into the
number of standard deviations that value is away from the mean of the data.
(A positive number indicates a value higher than the mean, and a negative
number indicates a value lower than the mean.)
The formula is effectively the the distance between the vector and the mean divided
by the deviation.
'''
return (values-np.mean(values))/np.std(values)
standardize_data(employment)
Out[365]:
In [367]:
import numpy as np
# Change False to True for each block of code to see what it does
# Using index arrays
if False:
a = np.array([1, 2, 3, 4])
b = np.array([True, True, False, False])
print a[b]
print a[np.array([True, False, True, False])]
# Creating the index array using vectorized operations
if False:
a = np.array([1, 2, 3, 2, 1])
b = (a >= 2)
print a[b]
print a[a >= 2]
# Creating the index array using vectorized operations on another array
if False:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 2, 1])
print b == 2
print a[b == 2]
def mean_time_for_paid_students(time_spent, days_to_cancel):
'''
Fill in this function to calculate the mean time spent in the classroom
for students who stayed enrolled at least (greater than or equal to) 7 days.
Unlike in Lesson 1, you can assume that days_to_cancel will contain only
integers (there are no students who have not canceled yet).
The arguments are NumPy arrays. time_spent contains the amount of time spent
in the classroom for each student, and days_to_cancel contains the number
of days until each student cancel. The data is given in the same order
in both arrays.
'''
return np.mean(time_spent[days_to_cancel>=7])
# Time spent in the classroom in the first week for 20 students
time_spent = np.array([
12.89697233, 0. , 64.55043217, 0. ,
24.2315615 , 39.991625 , 0. , 0. ,
147.20683783, 0. , 0. , 0. ,
45.18261617, 157.60454283, 133.2434615 , 52.85000767,
0. , 54.9204785 , 26.78142417, 0.
])
# Days to cancel for 20 students
days_to_cancel = np.array([
4, 5, 37, 3, 12, 4, 35, 38, 5, 37, 3, 3, 68,
38, 98, 2, 249, 2, 127, 35
])
mean_time_for_paid_students(time_spent, days_to_cancel)
Out[367]:
In [468]:
import pandas as pd
countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']
life_expectancy_values = [74.7, 75. , 83.4, 57.6, 74.6, 75.4, 72.3, 81.5, 80.2,
70.3, 72.1, 76.4, 68.1, 75.2, 69.8, 79.4, 70.8, 62.7,
67.3, 70.6]
gdp_values = [ 1681.61390973, 2155.48523109, 21495.80508273, 562.98768478,
13495.1274663 , 9388.68852258, 1424.19056199, 24765.54890176,
27036.48733192, 1945.63754911, 21721.61840978, 13373.21993972,
483.97086804, 9783.98417323, 2253.46411147, 25034.66692293,
3680.91642923, 366.04496652, 1175.92638695, 1132.21387981]
# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)
# Change False to True for each block of code to see what it does
# Accessing elements and slicing
if False:
print life_expectancy[0]
print gdp[3:6]
# Looping
if False:
for country_life_expectancy in life_expectancy:
print 'Examining life expectancy {}'.format(country_life_expectancy)
# Pandas functions
if False:
print life_expectancy.mean()
print life_expectancy.std()
print gdp.max()
print gdp.sum()
# Vectorized operations and index arrays
if False:
a = pd.Series([1, 2, 3, 4])
b = pd.Series([1, 2, 1, 2])
print a + b
print a * 2
print a >= 3
print a[a >= 3]
def variable_correlation(variable1, variable2):
both_up = (variable1 > variable1.mean()) & (variable2 > variable2.mean())
up = len(both_up[both_up==True])
both_down = (variable1 < variable1.mean()) & (variable2 < variable2.mean())
down = len(both_down[both_down==True])
num_same_direction = up + down
num_different_direction = len(variable1) - num_same_direction
return (num_same_direction, num_different_direction)
variable_correlation(life_expectancy, gdp)
Out[468]:
In [472]:
import pandas as pd
countries = [
'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
'Belize', 'Benin', 'Bhutan', 'Bolivia',
'Bosnia and Herzegovina'
]
employment_values = [
55.70000076, 51.40000153, 50.5 , 75.69999695,
58.40000153, 40.09999847, 61.5 , 57.09999847,
60.90000153, 66.59999847, 60.40000153, 68.09999847,
66.90000153, 53.40000153, 48.59999847, 56.79999924,
71.59999847, 58.40000153, 70.40000153, 41.20000076
]
# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)
def max_employment(employment):
'''
Fill in this function to return the name of the country
with the highest employment in the given employment
data, and the employment in that country.
The input will be a Pandas series where the values
are employment and the index is country names.
Try using the Pandas argmax() function. Documention is
here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.argmax.html
'''
max_country = employment.argmax() # Replace this with your code
max_value = employment.max() # Replace this with your code
return (max_country, max_value)
max_employment(employment)
Out[472]:
In [479]:
import pandas as pd
# Change False to True for each block of code to see what it does
# Addition when indexes are the same
if False:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print s1 + s2
# Indexes have same elements in a different order
if False:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])
print s1 + s2
# Indexes overlap, but do not have exactly the same elements
if True:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
print s1 + s2
# Indexes do not overlap
if False:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
print s1 + s2
In [519]:
import pandas as pd
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
# Try to write code that will add the 2 previous series together,
# but treating missing values from either series as 0. The result
# when printed out should be similar to the following line:
# print pd.Series([1, 2, 13, 24, 30, 40], index=['a', 'b', 'c', 'd', 'e', 'f'])
print s2.add(s1, fill_value=0)
In [555]:
import pandas as pd
# Change False to True to see what the following block of code does
# Example pandas apply() usage (although this could have been done
# without apply() using vectorized operations)
if False:
s = pd.Series([1, 2, 3, 4, 5])
def add_one(x):
return x + 1
print s.apply(add_one)
names = pd.Series([
'Andre Agassi',
'Barry Bonds',
'Christopher Columbus',
'Daniel Defoe',
'Emilio Estevez',
'Fred Flintstone',
'Greta Garbo',
'Humbert Humbert',
'Ivan Ilych',
'James Joyce',
'Keira Knightley',
'Lois Lane',
'Mike Myers',
'Nick Nolte',
'Ozzy Osbourne',
'Pablo Picasso',
'Quirinus Quirrell',
'Rachael Ray',
'Susan Sarandon',
'Tina Turner',
'Ugueth Urbina',
'Vince Vaughn',
'Woodrow Wilson',
'Yoji Yamada',
'Zinedine Zidane'
])
def reverse_names(names):
'''
Fill in this function to return a new series where each name
in the input series has been transformed from the format
"Firstname Lastname" to "Lastname, FirstName".
Try to use the Pandas apply() function rather than a loop.
'''
temp = names.split(' ')
return temp[1] +', '+ temp[0]
names.apply(reverse_names)
Out[555]:
In [734]:
import pandas as pd
import seaborn as sns
# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.
path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')
# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.
employment_us = employment.loc['United States']
female_completion_us = female_completion.loc['United States']
male_completion_us = male_completion.loc['United States']
life_expectancy_us = life_expectancy.loc['United States']
gdp_us = gdp.loc['United States']
# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2
# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
employment_us.plot(ax=ax, label='Employment - USA')
life.plot(ax=ax2, color='green', label='Life Expectancy - USA')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - USA', fontsize=16)
plt.show()
In [735]:
import pandas as pd
import seaborn as sns
# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.
path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')
# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.
employment_us = employment.loc['Spain']
female_completion_us = female_completion.loc['Spain']
male_completion_us = male_completion.loc['Spain']
life_expectancy_us = life_expectancy.loc['Spain']
gdp_us = gdp.loc['Spain']
# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2
# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
employment_us.plot(ax=ax, label='Employment - Spain')
life.plot(ax=ax2, color='green', label='Life Expectancy - Spain')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - Spain', fontsize=16)
plt.show()
In [736]:
import pandas as pd
import seaborn as sns
# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.
path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')
# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.
employment_us = employment.loc['Pakistan']
female_completion_us = female_completion.loc['Pakistan']
male_completion_us = male_completion.loc['Pakistan']
life_expectancy_us = life_expectancy.loc['Pakistan']
gdp_us = gdp.loc['Pakistan']
# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2
# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
employment_us.plot(ax=ax, label='Employment - Pakistan')
life.plot(ax=ax2, color='green', label='Life Expectancy - Pakistan')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - Pakistan', fontsize=16)
plt.show()
In [738]:
import pandas as pd
import seaborn as sns
# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.
path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')
# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.
employment_us = employment.loc['India']
female_completion_us = female_completion.loc['India']
male_completion_us = male_completion.loc['India']
life_expectancy_us = life_expectancy.loc['India']
gdp_us = gdp.loc['India']
# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2
# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
employment_us.plot(ax=ax, label='Employment - India')
life.plot(ax=ax2, color='green', label='Life Expectancy - India')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - India', fontsize=16)
plt.show()
In [784]:
import numpy as np
# Subway ridership for 5 stations on 10 different days
ridership2 = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
ridership1 = np.array([
[ 5, 10, 15, 20, 25],
[ 5, 10, 15, 20, 25],
[ 5, 10, 15, 20, 25]
])
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
# Change False to True for each block of code to see what it does
# Accessing elements
if False:
print ridership[1, 3]
print ridership[1:3, 3:5]
print ridership[1, :]
# Vectorized operations on rows or columns
if False:
print ridership[0, :] + ridership[1, :]
print ridership[:, 0] + ridership[:, 1]
# Vectorized operations on entire arrays
if False:
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print a + b
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
Hint: NumPy's argmax() function might be useful:
http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
'''
print np.max(ridership)
print np.argwhere(ridership==np.max(ridership))[0][1]
overall_mean = np.mean(ridership)
mean_for_max = np.mean(ridership[:, np.argwhere(ridership==np.max(ridership))[0][1]])
return (overall_mean, mean_for_max)
mean_riders_for_max_station(ridership)
Out[784]:
In [788]:
import numpy as np
# Change False to True for this block of code to see what it does
# NumPy axis argument
if False:
a = np.array([
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]
])
print a.sum()
print a.sum(axis=0)
print a.sum(axis=1)
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
def min_and_max_riders_per_day(ridership):
'''
Fill in this function. First, for each subway station, calculate the
mean ridership per day. Then, out of all the subway stations, return the
maximum and minimum of these values. That is, find the maximum
mean-ridership-per-day and the minimum mean-ridership-per-day for any
subway station.
'''
max_daily_ridership = np.max(np.mean(ridership, axis=0))
min_daily_ridership = np.min(np.mean(ridership, axis=0))
return (max_daily_ridership, min_daily_ridership)
min_and_max_riders_per_day(ridership)
Out[788]:
In [875]:
import pandas as pd
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
data=[[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]],
index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
# Change False to True for each block of code to see what it does
# DataFrame creation
if False:
# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df_1
# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print df_2
# Accessing elements
if False:
print ridership_df.iloc[0]
print ridership_df.loc['05-05-11']
print ridership_df['R003']
print ridership_df.iloc[1, 3]
# Accessing multiple rows
if False:
print ridership_df.iloc[1:4]
# Accessing multiple columns
if False:
print ridership_df[['R003', 'R005']]
# Pandas axis
if False:
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df.sum()
print df.sum(axis=1)
print df.values.sum()
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
This is the same as a previous exercise, but this time the
input is a Pandas DataFrame rather than a 2D NumPy array.
'''
overall_mean = ridership.values.mean()
mean_for_max = ridership.loc[:, ridership.max().argmax()].mean()
return (overall_mean, mean_for_max)
mean_riders_for_max_station(ridership_df)
Out[875]:
In [890]:
import pandas as pd
filename = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)
def correlation(x, y):
'''
Fill in this function to compute the correlation between the two
input variables. Each input is either a NumPy array or a Pandas
Series.
correlation = average of (x in standard units) times (y in standard units)
Remember to pass the argument "ddof=0" to the Pandas std() function!
'''
val_x = (x - x.mean())/x.std(ddof=0)
val_y = (y - y.mean())/y.std(ddof=0)
prod = val_x * val_y
return prod.mean()
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']
print correlation(entries, rain)
print correlation(entries, temp)
print correlation(rain, temp)
print correlation(entries, cum_entries)
In [894]:
import pandas as pd
# Examples of vectorized operations on DataFrames:
# Change False to True for each block of code to see what it does
# Adding DataFrames with the column names
if False:
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
print df1 + df2
In [896]:
# Adding DataFrames with overlapping column names
if True:
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
print df1 + df2
In [897]:
# Adding DataFrames with overlapping row indexes
if True:
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
index=['row4', 'row3', 'row2'])
print df1 + df2
In [1283]:
# --- Quiz ---
# Cumulative entries and exits for one station for a few hours.
# Dataset 1
entries_and_exits = pd.DataFrame({
'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
3144808, 3144895, 3144905, 3144941, 3145094],
'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
1088317, 1088328, 1088331, 1088420, 1088753]
})
# Dataset 2
# entries_and_exits = pd.DataFrame(
# {'ENTRIESn': [0.0, 30.0, 20.0, 5.0, 20.0],
# 'EXITSn': [0.0, 10.0, 10.0, 40.0, 0.0]},
# index=[0, 1, 2, 3, 4]
# )
def get_hourly_entries_and_exits(entries_and_exits):
'''
Fill in this function to take a DataFrame with cumulative entries
and exits (entries in the first column, exits in the second) and
return a DataFrame with hourly entries and exits (entries in the
first column, exits in the second).
'''
# Instructor's solution
# return (entries_and_exits - entries_and_exits.shift(1)).fillna(0)
arr = entries_and_exits.copy()
arr['EntryDelta'] = arr['ENTRIESn'].shift(1)
arr['ExitDelta'] = arr['EXITSn'].shift(1)
arr['ENTRIES1n'] = arr['ENTRIESn'] - arr['EntryDelta']
arr['EXITS1n'] = arr['EXITSn'] - arr['ExitDelta']
arr = arr.fillna(0)
entries_and_exits['ENTRIESn'] = arr['ENTRIES1n']
entries_and_exits['EXITSn'] = arr['EXITS1n']
return entries_and_exits
get_hourly_entries_and_exits(entries_and_exits)
Out[1283]:
In [974]:
import pandas as pd
# Change False to True for this block of code to see what it does
# DataFrame applymap()
if False:
df = pd.DataFrame({
'a': [1, 2, 3],
'b': [10, 20, 30],
'c': [5, 10, 15]
})
def add_one(x):
return x + 1
print df.applymap(add_one)
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
def convert_grades(grades):
'''
Fill in this function to convert the given DataFrame of numerical
grades to letter grades. Return a new DataFrame with the converted
grade.
The conversion rule is:
90-100 -> A
80-89 -> B
70-79 -> C
60-69 -> D
0-59 -> F
'''
grades_ret = ''
if grades >= 90:
grades_ret = 'A'
elif grades >= 80:
grades_ret = 'B'
elif grades >= 70:
grades_ret = 'C'
elif grades >= 60:
grades_ret = 'D'
else:
grades_ret = 'F'
return grades_ret
grades_df.applymap(convert_grades)
Out[974]:
In [990]:
import pandas as pd
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
test1_df = pd.DataFrame(
{0: [95, 85, 75, 65, 55], 1: [95, 85, 75, 65, 55]},
index=[0, 1, 2, 3, 4]
)
# Change False to True for this block of code to see what it does
# DataFrame apply()
if False:
def convert_grades_curve(exam_grades):
# Pandas has a bult-in function that will perform this calculation
# This will give the bottom 0% to 10% of students the grade 'F',
# 10% to 20% the grade 'D', and so on. You can read more about
# the qcut() function here:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
return pd.qcut(exam_grades,
[0, 0.1, 0.2, 0.5, 0.8, 1],
labels=['F', 'D', 'C', 'B', 'A'])
# qcut() operates on a list, array, or Series. This is the
# result of running the function on a single column of the
# DataFrame.
print convert_grades_curve(grades_df['exam1'])
# qcut() does not work on DataFrames, but we can use apply()
# to call the function on each column separately
print grades_df.apply(convert_grades_curve)
def standardize_col(col):
return (col - col.mean())/ col.std()
def standardize(df):
'''
Fill in this function to standardize each column of the given
DataFrame. To standardize a variable, convert each value to the
number of standard deviations it is above or below the mean.
'''
return df.apply(standardize_col)
standardize(grades_df)
Out[990]:
In [1000]:
import numpy as np
import pandas as pd
df = pd.DataFrame({
'a': [4, 5, 3, 1, 2],
'b': [20, 10, 40, 50, 30],
'c': [25, 20, 5, 15, 10]
})
# Change False to True for this block of code to see what it does
# DataFrame apply() - use case 2
if False:
print df.apply(np.mean)
print df.apply(np.max)
def second_largest_in_col(col):
return col.drop(col.argmax()).max()
def second_largest(df):
'''
Fill in this function to return the second-largest value of each
column of the input DataFrame.
'''
return df.apply(second_largest_in_col)
second_largest(df)
Out[1000]:
In [1002]:
import pandas as pd
# Change False to True for each block of code to see what it does
# Adding a Series to a square DataFrame
if True:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
In [1003]:
# Adding a Series to a one-row DataFrame
if True:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
print df
print '' # Create a blank line between outputs
print df + s
In [1004]:
# Adding a Series to a one-column DataFrame
if True:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})
print df
print '' # Create a blank line between outputs
print df + s
In [1005]:
# Adding when DataFrame column names match Series index
if True:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
In [1006]:
# Adding when DataFrame column names don't match Series index
if True:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
In [1007]:
import pandas as pd
# Adding using +
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding with axis='index'
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df.add(s, axis='index')
# The functions sub(), mul(), and div() work similarly to add()
# Adding with axis='columns'
if True:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df.add(s, axis='columns')
# The functions sub(), mul(), and div() work similarly to add()
In [1043]:
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
def standardize(df):
'''
Fill in this function to standardize each column of the given
DataFrame. To standardize a variable, convert each value to the
number of standard deviations it is above or below the mean.
This time, try to use vectorized operations instead of apply().
You should get the same results as you did before.
'''
for exam in df.columns:
df[exam] = (df[exam] - df[exam].mean())/df[exam].std(ddof=0)
return df
# Instructor solution
# return (df - df.mean())/df.std(ddof=0)
standardize(grades_df)
Out[1043]:
In [1076]:
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
def standardize_rows(df):
'''
Optional: Fill in this function to standardize each row of the given
DataFrame. Again, try not to use apply().
This one is more challenging than standardizing each column!
'''
last = len(df.index)
for index in range(0,last):
df.iloc[index] = (df.iloc[index] - df.iloc[index].mean())/df.iloc[index].std(ddof=0)
return df
# Instructor's Solution
# return df.sub(df.mean(axis='columns'), axis='index').div(df.std(axis='columns', ddof=0), axis='index')
standardize_rows(grades_df)
Out[1076]:
In [1085]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
# Change False to True for each block of code to see what it does
# Examine DataFrame
if False:
print example_df
# Examine groups
if True:
grouped_data = example_df.groupby('even')
# The groups attribute is a dictionary mapping keys to lists of row indexes
print grouped_data.groups
# Group by multiple columns
if False:
grouped_data = example_df.groupby(['even', 'above_three'])
print grouped_data.groups
# Get sum of each group
if False:
grouped_data = example_df.groupby('even')
print grouped_data.sum()
# Limit columns in result
if True:
grouped_data = example_df.groupby('even')
# You can take one or more columns from the result DataFrame
print grouped_data.sum()['value']
print '\n' # Blank line to separate results
# You can also take a subset of columns from the grouped data before
# collapsing to a DataFrame. In this case, the result is the same.
print grouped_data['value'].sum()
In [1272]:
filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)
### Write code here to group the subway data by a variable of your choice, then
### either print out the mean ridership within each group or create a plot.
# Extracting some interesting columns
summary = subway_df.groupby('UNIT').mean()
mean_entries = summary['ENTRIESn']
mean_entries_hourly = summary['ENTRIESn_hourly']
mean_exits = summary['EXITSn']
mean_exits_hourly = summary['EXITSn_hourly']
mean_rain = summary['rain']
mean_temperature = summary['tempi']
mean_fog = summary['fog']
mean_throughput = entries.add(exits)/2
throughput_dev = throughput.std(ddof=0)
mean_throughput_std = (throughput - throughput.mean())/ throughput_dev
# Getting some high traffic stations
# Here a station is considered high traffic if \
# throughput is > 2 devs away
high_traffic_stations = mean_throughput.loc[mean_throughput > 2*throughput_dev].index
high_traffic_throughput = mean_entries[mean_entries.index.isin(high_traffic_stations)]
high_traffic_rain = mean_rain[mean_rain.index.isin(high_traffic_stations)]
high_traffic_temperature = mean_temperature[mean_temperature.index.isin(high_traffic_stations)]
high_traffic_fog = mean_fog[mean_fog.index.isin(high_traffic_stations)]
print len(high_traffic_stations)
dataframe = pd.concat([high_traffic_throughput, high_traffic_rain, high_traffic_temperature, high_traffic_fog], axis=1).reset_index()
In [ ]:
In [1273]:
# Here I'm going to try and show how correlated some\
# factors are for high traffic subway stations.
sns.set(style="dark")
corr = dataframe.corr()
corr.columns = ['Traffic', 'Rain', 'Temperature', 'Fog']
corr.index = ['Traffic', 'Rain', 'Temperature', 'Fog']
cmap = sns.diverging_palette(9, 145, as_cmap=True)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('NYC Subway Correlation Matrix', fontsize=16)
sns.heatmap(corr, cmap=cmap, vmax=.3, square=True, ax=ax)
Out[1273]:
In [1274]:
import numpy as np
import pandas as pd
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
# Change False to True for each block of code to see what it does
# Standardize each group
if False:
def standardize(xs):
return (xs - xs.mean()) / xs.std()
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(standardize)
# Find second largest value in each group
if False:
def second_largest(xs):
sorted_xs = xs.sort(inplace=False, ascending=False)
return sorted_xs.iloc[1]
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(second_largest)
In [1292]:
# --- Quiz ---
# DataFrame with cumulative entries and exits for multiple stations
ridership_df = pd.DataFrame({
'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275]
})
def hourly_for_group(entries_and_exits):
'''
Fill in this function to take a DataFrame with cumulative entries
and exits and return a DataFrame with hourly entries and exits.
The hourly entries and exits should be calculated separately for
each station (the 'UNIT' column).
Hint: Take a look at the `get_hourly_entries_and_exits()` function
you wrote in a previous quiz, DataFrame Vectorized Operations. If
you copy it here and rename it, you can use it and the `.apply()`
function to help solve this problem.
'''
print entries_and_exits
print entries_and_exits.groupby('UNIT')[['ENTRIESn','EXITSn']].apply(get_hourly_entries_and_exits)
return None
hourly_for_group(ridership_df)
In [1309]:
import pandas as pd
subway_df = pd.DataFrame({
'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
'R004', 'R004'],
'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'ENTRIESn': [ 4388333, 4388348, 4389885, 4391507, 4393043, 14656120,
14656174, 14660126, 14664247, 14668301],
'EXITSn': [ 2911002, 2911036, 2912127, 2913223, 2914284, 14451774,
14451851, 14454734, 14457780, 14460818],
'latitude': [ 40.689945, 40.689945, 40.689945, 40.689945, 40.689945,
40.69132 , 40.69132 , 40.69132 , 40.69132 , 40.69132 ],
'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
-73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
})
weather_df = pd.DataFrame({
'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
'05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'latitude': [ 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945,
40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 ],
'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
-73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
'pressurei': [ 30.24, 30.24, 30.32, 30.32, 30.14, 30.14, 29.98, 29.98,
30.01, 30.01],
'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'tempi': [ 52. , 52. , 48.9, 48.9, 54. , 54. , 57.2, 57.2, 48.9, 48.9],
'wspdi': [ 8.1, 8.1, 6.9, 6.9, 3.5, 3.5, 15. , 15. , 15. , 15. ]
})
def combine_dfs(subway_df, weather_df):
'''
Fill in this function to take 2 DataFrames, one with subway data and one with weather data,
and return a single dataframe with one row for each date, hour, and location. Only include
times and locations that have both subway data and weather data available.
'''
return subway_df.merge(weather_df, on=['DATEn','latitude','longitude'], how='left')
combine_dfs(subway_df, weather_df)
Out[1309]:
In [1317]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
# Change False to True for this block of code to see what it does
# groupby() without as_index
if False:
first_even = example_df.groupby('even').first()
print first_even
print first_even['even'] # Causes an error. 'even' is no longer a column in the DataFrame
# groupby() with as_index=False
if True:
first_even = example_df.groupby('even', as_index=False).first()
print first_even
print first_even['even'] # Now 'even' is still a column in the DataFrame
In [1318]:
filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)
## Make a plot of your choice here showing something interesting about the subway data.
## Matplotlib documentation here: http://matplotlib.org/api/pyplot_api.html
## Once you've got something you're happy with, share it on the forums!
In [ ]: