Before we get started, a couple of reminders to keep in mind when using iPython notebooks:

  • Remember that you can see from the left side of a code cell when it was last run if there is a number within the brackets.
  • When you start a new notebook session, make sure you run all of the cells up to the point where you last left off. Even if the output is still visible from when you ran the cells in your previous session, the kernel starts in a fresh state so you'll need to reload the data, etc. on a new session.
  • The previous point is useful to keep in mind if your answers do not match what is expected in the lesson's quizzes. Try reloading the data and run all of the processing steps one by one in order to make sure that you are working with the same variables and data that are at each quiz stage.

Load Data from CSVs


In [2]:
import unicodecsv

## Longer version of code (replaced with shorter, equivalent version below)

# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
#     enrollments.append(row)
# f.close()

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)

In [3]:
#####################################
#                 1                 #
#####################################

## Read in the data from daily_engagement.csv and project_submissions.csv 
## and store the results in the below variables.
## Then look at the first row of each table.
import unicodecsv

with open('daily_engagement.csv', 'rb') as f:
    reader1 = unicodecsv.DictReader(f)
    daily_engagement = list(reader1)
print daily_engagement[0]
    
with open('project_submissions.csv', 'rb') as f:
    reader2 = unicodecsv.DictReader(f)
    project_submissions = list(reader2)
print project_submissions[0]


{u'lessons_completed': u'0.0', u'num_courses_visited': u'1.0', u'total_minutes_visited': u'11.6793745', u'projects_completed': u'0.0', u'acct': u'0', u'utc_date': u'2015-01-09'}
{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}

Fixing Data Types


In [4]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]


Out[4]:
{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [5]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]


Out[5]:
{u'acct': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [6]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]


Out[6]:
{u'account_key': u'256',
 u'assigned_rating': u'UNGRADED',
 u'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 u'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'lesson_key': u'3176718735',
 u'processing_state': u'EVALUATED'}

Note when running the above cells that we are actively changing the contents of our data variables. If you try to run these cells multiple times in the same session, an error will occur.

Investigating the Data


In [7]:
#####################################
#                 2                 #
#####################################

## Find the total number of rows and the number of unique students (account keys)
## in each table.
enrollment_num_rows = len(enrollments) 
list_of_unique_enrl_keys = []
for item in enrollments:
    if(int(item.get('account_key')) not in list_of_unique_enrl_keys):
        list_of_unique_enrl_keys.append(int(item.get('account_key')))
enrollment_num_unique_students = len(list_of_unique_enrl_keys)

engagement_num_rows = len(daily_engagement)
list_of_unique_eng_keys = []
for item in daily_engagement:
    if(int(item.get('acct')) not in list_of_unique_eng_keys):
        list_of_unique_eng_keys.append(int(item.get('acct')))
engagement_num_unique_students = len(list_of_unique_eng_keys)

submission_num_rows = len(project_submissions)             
list_of_unique_keys = []
for item in project_submissions:
    if(int(item.get('account_key')) not in list_of_unique_keys):
        list_of_unique_keys.append(int(item.get('account_key')))
submission_num_unique_students = len(list_of_unique_keys)

print enrollment_num_rows
print engagement_num_rows
print submission_num_rows

print enrollment_num_unique_students
print engagement_num_unique_students
print submission_num_unique_students


1640
136240
3642
1302
1237
743

Problems in the Data


In [8]:
#####################################
#                 3                 #
#####################################

## Rename the "acct" column in the daily_engagement table to "account_key".
for item in daily_engagement:
    item['account_key'] = item.pop('acct')

Missing Engagement Records


In [9]:
#####################################
#                 4                 #
#####################################

## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.

# Here the goal is to get the account ids of accounts in the enrollment list where the student is 
# enrolled greater than 1 day and none of the times are 'None' and where the enrolled student is 
# not in the list of accounts in the engagement table 

good_enrollments = []

for item in enrollments:    
    if item.get('join_date') != item.get('cancel_date') and item.get('days_to_cancel') != 'None':
        good_enrollments.append(item.get('account_key'))

bad_ids = []
for pid in good_enrollments:
    if int(pid) not in list_of_unique_eng_keys:
        bad_ids.append(int(pid))

print len(bad_ids)


3

Checking for More Problem Records


In [10]:
#####################################
#                 5                 #
#####################################

## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.

test_enrolment_ids = []
for item in enrollments:
    if item.get('is_udacity') and int(item.get('account_key')) not in test_enrolment_ids:
        test_enrolment_ids.append(int(item.get('account_key')))

bad_ids = []
num_problem_students = 0
for item in daily_engagement:
    if int(item.get('account_key')) in test_enrolment_ids and int(item.get('account_key')) not in bad_ids:
        bad_ids.append(int(item.get('account_key')))

num_problem_students = len(bad_ids)
print num_problem_students


3

Tracking Down the Remaining Problems


In [11]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)


Out[11]:
6

In [12]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [13]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print len(non_udacity_enrollments)
print len(non_udacity_engagement)
print len(non_udacity_submissions)


1622
135656
3634

Refining the Question


In [14]:
#####################################
#                 6                 #
#####################################

## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.

paid_students = {}

for enrollment in non_udacity_enrollments:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        
        if account_key not in paid_students or enrollment_date > paid_students[account_key]:
            paid_students[account_key] = enrollment_date
                
len(paid_students)


Out[14]:
995

Getting Data from First Week


In [15]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date 
    return time_delta.days < 7 and time_delta.days >= 0

In [16]:
# Function to remove trials
def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:    
        if data_point.get('account_key') in paid_students:
            new_data.append(data_point)  
    return new_data

In [17]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
print len(paid_enrollments)

paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
print len(paid_engagement)

paid_submissions = remove_free_trial_cancels(non_udacity_submissions)
print len(paid_submissions)


1293
134549
3618

In [18]:
#####################################
#                 7                 #
#####################################

## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.
# paid_students is list of paid account keys

# need engagement utc_dates where the id is in the paid_students array
paid_engagement_in_first_week = []

for engagement_record in paid_engagement:
    account_key = engagement_record.get('account_key')
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record.get('utc_date')
    
    if within_one_week(join_date,engagement_record_date):
        paid_engagement_in_first_week.append(engagement_record)

len(paid_engagement_in_first_week)


Out[18]:
6919

Exploring Student Engagement


In [19]:
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

In [20]:
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes

In [21]:
import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)


Mean: 306.708326753
Standard deviation: 412.996933409
Minimum: 0.0
Maximum: 3564.7332645

Debugging Data Analysis Code


In [22]:
#####################################
#                 8                 #
#####################################

## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.

max_time_spent = 0
bad_account = 0
for account_key, time_spent in total_minutes_by_account.items():
    if time_spent > max_time_spent:
        max_time_spent = time_spent
        bad_account = account_key

print "Bad Account = " + str(bad_account) + " Time Spent = " + str(max_time_spent)


Bad Account = 163 Time Spent = 3564.7332645

In [23]:
bad_engagements = []

for engagement in paid_engagement_in_first_week:
    if engagement.get('account_key') == bad_account:
        bad_engagements.append(engagement)
print bad_engagements


[{u'lessons_completed': 4, u'num_courses_visited': 4, u'total_minutes_visited': 850.519339666, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 9, 0, 0)}, {u'lessons_completed': 6, u'num_courses_visited': 6, u'total_minutes_visited': 872.633923334, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 10, 0, 0)}, {u'lessons_completed': 6, u'num_courses_visited': 2, u'total_minutes_visited': 777.018903666, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 11, 0, 0)}, {u'lessons_completed': 2, u'num_courses_visited': 1, u'total_minutes_visited': 294.568774, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 12, 0, 0)}, {u'lessons_completed': 1, u'num_courses_visited': 3, u'total_minutes_visited': 471.2139785, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 13, 0, 0)}, {u'lessons_completed': 1, u'num_courses_visited': 2, u'total_minutes_visited': 298.778345333, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 14, 0, 0)}, {u'lessons_completed': 0, u'num_courses_visited': 0, u'total_minutes_visited': 0.0, u'projects_completed': 0, 'account_key': u'163', u'utc_date': datetime.datetime(2015, 7, 15, 0, 0)}]

Lessons Completed in First Week


In [24]:
#####################################
#                 9                 #
#####################################

## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.

lessons_completed_by_student={}
lesson_metrics=[]

for account_key, engagement_for_student in engagement_by_account.items():
    
    lessons_completed = 0
    for record in engagement_for_student:
        lessons_completed += int(record.get('lessons_completed'))
    lesson_metrics.append(lessons_completed)
    lessons_completed_by_student[account_key] = lessons_completed
    

# Summarize the data about lessons completed
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(lesson_metrics)
print 'Standard deviation:', np.std(lesson_metrics)
print 'Minimum:', np.min(lesson_metrics)
print 'Maximum:', np.max(lesson_metrics)


Mean: 1.63618090452
Standard deviation: 3.00256129983
Minimum: 0
Maximum: 36

Number of Visits in First Week


In [25]:
######################################
#                 10                 #
######################################

## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.
days_visited_by_student={}
days_visited=[]

for account_key, engagement_for_student in engagement_by_account.items():
    days = 0
    for item in engagement_for_student:
        if item['num_courses_visited'] > 0:
            days += 1  
    days_visited.append(days)
    days_visited_by_student[account_key]=days

    
# Summarize the data about lessons completed
total_minutes = total_minutes_by_account.values()

print 'Mean:', np.mean(days_visited)
print 'Standard deviation:', np.std(days_visited)
print 'Minimum:', np.min(days_visited)
print 'Maximum:', np.max(days_visited)


Mean: 2.86733668342
Standard deviation: 2.25519800292
Minimum: 0
Maximum: 7

Splitting out Passing Students


In [63]:
######################################
#                 11                 #
######################################

## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.

subway_project_lesson_keys = ['746169184', '3176718735']

passing_subway = []
non_passing_subway_temp = []
non_passing_subway = []
lesson_ids = []


for submission in paid_submissions:
    pass_fail = str(submission.get('assigned_rating'))
    lesson_key = str(submission.get('lesson_key'))
    account_key = int(submission.get('account_key'))
    
    if lesson_key not in lesson_ids:
        lesson_ids.append(str(submission.get('lesson_key')))
    
    if lesson_key in subway_project_lesson_keys and \
        (pass_fail == 'PASSED' or pass_fail == 'DISTINCTION') and \
        account_key not in passing_subway:
        passing_subway.append(account_key)
        
    elif pass_fail != 'PASSED' and pass_fail != 'DISTINCTION' and\
        str(submission.get('assigned_rating')) != 'UNGRADED':
        non_passing_subway_temp.append(account_key)
    
for submission in non_passing_subway_temp:
    if submission not in passing_subway:
        non_passing_subway.append(submission)

        
        
print "Students Passed Subway = " + str(len(passing_subway))
print "Students Failed Subway = " + str(len(non_passing_subway))
print "Overlap = " + str(len([i for i in passing_subway if i in non_passing_subway]))
    
passing_engagement = []
non_passing_engagement = []

for engagement in paid_engagement_in_first_week:
    acc_key = int(engagement.get('account_key'))
    if acc_key in passing_subway:
        passing_engagement.append(acc_key)
    else:
        non_passing_engagement.append(acc_key)

print len(passing_engagement)
print len(non_passing_engagement)
print len([i for i in passing_engagement if i in non_passing_engagement])


Students Passed Subway = 647
Students Failed Subway = 98
Overlap = 0
4527
2392
0
## Comparing the Two Student Groups

In [104]:
######################################
#                 12                 #
######################################

## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).

minutes_passing = []
minutes_non_passing = []

lesson_count_passing = []
lesson_count_non_passing = []

days_visited_passing = []
days_visited_non_passing = []

for account_key, engagement_for_student in engagement_by_account.items():
    account_key = int(account_key)
    minutes = 0
    lesson_count = 0
    days_visited = 0

    if account_key in passing_subway:
        for engagement in engagement_for_student:
            minutes += float(engagement.get('total_minutes_visited'))
            lesson_count += int(engagement.get('lessons_completed'))
            if float(engagement.get('total_minutes_visited')) > 0:
                days_visited += 1
        minutes_passing.append(minutes)
        lesson_count_passing.append(lesson_count)
        days_visited_passing.append(days_visited)

    else:
        for engagement in engagement_for_student:
            minutes += float(engagement.get('total_minutes_visited'))
            lesson_count += int(engagement.get('lessons_completed'))
            if float(engagement.get('total_minutes_visited')) > 0:
                days_visited += 1
        minutes_non_passing.append(minutes)
        lesson_count_non_passing.append(lesson_count)
        days_visited_non_passing.append(days_visited)
        
print 'Minutes Spent in Classrom:'
print ''
print '------------------------------------------------'
print 'Passing Metrics:'
print 'Mean Passing:', np.mean(minutes_passing)
print 'Standard deviation:', np.std(minutes_passing)
print 'Minimum:', np.min(minutes_passing)
print 'Maximum:', np.max(minutes_passing)

print '------------------------------------------------'
print 'Non-Passing Metrics:'
print 'Mean Non-Passing:', np.mean(minutes_non_passing)
print 'Standard deviation:', np.std(minutes_non_passing)
print 'Minimum:', np.min(minutes_non_passing)
print 'Maximum:', np.max(minutes_non_passing)

print ''
print ''
print 'Student Lesson Count:'
print ''
print '------------------------------------------------'
print 'Passing Metrics:'
print 'Mean Passing:', np.mean(lesson_count_passing)
print 'Standard deviation:', np.std(lesson_count_passing)
print 'Minimum:', np.min(lesson_count_passing)
print 'Maximum:', np.max(lesson_count_passing)

print '------------------------------------------------'
print 'Non-Passing Metrics:'
print 'Mean Non-Passing:', np.mean(lesson_count_non_passing)
print 'Standard deviation:', np.std(lesson_count_non_passing)
print 'Minimum:', np.min(lesson_count_non_passing)
print 'Maximum:', np.max(lesson_count_non_passing)

print ''
print ''
print 'Student Days Visited:'
print ''
print '------------------------------------------------'
print 'Passing Metrics:'
print 'Mean Passing:', np.mean(days_visited_passing)
print 'Standard deviation:', np.std(days_visited_passing)
print 'Minimum:', np.min(days_visited_passing)
print 'Maximum:', np.max(days_visited_passing)

print '------------------------------------------------'
print 'Non-Passing Metrics:'
print 'Mean Non-Passing:', np.mean(days_visited_non_passing)
print 'Standard deviation:', np.std(days_visited_non_passing)
print 'Minimum:', np.min(days_visited_non_passing)
print 'Maximum:', np.max(days_visited_non_passing)


Minutes Spent in Classrom:

------------------------------------------------
Passing Metrics:
Mean Passing: 394.586046484
Standard deviation: 448.499519327
Minimum: 0.0
Maximum: 3564.7332645
------------------------------------------------
Non-Passing Metrics:
Mean Non-Passing: 143.326474267
Standard deviation: 269.538619011
Minimum: 0.0
Maximum: 1768.52274933


Student Lesson Count:

------------------------------------------------
Passing Metrics:
Mean Passing: 2.05255023184
Standard deviation: 3.14222705558
Minimum: 0
Maximum: 36
------------------------------------------------
Non-Passing Metrics:
Mean Non-Passing: 0.862068965517
Standard deviation: 2.54915994183
Minimum: 0
Maximum: 27


Student Days Visited:

------------------------------------------------
Passing Metrics:
Mean Passing: 3.38485316847
Standard deviation: 2.25882147092
Minimum: 0
Maximum: 7
------------------------------------------------
Non-Passing Metrics:
Mean Non-Passing: 1.90517241379
Standard deviation: 1.90573144136
Minimum: 0
Maximum: 7

Making Histograms


In [206]:
######################################
#                 13                 #
######################################

## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.

%matplotlib inline 
# Above is for jupyter to render in the notebook

import matplotlib.pyplot as plt

fig = plt.figure()
fig = plt.figure(figsize=(20, 20))
fig.subplots_adjust(hspace=0.5, wspace=0.15)

ax1 = fig.add_subplot(3, 2, 1)
ax1.set
ax1.set_xlabel('Minutes Spent')
ax1.set_ylabel('Student Count')
ax1.set_title('Minutes Spent-Passing Students')

ax2 = fig.add_subplot(3, 2, 2)
ax2.set_xlabel('Minutes Spent')
ax2.set_ylabel('Student Count')
ax2.set_title('Minutes Spent-Failing Students')

ax3 = fig.add_subplot(3, 2, 3)
ax3.set_xlabel('Lessons Completed')
ax3.set_ylabel('Students')
ax3.set_title('Lessons Completed-Passing Students')

ax4 = fig.add_subplot(3, 2, 4)
ax4.set_xlabel('Lessons Completed')
ax4.set_ylabel('Students')
ax4.set_title('Lessons Completed-Failing Students')

ax5 = fig.add_subplot(3, 2, 5)
ax5.set_xlabel('Days Visited')
ax5.set_ylabel('Students')
ax5.set_title('Days Spent-Passing Students')

ax6 = fig.add_subplot(3, 2, 6)
ax6.set_xlabel('Days Visited')
ax6.set_ylabel('Students')
ax6.set_title('Days Spent-Failing Students')

ax1 = ax1.hist(minutes_passing, bins=30)
ax2 = ax2.hist(minutes_non_passing, bins=30)
ax3 = ax3.hist(lesson_count_passing)
ax4 = ax4.hist(lesson_count_non_passing)
ax5 = ax5.hist(days_visited_passing)
ax6 = ax6.hist(days_visited_non_passing)


<matplotlib.figure.Figure at 0x1160f7090>

In [228]:
## Comparison for description on git

%matplotlib inline 
# Above is for jupyter to render in the notebook

import matplotlib.pyplot as plt

plt.hist(minutes_passing, bins=30, color=['lightgreen'], label=['Passing Students'])
plt.hist(minutes_non_passing, bins=30, color=['pink'], label=['Failing Students'])
plt.xlabel('Time Spent (Minutes)')
plt.ylabel('# of Students')
plt.title('Student Success - Time Investment', fontsize=16)
plt.grid(True)
plt.legend()


Out[228]:
<matplotlib.legend.Legend at 0x119c29090>

In [229]:
## Comparison for description on git

%matplotlib inline 
# Above is for jupyter to render in the notebook

import matplotlib.pyplot as plt


plt.hist(days_visited_passing, color=['lightgreen'], label=['Passing Students'])
plt.hist(days_visited_non_passing, color=['pink'], label=['Failing Students'])
plt.xlabel('Course Usage - Logins by Day')
plt.ylabel('# of Students')
plt.title('Student Success - Course Usage', fontsize=16)
plt.grid(True)
plt.legend()


Out[229]:
<matplotlib.legend.Legend at 0x11951db90>

Improving Plots and Sharing Findings


In [282]:
######################################
#                 14                 #
######################################

## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.
import seaborn as sns

sns.distplot(minutes_passing, kde=False, bins=10, label="Passing Students");
sns.distplot(minutes_non_passing, kde=False, bins=10, color='green', label="Failing Students");
plt.legend()
plt.xlabel('Time Spent (Minutes)')
plt.ylabel('# of Students')
plt.title('Student Success - Time Investment', fontsize=16)


Out[282]:
<matplotlib.text.Text at 0x114304b90>

In [283]:
sns.distplot(days_visited_passing, kde=False, bins=10, label="Passing Students");
sns.distplot(days_visited_non_passing, kde=False, bins=10, color='green', label="Failing Students");
plt.legend()
plt.xlabel('Course Usage - Logins by Day')
plt.ylabel('# of Students')
plt.title('Student Success - Course Usage', fontsize=16)


Out[283]:
<matplotlib.text.Text at 0x11b93c3d0>

In [285]:
# Setup pandas to read in the full engagement table
import pandas as pd

daily_engagement = pd.read_csv('daily_engagement_full.csv')

len(daily_engagement.get('acct').unique())


Out[285]:
1237

In [286]:
import numpy as np

# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

In [329]:
# Change False to True for each block of code to see what it does

# Accessing elements
if True:
    print countries[0]
    print countries[3]

# Slicing
if False:
    print countries[0:3]
    print countries[:3]
    print countries[17:]
    print countries[:]

# Element types
if False:
    print countries.dtype
    print employment.dtype
    print np.array([0, 1, 2, 3]).dtype
    print np.array([1.0, 1.5, 2.0, 2.5]).dtype
    print np.array([True, False, True]).dtype
    print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype

# Looping
if False:
    for country in countries:
        print 'Examining country {}'.format(country)

    for i in range(len(countries)):
        country = countries[i]
        country_employment = employment[i]
        print 'Country {} has employment {}'.format(country,
                country_employment)

# Numpy functions
if False:
    print employment.mean()
    print employment.std()
    print employment.max()
    print employment.sum()


Afghanistan
Angola

In [340]:
def max_employment(countries, employment):
    '''
    Fill in this function to return the name of the country
    with the highest employment in the given employment
    data, and the employment in that country.
    '''
    max_country = countries[int(tuple(np.where(employment==employment.max())[0])[0])]     # Replace this with your code
    max_value = employment.max()   # Replace this with your code

    return (max_country, max_value)

    # Alternate Instructor solution
#     return (countries[employment.argmax()], employment.max())

max_employment(countries, employment)


Out[340]:
('Angola', 75.699996949999999)

In [342]:
import numpy as np

# Change False to True for each block of code to see what it does

# Arithmetic operations between 2 NumPy arrays
if False:
    a = np.array([1, 2, 3, 4])
    b = np.array([1, 2, 1, 2])
    
    print a + b
    print a - b
    print a * b
    print a / b
    print a ** b
    
# Arithmetic operations between a NumPy array and a single number
if True:
    a = np.array([1, 2, 3, 4])
    b = 2
    
    print a + b
    print a - b
    print a * b
    print a / b
    print a ** b
    
# Logical operations with NumPy arrays
if False:
    a = np.array([True, True, False, False])
    b = np.array([True, False, True, False])
    
    print a & b
    print a | b
    print ~a
    
    print a & True
    print a & False
    
    print a | True
    print a | False
    
# Comparison operations between 2 NumPy Arrays
if False:
    a = np.array([1, 2, 3, 4, 5])
    b = np.array([5, 4, 3, 2, 1])
    
    print a > b
    print a >= b
    print a < b
    print a <= b
    print a == b
    print a != b
    
# Comparison operations between a NumPy array and a single number
if False:
    a = np.array([1, 2, 3, 4])
    b = 2
    
    print a > b
    print a >= b
    print a < b
    print a <= b
    print a == b
    print a != b


[3 4 5 6]
[-1  0  1  2]
[2 4 6 8]
[0 1 1 2]
[ 1  4  9 16]

In [362]:
# First 20 countries with school completion data
countries = np.array([
       'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan',
       'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia',
       'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Cape Verde'
])

# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([
    97.35583,  104.62379,  103.02998,   95.14321,  103.69019,
    98.49185,  100.88828,   95.43974,   92.11484,   91.54804,
    95.98029,   98.22902,   96.12179,  119.28105,   97.84627,
    29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])

# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([
     95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,
     97.80458,  103.81398,   88.11736,   93.55611,   87.76347,
    102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,
     37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])

def overall_completion_rate(female_completion, male_completion):
    '''
    Fill in this function to return a NumPy array containing the overall
    school completion rate for each country. The arguments are NumPy
    arrays giving the female and male completion of each country in
    the same order.
    '''
    
    return  (female_completion + male_completion)/2

overall_completion_rate(female_completion, male_completion)


Out[362]:
array([  96.416025,  102.644275,  101.41129 ,   93.316285,  103.455575,
         98.148215,  102.35113 ,   91.77855 ,   92.835475,   89.655755,
         99.218715,   98.484275,   94.172835,  117.335125,   98.275645,
         33.04039 ,   41.905225,   90.962965,   57.08404 ,   93.06015 ])

In [365]:
import numpy as np

# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

# Change this country name to change what country will be printed when you
# click "Test Run". Your function will be called to determine the standardized
# score for this country for each of the given 5 Gapminder variables in 2007.
# The possible country names are available in the Downloadables section.

country_name = 'United States'

def standardize_data(values):
    '''
    Fill in this function to return a standardized version of the given values,
    which will be in a NumPy array. Each value should be translated into the
    number of standard deviations that value is away from the mean of the data.
    (A positive number indicates a value higher than the mean, and a negative
    number indicates a value lower than the mean.)
    
    The formula is effectively the the distance between the vector and the mean divided 
    by the deviation.
    '''

    return (values-np.mean(values))/np.std(values)

standardize_data(employment)


Out[365]:
array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,
       -1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,
        0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,
       -0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])

In [367]:
import numpy as np

# Change False to True for each block of code to see what it does

# Using index arrays
if False:
    a = np.array([1, 2, 3, 4])
    b = np.array([True, True, False, False])
    
    print a[b]
    print a[np.array([True, False, True, False])]
    
# Creating the index array using vectorized operations
if False:
    a = np.array([1, 2, 3, 2, 1])
    b = (a >= 2)
    
    print a[b]
    print a[a >= 2]
    
# Creating the index array using vectorized operations on another array
if False:
    a = np.array([1, 2, 3, 4, 5])
    b = np.array([1, 2, 3, 2, 1])
    
    print b == 2
    print a[b == 2]

def mean_time_for_paid_students(time_spent, days_to_cancel):
    '''
    Fill in this function to calculate the mean time spent in the classroom
    for students who stayed enrolled at least (greater than or equal to) 7 days.
    Unlike in Lesson 1, you can assume that days_to_cancel will contain only
    integers (there are no students who have not canceled yet).
    
    The arguments are NumPy arrays. time_spent contains the amount of time spent
    in the classroom for each student, and days_to_cancel contains the number
    of days until each student cancel. The data is given in the same order
    in both arrays.
    '''
    return np.mean(time_spent[days_to_cancel>=7])

# Time spent in the classroom in the first week for 20 students
time_spent = np.array([
       12.89697233,    0.        ,   64.55043217,    0.        ,
       24.2315615 ,   39.991625  ,    0.        ,    0.        ,
      147.20683783,    0.        ,    0.        ,    0.        ,
       45.18261617,  157.60454283,  133.2434615 ,   52.85000767,
        0.        ,   54.9204785 ,   26.78142417,    0.
])

# Days to cancel for 20 students
days_to_cancel = np.array([
      4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,
     38,  98,   2, 249,   2, 127,  35
])

mean_time_for_paid_students(time_spent, days_to_cancel)


Out[367]:
41.054003485454537

In [468]:
import pandas as pd

countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]

# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)

# Change False to True for each block of code to see what it does

# Accessing elements and slicing
if False:
    print life_expectancy[0]
    print gdp[3:6]
    
# Looping
if False:
    for country_life_expectancy in life_expectancy:
        print 'Examining life expectancy {}'.format(country_life_expectancy)
        
# Pandas functions
if False:
    print life_expectancy.mean()
    print life_expectancy.std()
    print gdp.max()
    print gdp.sum()

# Vectorized operations and index arrays
if False:
    a = pd.Series([1, 2, 3, 4])
    b = pd.Series([1, 2, 1, 2])
  
    print a + b
    print a * 2
    print a >= 3
    print a[a >= 3]
   
def variable_correlation(variable1, variable2):
    
    both_up = (variable1 > variable1.mean()) & (variable2 > variable2.mean())
    up = len(both_up[both_up==True])
    both_down = (variable1 < variable1.mean()) & (variable2 < variable2.mean())
    down = len(both_down[both_down==True])
    
    num_same_direction = up + down
    num_different_direction = len(variable1) -  num_same_direction
    
    return (num_same_direction, num_different_direction)

variable_correlation(life_expectancy, gdp)


17
20
Out[468]:
(17, 3)

In [472]:
import pandas as pd

countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
]


employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
]

# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)

def max_employment(employment):
    '''
    Fill in this function to return the name of the country
    with the highest employment in the given employment
    data, and the employment in that country.
    
    The input will be a Pandas series where the values
    are employment and the index is country names.
    
    Try using the Pandas argmax() function. Documention is
    here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.argmax.html
    '''
    max_country = employment.argmax()      # Replace this with your code
    max_value = employment.max()   # Replace this with your code

    return (max_country, max_value)

max_employment(employment)


Out[472]:
('Angola', 75.69999695)

In [479]:
import pandas as pd

# Change False to True for each block of code to see what it does

# Addition when indexes are the same
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
    print s1 + s2

# Indexes have same elements in a different order
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])
    print s1 + s2

# Indexes overlap, but do not have exactly the same elements
if True:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
    print s1 + s2

# Indexes do not overlap
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
    print s1 + s2


a   NaN
b   NaN
c    13
d    24
e   NaN
f   NaN
dtype: float64

In [519]:
import pandas as pd

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])

# Try to write code that will add the 2 previous series together,
# but treating missing values from either series as 0. The result
# when printed out should be similar to the following line:
# print pd.Series([1, 2, 13, 24, 30, 40], index=['a', 'b', 'c', 'd', 'e', 'f'])


print s2.add(s1, fill_value=0)


a     1
b     2
c    13
d    24
e    30
f    40
dtype: float64

In [555]:
import pandas as pd

# Change False to True to see what the following block of code does

# Example pandas apply() usage (although this could have been done
# without apply() using vectorized operations)
if False:
    s = pd.Series([1, 2, 3, 4, 5])
    def add_one(x):
        return x + 1
    print s.apply(add_one)

names = pd.Series([
    'Andre Agassi',
    'Barry Bonds',
    'Christopher Columbus',
    'Daniel Defoe',
    'Emilio Estevez',
    'Fred Flintstone',
    'Greta Garbo',
    'Humbert Humbert',
    'Ivan Ilych',
    'James Joyce',
    'Keira Knightley',
    'Lois Lane',
    'Mike Myers',
    'Nick Nolte',
    'Ozzy Osbourne',
    'Pablo Picasso',
    'Quirinus Quirrell',
    'Rachael Ray',
    'Susan Sarandon',
    'Tina Turner',
    'Ugueth Urbina',
    'Vince Vaughn',
    'Woodrow Wilson',
    'Yoji Yamada',
    'Zinedine Zidane'
])

def reverse_names(names):
    '''
    Fill in this function to return a new series where each name
    in the input series has been transformed from the format
    "Firstname Lastname" to "Lastname, FirstName".
    
    Try to use the Pandas apply() function rather than a loop.
    '''
    temp = names.split(' ')
    return temp[1] +', '+ temp[0]

names.apply(reverse_names)


Out[555]:
0             Agassi, Andre
1              Bonds, Barry
2     Columbus, Christopher
3             Defoe, Daniel
4           Estevez, Emilio
5          Flintstone, Fred
6              Garbo, Greta
7          Humbert, Humbert
8               Ilych, Ivan
9              Joyce, James
10         Knightley, Keira
11               Lane, Lois
12              Myers, Mike
13              Nolte, Nick
14           Osbourne, Ozzy
15           Picasso, Pablo
16       Quirrell, Quirinus
17             Ray, Rachael
18          Sarandon, Susan
19             Turner, Tina
20           Urbina, Ugueth
21            Vaughn, Vince
22          Wilson, Woodrow
23             Yamada, Yoji
24         Zidane, Zinedine
dtype: object

In [734]:
import pandas as pd
import seaborn as sns

# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.

path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')

# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.

employment_us = employment.loc['United States']
female_completion_us = female_completion.loc['United States']
male_completion_us = male_completion.loc['United States']
life_expectancy_us = life_expectancy.loc['United States']
gdp_us = gdp.loc['United States']

# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2

# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems 
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)

fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

employment_us.plot(ax=ax, label='Employment - USA')
life.plot(ax=ax2, color='green', label='Life Expectancy - USA')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - USA', fontsize=16)

plt.show()



In [735]:
import pandas as pd
import seaborn as sns

# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.

path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')

# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.

employment_us = employment.loc['Spain']
female_completion_us = female_completion.loc['Spain']
male_completion_us = male_completion.loc['Spain']
life_expectancy_us = life_expectancy.loc['Spain']
gdp_us = gdp.loc['Spain']

# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2

# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems 
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)

fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

employment_us.plot(ax=ax, label='Employment - Spain')
life.plot(ax=ax2, color='green', label='Life Expectancy - Spain')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - Spain', fontsize=16)

plt.show()



In [736]:
import pandas as pd
import seaborn as sns

# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.

path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')

# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.

employment_us = employment.loc['Pakistan']
female_completion_us = female_completion.loc['Pakistan']
male_completion_us = male_completion.loc['Pakistan']
life_expectancy_us = life_expectancy.loc['Pakistan']
gdp_us = gdp.loc['Pakistan']

# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2

# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems 
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)

fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

employment_us.plot(ax=ax, label='Employment - Pakistan')
life.plot(ax=ax2, color='green', label='Life Expectancy - Pakistan')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - Pakistan', fontsize=16)

plt.show()



In [738]:
import pandas as pd
import seaborn as sns

# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.

path = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/'
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')

# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.

employment_us = employment.loc['India']
female_completion_us = female_completion.loc['India']
male_completion_us = male_completion.loc['India']
life_expectancy_us = life_expectancy.loc['India']
gdp_us = gdp.loc['India']

# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
year_list = employment_us.index.values
life = life_expectancy_us.get(year_list)
gdp = gdp_us.get(year_list)
m_ed = male_completion_us.get(year_list)
f_ed = female_completion_us.get(year_list)
ed = (f_ed.add(m_ed, fill_value=0).fillna(0))/2

# Here, we're only plotting Employment vs Life Expectancy as the completion stat seems 
# > 100% and the gdp chart didn't add much to the story (other than being correlated
# to Life Expectancy)

fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

employment_us.plot(ax=ax, label='Employment - India')
life.plot(ax=ax2, color='green', label='Life Expectancy - India')
ax.set_xlabel('Time - (Years)')
ax.set_ylabel('Employment (%)')
ax2.set_ylabel('Life Expectancy (Years)')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.set_title('Demographic Statistics - India', fontsize=16)

plt.show()



In [784]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership2 = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

ridership1 = np.array([
    [ 5, 10, 15, 20, 25],
    [ 5, 10, 15, 20, 25],
    [ 5, 10, 15, 20, 25]
])

ridership = np.array([
       [   0,    0,    2,    5,    0],
       [1478, 3877, 3674, 2328, 2539],
       [1613, 4088, 3991, 6461, 2691],
       [1560, 3392, 3826, 4787, 2613],
       [1608, 4802, 3932, 4477, 2705],
       [1576, 3933, 3909, 4979, 2685],
       [  95,  229,  255,  496,  201],
       [   2,    0,    1,   27,    0],
       [1438, 3785, 3589, 4174, 2215],
       [1342, 4043, 4009, 4665, 3033]
])

# Change False to True for each block of code to see what it does

# Accessing elements
if False:
    print ridership[1, 3]
    print ridership[1:3, 3:5]
    print ridership[1, :]
    
# Vectorized operations on rows or columns
if False:
    print ridership[0, :] + ridership[1, :]
    print ridership[:, 0] + ridership[:, 1]
    
# Vectorized operations on entire arrays
if False:
    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    print a + b

def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    print np.max(ridership)
    print np.argwhere(ridership==np.max(ridership))[0][1]
    
    overall_mean = np.mean(ridership) 
    mean_for_max = np.mean(ridership[:, np.argwhere(ridership==np.max(ridership))[0][1]])
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership)


6461
3
Out[784]:
(2342.5999999999999, 3239.9000000000001)

In [788]:
import numpy as np

# Change False to True for this block of code to see what it does

# NumPy axis argument
if False:
    a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    
    print a.sum()
    print a.sum(axis=0)
    print a.sum(axis=1)
    
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    
    max_daily_ridership = np.max(np.mean(ridership, axis=0))
    min_daily_ridership = np.min(np.mean(ridership, axis=0))
    
    return (max_daily_ridership, min_daily_ridership)

min_and_max_riders_per_day(ridership)


Out[788]:
(3239.9000000000001, 1071.2)

In [875]:
import pandas as pd

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

# Change False to True for each block of code to see what it does

# DataFrame creation
if False:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df_1

    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print df_2
   

# Accessing elements
if False:
    print ridership_df.iloc[0]
    print ridership_df.loc['05-05-11']
    print ridership_df['R003']
    print ridership_df.iloc[1, 3]
    
# Accessing multiple rows
if False:
    print ridership_df.iloc[1:4]
    
# Accessing multiple columns
if False:
    print ridership_df[['R003', 'R005']]
    
# Pandas axis
if False:
    df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df.sum()
    print df.sum(axis=1)
    print df.values.sum()
    
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''    
    overall_mean = ridership.values.mean() 
    mean_for_max = ridership.loc[:, ridership.max().argmax()].mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership_df)


Out[875]:
(2342.5999999999999, 3239.9)

In [890]:
import pandas as pd

filename = '/Users/Vinay/Documents/Code/Udacity/numpy_pandas_learning/nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    val_x = (x - x.mean())/x.std(ddof=0)
    val_y = (y - y.mean())/y.std(ddof=0)
    prod = val_x * val_y
    
    return prod.mean()

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print correlation(entries, rain)
print correlation(entries, temp)
print correlation(rain, temp)
print correlation(entries, cum_entries)


0.0356485157722
-0.0266933483216
-0.229034323408
0.585895470766

In [894]:
import pandas as pd

# Examples of vectorized operations on DataFrames:
# Change False to True for each block of code to see what it does

# Adding DataFrames with the column names
if False:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
    print df1 + df2


    a   b   c
0  11  44  77
1  22  55  88
2  33  66  99

In [896]:
# Adding DataFrames with overlapping column names 
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
    print df1 + df2


    a   b   c   d
0 NaN  74  47 NaN
1 NaN  85  58 NaN
2 NaN  96  69 NaN

In [897]:
# Adding DataFrames with overlapping row indexes
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                       index=['row1', 'row2', 'row3'])
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                       index=['row4', 'row3', 'row2'])
    print df1 + df2


       a   b   c
row1 NaN NaN NaN
row2  32  65  98
row3  23  56  89
row4 NaN NaN NaN

In [1283]:
# --- Quiz ---
# Cumulative entries and exits for one station for a few hours.
# Dataset 1
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

# Dataset 2
# entries_and_exits = pd.DataFrame(
#     {'ENTRIESn': [0.0, 30.0, 20.0, 5.0, 20.0],
#      'EXITSn': [0.0, 10.0, 10.0, 40.0, 0.0]},
#     index=[0, 1, 2, 3, 4]
# )

def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
#     Instructor's solution
#     return (entries_and_exits - entries_and_exits.shift(1)).fillna(0)

    arr = entries_and_exits.copy()
    arr['EntryDelta'] = arr['ENTRIESn'].shift(1)
    arr['ExitDelta'] = arr['EXITSn'].shift(1)
    arr['ENTRIES1n'] = arr['ENTRIESn'] - arr['EntryDelta']
    arr['EXITS1n'] = arr['EXITSn'] - arr['ExitDelta']
    arr = arr.fillna(0)
    entries_and_exits['ENTRIESn'] =  arr['ENTRIES1n']
    entries_and_exits['EXITSn'] =  arr['EXITS1n']
    
    return entries_and_exits

get_hourly_entries_and_exits(entries_and_exits)


Out[1283]:
ENTRIESn EXITSn
0 0 0
1 23 8
2 18 18
3 71 54
4 170 44
5 214 42
6 87 11
7 10 3
8 36 89
9 153 333

In [974]:
import pandas as pd

# Change False to True for this block of code to see what it does

# DataFrame applymap()
if False:
    df = pd.DataFrame({
        'a': [1, 2, 3],
        'b': [10, 20, 30],
        'c': [5, 10, 15]
    })
    
    def add_one(x):
        return x + 1
        
    print df.applymap(add_one)
    
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
    
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
    The conversion rule is:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
    '''
    grades_ret = ''
    if grades >= 90:
        grades_ret = 'A'
    elif grades >= 80:
        grades_ret = 'B'
    elif grades >= 70:
        grades_ret = 'C'  
    elif grades >= 60:
        grades_ret = 'D'
    else:
        grades_ret = 'F'
        
    return grades_ret

grades_df.applymap(convert_grades)


Out[974]:
exam1 exam2
Andre F F
Barry B D
Chris C F
Dan C F
Emilio B D
Fred C F
Greta A C
Humbert D F
Ivan A C
James B D

In [990]:
import pandas as pd

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

test1_df = pd.DataFrame(
    {0: [95, 85, 75, 65, 55], 1: [95, 85, 75, 65, 55]},
    index=[0, 1, 2, 3, 4]
)

# Change False to True for this block of code to see what it does

# DataFrame apply()
if False:
    def convert_grades_curve(exam_grades):
        # Pandas has a bult-in function that will perform this calculation
        # This will give the bottom 0% to 10% of students the grade 'F',
        # 10% to 20% the grade 'D', and so on. You can read more about
        # the qcut() function here:
        # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
        return pd.qcut(exam_grades,
                       [0, 0.1, 0.2, 0.5, 0.8, 1],
                       labels=['F', 'D', 'C', 'B', 'A'])
        
    # qcut() operates on a list, array, or Series. This is the
    # result of running the function on a single column of the
    # DataFrame.
    print convert_grades_curve(grades_df['exam1'])
    
    # qcut() does not work on DataFrames, but we can use apply()
    # to call the function on each column separately
    print grades_df.apply(convert_grades_curve)


def standardize_col(col):    
    return (col - col.mean())/ col.std()

def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    '''
    return df.apply(standardize_col)

standardize(grades_df)


Out[990]:
exam1 exam2
Andre -2.196525 -2.186335
Barry 0.208891 0.366571
Chris 0.018990 -0.091643
Dan -0.170911 -0.091643
Emilio 0.715295 0.628408
Fred -0.487413 -0.418938
Greta 0.841896 1.413917
Humbert -0.803916 -0.746234
Ivan 1.284999 0.955703
James 0.588694 0.170194

In [1000]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

# Change False to True for this block of code to see what it does

# DataFrame apply() - use case 2
if False:   
    print df.apply(np.mean)
    print df.apply(np.max)

def second_largest_in_col(col):
    return col.drop(col.argmax()).max()

def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    return df.apply(second_largest_in_col)

second_largest(df)


Out[1000]:
a     4
b    40
c    20
dtype: int64

In [1002]:
import pandas as pd

# Change False to True for each block of code to see what it does

# Adding a Series to a square DataFrame
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s


    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1    2    3
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164

In [1003]:
# Adding a Series to a one-row DataFrame 
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s


    0   1   2   3
0  10  20  30  40

    0   1   2   3
0  11  22  33  44

In [1004]:
# Adding a Series to a one-column DataFrame
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10, 20, 30, 40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s


    0
0  10
1  20
2  30
3  40

    0   1   2   3
0  11 NaN NaN NaN
1  21 NaN NaN NaN
2  31 NaN NaN NaN
3  41 NaN NaN NaN

In [1005]:
# Adding when DataFrame column names match Series index
if True:
    s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s


    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    a   b    c    d
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164

In [1006]:
# Adding when DataFrame column names don't match Series index
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s


    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1   2   3   a   b   c   d
0 NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN

In [1007]:
import pandas as pd

# Adding using +
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    
# Adding with axis='index'
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df.add(s, axis='index')
    # The functions sub(), mul(), and div() work similarly to add()
    
# Adding with axis='columns'
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df.add(s, axis='columns')
    # The functions sub(), mul(), and div() work similarly to add()


    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1    2    3
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164

In [1043]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    
    This time, try to use vectorized operations instead of apply().
    You should get the same results as you did before.
    '''
    for exam in df.columns:
          df[exam] = (df[exam] - df[exam].mean())/df[exam].std(ddof=0)
    return df

#     Instructor solution
#     return (df - df.mean())/df.std(ddof=0)

standardize(grades_df)


Out[1043]:
exam1 exam2
Andre -2.315341 -2.304599
Barry 0.220191 0.386400
Chris 0.020017 -0.096600
Dan -0.180156 -0.096600
Emilio 0.753987 0.662400
Fred -0.513779 -0.441600
Greta 0.887436 1.490400
Humbert -0.847401 -0.786600
Ivan 1.354508 1.007400
James 0.620538 0.179400

In [1076]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def standardize_rows(df):
    '''
    Optional: Fill in this function to standardize each row of the given
    DataFrame. Again, try not to use apply().
    
    This one is more challenging than standardizing each column!
    '''
    last = len(df.index)
    for index in range(0,last):
        df.iloc[index] = (df.iloc[index] - df.iloc[index].mean())/df.iloc[index].std(ddof=0)
    return df

#     Instructor's Solution
#     return df.sub(df.mean(axis='columns'), axis='index').div(df.std(axis='columns', ddof=0), axis='index')

standardize_rows(grades_df)


Out[1076]:
exam1 exam2
Andre 1 -1
Barry 1 -1
Chris 1 -1
Dan 1 -1
Emilio 1 -1
Fred 1 -1
Greta 1 -1
Humbert 1 -1
Ivan 1 -1
James 1 -1

In [1085]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Change False to True for each block of code to see what it does

# Examine DataFrame
if False:
    print example_df
    
# Examine groups
if True:
    grouped_data = example_df.groupby('even')
    # The groups attribute is a dictionary mapping keys to lists of row indexes
    print grouped_data.groups
    
# Group by multiple columns
if False:
    grouped_data = example_df.groupby(['even', 'above_three'])
    print grouped_data.groups
    
# Get sum of each group
if False:
    grouped_data = example_df.groupby('even')
    print grouped_data.sum()
    
# Limit columns in result
if True:
    grouped_data = example_df.groupby('even')
    
    # You can take one or more columns from the result DataFrame
    print grouped_data.sum()['value']
    
    print '\n' # Blank line to separate results
    
    # You can also take a subset of columns from the grouped data before 
    # collapsing to a DataFrame. In this case, the result is the same.
    print grouped_data['value'].sum()


{False: ['a', 'b', 'e'], True: ['c', 'd', 'f', 'g']}
even
False     5
True     16
Name: value, dtype: int64


even
False     5
True     16
Name: value, dtype: int64

In [1272]:
filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

### Write code here to group the subway data by a variable of your choice, then
### either print out the mean ridership within each group or create a plot.

# Extracting some interesting columns
summary = subway_df.groupby('UNIT').mean()
mean_entries = summary['ENTRIESn']
mean_entries_hourly = summary['ENTRIESn_hourly']
mean_exits = summary['EXITSn']
mean_exits_hourly = summary['EXITSn_hourly']
mean_rain = summary['rain']
mean_temperature = summary['tempi']
mean_fog = summary['fog']


mean_throughput = entries.add(exits)/2
throughput_dev = throughput.std(ddof=0)
mean_throughput_std = (throughput - throughput.mean())/ throughput_dev 

# Getting some high traffic stations
# Here a station is considered high traffic if \
# throughput is > 2 devs away
high_traffic_stations = mean_throughput.loc[mean_throughput > 2*throughput_dev].index
high_traffic_throughput = mean_entries[mean_entries.index.isin(high_traffic_stations)]
high_traffic_rain = mean_rain[mean_rain.index.isin(high_traffic_stations)]
high_traffic_temperature = mean_temperature[mean_temperature.index.isin(high_traffic_stations)]
high_traffic_fog = mean_fog[mean_fog.index.isin(high_traffic_stations)]
print len(high_traffic_stations)
dataframe = pd.concat([high_traffic_throughput, high_traffic_rain, high_traffic_temperature, high_traffic_fog], axis=1).reset_index()


26

In [ ]:


In [1273]:
# Here I'm going to try and show how correlated some\
# factors are for high traffic subway stations.
sns.set(style="dark")
corr = dataframe.corr()
corr.columns = ['Traffic', 'Rain', 'Temperature', 'Fog']
corr.index = ['Traffic', 'Rain', 'Temperature', 'Fog']
cmap = sns.diverging_palette(9, 145, as_cmap=True)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('NYC Subway Correlation Matrix', fontsize=16)

sns.heatmap(corr, cmap=cmap, vmax=.3, square=True, ax=ax)


Out[1273]:
<matplotlib.axes._subplots.AxesSubplot at 0x12a1c4dd0>

In [1274]:
import numpy as np
import pandas as pd

values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Change False to True for each block of code to see what it does

# Standardize each group
if False:
    def standardize(xs):
        return (xs - xs.mean()) / xs.std()
    grouped_data = example_df.groupby('even')
    print grouped_data['value'].apply(standardize)
    
# Find second largest value in each group
if False:
    def second_largest(xs):
        sorted_xs = xs.sort(inplace=False, ascending=False)
        return sorted_xs.iloc[1]
    grouped_data = example_df.groupby('even')
    print grouped_data['value'].apply(second_largest)

In [1292]:
# --- Quiz ---
# DataFrame with cumulative entries and exits for multiple stations
ridership_df = pd.DataFrame({
    'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
    'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
    'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
    'EXITSn': [1088151, 13755385,  1088159, 13755393,  1088177, 13755598, 1088231, 13756191,  1088275]
})

def hourly_for_group(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits and return a DataFrame with hourly entries and exits.
    The hourly entries and exits should be calculated separately for
    each station (the 'UNIT' column).
    
    Hint: Take a look at the `get_hourly_entries_and_exits()` function
    you wrote in a previous quiz, DataFrame Vectorized Operations. If
    you copy it here and rename it, you can use it and the `.apply()`
    function to help solve this problem.
    '''
    print entries_and_exits
    print entries_and_exits.groupby('UNIT')[['ENTRIESn','EXITSn']].apply(get_hourly_entries_and_exits)
    return None

hourly_for_group(ridership_df)


   ENTRIESn    EXITSn     TIMEn  UNIT
0   3144312   1088151  00:00:00  R051
1   8936644  13755385  02:00:00  R079
2   3144335   1088159  04:00:00  R051
3   8936658  13755393  06:00:00  R079
4   3144353   1088177  08:00:00  R051
5   8936687  13755598  10:00:00  R079
6   3144424   1088231  12:00:00  R051
7   8936819  13756191  14:00:00  R079
8   3144594   1088275  16:00:00  R051
   ENTRIESn  EXITSn
0         0       0
1         0       0
2        23       8
3        14       8
4        18      18
5        29     205
6        71      54
7       132     593
8       170      44

In [1309]:
import pandas as pd

subway_df = pd.DataFrame({
    'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
             'R004', 'R004'],
    'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
              '05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
    'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'ENTRIESn': [ 4388333,  4388348,  4389885,  4391507,  4393043, 14656120,
                 14656174, 14660126, 14664247, 14668301],
    'EXITSn': [ 2911002,  2911036,  2912127,  2913223,  2914284, 14451774,
               14451851, 14454734, 14457780, 14460818],
    'latitude': [ 40.689945,  40.689945,  40.689945,  40.689945,  40.689945,
                  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ],
    'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
                  -73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
})

weather_df = pd.DataFrame({
    'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
              '05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
    'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'latitude': [ 40.689945,  40.69132 ,  40.689945,  40.69132 ,  40.689945,
                  40.69132 ,  40.689945,  40.69132 ,  40.689945,  40.69132 ],
    'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
                  -73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
    'pressurei': [ 30.24,  30.24,  30.32,  30.32,  30.14,  30.14,  29.98,  29.98,
                   30.01,  30.01],
    'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'tempi': [ 52. ,  52. ,  48.9,  48.9,  54. ,  54. ,  57.2,  57.2,  48.9,  48.9],
    'wspdi': [  8.1,   8.1,   6.9,   6.9,   3.5,   3.5,  15. ,  15. ,  15. ,  15. ]
})

def combine_dfs(subway_df, weather_df):
    '''
    Fill in this function to take 2 DataFrames, one with subway data and one with weather data,
    and return a single dataframe with one row for each date, hour, and location. Only include
    times and locations that have both subway data and weather data available.
    '''
    return subway_df.merge(weather_df, on=['DATEn','latitude','longitude'], how='left')

combine_dfs(subway_df, weather_df)


Out[1309]:
DATEn ENTRIESn EXITSn UNIT hour_x latitude longitude fog hour_y pressurei rain tempi wspdi
0 05-01-11 4388333 2911002 R003 0 40.689945 -73.872564 0 0 30.24 0 52.0 8.1
1 05-02-11 4388348 2911036 R003 0 40.689945 -73.872564 0 0 30.32 0 48.9 6.9
2 05-03-11 4389885 2912127 R003 0 40.689945 -73.872564 0 0 30.14 0 54.0 3.5
3 05-04-11 4391507 2913223 R003 0 40.689945 -73.872564 0 0 29.98 0 57.2 15.0
4 05-05-11 4393043 2914284 R003 0 40.689945 -73.872564 0 0 30.01 0 48.9 15.0
5 05-01-11 14656120 14451774 R004 0 40.691320 -73.867135 0 0 30.24 0 52.0 8.1
6 05-02-11 14656174 14451851 R004 0 40.691320 -73.867135 0 0 30.32 0 48.9 6.9
7 05-03-11 14660126 14454734 R004 0 40.691320 -73.867135 0 0 30.14 0 54.0 3.5
8 05-04-11 14664247 14457780 R004 0 40.691320 -73.867135 0 0 29.98 0 57.2 15.0
9 05-05-11 14668301 14460818 R004 0 40.691320 -73.867135 0 0 30.01 0 48.9 15.0

In [1317]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Change False to True for this block of code to see what it does

# groupby() without as_index
if False:
    first_even = example_df.groupby('even').first()
    print first_even
    print first_even['even'] # Causes an error. 'even' is no longer a column in the DataFrame
    
# groupby() with as_index=False
if True:
    first_even = example_df.groupby('even', as_index=False).first()
    print first_even
    print first_even['even'] # Now 'even' is still a column in the DataFrame


    even above_three  value
0  False       False      1
1   True       False      2
0    False
1     True
Name: even, dtype: bool

In [1318]:
filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

## Make a plot of your choice here showing something interesting about the subway data.
## Matplotlib documentation here: http://matplotlib.org/api/pyplot_api.html
## Once you've got something you're happy with, share it on the forums!


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-1318-a74f3f860ef0> in <module>()
      1 filename = '/datasets/ud170/subway/nyc_subway_weather.csv'
----> 2 subway_df = pd.read_csv(filename)
      3 
      4 ## Make a plot of your choice here showing something interesting about the subway data.
      5 ## Matplotlib documentation here: http://matplotlib.org/api/pyplot_api.html

/Users/Vinay/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    489                     skip_blank_lines=skip_blank_lines)
    490 
--> 491         return _read(filepath_or_buffer, kwds)
    492 
    493     parser_f.__name__ = name

/Users/Vinay/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    266 
    267     # Create the parser.
--> 268     parser = TextFileReader(filepath_or_buffer, **kwds)
    269 
    270     if (nrows is not None) and (chunksize is not None):

/Users/Vinay/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    581             self.options['has_index_names'] = kwds['has_index_names']
    582 
--> 583         self._make_engine(self.engine)
    584 
    585     def _get_options_with_defaults(self, engine):

/Users/Vinay/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    722     def _make_engine(self, engine='c'):
    723         if engine == 'c':
--> 724             self._engine = CParserWrapper(self.f, **self.options)
    725         else:
    726             if engine == 'python':

/Users/Vinay/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1091         kwds['allow_leading_cols'] = self.index_col is not False
   1092 
-> 1093         self._reader = _parser.TextReader(src, **kwds)
   1094 
   1095         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3229)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6042)()

IOError: File /datasets/ud170/subway/nyc_subway_weather.csv does not exist

In [ ]: