Load Data from CSVs



In [1]:

    
import unicodecsv

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)



In [2]:

    
#####################################
#                 1                 #
#####################################

## Read in the data from daily_engagement.csv and project_submissions.csv 
## and store the results in the below variables.
## Then look at the first row of each table.

def read_csv(filename):
    with open(filename,'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)
    
enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

print enrollments[0]
print daily_engagement[0]
print project_submissions[0]









    



{u'status': u'canceled', u'is_udacity': u'True', u'is_canceled': u'True', u'join_date': u'2014-11-10', u'account_key': u'448', u'cancel_date': u'2015-01-14', u'days_to_cancel': u'65'}
{u'lessons_completed': u'0.0', u'num_courses_visited': u'1.0', u'total_minutes_visited': u'11.6793745', u'projects_completed': u'0.0', u'acct': u'0', u'utc_date': u'2015-01-09'}
{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}

Problems in the Data



In [3]:

    
#####################################
#                 3                 #
#####################################

## Rename the "acct" column in the daily_engagement table to "account_key".
for engagement in daily_engagement:
    engagement['account_key'] = engagement['acct']
    del engagement['acct']
    
print daily_engagement[0]['account_key']

Fixing Data Types



In [4]:

    
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)



In [5]:

    
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]









    Out[5]:





{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}



In [6]:

    
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]









    Out[6]:





{'account_key': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}



In [7]:

    
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]









    Out[7]:





{u'account_key': u'256',
 u'assigned_rating': u'UNGRADED',
 u'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 u'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'lesson_key': u'3176718735',
 u'processing_state': u'EVALUATED'}

Investigating the Data



In [8]:

    
#####################################
#                 2                 #
#####################################

## number of rows and the number of unique students (account keys)
enrollment_keys = [enrollment['account_key'] for enrollment in enrollments]
enrollment_num_rows = len(enrollments)             
enrollment_num_unique_students = len(set(enrollment_keys))  

engagement_keys = [engagement['account_key'] for engagement in daily_engagement]
engagement_num_rows = len(daily_engagement)             
engagement_num_unique_students = len(set(engagement_keys))  

submission_keys = [submission['account_key'] for submission in project_submissions]
submission_num_rows = len(project_submissions)             
submission_num_unique_students = len(set(submission_keys))

print enrollment_num_rows, enrollment_num_unique_students
print engagement_num_rows, engagement_num_unique_students
print submission_num_rows, submission_num_unique_students









    



1640 1302
136240 1237
3642 743



In [9]:

    
# alternate solution is to write a function to get unique students in each case
def get_unique_students(data):
    unique_students = set()
    for item in data:
        unique_students.add(item['account_key'])
    return unique_students



In [10]:

    
enrollment_num_rows = len(enrollments)             
enrollment_num_unique_students = len(get_unique_students(enrollments))  

engagement_num_rows = len(daily_engagement)             
engagement_num_unique_students = len(get_unique_students(daily_engagement))  

submission_num_rows = len(project_submissions)             
submission_num_unique_students = len(get_unique_students(project_submissions))

print enrollment_num_rows, enrollment_num_unique_students
print engagement_num_rows, engagement_num_unique_students
print submission_num_rows, submission_num_unique_students









    



1640 1302
136240 1237
3642 743

Missing Engagement Records



In [11]:

    
#####################################
#                 4                 #
#####################################

## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.
enrollment_keys = get_unique_students(enrollments)
engagement_keys = get_unique_students(daily_engagement)

# these are the account keys not appearing in engagement
faulty_keys = enrollment_keys^engagement_keys

# print enrollment data of student not in engagement

# first analysis
bad_enrollments = []
for enrollment in enrollments:
    if enrollment['account_key'] in faulty_keys:
        bad_enrollments.append(enrollment)
print len(bad_enrollments)

Checking for More Problem Records



In [12]:

    
#####################################
#                 5                 #
#####################################

## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.
# check for number of days enrolled
bad_enrollments = []
for enrollment in enrollments:
    if enrollment['account_key'] in faulty_keys and enrollment['days_to_cancel'] != 0:
        bad_enrollments.append(enrollment)
print bad_enrollments









    



[{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 1, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), u'days_to_cancel': 59}, {u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 3, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), u'days_to_cancel': 99}, {u'status': u'current', u'is_udacity': True, u'is_canceled': False, u'join_date': datetime.datetime(2015, 2, 25, 0, 0), u'account_key': u'1101', u'cancel_date': None, u'days_to_cancel': None}]

Tracking Down the Remaining Problems



In [13]:

    
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)









    Out[13]:





6



In [14]:

    
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data



In [15]:

    
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print len(non_udacity_enrollments)
print len(non_udacity_engagement)
print len(non_udacity_submissions)

Refining the Question



In [16]:

    
#####################################
#                 6                 #
#####################################

## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.

paid_students = {}

for enrollment in non_udacity_enrollments:
    if enrollment['days_to_cancel'] == None or enrollment['days_to_cancel'] > 7:
        join_date = enrollment['join_date']
        account_key = enrollment['account_key']
        if account_key not in paid_students or paid_students[account_key] < join_date:            
            paid_students[account_key] = join_date
        
print len(paid_students.keys())

Getting Data from First Week



In [17]:

    
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0



In [47]:

    
#####################################
#                 7                 #
#####################################

## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.

paid_engagement_in_first_week = []

for engagement in non_udacity_engagement:
    account_key = engagement['account_key']
    engagement_date = engagement['utc_date']
    if account_key in paid_students and within_one_week(paid_students[account_key], engagement_date):
        paid_engagement_in_first_week.append(engagement)
        
        
paid_submissions_in_first_week = []

for submission in non_udacity_submissions:
    account_key = submission['account_key']
    completion_date = submission['completion_date']
    if account_key in paid_students and completion_date != None:
        paid_submissions_in_first_week.append(submission)
        
print len(paid_submissions_in_first_week)

Exploring Student Engagement



In [32]:

    
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)



In [33]:

    
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes



In [34]:

    
import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)









    



Mean: 306.708326753
Standard deviation: 412.996933409
Minimum: 0.0
Maximum: 3564.7332645

Debugging Data Analysis Code



In [35]:

    
#####################################
#                 8                 #
#####################################

## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.

minutes_in_week = 7*24*60

faulty_accounts = {}
for account_key, total_minutes in total_minutes_by_account.items():
    if total_minutes > minutes_in_week:
        faulty_accounts[account_key] = engagement_by_account[account_key]
        print total_minutes

Lessons Completed in First Week



In [36]:

    
#####################################
#                 9                 #
#####################################

## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.
total_lessons_by_student = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_lessons = 0
    for engagement_record in engagement_for_student:
        total_lessons += engagement_record['lessons_completed']
    total_lessons_by_student[account_key] = total_lessons
    
total_lessons = total_lessons_by_student.values()
print 'Mean:', np.mean(total_lessons)
print 'Standard deviation:', np.std(total_lessons)
print 'Minimum:', np.min(total_lessons)
print 'Maximum:', np.max(total_lessons)









    



Mean: 1.63618090452
Standard deviation: 3.00256129983
Minimum: 0
Maximum: 36



In [79]:

    
# Course solution to the problem
from collections import defaultdict

# group entries based on the account key
def group_data(data, key_name):
    grouped_data = defaultdict(list)
    for data_point in data:
        key = data_point[key_name]
        grouped_data[key].append(data_point)
    return grouped_data

# function to sum the values for a specific field
def sum_grouped_items(grouped_data, field_name):
    summed_data = {}
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
        summed_data[key] = total
    return summed_data

import numpy as np
import matplotlib.pyplot as plt

def describe_data(data,bins=7):
    print 'Mean:', np.mean(data)
    print 'Standard deviation:', np.std(data)
    print 'Minimum:', np.min(data)
    print 'Maximum:', np.max(data)
    plt.hist(data, bins=bins)
    plt.xlabel("Metric of evaluation")
    plt.ylabel("Number of students")
    plt.title("Histogram")
    plt.show()

Number of Visits in First Week



In [38]:

    
######################################
#                 10                 #
######################################

## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.
num_courses_visited = {}
engagement_by_account = group_data(paid_engagement_in_first_week, 'account_key')

# function to sum the values for a specific field
def sum_days_visited(grouped_data):
    summed_data = {}
    field_name = 'num_courses_visited'
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            if data_point[field_name] > 0:
                total += 1
        summed_data[key] = total
    return summed_data

activity_by_account = sum_days_visited(engagement_by_account)
describe_data(activity_by_account.values())









    



Mean: 2.86733668342
Standard deviation: 2.25519800292
Minimum: 0
Maximum: 7

Splitting out Passing Students



In [53]:

    
######################################
#                 11                 #
######################################

## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.
subway_project_lesson_keys = ['746169184', '3176718735']

passing_students = defaultdict(list)
for submission in paid_submissions_in_first_week:
    if submission['lesson_key'] in subway_project_lesson_keys and submission['assigned_rating'] in ['DISTINCTION','PASSED']:
            passing_students[submission['account_key']].append(submission)

passing_engagement = []
non_passing_engagement = []

for engagement in paid_engagement_in_first_week:
    if engagement['account_key'] in passing_students.keys():
        passing_engagement.append(engagement)
    else:
        non_passing_engagement.append(engagement)
            
print len(passing_engagement), len(non_passing_engagement)

Comparing the Two Student Groups



In [72]:

    
######################################
#                 12                 #
######################################

## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).
interest_key = 'days_visited'

passing_students = group_data(passing_engagement, 'account_key')
if interest_key == 'days_visited':
    passing_students_minutes = sum_days_visited(passing_students)
else:
    passing_students_minutes = sum_grouped_items(passing_students, interest_key)
print 'Passing: '
describe_data(passing_students_minutes.values())

non_passing_students = group_data(non_passing_engagement, 'account_key')
if interest_key == 'days_visited':
    non_passing_students_minutes = sum_days_visited(non_passing_students)
else:
    non_passing_students_minutes = sum_grouped_items(non_passing_students, interest_key)
print 'Non passing: '
describe_data(non_passing_students_minutes.values())









    



Passing: 
Mean: 3.38485316847
Standard deviation: 2.25882147092
Minimum: 0
Maximum: 7
Non passing: 
Mean: 1.90517241379
Standard deviation: 1.90573144136
Minimum: 0
Maximum: 7

Making Histograms



In [81]:

    
######################################
#                 13                 #
######################################

## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.
# %pylab inline
import seaborn as sns
# import matplotlib.pyplot as plt

describe_data(passing_students_minutes.values(),bins=8)
describe_data(non_passing_students_minutes.values(),bins=8)









    



Mean: 3.38485316847
Standard deviation: 2.25882147092
Minimum: 0
Maximum: 7






    












    



Mean: 1.90517241379
Standard deviation: 1.90573144136
Minimum: 0
Maximum: 7



In [ ]:

    
######################################
#                 14                 #
######################################

## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.

Load Data from CSVs

Problems in the Data

Fixing Data Types

Investigating the Data

Missing Engagement Records

Checking for More Problem Records

Tracking Down the Remaining Problems

Refining the Question

Getting Data from First Week

Exploring Student Engagement

Debugging Data Analysis Code

Lessons Completed in First Week

Number of Visits in First Week

Splitting out Passing Students

Comparing the Two Student Groups

Making Histograms

Improving Plots and Sharing Findings