In [1]:
import unicodecsv
with open('enrollments.csv', 'rb') as f:
reader = unicodecsv.DictReader(f)
enrollments = list(reader)
In [2]:
#####################################
# 1 #
#####################################
## Read in the data from daily_engagement.csv and project_submissions.csv
## and store the results in the below variables.
## Then look at the first row of each table.
def read_csv(filename):
with open(filename,'rb') as f:
reader = unicodecsv.DictReader(f)
return list(reader)
enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')
print enrollments[0]
print daily_engagement[0]
print project_submissions[0]
In [3]:
#####################################
# 3 #
#####################################
## Rename the "acct" column in the daily_engagement table to "account_key".
for engagement in daily_engagement:
engagement['account_key'] = engagement['acct']
del engagement['acct']
print daily_engagement[0]['account_key']
In [4]:
from datetime import datetime as dt
# Takes a date as a string, and returns a Python datetime object.
# If there is no date given, returns None
def parse_date(date):
if date == '':
return None
else:
return dt.strptime(date, '%Y-%m-%d')
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
if i == '':
return None
else:
return int(i)
In [5]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
enrollment['join_date'] = parse_date(enrollment['join_date'])
enrollments[0]
Out[5]:
In [6]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
daily_engagement[0]
Out[6]:
In [7]:
# Clean up the data types in the submissions table
for submission in project_submissions:
submission['completion_date'] = parse_date(submission['completion_date'])
submission['creation_date'] = parse_date(submission['creation_date'])
project_submissions[0]
Out[7]:
In [8]:
#####################################
# 2 #
#####################################
## number of rows and the number of unique students (account keys)
enrollment_keys = [enrollment['account_key'] for enrollment in enrollments]
enrollment_num_rows = len(enrollments)
enrollment_num_unique_students = len(set(enrollment_keys))
engagement_keys = [engagement['account_key'] for engagement in daily_engagement]
engagement_num_rows = len(daily_engagement)
engagement_num_unique_students = len(set(engagement_keys))
submission_keys = [submission['account_key'] for submission in project_submissions]
submission_num_rows = len(project_submissions)
submission_num_unique_students = len(set(submission_keys))
print enrollment_num_rows, enrollment_num_unique_students
print engagement_num_rows, engagement_num_unique_students
print submission_num_rows, submission_num_unique_students
In [9]:
# alternate solution is to write a function to get unique students in each case
def get_unique_students(data):
unique_students = set()
for item in data:
unique_students.add(item['account_key'])
return unique_students
In [10]:
enrollment_num_rows = len(enrollments)
enrollment_num_unique_students = len(get_unique_students(enrollments))
engagement_num_rows = len(daily_engagement)
engagement_num_unique_students = len(get_unique_students(daily_engagement))
submission_num_rows = len(project_submissions)
submission_num_unique_students = len(get_unique_students(project_submissions))
print enrollment_num_rows, enrollment_num_unique_students
print engagement_num_rows, engagement_num_unique_students
print submission_num_rows, submission_num_unique_students
In [11]:
#####################################
# 4 #
#####################################
## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.
enrollment_keys = get_unique_students(enrollments)
engagement_keys = get_unique_students(daily_engagement)
# these are the account keys not appearing in engagement
faulty_keys = enrollment_keys^engagement_keys
# print enrollment data of student not in engagement
# first analysis
bad_enrollments = []
for enrollment in enrollments:
if enrollment['account_key'] in faulty_keys:
bad_enrollments.append(enrollment)
print len(bad_enrollments)
In [12]:
#####################################
# 5 #
#####################################
## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.
# check for number of days enrolled
bad_enrollments = []
for enrollment in enrollments:
if enrollment['account_key'] in faulty_keys and enrollment['days_to_cancel'] != 0:
bad_enrollments.append(enrollment)
print bad_enrollments
In [13]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
if enrollment['is_udacity']:
udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)
Out[13]:
In [14]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
non_udacity_data = []
for data_point in data:
if data_point['account_key'] not in udacity_test_accounts:
non_udacity_data.append(data_point)
return non_udacity_data
In [15]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)
print len(non_udacity_enrollments)
print len(non_udacity_engagement)
print len(non_udacity_submissions)
In [16]:
#####################################
# 6 #
#####################################
## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.
paid_students = {}
for enrollment in non_udacity_enrollments:
if enrollment['days_to_cancel'] == None or enrollment['days_to_cancel'] > 7:
join_date = enrollment['join_date']
account_key = enrollment['account_key']
if account_key not in paid_students or paid_students[account_key] < join_date:
paid_students[account_key] = join_date
print len(paid_students.keys())
In [17]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
time_delta = engagement_date - join_date
return time_delta.days < 7 and time_delta.days >= 0
In [47]:
#####################################
# 7 #
#####################################
## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.
paid_engagement_in_first_week = []
for engagement in non_udacity_engagement:
account_key = engagement['account_key']
engagement_date = engagement['utc_date']
if account_key in paid_students and within_one_week(paid_students[account_key], engagement_date):
paid_engagement_in_first_week.append(engagement)
paid_submissions_in_first_week = []
for submission in non_udacity_submissions:
account_key = submission['account_key']
completion_date = submission['completion_date']
if account_key in paid_students and completion_date != None:
paid_submissions_in_first_week.append(submission)
print len(paid_submissions_in_first_week)
In [32]:
from collections import defaultdict
# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
account_key = engagement_record['account_key']
engagement_by_account[account_key].append(engagement_record)
In [33]:
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
total_minutes = 0
for engagement_record in engagement_for_student:
total_minutes += engagement_record['total_minutes_visited']
total_minutes_by_account[account_key] = total_minutes
In [34]:
import numpy as np
# Summarize the data about minutes spent in the classroom
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)
In [35]:
#####################################
# 8 #
#####################################
## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.
minutes_in_week = 7*24*60
faulty_accounts = {}
for account_key, total_minutes in total_minutes_by_account.items():
if total_minutes > minutes_in_week:
faulty_accounts[account_key] = engagement_by_account[account_key]
print total_minutes
In [36]:
#####################################
# 9 #
#####################################
## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.
total_lessons_by_student = {}
for account_key, engagement_for_student in engagement_by_account.items():
total_lessons = 0
for engagement_record in engagement_for_student:
total_lessons += engagement_record['lessons_completed']
total_lessons_by_student[account_key] = total_lessons
total_lessons = total_lessons_by_student.values()
print 'Mean:', np.mean(total_lessons)
print 'Standard deviation:', np.std(total_lessons)
print 'Minimum:', np.min(total_lessons)
print 'Maximum:', np.max(total_lessons)
In [79]:
# Course solution to the problem
from collections import defaultdict
# group entries based on the account key
def group_data(data, key_name):
grouped_data = defaultdict(list)
for data_point in data:
key = data_point[key_name]
grouped_data[key].append(data_point)
return grouped_data
# function to sum the values for a specific field
def sum_grouped_items(grouped_data, field_name):
summed_data = {}
for key, data_points in grouped_data.items():
total = 0
for data_point in data_points:
total += data_point[field_name]
summed_data[key] = total
return summed_data
import numpy as np
import matplotlib.pyplot as plt
def describe_data(data,bins=7):
print 'Mean:', np.mean(data)
print 'Standard deviation:', np.std(data)
print 'Minimum:', np.min(data)
print 'Maximum:', np.max(data)
plt.hist(data, bins=bins)
plt.xlabel("Metric of evaluation")
plt.ylabel("Number of students")
plt.title("Histogram")
plt.show()
In [38]:
######################################
# 10 #
######################################
## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.
num_courses_visited = {}
engagement_by_account = group_data(paid_engagement_in_first_week, 'account_key')
# function to sum the values for a specific field
def sum_days_visited(grouped_data):
summed_data = {}
field_name = 'num_courses_visited'
for key, data_points in grouped_data.items():
total = 0
for data_point in data_points:
if data_point[field_name] > 0:
total += 1
summed_data[key] = total
return summed_data
activity_by_account = sum_days_visited(engagement_by_account)
describe_data(activity_by_account.values())
In [53]:
######################################
# 11 #
######################################
## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.
subway_project_lesson_keys = ['746169184', '3176718735']
passing_students = defaultdict(list)
for submission in paid_submissions_in_first_week:
if submission['lesson_key'] in subway_project_lesson_keys and submission['assigned_rating'] in ['DISTINCTION','PASSED']:
passing_students[submission['account_key']].append(submission)
passing_engagement = []
non_passing_engagement = []
for engagement in paid_engagement_in_first_week:
if engagement['account_key'] in passing_students.keys():
passing_engagement.append(engagement)
else:
non_passing_engagement.append(engagement)
print len(passing_engagement), len(non_passing_engagement)
In [72]:
######################################
# 12 #
######################################
## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).
interest_key = 'days_visited'
passing_students = group_data(passing_engagement, 'account_key')
if interest_key == 'days_visited':
passing_students_minutes = sum_days_visited(passing_students)
else:
passing_students_minutes = sum_grouped_items(passing_students, interest_key)
print 'Passing: '
describe_data(passing_students_minutes.values())
non_passing_students = group_data(non_passing_engagement, 'account_key')
if interest_key == 'days_visited':
non_passing_students_minutes = sum_days_visited(non_passing_students)
else:
non_passing_students_minutes = sum_grouped_items(non_passing_students, interest_key)
print 'Non passing: '
describe_data(non_passing_students_minutes.values())
In [81]:
######################################
# 13 #
######################################
## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.
# %pylab inline
import seaborn as sns
# import matplotlib.pyplot as plt
describe_data(passing_students_minutes.values(),bins=8)
describe_data(non_passing_students_minutes.values(),bins=8)
In [ ]:
######################################
# 14 #
######################################
## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.