Before we get started, a couple of reminders to keep in mind when using iPython notebooks:
In [1]:
import unicodecsv
## Longer version of code (replaced with shorter, equivalent version below)
# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
# enrollments.append(row)
# f.close()
def read_csv(filename):
with open(filename, 'rb') as f:
reader = unicodecsv.DictReader(f)
aList = list(reader)
return aList
#with open('enrollments.csv', 'rb') as f:
# reader = unicodecsv.DictReader(f)
# enrollments = list(reader)
enrollments = read_csv('enrollments.csv')
enrollments[15]
Out[1]:
In [2]:
#####################################
# 1 #
#####################################
## Read in the data from daily_engagement.csv and project_submissions.csv
## and store the results in the below variables.
## Then look at the first row of each table.
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')
print (daily_engagement[0])
print ('\n')
print (project_submissions[0])
In [3]:
from datetime import datetime as dt
# Takes a date as a string, and returns a Python datetime object.
# If there is no date given, returns None
def parse_date(date):
if date == '':
return None
else:
return dt.strptime(date, '%Y-%m-%d')
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
if i == '':
return None
else:
return int(i)
# Clean up the data types in the enrollments table
for enrollment in enrollments:
enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
enrollment['join_date'] = parse_date(enrollment['join_date'])
enrollments[0]
Out[3]:
In [4]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
daily_engagement[0]
Out[4]:
In [5]:
# Clean up the data types in the submissions table
for submission in project_submissions:
submission['completion_date'] = parse_date(submission['completion_date'])
submission['creation_date'] = parse_date(submission['creation_date'])
project_submissions[0]
Out[5]:
In [6]:
#####################################
# 2 #
#####################################
## Find the total number of rows and the number of unique students (account keys)
## in each table.
def get_unique_keys(a_list,the_key):
a_set = set()
try:
for item in a_list:
a_set.add(item[the_key])
except:
print ('some error')
return a_set
enrollment_num_rows = len(enrollments) # Replace this with your code
unique_enrollment_students = get_unique_keys(enrollments,'account_key')
enrollment_num_unique_students = len(unique_enrollment_students)
print('enrollments: %d' % enrollment_num_rows)
print('unique enrollments: %d' % enrollment_num_unique_students)
engagement_num_rows = len(daily_engagement) # Replace this with your code
unique_engagement_students = get_unique_keys(daily_engagement,'account_key') # Replace this with your code
engagement_num_unique_students = len(unique_engagement_students)
print('enagement %d' % engagement_num_rows)
print('unique enagement %d' % engagement_num_unique_students)
submission_num_rows = len(project_submissions) # Replace this with your code
submission_unique_students = get_unique_keys(project_submissions,'account_key') # Replace this with your code
submission_num_unique_students = len(submission_unique_students)
print(submission_num_rows)
print(submission_num_unique_students)
print(daily_engagement[0]['account_key'])
In [7]:
#####################################
# 3 #
#####################################
## Rename the "acct" column in the daily_engagement table to "account_key".
#actually I modified the file
In [8]:
#####################################
# 4 #
#####################################
## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.
notEngCount = 0
for enrollment in enrollments:
student = enrollment['account_key']
if student not in unique_engagement_students:
#print (enrollment)
#break
notEngCount = notEngCount + 1
print ('Not engagement count %d' % notEngCount)
In [9]:
#####################################
# 5 #
#####################################
## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.
num_problem_students = 0
for enrollment in enrollments:
student = enrollment['account_key']
if (student not in unique_engagement_students and
enrollment['join_date'] != enrollment['cancel_date']):
print (enrollment)
num_problem_students += 1
num_problem_students
Out[9]:
In [10]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
if enrollment['is_udacity']:
udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)
Out[10]:
In [11]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
non_udacity_data = []
for data_point in data:
if data_point['account_key'] not in udacity_test_accounts:
non_udacity_data.append(data_point)
return non_udacity_data
In [12]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)
print (len(non_udacity_enrollments))
print (len(non_udacity_engagement))
print (len(non_udacity_submissions))
In [13]:
#####################################
# 6 #
#####################################
## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.
paid_students = {}
for enrollment in non_udacity_enrollments:
if (not enrollment['is_canceled'] or
enrollment['days_to_cancel'] > 7):
account_key = enrollment['account_key']
enrollment_date = enrollment['join_date']
if (account_key not in paid_students or
enrollment_date > paid_students[account_key]):
paid_students[account_key] = enrollment_date
len(paid_students)
Out[13]:
In [14]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
time_delta = engagement_date - join_date
return time_delta.days < 7 and time_delta.days >= 0
def remove_free_trial_cancels(data):
new_data = []
for data_point in data:
if data_point['account_key'] in paid_students:
new_data.append(data_point)
return new_data
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)
print (len(paid_enrollments))
print (len(paid_engagement))
print (len(paid_submissions))
In [15]:
for engagement_record in paid_engagement:
if engagement_record['num_courses_visited'] > 0:
engagement_record['has_visited'] = 1
else:
engagement_record['has_visited'] = 0
In [16]:
#####################################
# 7 #
#####################################
## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.
paid_engagement_in_first_week = []
for eng_entry in paid_engagement:
a_key = eng_entry['account_key']
eng_date = eng_entry['utc_date']
join_date = paid_students[a_key]
if within_one_week(join_date,eng_date):
paid_engagement_in_first_week.append(eng_entry)
len(paid_engagement_in_first_week)
Out[16]:
In [39]:
from collections import defaultdict
# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
def group_data(data,key_name):
grouped_data = defaultdict(list)
for record in data:
key_value = record[key_name]
grouped_data[key_value].append(record)
return grouped_data
def sum_grouped_data(grouped_data,field_name):
sumed_data = {}
for account_key, grouped_values in grouped_data.items():
total_value = 0
for a_record in grouped_values:
total_value += a_record[field_name]
sumed_data[account_key] = total_value
return sumed_data
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
def describe_results(total_grouped_values):
total_values = total_grouped_values.values()
print ('Mean: %f' % np.mean(total_values))
print ('Standard deviation: %f' % np.std(total_values))
print ('Minimum: %f' % np.min(total_values))
print ('Maximum: %f' % np.max(total_values))
plt.hist(total_values)
engagement_by_account = group_data(paid_engagement_in_first_week,'account_key')
total_minutes_by_account = sum_grouped_data(engagement_by_account,'total_minutes_visited')
In [40]:
describe_results(total_minutes_by_account)
In [41]:
#####################################
# 8 #
#####################################
## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.
student_with_max_minutes = None
max_minutes = 0
c = 0
for student, total_minutes in total_minutes_by_account.items():
if total_minutes > max_minutes:
max_minutes = total_minutes
student_with_max_minutes = student
max_minutes
Out[41]:
In [42]:
for engagement_record in paid_engagement_in_first_week:
if engagement_record['account_key'] == student_with_max_minutes:
print engagement_record
In [43]:
#####################################
# 9 #
#####################################
## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.
total_lessons_completed_by_account = sum_grouped_data(engagement_by_account,'lessons_completed')
describe_results(total_lessons_completed_by_account)
In [44]:
######################################
# 10 #
######################################
## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.
total_first_week = sum_grouped_data(engagement_by_account,'has_visited')
describe_results(total_first_week)
In [45]:
######################################
# 11 #
######################################
## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.
subway_project_lesson_keys = ['746169184', '3176718735']
subway_submissions = group_data(paid_submissions,'lesson_key')
passing_engagement = []
non_passing_engagement = []
pass_subway_project = set()
for submission in paid_submissions:
project = submission['lesson_key']
rating = submission['assigned_rating']
if project in subway_project_lesson_keys and \
(rating == 'PASSED' or rating == 'DISTINCTION'):
pass_subway_project.add(submission['account_key'])
print len(pass_subway_project)
for engagement_record in paid_engagement_in_first_week:
if engagement_record['account_key'] in pass_subway_project:
passing_engagement.append(engagement_record)
else:
non_passing_engagement.append(engagement_record)
print len(passing_engagement)
print len(non_passing_engagement)
In [130]:
######################################
# 12 #
######################################
## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).
passing_engagement_by_account = group_data(passing_engagement,'account_key')
non_passing_engagement_by_account = group_data(non_passing_engagement,'account_key')
total_minutes_by_pass_account = sum_grouped_data(passing_engagement_by_account,'total_minutes_visited')
print 'minutes for Passing students'
describe_results(total_minutes_by_pass_account)
print '\n'
total_minutes_by_non_pass_account = sum_grouped_data(non_passing_engagement_by_account,'total_minutes_visited')
print 'minutes for NON Passing students'
describe_results(total_minutes_by_non_pass_account)
print '\n'
print '\n'
lessons_completed_by_pass_account = sum_grouped_data(passing_engagement_by_account,'lessons_completed')
print 'lessons_completed for Passing students'
describe_results(lessons_completed_by_pass_account)
print '\n'
lessons_completed_by_non_pass_account = sum_grouped_data(non_passing_engagement_by_account,'lessons_completed')
print 'lessons_completed for NON Passing students'
describe_results(lessons_completed_by_non_pass_account)
print '\n'
print '\n'
days_visited_by_pass_account = sum_grouped_data(passing_engagement_by_account,'has_visited')
print 'days_visited for Passing students'
describe_results(days_visited_by_pass_account)
print '\n'
days_visited_by_non_pass_account = sum_grouped_data(non_passing_engagement_by_account,'has_visited')
print 'days_visited for NON Passing students'
describe_results(days_visited_by_non_pass_account)
In [133]:
######################################
# 13 #
######################################
## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(7, 3), sharey=False,sharex=False)
axes[0,0].hist(total_minutes_by_non_pass_account.values(),bins=20)
axes[1,0].hist(days_visited_by_non_pass_account.values())
axes[0,1].hist(total_minutes_by_pass_account.values())
axes[1,1].hist(days_visited_by_pass_account.values())
Out[133]:
In [ ]:
######################################
# 14 #
######################################
## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.