In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
sns.set(style="white")
sns.set_context("talk")
In [2]:
df = pd.read_csv('raw/2016-17-ClassCentral-Survey-data-noUserText.csv', decimal=',', encoding = "ISO-8859-1")
In [3]:
df['How willing are you to pay for a certificate for a MOOC?'].value_counts()
Out[3]:
In [4]:
target_name = 'Willingness to pay'
willing = (pd.to_numeric(df['How willing are you to pay for a certificate for a MOOC?'], errors='coerce') > 3)
In [5]:
def binary_compare_categorical_barh(mask, feature, df=df,
target_name='target', nontarget_name='Other',
split_name='visitor', answer='answer'):
"""Split dataframe into two based on mask
Draw horizontal barcharts for each category item for both masked and unmasked object"""
target = df[mask]
nontarget = df[~mask]
target_size, nontarget_size = len(target), len(nontarget)
res_target = target[feature].value_counts()/target_size*100
res_nontarget = nontarget[feature].value_counts()/nontarget_size*100
result = pd.DataFrame({target_name: res_target, nontarget_name: res_nontarget})
result[answer] = result.index
res_df = pd.melt(result, id_vars=answer, var_name=split_name, value_name='percentage')
display(res_df)
sns.factorplot(x='percentage', y=answer, hue=split_name, data=res_df, kind='bar', orient='h', size=6, aspect=2)
plt.title(feature)
sns.despine(left=True, bottom=True)
plt.show()
return
In [6]:
def binary_compare_multi_select_categorical_barh(df, target, target_name, question, selectors, nontarget_name = 'Others'):
"""draw a barchart for Survey results on a question that allows to select multiple categories
df: dataframe to use
target: selection of rows based on column values
question: the question you want to analyse
selectors: list of df column containing the selectors (values 0/1)"""
size = {}
target_df = df[target]
nontarget_df = df[~target]
size[target_name], size[nontarget_name] = len(target_df), len(nontarget_df)
print(size)
graph_targetdata = target_df.loc[:, selectors]
graph_targetdata['target'] = target_name
graph_nontargetdata = nontarget_df.loc[:, selectors]
graph_nontargetdata['target'] = nontarget_name
graph_data = pd.concat([graph_targetdata, graph_nontargetdata])
melted = pd.melt(graph_data, id_vars='target', var_name='select', value_name='percentage')
grouped = melted.groupby(['target', 'select'], as_index=False).sum()
#print(size[grouped['target']])
grouped.percentage = grouped.percentage/grouped['target'].map(size)*100 # make it percentage of total
grouped['select'] = grouped['select'].apply(lambda x: x.split(": ")[1]) # remove prefix from string
display(grouped)
sns.factorplot(x='percentage', y='select', hue='target', data=grouped, kind='bar', orient='h', size=6, aspect=2)
sns.plt.title(question)
sns.despine(left=True, bottom=True)
sns.plt.show()
In [7]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay', feature='How familiar are you with MOOCs?')
In [8]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay', feature='Which region of the world are you in?')
Africa is the only region where there are far more people willing to pay for a certificate than people who wont. The higher education quality available at a lower cost in the region, the lesser people are willing to pay for a MOOC certificate.
In [9]:
binary_compare_categorical_barh(mask=willing,
target_name='Willng to pay', feature='How important is the ability to earn a certificate when you complete a MOOC?')
Those who find the ability to earn a certificate important are more willing to pay than the others. This is one of the first actions the platform providers took to increase there business: quit with the free certificates.
In [10]:
reasons = ['Reasons: Learning skills for current career', 'Reasons: Learning skills for new career',
'Reasons: School credit', 'Reasons: Personal interest', 'Reasons: Access to reference materials']
binary_compare_multi_select_categorical_barh(df, target=willing, target_name='Willing to pay',
question='Which of the following are important reasons for you to take MOOCs?',
selectors=reasons)
There is only a slight difference in the reasons to follow MOOCs between those who are willing to pay and those who don't. When the reasons are career related respondents are willing to pay for a certificate.
In [11]:
decisions = ['Decide: Topic/Subject', 'Decide: Instructor', 'Decide: Institution/university',
'Decide: Platform', 'Decide: Ratings', 'Decide: Others recommendations']
binary_compare_multi_select_categorical_barh(df, target=willing, target_name='Willing to pay',
question='Which are the most important factors in deciding which MOOC to take?',
selectors=decisions)
The Institution is a more appealing reason to follow a MOOC for those who are willing to pay compared to those that are not.
In [12]:
aspects = ['Aspects: Browsing discussion forums',
'Aspects: Actively contributing to discussion forums',
'Aspects: Connecting with other learners in the course environment',
'Aspects: Connecting with learners outside the course environment',
'Aspects: Taking the course with other people you know (friends, colleagues, etc.)']
binary_compare_multi_select_categorical_barh(df, target=willing, target_name='Willing to pay',
question='Which of the following are important aspects of the MOOC experience to you?',
selectors=aspects)
Connecting with other students is more important for those who are willing to pay. Is this an opportunity for the platforms to increase their revenue by improving forum features and quality?
In [13]:
benefits = ['Benefit: Have not taken MOOCs',
'Benefit: Not Really',
'Benefit: School credit towards a degree',
'Benefit: Promotion at current organization',
'Benefit: Higher performance evaluation at current job',
'Benefit: Helped me get a new job in the same field',
'Benefit: Helped me get a new job in a different field']
binary_compare_multi_select_categorical_barh(df, target=willing, target_name='Willing to pay',
question='Have you received any tangible benefits from taking MOOCs?',
selectors=benefits)
People willing to pay see more benefits in MOOCs
In [14]:
pays = ['Pay: The topic/subject',
'Pay: The institution/university offering the MOOC',
'Pay: The instructor/professor',
'Pay: The MOOC platform being used',
'Pay: A multi-course certification that the MOOC is a part of']
binary_compare_multi_select_categorical_barh(df, target=willing, target_name='Willing to pay',
question='Which of the following have a strong impact on your willingness to pay for a MOOC certificate?',
selectors=pays)
In [15]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay', feature='# MOOCs Started')
The willingness to pay drops after starting about 7 course
In [16]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay', feature='# MOOCs Finished')
The more people finish MOOCs the less willingnes to pay. Is this the reason why Coursera is switching to a subscription model?
In [17]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay', feature='When did you first start taking MOOCs?')
People who more recently started taking MOOCs are more willing to pay
In [18]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay',
feature='How much do you think employers value MOOC certificates?')
People who are more willing to pay for a certificate think employer value the certificates about twice as much as people who are less willing to pay for the certificate
In [19]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay',
feature='What is your level of formal education?')
There is very little correlation between willingness to pay for certificates and the education level of the respondent
In [20]:
binary_compare_categorical_barh(mask=willing,
target_name='Willing to pay',
feature='What is your age range?')
In the age range 36-45 there is significant more willingness to pay, the age range with biggest need to upgrade their skills? In the age range 56+ the willingness to pay drops.