Newbees = respondents that haven't finished a MOOC yet
Experienced = respondents having finished at least 2 MOOCs
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
sns.set_context("talk")
In [2]:
df = pd.read_csv('raw/2016-17-ClassCentral-Survey-data-noUserText.csv', decimal=',', encoding = "ISO-8859-1")
In [3]:
df = df[df['# MOOCs Finished'] != '1'] # remove respondents with 1 MOOC finished, these are no newbees neither experienced
df['# MOOCs Finished'].value_counts()
Out[3]:
In [4]:
target_name = 'Newbees'
newbees = (df['# MOOCs Finished'] == '0')
In [5]:
def binary_compare_categorical_barh(mask, feature, df=df,
target_name='target', nontarget_name='Other',
split_name='visitor', answer='answer'):
"""Split dataframe into two based on mask
Draw horizontal barcharts for each category item for both masked and unmasked object"""
target = df[mask]
nontarget = df[~mask]
target_size, nontarget_size = len(target), len(nontarget)
res_target = target[feature].value_counts()/target_size*100
res_nontarget = nontarget[feature].value_counts()/nontarget_size*100
result = pd.DataFrame({target_name: res_target, nontarget_name: res_nontarget})
result[answer] = result.index
res_df = pd.melt(result, id_vars=answer, var_name=split_name, value_name='percentage')
print(res_df)
sns.factorplot(x='percentage', y=answer, hue=split_name, data=res_df, kind='bar', orient='h', size=6, aspect=2)
plt.title(feature)
sns.despine(left=True, bottom=True)
plt.show()
return
In [6]:
def binary_compare_multi_select_categorical_barh(df, target, target_name, question, selectors, nontarget_name = 'Others'):
"""draw a barchart for Survey results on a question that allows to select multiple categories
df: dataframe to use
target: selection of rows based on column values
question: the question you want to analyse
selectors: list of df column containing the selectors (values 0/1)"""
size = {}
target_df = df[target]
nontarget_df = df[~target]
size[target_name], size[nontarget_name] = len(target_df), len(nontarget_df)
print(size)
graph_targetdata = target_df.loc[:, selectors]
graph_targetdata['target'] = target_name
graph_nontargetdata = nontarget_df.loc[:, selectors]
graph_nontargetdata['target'] = nontarget_name
graph_data = pd.concat([graph_targetdata, graph_nontargetdata])
melted = pd.melt(graph_data, id_vars='target', var_name='select', value_name='percentage')
grouped = melted.groupby(['target', 'select'], as_index=False).sum()
#print(size[grouped['target']])
grouped.percentage = grouped.percentage/grouped['target'].map(size)*100 # make it percentage of total
grouped['select'] = grouped['select'].apply(lambda x: x.split(": ")[1]) # remove prefix from string
print(grouped)
sns.factorplot(x='percentage', y='select', hue='target', data=grouped, kind='bar', orient='h', size=6, aspect=2)
sns.plt.title(question)
sns.despine(left=True, bottom=True)
sns.plt.show()
In [7]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced', feature='How familiar are you with MOOCs?')
In [8]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced', feature='Which region of the world are you in?')
Africa and India are the regions with significant more newbees than experienced MOOCers
In [9]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced',
feature='How important is the ability to earn a certificate when you complete a MOOC?')
Newbees find it more important to be able to earn a certificate than experienced respondents
In [10]:
reasons = ['Reasons: Learning skills for current career', 'Reasons: Learning skills for new career',
'Reasons: School credit', 'Reasons: Personal interest', 'Reasons: Access to reference materials']
binary_compare_multi_select_categorical_barh(df, target=newbees, target_name='Newbees', nontarget_name='Experienced',
question='Which of the following are important reasons for you to take MOOCs?',
selectors=reasons)
There is only a slight difference in the reasons to follow MOOCs between newbees and experienced users. Personal interest and access to reference material are somewhat more important reasons to take MOOCs for experienced respondents than for newbees.
In [11]:
decisions = ['Decide: Topic/Subject', 'Decide: Instructor', 'Decide: Institution/university',
'Decide: Platform', 'Decide: Ratings', 'Decide: Others recommendations']
binary_compare_multi_select_categorical_barh(df, target=newbees, target_name='Newbees', nontarget_name='Experienced',
question='Which are the most important factors in deciding which MOOC to take?',
selectors=decisions)
In [12]:
aspects = ['Aspects: Browsing discussion forums',
'Aspects: Actively contributing to discussion forums',
'Aspects: Connecting with other learners in the course environment',
'Aspects: Connecting with learners outside the course environment',
'Aspects: Taking the course with other people you know (friends, colleagues, etc.)']
binary_compare_multi_select_categorical_barh(df, target=newbees, target_name='Newbees', nontarget_name='Experienced',
question='Which of the following are important aspects of the MOOC experience to you?',
selectors=aspects)
Newbee MOOCers find it more important to connect with other learners
In [13]:
benefits = ['Benefit: Have not taken MOOCs',
'Benefit: Not Really',
'Benefit: School credit towards a degree',
'Benefit: Promotion at current organization',
'Benefit: Higher performance evaluation at current job',
'Benefit: Helped me get a new job in the same field',
'Benefit: Helped me get a new job in a different field']
binary_compare_multi_select_categorical_barh(df, target=newbees, target_name='Newbees', nontarget_name='Experienced',
question='Have you received any tangible benefits from taking MOOCs?',
selectors=benefits)
Expereinced MOOCers received more tangible benefits in MOOCs
In [14]:
pays = ['Pay: The topic/subject',
'Pay: The institution/university offering the MOOC',
'Pay: The instructor/professor',
'Pay: The MOOC platform being used',
'Pay: A multi-course certification that the MOOC is a part of']
binary_compare_multi_select_categorical_barh(df, target=newbees, target_name='Newbees', nontarget_name='Experienced',
question='Which of the following have a strong impact on your willingness to pay for a MOOC certificate?',
selectors=pays)
In [15]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced', feature='# MOOCs Started')
The willingness to pay drops after starting about 7 course
In [16]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced', feature='# MOOCs Finished')
The more people finish MOOCs the less willingnes to pay. Is this the reason why Coursera is switching to a subscription model?
In [17]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced',
feature='When did you first start taking MOOCs?')
In [18]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced',
feature='How much do you think employers value MOOC certificates?')
Newbee MOOCers perceive the employment value of a MOOC certificate higher than experienced respondents
In [19]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced',
feature='What is your level of formal education?')
People with a graduate degree have also more experience with MOOCs than people with a lower eductation level
In [20]:
binary_compare_categorical_barh(mask=newbees,
target_name='Newbees', nontarget_name='Experienced',
feature='What is your age range?')
In the age range 46+ there are significant more experienced MOOCers than in the younger age ranges.