In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
Importing data and previewing
In [2]:
df = pd.read_csv("thwfall2017-survey.csv")
df = df[1:]
Oh no, these are some messy column names! Gotta clean them up, truncating the first 308 characters.
In [3]:
df.columns[0:3]
Out[3]:
In [4]:
count = 0
for column in df.columns:
if count < 43:
df = df.rename(columns = {column:column[308:]})
print(len(column), column[308:])
count = count + 1
In [5]:
topic_list = df.columns[0:37]
topic_list
Out[5]:
Creating two dataframes: df_topics for interest/experience about topics and df_meta for questions about THW
In [6]:
df_topics = df[topic_list]
In [7]:
df_meta = df
df_meta = df[['Skill level', 'Personal experience', 'Presentation style']]
Each topic (e.g. Python, R, GitHub) has one cell, with a list based on the items checked.
These are mutually independent -- if someone clicked all of them, the value would be "1, 2, 3" and so on.
Assumptions for calculating interest: If someone clicked that they just wanted a topic, add 1 to the topic's score. If someone clicked that they really wanted it, add 3 to the topic's score. If they clicked both, just add 3, not 4.
In [8]:
topic_interest = {}
topic_teaching = {}
for topic in df_topics:
topic_interest[topic] = 0
topic_teaching[topic] = 0
for row in df_topics[topic]:
# if row contains only value 1, increment interest dict by 1
if str(row).find('1')>=0 and str(row).find('2')==-1:
topic_interest[topic] += 1
# if row contains value 2, increment interest dict by 3
if str(row).find('2')>=0:
topic_interest[topic] += 3
if str(row).find('3')>=0:
topic_teaching[topic] += 1
In [9]:
topic_interest_df = pd.DataFrame.from_dict(topic_interest, orient="index")
topic_interest_df.sort_values([0], ascending=False)
Out[9]:
In [10]:
topic_interest_df = topic_interest_df.sort_values([0], ascending=True)
topic_interest_df.plot(figsize=[8,14], kind='barh', fontsize=20)
Out[10]:
In [11]:
topic_teaching_df = pd.DataFrame.from_dict(topic_teaching, orient="index")
topic_teaching_df = topic_teaching_df[topic_teaching_df[0] != 0]
topic_teaching_df.sort_values([0], ascending=False)
Out[11]:
In [12]:
topic_teaching_df = topic_teaching_df.sort_values([0], ascending=True)
topic_teaching_df.plot(figsize=[8,10], kind='barh', fontsize=20)
Out[12]:
In [13]:
df_meta = df_meta.dropna()
df_meta[0:4]
Out[13]:
In [14]:
fig, ax = plt.subplots()
pe_df = df_meta['Personal experience'].value_counts(sort=False).sort_index(ascending=True)
pe_plot = pe_df.plot(kind='barh', fontsize=20, figsize=[8,4], ax=ax)
plt.title("What is your personal experience with scientific computing?", size=20)
ax.set_yticklabels(["Beginner", "Intermediate", "Advanced"])
Out[14]:
In [15]:
fig, ax = plt.subplots()
skill_df = df_meta['Skill level'].value_counts(sort=False).sort_index(ascending=True)
skill_plot = skill_df.plot(kind='barh', fontsize=20, figsize=[8,4], ax=ax)
plt.title("What skill level should we aim for?", size=20)
ax.set_yticklabels(["Beginner", "Intermediate", "Advanced"])
Out[15]:
In [17]:
fig, ax = plt.subplots()
style_df = df_meta['Presentation style'].value_counts(sort=False).sort_index(ascending=True)
style_plot = style_df.plot(kind='barh', fontsize=20, figsize=[8,4], ax=ax)
plt.title("Session format", size=20)
ax.set_yticklabels(["100% presentation / 0% hackathon",
"75% presentation / 25% hackathon",
"50% presentation / 50% hackathon",
"25% presentation / 75% hackathon",
"0% presentation / 100% hackathon"])
Out[17]:
In [ ]: