In [1]:
%matplotlib inline
import requests
from collections import Counter, OrderedDict, defaultdict
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as ss
import matplotlib_venn as venn
from altair import Chart, Color, Column, Row, Text
from IPython.display import Markdown, display
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import itertools
import datetime
plt.rcParams["figure.figsize"] = (15, 8)
nltk.download('punkt')
nltk.download('stopwords')
Out[1]:
In [2]:
def printmd(string):
"""This leverages Jupyter's display to print out Markdown print statements"""
display(Markdown(string))
In [3]:
"""Get the survey responses as a simple csv and load it into a
pandas dataframe
"""
file_id = '19WATPacwNw4yJjTgWH7bsUNmst5fmnbADc1VvPx6cNQ'
url = "https://docs.google.com/spreadsheets/d/{file_id}/export?format=csv".format(**locals())
columns = ['Timestamp'
, 'EveningMeetups'
, 'LunchtimeMeetups'
, 'BreakfastMeetups'
, 'LocalConferences'
, 'NatIntConferences'
, 'MOOCSOnlineCourses'
, 'NumEventsAttended'
, 'ReasonForParticipation'
, 'LanguageFrameworkFocused'
, 'TechnologyFocused'
, 'DisciplineJobFocused'
, 'CompanyFocused'
, 'LearningGroupFocused'
, 'LiveTechTalksFocused'
, 'WatchRecordedTalks'
, 'MarketIndustryFocused'
, 'ExpertInterviewsFocused'
, 'NetworkingFocused'
, 'CareerAdviceFocused'
, 'MagicWand']
survey_df = pd.read_csv(url, names=columns, header=0)
In [4]:
printmd("# Responses to date *{}*\n\n # {}".format(datetime.date.today().strftime("%B %d, %Y"), survey_df.shape[0]))
In [5]:
"""Here's what our dataframe looks like
"""
survey_df.head()
Out[5]:
In [6]:
"""Let's group the columns that go to each answer
"""
preferred_events = ['BreakfastMeetups'
, 'LocalConferences'
, 'MOOCSOnlineCourses'
, 'EveningMeetups'
, 'LunchtimeMeetups'
, 'NatIntConferences']
preferred_focus = ['LanguageFrameworkFocused'
, 'TechnologyFocused'
, 'DisciplineJobFocused'
, 'CompanyFocused'
, 'LearningGroupFocused'
, 'LiveTechTalksFocused'
, 'WatchRecordedTalks'
, 'MarketIndustryFocused'
, 'ExpertInterviewsFocused'
, 'NetworkingFocused'
, 'CareerAdviceFocused']
In [7]:
printmd("# Preferred Event Types (lower better)")
print(survey_df[preferred_events].median().sort_values())
In [8]:
reasons_split = survey_df['ReasonForParticipation'].apply(
lambda x: pd.Series([i.strip() for i in x.split(',')])
)
reasons = pd.DataFrame(Counter(pd.melt(reasons_split, value_vars=[0,1,2,3])['value']).most_common(),
columns=['Reason', 'Counts']).dropna()
In [42]:
printmd("""# Reasons for Attending Events
Skill development is the most popular reason to attend an event""")
reasons
Out[42]:
In [70]:
ax = plt.subplot(111)
reasons['Counts'].plot(ax=ax, kind='barh', title='What are you hoping to get out of participating in events?')
# ax.axis('off')
ax.invert_yaxis()
for i, x in enumerate(reasons['Reason']):
ax.text(0, i + .5, x, ha='right', fontsize='large')
In [75]:
survey_df['NumEventsAttended'].value_counts().plot.pie(figsize=(5,5))
Out[75]:
In [10]:
survey_df[preferred_events].describe()
Out[10]:
In [11]:
survey_df[preferred_focus].describe(include='all').T.sort_values(by=['top', 'freq'], ascending=[True, False])
Out[11]:
In [12]:
cats = ["Enjoy", "Neutral", "Don't Enjoy"]
focus_rank = pd.melt(survey_df[preferred_focus]).pivot_table(
index=['variable'],
columns=['value'],
values=['value'],
aggfunc=lambda x: x.count()
)['variable'][cats].sort_values(by=cats).fillna(0)
In [13]:
focus_rank
Out[13]:
In [14]:
hm = Chart(pd.melt(survey_df[preferred_focus]).dropna()).mark_text(applyColorToBackground=True).encode(
Row('variable:O', title="Event Focus"),
Column('value:O', title="Ranking"),
Color('count(value):O', ),
Text(' ')
)
In [15]:
printmd("""# Event Focus Prefrences
So it comes as no surprise that, other than coming to see a celebrity,
the most preferred focuses relate to learning skills""")
hm
In [60]:
survey_df[preferred_events].hist(figsize=(10, 5), layout=(2, 3))
plt.suptitle("Event Enjoyment by Total for each Ranking low number equals most preferred");
In [27]:
plt.subplots(figsize=(15, 8))
plt.suptitle("""Summary Boxplots for Events by Enjoyment""")
plt.annotate('most\npolarizing', xy=(3,5), xytext=(2,4.1),
arrowprops=dict(facecolor='blue', shrink=0.05))
plt.annotate('least\npolarizing', xy=(5,3), xytext=(5.5,2.2),
arrowprops=dict(facecolor='blue', shrink=0.05))
survey_df.boxplot(rot=20);
In [18]:
breakfast = survey_df['BreakfastMeetups'].dropna()
plot = breakfast.hist()
counts, bins = np.histogram(breakfast, bins=6,
range=(breakfast.min(),breakfast.max()))
bincenters = 0.5*(bins[1:]+bins[:-1])
plot.plot(bincenters, counts, 'r-', )
plt.suptitle("""Breakfast Meetups Enjoyed \n(1 high - 6 low)""");
In [19]:
def get_event_count_by_scores(df: pd.DataFrame, cols: list, scores: list):
counter = Counter()
[
[
counter.update([(event, score)[0]])
for score in df[event] if score in scores
]
for event in preferred_events
]
return counter
In [20]:
liked = Counter()
neutral = Counter()
unliked = Counter()
In [21]:
liked = get_event_count_by_scores(survey_df, preferred_events, [1,2])
adjliked = liked.copy()
neutral = get_event_count_by_scores(survey_df, preferred_events, [3,4])
unliked = get_event_count_by_scores(survey_df, preferred_events, [5,6])
adjliked.subtract(unliked)
In [22]:
printmd("# Ranking of Events by Total High Scores")
[printmd("* {} {}".format(k,v)) for k,v in liked.most_common()]
printmd("# Ranking of Events by Total Neutral Scores")
[printmd("* {} {}".format(k,v)) for k,v in neutral.most_common()]
printmd("# Ranking of Events by Total Low Scores")
[printmd("* {} {}".format(k,v)) for k,v in unliked.most_common()];
In [34]:
adj_highs = pd.DataFrame.from_dict(dict(adjliked), orient='index')
adj_highs.index.name = 'event'
adj_highs.columns = ['score']
title = """Events ranked by how much people like to attend
(score is high scores adjusted by low scores)
"""
p = adj_highs.sort_values(by='score').T.plot.barh(title=title, figsize=(15,10))
p.set_yticklabels([])
p.legend(title='Event Type', loc='upper left')
p.set_ylabel("Ranked Enjoyment")
p.set_xlabel("Adjusted Score");
In [32]:
responses = [e.strip().lower() for e in survey_df['MagicWand'].dropna()]
responses
Out[32]:
In [33]:
"""Working with strings in vanilla python"""
words = Counter()
[words.update(e.split()) for e in responses];
[word for word in words.most_common() if word[1] > 1]
Out[33]:
In [26]:
"""Using some of NLTK to process strings
"""
stop = stopwords.words('english') + [ p for p in string.punctuation]
filtered = [[word for word in e.split() if word not in stop] for e in responses]
words = list(itertools.chain.from_iterable(filtered))
[word for word in Counter(words).most_common() if word[1] > 1]
Out[26]:
In [78]:
from wordcloud import WordCloud
In [79]:
WordCloud().generate(words)
In [88]:
wordcloud = WordCloud(background_color='white').generate(" ".join(list(itertools.chain.from_iterable(filtered))))
plt.imshow(wordcloud)
plt.axis("off");
In [ ]: