In [1]:
%matplotlib inline

import requests
from collections import Counter, OrderedDict, defaultdict
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as ss
import matplotlib_venn as venn
from altair import Chart, Color, Column, Row, Text
from IPython.display import Markdown, display
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import itertools
import datetime

plt.rcParams["figure.figsize"] = (15, 8)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[1]:
True

In [2]:
def printmd(string):
    """This leverages Jupyter's display to print out Markdown print statements"""
    display(Markdown(string))

In [3]:
"""Get the survey responses as a simple csv and load it into a 
pandas dataframe
"""
file_id = '19WATPacwNw4yJjTgWH7bsUNmst5fmnbADc1VvPx6cNQ'
url = "https://docs.google.com/spreadsheets/d/{file_id}/export?format=csv".format(**locals())
columns = ['Timestamp' 
           , 'EveningMeetups'
           , 'LunchtimeMeetups'
           , 'BreakfastMeetups'
           , 'LocalConferences'
           , 'NatIntConferences'
           , 'MOOCSOnlineCourses'
           , 'NumEventsAttended'
           , 'ReasonForParticipation'
           , 'LanguageFrameworkFocused'
           , 'TechnologyFocused'
           , 'DisciplineJobFocused'
           , 'CompanyFocused'
           , 'LearningGroupFocused'
           , 'LiveTechTalksFocused'
           , 'WatchRecordedTalks'
           , 'MarketIndustryFocused'
           , 'ExpertInterviewsFocused'
           , 'NetworkingFocused'
           , 'CareerAdviceFocused'
           , 'MagicWand']
survey_df = pd.read_csv(url, names=columns, header=0)

In [4]:
printmd("# Responses to date *{}*\n\n # {}".format(datetime.date.today().strftime("%B %d, %Y"), survey_df.shape[0]))


Responses to date December 12, 2016

47


In [5]:
"""Here's what our dataframe looks like
"""
survey_df.head()


Out[5]:
Timestamp EveningMeetups LunchtimeMeetups BreakfastMeetups LocalConferences NatIntConferences MOOCSOnlineCourses NumEventsAttended ReasonForParticipation LanguageFrameworkFocused ... DisciplineJobFocused CompanyFocused LearningGroupFocused LiveTechTalksFocused WatchRecordedTalks MarketIndustryFocused ExpertInterviewsFocused NetworkingFocused CareerAdviceFocused MagicWand
0 11/19/2016 15:03:49 4 2 5.0 6 3.0 1 2 to 5 Develop/Improve technical skills, Understand i... Enjoy ... Enjoy Don't Enjoy Neutral Neutral Enjoy Neutral Enjoy Enjoy Neutral Make people less scared to show up and start t...
1 11/19/2016 15:42:58 3 4 5.0 2 1.0 6 10+ Develop/Improve technical skills, Understand i... Don't Enjoy ... Enjoy Don't Enjoy Enjoy Enjoy Neutral Enjoy Enjoy Enjoy Enjoy More diversity
2 11/19/2016 16:03:33 2 3 4.0 1 5.0 6 2 to 5 Develop/Improve technical skills, Network & Jo... Neutral ... Enjoy Neutral Enjoy Enjoy Neutral Enjoy Enjoy Enjoy Enjoy More inviting to people with less prior knowledge
3 11/19/2016 16:35:21 4 2 3.0 6 1.0 5 2 to 5 Develop/Improve technical skills, Understand i... Enjoy ... Enjoy Enjoy Neutral Enjoy Neutral Enjoy Enjoy Neutral Neutral Local c# sharp community
4 11/20/2016 9:19:32 5 1 4.0 2 6.0 3 2 to 5 Develop/Improve technical skills, Understand i... Neutral ... Neutral Neutral Enjoy NaN Neutral Enjoy Enjoy Neutral Neutral More collaboration.

5 rows × 21 columns


In [6]:
"""Let's group the columns that go to each answer
"""
preferred_events = ['BreakfastMeetups'
                    , 'LocalConferences'
                    , 'MOOCSOnlineCourses'
                    , 'EveningMeetups'
                    , 'LunchtimeMeetups'
                    , 'NatIntConferences']
preferred_focus = ['LanguageFrameworkFocused'
                   , 'TechnologyFocused'
                   , 'DisciplineJobFocused'
                   , 'CompanyFocused'
                   , 'LearningGroupFocused'
                   , 'LiveTechTalksFocused'
                   , 'WatchRecordedTalks'
                   , 'MarketIndustryFocused'
                   , 'ExpertInterviewsFocused'
                   , 'NetworkingFocused'
                   , 'CareerAdviceFocused']

In [7]:
printmd("# Preferred Event Types (lower better)")
print(survey_df[preferred_events].median().sort_values())


Preferred Event Types (lower better)

LocalConferences      3.0
EveningMeetups        3.0
NatIntConferences     3.0
MOOCSOnlineCourses    4.0
LunchtimeMeetups      4.0
BreakfastMeetups      5.0
dtype: float64

In [8]:
reasons_split = survey_df['ReasonForParticipation'].apply(
    lambda x: pd.Series([i.strip() for i in x.split(',')])
)
reasons = pd.DataFrame(Counter(pd.melt(reasons_split, value_vars=[0,1,2,3])['value']).most_common(),
             columns=['Reason', 'Counts']).dropna()

In [42]:
printmd("""# Reasons for Attending Events
Skill development is the most popular reason to attend an event""")
reasons


Reasons for Attending Events

Skill development is the most popular reason to attend an event

Out[42]:
Reason Counts
1 Develop/Improve technical skills 43
2 Connection to my community 32
3 Understand industry trends 30
4 Network & Job hunt 25
5 Test 1

In [70]:
ax = plt.subplot(111)
reasons['Counts'].plot(ax=ax, kind='barh', title='What are you hoping to get out of participating in events?')
# ax.axis('off')
ax.invert_yaxis()
for i, x in enumerate(reasons['Reason']):
    ax.text(0, i + .5, x, ha='right', fontsize='large')



In [75]:
survey_df['NumEventsAttended'].value_counts().plot.pie(figsize=(5,5))


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc312b919b0>

In [10]:
survey_df[preferred_events].describe()


Out[10]:
BreakfastMeetups LocalConferences MOOCSOnlineCourses EveningMeetups LunchtimeMeetups NatIntConferences
count 45.000000 47.000000 47.000000 47.000000 47.000000 46.000000
mean 3.844444 3.085106 3.893617 3.510638 3.553191 3.173913
std 1.976631 1.779470 1.658103 1.599896 1.585375 1.553553
min 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
25% 2.000000 2.000000 3.000000 2.000000 2.000000 2.000000
50% 5.000000 3.000000 4.000000 3.000000 4.000000 3.000000
75% 6.000000 4.500000 5.000000 5.000000 5.000000 4.000000
max 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000

In [11]:
survey_df[preferred_focus].describe(include='all').T.sort_values(by=['top', 'freq'], ascending=[True, False])


Out[11]:
count unique top freq
DisciplineJobFocused 47 2 Enjoy 41
TechnologyFocused 46 2 Enjoy 40
LiveTechTalksFocused 46 2 Enjoy 40
ExpertInterviewsFocused 47 2 Enjoy 39
LanguageFrameworkFocused 47 3 Enjoy 31
NetworkingFocused 47 3 Enjoy 30
MarketIndustryFocused 47 3 Enjoy 26
LearningGroupFocused 46 3 Enjoy 22
CareerAdviceFocused 47 3 Enjoy 19
CompanyFocused 45 3 Neutral 25
WatchRecordedTalks 46 3 Neutral 19

In [12]:
cats = ["Enjoy", "Neutral", "Don't Enjoy"]
focus_rank = pd.melt(survey_df[preferred_focus]).pivot_table(
    index=['variable'], 
    columns=['value'], 
    values=['value'],
    aggfunc=lambda x: x.count()
)['variable'][cats].sort_values(by=cats).fillna(0)

In [13]:
focus_rank


Out[13]:
value Enjoy Neutral Don't Enjoy
variable
CompanyFocused 13.0 25.0 7.0
WatchRecordedTalks 16.0 19.0 11.0
CareerAdviceFocused 19.0 18.0 10.0
LearningGroupFocused 22.0 16.0 8.0
MarketIndustryFocused 26.0 18.0 3.0
NetworkingFocused 30.0 14.0 3.0
LanguageFrameworkFocused 31.0 12.0 4.0
ExpertInterviewsFocused 39.0 8.0 0.0
LiveTechTalksFocused 40.0 6.0 0.0
TechnologyFocused 40.0 6.0 0.0
DisciplineJobFocused 41.0 6.0 0.0

In [14]:
hm = Chart(pd.melt(survey_df[preferred_focus]).dropna()).mark_text(applyColorToBackground=True).encode(
    Row('variable:O', title="Event Focus"),
    Column('value:O', title="Ranking"),
    Color('count(value):O', ),
    Text(' ')
)

In [15]:
printmd("""# Event Focus Prefrences
So it comes as no surprise that, other than coming to see a celebrity,
the most preferred focuses relate to learning skills""")
hm


Event Focus Prefrences

So it comes as no surprise that, other than coming to see a celebrity, the most preferred focuses relate to learning skills


In [60]:
survey_df[preferred_events].hist(figsize=(10, 5), layout=(2, 3))
plt.suptitle("Event Enjoyment by Total for each Ranking low number equals most preferred");



In [27]:
plt.subplots(figsize=(15, 8))
plt.suptitle("""Summary Boxplots for Events by Enjoyment""")
plt.annotate('most\npolarizing', xy=(3,5), xytext=(2,4.1), 
             arrowprops=dict(facecolor='blue', shrink=0.05))
plt.annotate('least\npolarizing', xy=(5,3), xytext=(5.5,2.2),
             arrowprops=dict(facecolor='blue', shrink=0.05))
survey_df.boxplot(rot=20);



In [18]:
breakfast = survey_df['BreakfastMeetups'].dropna()
plot = breakfast.hist()
counts, bins = np.histogram(breakfast, bins=6,
                            range=(breakfast.min(),breakfast.max()))
bincenters = 0.5*(bins[1:]+bins[:-1])
plot.plot(bincenters, counts, 'r-', )
plt.suptitle("""Breakfast Meetups Enjoyed \n(1 high - 6 low)""");



In [19]:
def get_event_count_by_scores(df: pd.DataFrame, cols: list, scores: list):
    counter = Counter()
    [
        [
            counter.update([(event, score)[0]]) 
            for score in df[event] if score in scores
        ] 
        for event in preferred_events
    ]
    return counter

In [20]:
liked = Counter()
neutral = Counter()
unliked = Counter()

In [21]:
liked = get_event_count_by_scores(survey_df, preferred_events, [1,2])
adjliked = liked.copy()
neutral = get_event_count_by_scores(survey_df, preferred_events, [3,4])
unliked = get_event_count_by_scores(survey_df, preferred_events, [5,6])
adjliked.subtract(unliked)

In [22]:
printmd("# Ranking of Events by Total High Scores")
[printmd("*  {} {}".format(k,v)) for k,v in liked.most_common()]
printmd("# Ranking of Events by Total Neutral Scores")
[printmd("*  {} {}".format(k,v)) for k,v in neutral.most_common()]
printmd("# Ranking of Events by Total Low Scores")
[printmd("*  {} {}".format(k,v)) for k,v in unliked.most_common()];


Ranking of Events by Total High Scores

  • LocalConferences 23
  • NatIntConferences 16
  • EveningMeetups 16
  • LunchtimeMeetups 14
  • BreakfastMeetups 14
  • MOOCSOnlineCourses 9

Ranking of Events by Total Neutral Scores

  • MOOCSOnlineCourses 20
  • LunchtimeMeetups 19
  • NatIntConferences 19
  • EveningMeetups 16
  • LocalConferences 12
  • BreakfastMeetups 8

Ranking of Events by Total Low Scores

  • BreakfastMeetups 23
  • MOOCSOnlineCourses 18
  • EveningMeetups 15
  • LunchtimeMeetups 14
  • LocalConferences 12
  • NatIntConferences 11

In [34]:
adj_highs = pd.DataFrame.from_dict(dict(adjliked), orient='index')
adj_highs.index.name = 'event'
adj_highs.columns = ['score']
title = """Events ranked by how much people like to attend
(score is high scores adjusted by low scores)
"""
p = adj_highs.sort_values(by='score').T.plot.barh(title=title, figsize=(15,10))
p.set_yticklabels([])
p.legend(title='Event Type', loc='upper left')
p.set_ylabel("Ranked Enjoyment")
p.set_xlabel("Adjusted Score");



In [32]:
responses = [e.strip().lower() for e in survey_df['MagicWand'].dropna()]
responses


Out[32]:
['make people less scared to show up and start talking to the person next to them.',
 'more diversity',
 'more inviting to people with less prior knowledge',
 'local c# sharp community',
 'more collaboration.',
 'more community involvement - both from individuals and organizations',
 'have more open discussions between differing factions',
 'consolidate overall performance and make it more effective and sufficient.',
 'need more beginner level events',
 'educate everyone in utah',
 'test',
 'have more lightning talks. do more hands on learning. more hackathons.',
 'increase sharing',
 'raise the salary of utah tech employees.',
 'more national-level conferences in the state',
 'have a lot of local, focused conferences during regular business hours (m-f, 8:00am-5:00pm).',
 'get rid of provo/orem/ogden user groups and just have slc be the hub',
 "get them educated in fundamentals of math and cs. make it so an lds membership wasn't required for a job or networking with colleagues.",
 'free training on topics relevant to data analysis and data science.',
 'more events north of slc',
 'more tech related meetups in utah valley.',
 "i think we're generally behind in moving technology forward. i mean this specifically for data science and engineering. there are a few companies that are employing lambda / kappa architectures, and doing impressive things with distributed data. because of the dearth of complex data architecture there ends up being more beginner level talks than advanced. i'm happy to give a beginner level talk and attend some, but the lack of more complex talks makes it hard to support the community as much as i'd like to.",
 'more real world case study meetups',
 'more statistics related topics',
 'more hackathons',
 'close study groups on shared technical topics of interest.',
 'with magic? be able to travel faster or have all meetups be closer to my house. other than that? as a meetup organizer, i ask for ideas and nobody provides any, so few people attend the events i plan. i wish i knew what people would show up for.',
 "stop pretending we are silicon valley. we're not, and that's okay. we can be a different good.",
 'more launch events for tech products',
 'more hackathons/hacknights.',
 'more participation - of which i am guilty of not participating enough :-(']

In [33]:
"""Working with strings in vanilla python"""
words = Counter()
[words.update(e.split()) for e in responses];
[word for word in words.most_common() if word[1] > 1]


Out[33]:
[('more', 21),
 ('and', 12),
 ('of', 11),
 ('to', 9),
 ('the', 8),
 ('i', 7),
 ('a', 6),
 ('have', 5),
 ('in', 5),
 ('data', 4),
 ('be', 4),
 ('for', 4),
 ('with', 4),
 ('people', 4),
 ('events', 4),
 ('community', 3),
 ('on', 3),
 ('as', 3),
 ('tech', 3),
 ('up', 3),
 ('it', 3),
 ('meetups', 3),
 ('topics', 3),
 ('are', 3),
 ('utah', 3),
 ('beginner', 3),
 ('level', 3),
 ('make', 3),
 ('valley.', 2),
 ('slc', 2),
 ('so', 2),
 ('complex', 2),
 ('groups', 2),
 ('related', 2),
 ("we're", 2),
 ('or', 2),
 ('talks', 2),
 ('there', 2),
 ('show', 2),
 ('conferences', 2),
 ('study', 2),
 ('few', 2),
 ('-', 2),
 ('get', 2),
 ('than', 2),
 ('we', 2),
 ('less', 2),
 ('attend', 2)]

In [26]:
"""Using some of NLTK to process strings
"""
stop = stopwords.words('english') + [ p for p in string.punctuation]
filtered = [[word for word in e.split() if word not in stop] for e in responses]
words = list(itertools.chain.from_iterable(filtered))
[word for word in Counter(words).most_common() if word[1] > 1]


Out[26]:
[('data', 4),
 ('people', 4),
 ('events', 4),
 ('community', 3),
 ('beginner', 3),
 ('meetups', 3),
 ('topics', 3),
 ('utah', 3),
 ('level', 3),
 ('make', 3),
 ('tech', 3),
 ('valley.', 2),
 ('complex', 2),
 ('conferences', 2),
 ('groups', 2),
 ('related', 2),
 ('slc', 2),
 ("we're", 2),
 ('talks', 2),
 ('show', 2),
 ('study', 2),
 ('get', 2),
 ('less', 2),
 ('attend', 2)]

In [78]:
from wordcloud import WordCloud

In [79]:
WordCloud().generate(words)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-79-66ac5a6cd39b> in <module>()
----> 1 WordCloud().generate(words)

/opt/conda/lib/python3.5/site-packages/wordcloud/wordcloud.py in generate(self, text)
    448         self
    449         """
--> 450         return self.generate_from_text(text)
    451 
    452     def _check_generated(self):

/opt/conda/lib/python3.5/site-packages/wordcloud/wordcloud.py in generate_from_text(self, text)
    433         self
    434         """
--> 435         words = self.process_text(text)
    436         self.generate_from_frequencies(words)
    437         return self

/opt/conda/lib/python3.5/site-packages/wordcloud/wordcloud.py in process_text(self, text)
    386         flags = (re.UNICODE if sys.version < '3' and type(text) is unicode
    387                  else 0)
--> 388         for word in re.findall(r"\w[\w']+", text, flags=flags):
    389             if word.isdigit():
    390                 continue

/opt/conda/lib/python3.5/re.py in findall(pattern, string, flags)
    211 
    212     Empty matches are included in the result."""
--> 213     return _compile(pattern, flags).findall(string)
    214 
    215 def finditer(pattern, string, flags=0):

TypeError: expected string or bytes-like object

In [88]:
wordcloud = WordCloud(background_color='white').generate(" ".join(list(itertools.chain.from_iterable(filtered))))
plt.imshow(wordcloud)
plt.axis("off");