notebook.community

Edit and run



In [1]:

    
%matplotlib inline

import requests
from collections import Counter, OrderedDict, defaultdict
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as ss
import matplotlib_venn as venn
from altair import Chart, Color, Column, Row, Text
from IPython.display import Markdown, display
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import itertools
import datetime

plt.rcParams["figure.figsize"] = (15, 8)
nltk.download('punkt')
nltk.download('stopwords')









    



[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!






    Out[1]:





True



In [2]:

    
def printmd(string):
    """This leverages Jupyter's display to print out Markdown print statements"""
    display(Markdown(string))



In [3]:

    
"""Get the survey responses as a simple csv and load it into a 
pandas dataframe
"""
file_id = '19WATPacwNw4yJjTgWH7bsUNmst5fmnbADc1VvPx6cNQ'
url = "https://docs.google.com/spreadsheets/d/{file_id}/export?format=csv".format(**locals())
columns = ['Timestamp' 
           , 'EveningMeetups'
           , 'LunchtimeMeetups'
           , 'BreakfastMeetups'
           , 'LocalConferences'
           , 'NatIntConferences'
           , 'MOOCSOnlineCourses'
           , 'NumEventsAttended'
           , 'ReasonForParticipation'
           , 'LanguageFrameworkFocused'
           , 'TechnologyFocused'
           , 'DisciplineJobFocused'
           , 'CompanyFocused'
           , 'LearningGroupFocused'
           , 'LiveTechTalksFocused'
           , 'WatchRecordedTalks'
           , 'MarketIndustryFocused'
           , 'ExpertInterviewsFocused'
           , 'NetworkingFocused'
           , 'CareerAdviceFocused'
           , 'MagicWand']
survey_df = pd.read_csv(url, names=columns, header=0)



In [4]:

    
printmd("# Responses to date *{}*\n\n # {}".format(datetime.date.today().strftime("%B %d, %Y"), survey_df.shape[0]))









    




Responses to date December 12, 2016
47



In [5]:

    
"""Here's what our dataframe looks like
"""
survey_df.head()









    Out[5]:






  
    
      
      Timestamp
      EveningMeetups
      LunchtimeMeetups
      BreakfastMeetups
      LocalConferences
      NatIntConferences
      MOOCSOnlineCourses
      NumEventsAttended
      ReasonForParticipation
      LanguageFrameworkFocused
      ...
      DisciplineJobFocused
      CompanyFocused
      LearningGroupFocused
      LiveTechTalksFocused
      WatchRecordedTalks
      MarketIndustryFocused
      ExpertInterviewsFocused
      NetworkingFocused
      CareerAdviceFocused
      MagicWand
    
  
  
    
      0
      11/19/2016 15:03:49
      4
      2
      5.0
      6
      3.0
      1
      2 to 5
      Develop/Improve technical skills, Understand i...
      Enjoy
      ...
      Enjoy
      Don't Enjoy
      Neutral
      Neutral
      Enjoy
      Neutral
      Enjoy
      Enjoy
      Neutral
      Make people less scared to show up and start t...
    
    
      1
      11/19/2016 15:42:58
      3
      4
      5.0
      2
      1.0
      6
      10+
      Develop/Improve technical skills, Understand i...
      Don't Enjoy
      ...
      Enjoy
      Don't Enjoy
      Enjoy
      Enjoy
      Neutral
      Enjoy
      Enjoy
      Enjoy
      Enjoy
      More diversity
    
    
      2
      11/19/2016 16:03:33
      2
      3
      4.0
      1
      5.0
      6
      2 to 5
      Develop/Improve technical skills, Network & Jo...
      Neutral
      ...
      Enjoy
      Neutral
      Enjoy
      Enjoy
      Neutral
      Enjoy
      Enjoy
      Enjoy
      Enjoy
      More inviting to people with less prior knowledge
    
    
      3
      11/19/2016 16:35:21
      4
      2
      3.0
      6
      1.0
      5
      2 to 5
      Develop/Improve technical skills, Understand i...
      Enjoy
      ...
      Enjoy
      Enjoy
      Neutral
      Enjoy
      Neutral
      Enjoy
      Enjoy
      Neutral
      Neutral
      Local c# sharp community
    
    
      4
      11/20/2016 9:19:32
      5
      1
      4.0
      2
      6.0
      3
      2 to 5
      Develop/Improve technical skills, Understand i...
      Neutral
      ...
      Neutral
      Neutral
      Enjoy
      NaN
      Neutral
      Enjoy
      Enjoy
      Neutral
      Neutral
      More collaboration.
    
  

5 rows × 21 columns



In [6]:

    
"""Let's group the columns that go to each answer
"""
preferred_events = ['BreakfastMeetups'
                    , 'LocalConferences'
                    , 'MOOCSOnlineCourses'
                    , 'EveningMeetups'
                    , 'LunchtimeMeetups'
                    , 'NatIntConferences']
preferred_focus = ['LanguageFrameworkFocused'
                   , 'TechnologyFocused'
                   , 'DisciplineJobFocused'
                   , 'CompanyFocused'
                   , 'LearningGroupFocused'
                   , 'LiveTechTalksFocused'
                   , 'WatchRecordedTalks'
                   , 'MarketIndustryFocused'
                   , 'ExpertInterviewsFocused'
                   , 'NetworkingFocused'
                   , 'CareerAdviceFocused']



In [7]:

    
printmd("# Preferred Event Types (lower better)")
print(survey_df[preferred_events].median().sort_values())









    




Preferred Event Types (lower better)






    



LocalConferences      3.0
EveningMeetups        3.0
NatIntConferences     3.0
MOOCSOnlineCourses    4.0
LunchtimeMeetups      4.0
BreakfastMeetups      5.0
dtype: float64



In [8]:

    
reasons_split = survey_df['ReasonForParticipation'].apply(
    lambda x: pd.Series([i.strip() for i in x.split(',')])
)
reasons = pd.DataFrame(Counter(pd.melt(reasons_split, value_vars=[0,1,2,3])['value']).most_common(),
             columns=['Reason', 'Counts']).dropna()



In [42]:

    
printmd("""# Reasons for Attending Events
Skill development is the most popular reason to attend an event""")
reasons









    




Reasons for Attending Events
Skill development is the most popular reason to attend an event







    Out[42]:






  
    
      
      Reason
      Counts
    
  
  
    
      1
      Develop/Improve technical skills
      43
    
    
      2
      Connection to my community
      32
    
    
      3
      Understand industry trends
      30
    
    
      4
      Network & Job hunt
      25
    
    
      5
      Test
      1



In [70]:

    
ax = plt.subplot(111)
reasons['Counts'].plot(ax=ax, kind='barh', title='What are you hoping to get out of participating in events?')
# ax.axis('off')
ax.invert_yaxis()
for i, x in enumerate(reasons['Reason']):
    ax.text(0, i + .5, x, ha='right', fontsize='large')



In [75]:

    
survey_df['NumEventsAttended'].value_counts().plot.pie(figsize=(5,5))









    Out[75]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc312b919b0>



In [10]:

    
survey_df[preferred_events].describe()









    Out[10]:






  
    
      
      BreakfastMeetups
      LocalConferences
      MOOCSOnlineCourses
      EveningMeetups
      LunchtimeMeetups
      NatIntConferences
    
  
  
    
      count
      45.000000
      47.000000
      47.000000
      47.000000
      47.000000
      46.000000
    
    
      mean
      3.844444
      3.085106
      3.893617
      3.510638
      3.553191
      3.173913
    
    
      std
      1.976631
      1.779470
      1.658103
      1.599896
      1.585375
      1.553553
    
    
      min
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
    
      25%
      2.000000
      2.000000
      3.000000
      2.000000
      2.000000
      2.000000
    
    
      50%
      5.000000
      3.000000
      4.000000
      3.000000
      4.000000
      3.000000
    
    
      75%
      6.000000
      4.500000
      5.000000
      5.000000
      5.000000
      4.000000
    
    
      max
      6.000000
      6.000000
      6.000000
      6.000000
      6.000000
      6.000000



In [11]:

    
survey_df[preferred_focus].describe(include='all').T.sort_values(by=['top', 'freq'], ascending=[True, False])









    Out[11]:






  
    
      
      count
      unique
      top
      freq
    
  
  
    
      DisciplineJobFocused
      47
      2
      Enjoy
      41
    
    
      TechnologyFocused
      46
      2
      Enjoy
      40
    
    
      LiveTechTalksFocused
      46
      2
      Enjoy
      40
    
    
      ExpertInterviewsFocused
      47
      2
      Enjoy
      39
    
    
      LanguageFrameworkFocused
      47
      3
      Enjoy
      31
    
    
      NetworkingFocused
      47
      3
      Enjoy
      30
    
    
      MarketIndustryFocused
      47
      3
      Enjoy
      26
    
    
      LearningGroupFocused
      46
      3
      Enjoy
      22
    
    
      CareerAdviceFocused
      47
      3
      Enjoy
      19
    
    
      CompanyFocused
      45
      3
      Neutral
      25
    
    
      WatchRecordedTalks
      46
      3
      Neutral
      19



In [12]:

    
cats = ["Enjoy", "Neutral", "Don't Enjoy"]
focus_rank = pd.melt(survey_df[preferred_focus]).pivot_table(
    index=['variable'], 
    columns=['value'], 
    values=['value'],
    aggfunc=lambda x: x.count()
)['variable'][cats].sort_values(by=cats).fillna(0)



In [13]:

    
focus_rank









    Out[13]:






  
    
      value
      Enjoy
      Neutral
      Don't Enjoy
    
    
      variable
      
      
      
    
  
  
    
      CompanyFocused
      13.0
      25.0
      7.0
    
    
      WatchRecordedTalks
      16.0
      19.0
      11.0
    
    
      CareerAdviceFocused
      19.0
      18.0
      10.0
    
    
      LearningGroupFocused
      22.0
      16.0
      8.0
    
    
      MarketIndustryFocused
      26.0
      18.0
      3.0
    
    
      NetworkingFocused
      30.0
      14.0
      3.0
    
    
      LanguageFrameworkFocused
      31.0
      12.0
      4.0
    
    
      ExpertInterviewsFocused
      39.0
      8.0
      0.0
    
    
      LiveTechTalksFocused
      40.0
      6.0
      0.0
    
    
      TechnologyFocused
      40.0
      6.0
      0.0
    
    
      DisciplineJobFocused
      41.0
      6.0
      0.0



In [14]:

    
hm = Chart(pd.melt(survey_df[preferred_focus]).dropna()).mark_text(applyColorToBackground=True).encode(
    Row('variable:O', title="Event Focus"),
    Column('value:O', title="Ranking"),
    Color('count(value):O', ),
    Text(' ')
)



In [15]:

    
printmd("""# Event Focus Prefrences
So it comes as no surprise that, other than coming to see a celebrity,
the most preferred focuses relate to learning skills""")
hm









    




Event Focus Prefrences
So it comes as no surprise that, other than coming to see a celebrity,
the most preferred focuses relate to learning skills



In [60]:

    
survey_df[preferred_events].hist(figsize=(10, 5), layout=(2, 3))
plt.suptitle("Event Enjoyment by Total for each Ranking low number equals most preferred");



In [27]:

    
plt.subplots(figsize=(15, 8))
plt.suptitle("""Summary Boxplots for Events by Enjoyment""")
plt.annotate('most\npolarizing', xy=(3,5), xytext=(2,4.1), 
             arrowprops=dict(facecolor='blue', shrink=0.05))
plt.annotate('least\npolarizing', xy=(5,3), xytext=(5.5,2.2),
             arrowprops=dict(facecolor='blue', shrink=0.05))
survey_df.boxplot(rot=20);



In [18]:

    
breakfast = survey_df['BreakfastMeetups'].dropna()
plot = breakfast.hist()
counts, bins = np.histogram(breakfast, bins=6,
                            range=(breakfast.min(),breakfast.max()))
bincenters = 0.5*(bins[1:]+bins[:-1])
plot.plot(bincenters, counts, 'r-', )
plt.suptitle("""Breakfast Meetups Enjoyed \n(1 high - 6 low)""");



In [19]:

    
def get_event_count_by_scores(df: pd.DataFrame, cols: list, scores: list):
    counter = Counter()
    [
        [
            counter.update([(event, score)[0]]) 
            for score in df[event] if score in scores
        ] 
        for event in preferred_events
    ]
    return counter



In [20]:

    
liked = Counter()
neutral = Counter()
unliked = Counter()



In [21]:

    
liked = get_event_count_by_scores(survey_df, preferred_events, [1,2])
adjliked = liked.copy()
neutral = get_event_count_by_scores(survey_df, preferred_events, [3,4])
unliked = get_event_count_by_scores(survey_df, preferred_events, [5,6])
adjliked.subtract(unliked)



In [22]:

    
printmd("# Ranking of Events by Total High Scores")
[printmd("*  {} {}".format(k,v)) for k,v in liked.most_common()]
printmd("# Ranking of Events by Total Neutral Scores")
[printmd("*  {} {}".format(k,v)) for k,v in neutral.most_common()]
printmd("# Ranking of Events by Total Low Scores")
[printmd("*  {} {}".format(k,v)) for k,v in unliked.most_common()];









    




Ranking of Events by Total High Scores






    





LocalConferences 23








    





NatIntConferences 16








    





EveningMeetups 16








    





LunchtimeMeetups 14








    





BreakfastMeetups 14








    





MOOCSOnlineCourses 9








    




Ranking of Events by Total Neutral Scores






    





MOOCSOnlineCourses 20








    





LunchtimeMeetups 19








    





NatIntConferences 19








    





EveningMeetups 16








    





LocalConferences 12








    





BreakfastMeetups 8








    




Ranking of Events by Total Low Scores






    





BreakfastMeetups 23








    





MOOCSOnlineCourses 18








    





EveningMeetups 15








    





LunchtimeMeetups 14








    





LocalConferences 12








    





NatIntConferences 11



In [34]:

    
adj_highs = pd.DataFrame.from_dict(dict(adjliked), orient='index')
adj_highs.index.name = 'event'
adj_highs.columns = ['score']
title = """Events ranked by how much people like to attend
(score is high scores adjusted by low scores)
"""
p = adj_highs.sort_values(by='score').T.plot.barh(title=title, figsize=(15,10))
p.set_yticklabels([])
p.legend(title='Event Type', loc='upper left')
p.set_ylabel("Ranked Enjoyment")
p.set_xlabel("Adjusted Score");



In [32]:

    
responses = [e.strip().lower() for e in survey_df['MagicWand'].dropna()]
responses









    Out[32]:





['make people less scared to show up and start talking to the person next to them.',
 'more diversity',
 'more inviting to people with less prior knowledge',
 'local c# sharp community',
 'more collaboration.',
 'more community involvement - both from individuals and organizations',
 'have more open discussions between differing factions',
 'consolidate overall performance and make it more effective and sufficient.',
 'need more beginner level events',
 'educate everyone in utah',
 'test',
 'have more lightning talks. do more hands on learning. more hackathons.',
 'increase sharing',
 'raise the salary of utah tech employees.',
 'more national-level conferences in the state',
 'have a lot of local, focused conferences during regular business hours (m-f, 8:00am-5:00pm).',
 'get rid of provo/orem/ogden user groups and just have slc be the hub',
 "get them educated in fundamentals of math and cs. make it so an lds membership wasn't required for a job or networking with colleagues.",
 'free training on topics relevant to data analysis and data science.',
 'more events north of slc',
 'more tech related meetups in utah valley.',
 "i think we're generally behind in moving technology forward. i mean this specifically for data science and engineering. there are a few companies that are employing lambda / kappa architectures, and doing impressive things with distributed data. because of the dearth of complex data architecture there ends up being more beginner level talks than advanced. i'm happy to give a beginner level talk and attend some, but the lack of more complex talks makes it hard to support the community as much as i'd like to.",
 'more real world case study meetups',
 'more statistics related topics',
 'more hackathons',
 'close study groups on shared technical topics of interest.',
 'with magic? be able to travel faster or have all meetups be closer to my house. other than that? as a meetup organizer, i ask for ideas and nobody provides any, so few people attend the events i plan. i wish i knew what people would show up for.',
 "stop pretending we are silicon valley. we're not, and that's okay. we can be a different good.",
 'more launch events for tech products',
 'more hackathons/hacknights.',
 'more participation - of which i am guilty of not participating enough :-(']



In [33]:

    
"""Working with strings in vanilla python"""
words = Counter()
[words.update(e.split()) for e in responses];
[word for word in words.most_common() if word[1] > 1]









    Out[33]:





[('more', 21),
 ('and', 12),
 ('of', 11),
 ('to', 9),
 ('the', 8),
 ('i', 7),
 ('a', 6),
 ('have', 5),
 ('in', 5),
 ('data', 4),
 ('be', 4),
 ('for', 4),
 ('with', 4),
 ('people', 4),
 ('events', 4),
 ('community', 3),
 ('on', 3),
 ('as', 3),
 ('tech', 3),
 ('up', 3),
 ('it', 3),
 ('meetups', 3),
 ('topics', 3),
 ('are', 3),
 ('utah', 3),
 ('beginner', 3),
 ('level', 3),
 ('make', 3),
 ('valley.', 2),
 ('slc', 2),
 ('so', 2),
 ('complex', 2),
 ('groups', 2),
 ('related', 2),
 ("we're", 2),
 ('or', 2),
 ('talks', 2),
 ('there', 2),
 ('show', 2),
 ('conferences', 2),
 ('study', 2),
 ('few', 2),
 ('-', 2),
 ('get', 2),
 ('than', 2),
 ('we', 2),
 ('less', 2),
 ('attend', 2)]



In [26]:

    
"""Using some of NLTK to process strings
"""
stop = stopwords.words('english') + [ p for p in string.punctuation]
filtered = [[word for word in e.split() if word not in stop] for e in responses]
words = list(itertools.chain.from_iterable(filtered))
[word for word in Counter(words).most_common() if word[1] > 1]









    Out[26]:





[('data', 4),
 ('people', 4),
 ('events', 4),
 ('community', 3),
 ('beginner', 3),
 ('meetups', 3),
 ('topics', 3),
 ('utah', 3),
 ('level', 3),
 ('make', 3),
 ('tech', 3),
 ('valley.', 2),
 ('complex', 2),
 ('conferences', 2),
 ('groups', 2),
 ('related', 2),
 ('slc', 2),
 ("we're", 2),
 ('talks', 2),
 ('show', 2),
 ('study', 2),
 ('get', 2),
 ('less', 2),
 ('attend', 2)]



In [78]:

    
from wordcloud import WordCloud



In [79]:

    
WordCloud().generate(words)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-79-66ac5a6cd39b> in <module>()
----> 1 WordCloud().generate(words)

/opt/conda/lib/python3.5/site-packages/wordcloud/wordcloud.py in generate(self, text)
    448         self
    449         """
--> 450         return self.generate_from_text(text)
    451 
    452     def _check_generated(self):

/opt/conda/lib/python3.5/site-packages/wordcloud/wordcloud.py in generate_from_text(self, text)
    433         self
    434         """
--> 435         words = self.process_text(text)
    436         self.generate_from_frequencies(words)
    437         return self

/opt/conda/lib/python3.5/site-packages/wordcloud/wordcloud.py in process_text(self, text)
    386         flags = (re.UNICODE if sys.version < '3' and type(text) is unicode
    387                  else 0)
--> 388         for word in re.findall(r"\w[\w']+", text, flags=flags):
    389             if word.isdigit():
    390                 continue

/opt/conda/lib/python3.5/re.py in findall(pattern, string, flags)
    211 
    212     Empty matches are included in the result."""
--> 213     return _compile(pattern, flags).findall(string)
    214 
    215 def finditer(pattern, string, flags=0):

TypeError: expected string or bytes-like object



In [88]:

    
wordcloud = WordCloud(background_color='white').generate(" ".join(list(itertools.chain.from_iterable(filtered))))
plt.imshow(wordcloud)
plt.axis("off");



In [ ]:

	Timestamp	EveningMeetups	LunchtimeMeetups	BreakfastMeetups	LocalConferences	NatIntConferences	MOOCSOnlineCourses	NumEventsAttended	ReasonForParticipation	LanguageFrameworkFocused	...	DisciplineJobFocused	CompanyFocused	LearningGroupFocused	LiveTechTalksFocused	WatchRecordedTalks	MarketIndustryFocused	ExpertInterviewsFocused	NetworkingFocused	CareerAdviceFocused	MagicWand
0	11/19/2016 15:03:49	4	2	5.0	6	3.0	1	2 to 5	Develop/Improve technical skills, Understand i...	Enjoy	...	Enjoy	Don't Enjoy	Neutral	Neutral	Enjoy	Neutral	Enjoy	Enjoy	Neutral	Make people less scared to show up and start t...
1	11/19/2016 15:42:58	3	4	5.0	2	1.0	6	10+	Develop/Improve technical skills, Understand i...	Don't Enjoy	...	Enjoy	Don't Enjoy	Enjoy	Enjoy	Neutral	Enjoy	Enjoy	Enjoy	Enjoy	More diversity
2	11/19/2016 16:03:33	2	3	4.0	1	5.0	6	2 to 5	Develop/Improve technical skills, Network & Jo...	Neutral	...	Enjoy	Neutral	Enjoy	Enjoy	Neutral	Enjoy	Enjoy	Enjoy	Enjoy	More inviting to people with less prior knowledge
3	11/19/2016 16:35:21	4	2	3.0	6	1.0	5	2 to 5	Develop/Improve technical skills, Understand i...	Enjoy	...	Enjoy	Enjoy	Neutral	Enjoy	Neutral	Enjoy	Enjoy	Neutral	Neutral	Local c# sharp community
4	11/20/2016 9:19:32	5	1	4.0	2	6.0	3	2 to 5	Develop/Improve technical skills, Understand i...	Neutral	...	Neutral	Neutral	Enjoy	NaN	Neutral	Enjoy	Enjoy	Neutral	Neutral	More collaboration.

	Reason	Counts
1	Develop/Improve technical skills	43
2	Connection to my community	32
3	Understand industry trends	30
4	Network & Job hunt	25
5	Test	1

	BreakfastMeetups	LocalConferences	MOOCSOnlineCourses	EveningMeetups	LunchtimeMeetups	NatIntConferences
count	45.000000	47.000000	47.000000	47.000000	47.000000	46.000000
mean	3.844444	3.085106	3.893617	3.510638	3.553191	3.173913
std	1.976631	1.779470	1.658103	1.599896	1.585375	1.553553
min	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
25%	2.000000	2.000000	3.000000	2.000000	2.000000	2.000000
50%	5.000000	3.000000	4.000000	3.000000	4.000000	3.000000
75%	6.000000	4.500000	5.000000	5.000000	5.000000	4.000000
max	6.000000	6.000000	6.000000	6.000000	6.000000	6.000000

	count	unique	top	freq
DisciplineJobFocused	47	2	Enjoy	41
TechnologyFocused	46	2	Enjoy	40
LiveTechTalksFocused	46	2	Enjoy	40
ExpertInterviewsFocused	47	2	Enjoy	39
LanguageFrameworkFocused	47	3	Enjoy	31
NetworkingFocused	47	3	Enjoy	30
MarketIndustryFocused	47	3	Enjoy	26
LearningGroupFocused	46	3	Enjoy	22
CareerAdviceFocused	47	3	Enjoy	19
CompanyFocused	45	3	Neutral	25
WatchRecordedTalks	46	3	Neutral	19

value	Enjoy	Neutral	Don't Enjoy
variable
CompanyFocused	13.0	25.0	7.0
WatchRecordedTalks	16.0	19.0	11.0
CareerAdviceFocused	19.0	18.0	10.0
LearningGroupFocused	22.0	16.0	8.0
MarketIndustryFocused	26.0	18.0	3.0
NetworkingFocused	30.0	14.0	3.0
LanguageFrameworkFocused	31.0	12.0	4.0
ExpertInterviewsFocused	39.0	8.0	0.0
LiveTechTalksFocused	40.0	6.0	0.0
TechnologyFocused	40.0	6.0	0.0
DisciplineJobFocused	41.0	6.0	0.0