In [178]:
import json
import codecs
import math
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from pylab import *
import seaborn as sns
from IPython.display import display, HTML 
%matplotlib inline

#from mpltools import style
#from mpltools import layout
#style.use('ggplot')

#load proposals
proposals = json.loads(codecs.open(os.path.join("data","proposals.json"),"r").read())

In [18]:
sessions_df = pd.DataFrame(proposals['sessions'])
sessions_df.head(2)


Out[18]:
agenda facilitators goals organization outcomes scale theme themeSlug timestamp title
0 #Hackers techniques: Social Engineering , Tool... [{u'twitter': u'@BIG_MIGGY', u'name': u'Mohamm... They will be well known about the latest hacks... Mozilla Jordan community People will be aware of what happens around th... I will give them everything i have, everyone m... Policy & Advocacy policy 2014-09-11T12:16:43.978Z Don't Hack Me
1 I'd showcase an application built using the st... [{u'twitter': u'@tpiros', u'name': u'Tamas Pir... The session would discuss the usefulness of th... I am the owner of the MEAN Stack meetup group ... The biggest challenge would be to make sure th... Build and Teach the Web teach 2014-09-11T12:16:43.978Z Be Smarter, Get MEAN

In [19]:
themes_df = pd.DataFrame(proposals['themes'])
themes_df.head()


Out[19]:
description name slug totalProposals
0 Keep the web wild through hands-on making with... Build and Teach the Web teach 135
1 Escape the limitations of your computer and bu... Open Web With Things physical 44
2 Explore opportunities in the booming world of ... Web in Your Pocket mobile 26
3 Design next-generation web solutions to solve ... Source Code for Journalism journalism 42
4 Examine the potential of the open web to re-de... Open Science and the Web science 45

How Many Proposals Did MozFest 2014 Receive?


In [8]:
display(HTML("<p>This year, the Mozilla Festival received {0} proposals in {1} areas.</p>".format(len(proposals['sessions']), len(proposals['themes']))))


This year, the Mozilla Festival received 578 proposals in 11 areas.


In [58]:
themes_df.sort('totalProposals',ascending=True).plot('name','totalProposals',
                                                     kind='barh',
                                                     title='Number of proposals per Topic, #MozFest 2014')


Out[58]:
<matplotlib.axes.AxesSubplot at 0x10eecfe50>

How Many Organizations Proposed Sessions at MozFest?


In [68]:
sessions_df.head()
sessions_df = sessions_df.replace(to_replace="Open Knowledge/ School of Data",value="Open Knowledge")
sessions_df[sessions_df['organization'] == "Open Knowledge"]


Out[68]:
agenda facilitators goals organization outcomes scale theme themeSlug timestamp title
481 We need 2 hours to ensure people have sufficie... [{u'twitter': u'@milena_iul', u'name': u'Milen... Spreadsheets can be a your best friend: they c... Open Knowledge We are looking to document the session in the ... We will break people down in small groups by t... Open Science and the Web science 2014-09-11T12:16:44.039Z Become a spreadsheet pro
483 We’ll split the crowd in small groups that wil... [{u'twitter': u'@beatricemartini', u'name': u'... Digital and open communities have taken multip... Open Knowledge After the session, we will collect all the ide... The overall idea is to have small groups of pe... (Community) Policy & Advocacy NaN 2014-09-11T12:16:44.039Z Join! Types of diversity and inclusion
493 We need 2 hours to ensure participants have su... [{u'twitter': u'@miena_iul', u'name': u'Milena... We’ve all heard of “horror stories” where data... Open Knowledge We are looking to document the session in the ... We plan to work in small groups of 5-6 partici... Open Science and the Web science 2014-09-11T12:16:44.040Z Dealing with messy data
554 Active session - \nA station of ideas to prom... [{u'twitter': u'@morchickit', u'name': u'Mor R... This is the third year we run the Global Open ... Open Knowledge We will post blog post about the session. Also... With 5-10 participants we can create a more in... Open Data data 2014-09-11T12:16:44.048Z Help us measure Open Gov Data

In [88]:
sessions_gb = sessions_df.groupby('organization')
org_count = sessions_gb.aggregate({'organization':len}).sort('organization',ascending=False)
org_count.head()


Out[88]:
organization
organization
102
Mozilla 28
Northwestern University Knight Lab 13
mozillian 11
Mozilla Reps 8

In [78]:
display(HTML("<p>This year, {0} organizations proposed sessions, with {1} orgs proposing more than one session.</p>".format(len(org_count)-1,len(org_count[org_count['organization'] > 1]))))


This year, 335 organizations proposed sessions, with 62 orgs proposing more than one session.


In [85]:
display(HTML("<p>{0} sessions, or {1:.2f}% of all sessions had no organization listed. </p>".format(org_count.ix['']['organization'],100*org_count.ix['']['organization']/float(len(sessions_df)))))


102 sessions, or 17.65% of all sessions had no organization listed.


In [104]:
org_count[org_count['organization'] > 2].sort('organization',ascending=True).plot(kind='barh',figsize=(3,6),title='Proposals from orgs with >2 proposals, #MozFest 2014')
plt.legend('')
plt.ylabel('')
plt.xlabel('Submissions')


Out[104]:
<matplotlib.text.Text at 0x1151d9790>

In [159]:
def _uniques(Series):
    return len(set(Series))

def _avg_count(Series):
    return np.mean(Series.apply(len))

sessions_gb_theme = sessions_df.groupby('themeSlug')
theme_count_df = sessions_gb_theme.aggregate({'organization':_uniques,'title':len,'facilitators':_avg_count})
theme_count_df


Out[159]:
organization facilitators title
themeSlug
art 38 1.773585 53
badges 17 2.833333 24
data 30 2.175000 40
hive 10 2.000000 14
journalism 33 1.880952 42
mobile 17 1.923077 26
music 13 2.117647 17
physical 37 1.954545 44
policy 42 2.491228 57
science 36 2.755556 45
teach 80 2.059259 135

In [176]:
# For some reason the colormap isn't working
ax = theme_count_df.plot(x='title',y='organization',
                    kind='scatter',
                    colormap='autumn_r',
                    c=theme_count_df['facilitators'],
                    s=20*np.e**theme_count_df['facilitators'],
                    figsize=(8,4))

for i in theme_count_df.index:
    x = theme_count_df.ix[i]['title']
    y = theme_count_df.ix[i]['organization']
    plt.annotate(
        i, 
        xy = (x, y), xytext = (-20, 20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->',color='black', connectionstyle = 'arc3,rad=0'))
    
plt.xlabel('Number of Sessions Proposed per Theme')
plt.ylabel('Number of Proposing Organizations per Theme')
title('Number of Session Proposals and Proposing Organizations by Theme, #MozFest 2014\n (the color of dots and number on labels is the avg session facilitator count)')


Out[176]:
<matplotlib.text.Text at 0x1185b60d0>

In [179]:
from byline_gender import BylineGender

bg = BylineGender()
people = {}
for session in proposals['sessions']:
    org = session['organization'] #not necessarily membership, maybe collaboration
    for facilitator in session['facilitators']:
        name = facilitator['name']
        twitter = facilitator['twitter']
        if(len(name)==0 and len(twitter)>0):
            name = twitter
        if name not in people.keys():
            people[name]={"sessions":[],"orgs":[],"twitter":[],"gender":None}
        people[name]['sessions'].append(session)
        if len(org)>0 and org not in people[name]['orgs']:
            people[name]['orgs'].append(org)
        if len(twitter)>0 and twitter not in people[name]['twitter']:
            people[name]['twitter'].append(twitter)


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-179-7d1baed1357e> in <module>()
----> 1 from byline_gender import BylineGender
      2 
      3 bg = BylineGender()
      4 people = {}
      5 for session in proposals['sessions']:

/Users/brianckeegan/Dropbox/Hacking/conference-proposers/byline_gender.py in <module>()
----> 1 from gender_detector import GenderDetector
      2 import nltk
      3 import re
      4 import requests
      5 import StringIO

ImportError: No module named gender_detector

In [5]:
import unicodedata
import string

def get_org_name(org,name):
    asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
    asciiname = ''.join(ch for ch in asciiname if ch not in string.punctuation)
    org = ''.join(ch for ch in org if ch not in string.punctuation)
    return (org,asciiname)


from byline_gender import BylineGender
bg = BylineGender()
people = {}
for session in proposals['sessions']:
    org = session['organization'] #not necessarily membership, maybe collaboration
    for facilitator in session['facilitators']:
        name = facilitator['name']
        twitter = facilitator['twitter']
        if(len(name)==0 and len(twitter)>0):
            name = twitter
        if name not in people.keys():
            people[name]={"sessions":[],"orgs":[],"twitter":[],"gender":None}
        people[name]['sessions'].append(session)
        if len(org)>0 and org not in people[name]['orgs']:
            people[name]['orgs'].append(org)
        if len(twitter)>0 and twitter not in people[name]['twitter']:
            people[name]['twitter'].append(twitter) 

#ONE TIME ONLY: Generate Name CSV to import to Google Spreadsheets
#TODO: SAVE ACTUAL ASCII NAMES AND ORG NAMES
#orgnames = {}
#for name in people.keys():
#    if len(name)==0:
#        continue
#    person = people[name]
#    if(len(person['orgs'])>0):
#        org = person['orgs'][0]
#    elif(len(person['twitter'])>0):
#        org = person['twitter'][0]
#    #alas, the python version of Open Gender Tracker is not unicode safe :p
#    #asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
#    #asciiname = ''.join(ch for ch in asciiname if ch not in string.punctuation)
#    #org = ''.join(ch for ch in org if ch not in string.punctuation)
#    org,asciiname = get_org_name(org,name)
#    if org not in orgnames.keys():
#        orgnames[org]={}
#    if asciiname not in orgnames[org].keys():
#        orgnames[org][asciiname] = len(person['sessions'])

#f = codecs.open("mozfest_org_names.csv","w", "utf-8")
#bg.export_org_names(orgnames,f)
#f.close()

In [35]:
#GENERATE A DATASET OF GENDER PER THEME

theme_people = {}

for theme in proposals['themes']:
    slug = theme['slug']
    #filter by sessions that have the current theme
    sessions = [x for x in proposals['sessions'] if 'themeSlug' in x.keys() and x['themeSlug'] == slug]
    #initialize hash
    if slug not in theme_people.keys():
        theme_people[slug] ={"facilitators":[],
                             "inclusive":{"female":0,"male":0,"unknown":0,"total":0},
                             "unique":{"female":0,"male":0,"unknown":0,"total":0},
                             }    
    for session in sessions:
        org = ""
        if(len(session['organization'])>0):
            org = session['organization']
        for person in session['facilitators']:
            if(len(org)==0 and len(person['twitter'])>0):
                org = person['twitter']
            name = person['name']
            
            asciiname = ''.join(ch for ch in name if ch not in string.punctuation)
            org = ''.join(ch for ch in org if ch not in string.punctuation)
            
            inferred_gender = bg.org_name_gender(org,asciiname)
            if(inferred_gender != "ignore"):
                theme_people[slug]['inclusive'][inferred_gender] += 1
                theme_people[slug]['inclusive']['total']+=1
                if name not in theme_people[slug]['facilitators']:
                    theme_people[slug]['unique'][inferred_gender]+=1
                    theme_people[slug]['unique']['total']+=1
                    theme_people[slug]['facilitators'].append(name)

#generate gender specific series

def pct(a,b):
    return 100.*(float(a)/float(b))

themes = [x[0] for x in sorted([(x,theme_people[x]['inclusive']) for x in theme_people.keys()],key=lambda x: x[1],reverse=True)]
female = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
male = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
unknown = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
unknown_bottom = {'unique':[], 'inclusive':[],'unique_pct':[],'inclusive_pct':[]}
for theme in themes:

    for k in ['unique','inclusive']:
        female[k].append(theme_people[theme][k]['female'])
        male[k].append(theme_people[theme][k]['male'])
        unknown[k].append(theme_people[theme][k]['unknown'])
        unknown_bottom[k].append(female[k][-1] + male[k][-1])

        
        female[k+"_pct"].append(pct(female[k][-1],theme_people[theme][k]['total']))
        male[k+"_pct"].append(pct(male[k][-1],theme_people[theme][k]['total']))
        unknown[k+"_pct"].append(pct(unknown[k][-1],theme_people[theme][k]['total']))
        unknown_bottom[k+'_pct'].append(female[k+"_pct"][-1] + male[k+"_pct"][-1])

In [7]:
sum([len(theme_people[x]['facilitators']) for x in theme_people.keys()])


Out[7]:
924

In [8]:
ind = np.arange(len(themes))
width = 0.4       # the width of the bars: can also be len(x) sequence  

for j in ['unique','inclusive']:
    for a in ['','_pct']:
        k = j+a
        fig = plt.figure(figsize=(14, 8))   
        ax = fig.add_subplot(111)
        ax = fig.add_subplot(111)

        #print "{0},{1},{2}".format(len(female[k]),len(male[k]),len(unknown_bottom[k]))

        p1 = ax.bar(ind, female[k],   width, color='#48C8B8')
        p2 = ax.bar(ind, male[k], width, color='#E8CA33',
                     bottom=female[k])
        p3 = ax.bar(ind, unknown[k], width, color='#cccccc',
                     bottom=unknown_bottom[k])
        plt.xticks(ind+width/2.)
        if(a =="_pct"):
            plt.yticks(np.arange(0,101,10))
            plt.ylim(ymax = 100, ymin = 0)
        ax.set_xticklabels(themes, rotation=45, fontsize=18,ha='center')

        plt.ylabel("Number of {0} session facilitators".format(j))
        plt.title("Inferred Sex of #MozFest 2014 proposed session facilitators ({0})\n".format(j), fontsize=16)
        plt.legend( (p1[0], p2[0],p3[0]), ('Female', 'Male',"Unknown"), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )

        plt.show()



In [ ]:
# ADD GENDER TO SESSION HASH
for i in np.arange(0,len(proposals['sessions'])):
    proposals['sessions'][i][u'gender']={u'female':0,u'male':0,u'unknown':0,u'total':0}
    for person in proposals['sessions'][i]['facilitators']:
        name = person['name']
        org = proposals['sessions'][i]['organization']
        asciiname = ''.join(ch for ch in name if ch not in string.punctuation)
        org = ''.join(ch for ch in org if ch not in string.punctuation)
        inferred_gender = bg.org_name_gender(org,asciiname)
        if(inferred_gender != "ignore"):
            proposals['sessions'][i][u'gender'][inferred_gender]+=1
            proposals['sessions'][i][u'gender'][u'total']+=1

In [37]:
f = codecs.open("mozilla_session_gender.csv","w","utf8")
f.write(','.join(["slug","org","title","female","male","unknown","total"])+"\n")
for session in proposals['sessions']:
    if 'themeSlug' in session.keys() and 'organization' in session.keys():
        org = ''.join(ch for ch in session['organization'] if ch not in string.punctuation)
        title = ''.join(ch for ch in session['title'] if ch not in string.punctuation)
        gender = session['gender']
        f.write(','.join([session['themeSlug'],org,title,str(gender[u'female']),str(gender[u'male']),str(gender[u'unknown']),str(gender[u'total'])]) + "\n")
f.close()

In [ ]: