notebook.community

Edit and run



In [1]:

    
from __future__ import division
import pymongo, pandas, random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt



In [2]:

    
connection = pymongo.MongoClient('localhost', 27017)
communities = connection.database_names()

for db in ["gender", "admin", "local", "visualizations", "results"]:
    if db in communities:communities.remove(db)



In [3]:

    
columns = ['community', 'males','females','total_ident', 'total', 'prop_males', 'prop_females', 'prop_ident']
results = pandas.DataFrame(columns=columns)



In [4]:

    
for community in communities:
    community_db = connection[community]['statistics']
    
    cursor = community_db.find({'contributions_total': {'$gt':0}},
                               {u'_id': False, u'user_id':True, u'gender':True})
    
    df =  pandas.DataFrame(list(cursor))

    males = df.query('gender == "Male"')
    females = df.query('gender == "Female"')
    
    total = df.count()[0]
    males_count = males.count()[0]
    females_count = females.count()[0]
    total_ident = males_count + females_count


    result = {
        'community': community,
        'males': males_count,
        'females': females_count,
        'total': total,
        'total_ident': total_ident,
        'prop_males': males_count/total_ident,
        'prop_females': females_count/total_ident,
        'prop_ident': total_ident/total
    }
    
    results = results.append(result, ignore_index=True)



In [34]:

    
results.to_csv("/Users/milena/Desktop/proportions.csv")



In [6]:

    
results.median()









    Out[6]:





males            499.000000
females           35.000000
total_ident      536.000000
total           1485.000000
prop_males         0.936200
prop_females       0.063800
prop_ident         0.366874
dtype: float64



In [14]:

    
communities = ["academia", "android", "anime", "apple", "askubuntu", "bicycles", "biology", "bitcoin", "chemistry", "chinese", "christianity", "codegolf", "codereview", "cogsci", "cooking", "crypto", "cs", "cstheory", "dba", "diy", "drupal", "dsp", "electronics", "ell", "english", "expressionengine", "fitness", "freelancing", "french", "gamedev", "gaming", "gardening", "genealogy", "german", "gis", "graphicdesign", "hermeneutics", "history", "islam", "japanese", "judaism", "linguistics", "magento", "math", "mathematica", "mathoverflow", "mechanics", "money", "movies", "music", "outdoors", "parenting", "philosophy", "photo", "physics", "pm", "productivity", "programmers", "quant", "raspberrypi", "rpg", "russian", "salesforce", "scicomp", "scifi", "security", "serverfault", "sharepoint", "skeptics", "sound", "spanish", "sqa", "stackapps", "stackoverflow", "stats", "superuser", "tex", "travel", "unix", "ux", "webapps", "webmasters", "wordpress", "workplace", "writers"]
category = ["life-arts", "technology", "culture-recreation", "technology", "technology", "culture-recreation", "science", "business", "science", "culture-recreation", "culture-recreation", "technology", "technology", "science", "life-arts", "technology", "science", "science", "technology", "life-arts", "technology", "technology", "technology", "culture-recreation", "culture-recreation", "technology", "life-arts", "professional", "culture-recreation", "technology", "culture-recreation", "life-arts", "life-arts", "culture-recreation", "technology", "life-arts", "culture-recreation", "culture-recreation", "culture-recreation", "culture-recreation", "culture-recreation", "science", "technology", "science", "technology", "science", "culture-recreation", "life-arts", "life-arts", "life-arts", "culture-recreation", "life-arts", "science", "life-arts", "science", "business", "life-arts", "technology", "business", "technology", "culture-recreation", "culture-recreation", "technology", "science", "life-arts", "technology", "technology", "technology", "culture-recreation", "technology", "culture-recreation", "technology", "technology", "technology", "science", "technology", "technology", "culture-recreation", "technology", "technology", "technology", "technology", "technology", "professional", "life-arts"]

cat = pandas.DataFrame({'community':communities, 'category':category})
temp = pandas.merge(cat, results, on='community')
temp.groupby(['category']).median()









    Out[14]:






  
    
      
      males
      females
      total_ident
      total
      prop_males
      prop_females
      prop_ident
    
    
      category
      
      
      
      
      
      
      
    
  
  
    
      business
      609.0
      29.0
      649.0
      1548.0
      0.943662
      0.056338
      0.356268
    
    
      culture-recreation
      495.0
      38.0
      536.0
      1399.0
      0.918089
      0.081911
      0.366874
    
    
      life-arts
      788.0
      70.0
      840.0
      2330.0
      0.919052
      0.080948
      0.387854
    
    
      professional
      800.0
      80.5
      880.5
      2382.0
      0.897391
      0.102609
      0.379702
    
    
      science
      796.0
      52.0
      854.5
      2253.5
      0.931179
      0.068821
      0.368173
    
    
      technology
      2751.5
      127.5
      2907.5
      7894.0
      0.946982
      0.053018
      0.366328



In [ ]:

	males	females	total_ident	total	prop_males	prop_females	prop_ident
category
business	609.0	29.0	649.0	1548.0	0.943662	0.056338	0.356268
culture-recreation	495.0	38.0	536.0	1399.0	0.918089	0.081911	0.366874
life-arts	788.0	70.0	840.0	2330.0	0.919052	0.080948	0.387854
professional	800.0	80.5	880.5	2382.0	0.897391	0.102609	0.379702
science	796.0	52.0	854.5	2253.5	0.931179	0.068821	0.368173
technology	2751.5	127.5	2907.5	7894.0	0.946982	0.053018	0.366328