In [1]:
from __future__ import division
import pymongo, pandas, random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
connection = pymongo.MongoClient('localhost', 27017)
communities = connection.database_names()

for db in ["gender", "admin", "local", "visualizations", "results"]:
    if db in communities:communities.remove(db)

In [3]:
columns = ['community', 'males','females','total_ident', 'total', 'prop_males', 'prop_females', 'prop_ident']
results = pandas.DataFrame(columns=columns)

In [4]:
for community in communities:
    community_db = connection[community]['statistics']
    
    cursor = community_db.find({'contributions_total': {'$gt':0}},
                               {u'_id': False, u'user_id':True, u'gender':True})
    
    df =  pandas.DataFrame(list(cursor))

    males = df.query('gender == "Male"')
    females = df.query('gender == "Female"')
    
    total = df.count()[0]
    males_count = males.count()[0]
    females_count = females.count()[0]
    total_ident = males_count + females_count


    result = {
        'community': community,
        'males': males_count,
        'females': females_count,
        'total': total,
        'total_ident': total_ident,
        'prop_males': males_count/total_ident,
        'prop_females': females_count/total_ident,
        'prop_ident': total_ident/total
    }
    
    results = results.append(result, ignore_index=True)

In [34]:
results.to_csv("/Users/milena/Desktop/proportions.csv")

In [6]:
results.median()


Out[6]:
males            499.000000
females           35.000000
total_ident      536.000000
total           1485.000000
prop_males         0.936200
prop_females       0.063800
prop_ident         0.366874
dtype: float64

In [14]:
communities = ["academia", "android", "anime", "apple", "askubuntu", "bicycles", "biology", "bitcoin", "chemistry", "chinese", "christianity", "codegolf", "codereview", "cogsci", "cooking", "crypto", "cs", "cstheory", "dba", "diy", "drupal", "dsp", "electronics", "ell", "english", "expressionengine", "fitness", "freelancing", "french", "gamedev", "gaming", "gardening", "genealogy", "german", "gis", "graphicdesign", "hermeneutics", "history", "islam", "japanese", "judaism", "linguistics", "magento", "math", "mathematica", "mathoverflow", "mechanics", "money", "movies", "music", "outdoors", "parenting", "philosophy", "photo", "physics", "pm", "productivity", "programmers", "quant", "raspberrypi", "rpg", "russian", "salesforce", "scicomp", "scifi", "security", "serverfault", "sharepoint", "skeptics", "sound", "spanish", "sqa", "stackapps", "stackoverflow", "stats", "superuser", "tex", "travel", "unix", "ux", "webapps", "webmasters", "wordpress", "workplace", "writers"]
category = ["life-arts", "technology", "culture-recreation", "technology", "technology", "culture-recreation", "science", "business", "science", "culture-recreation", "culture-recreation", "technology", "technology", "science", "life-arts", "technology", "science", "science", "technology", "life-arts", "technology", "technology", "technology", "culture-recreation", "culture-recreation", "technology", "life-arts", "professional", "culture-recreation", "technology", "culture-recreation", "life-arts", "life-arts", "culture-recreation", "technology", "life-arts", "culture-recreation", "culture-recreation", "culture-recreation", "culture-recreation", "culture-recreation", "science", "technology", "science", "technology", "science", "culture-recreation", "life-arts", "life-arts", "life-arts", "culture-recreation", "life-arts", "science", "life-arts", "science", "business", "life-arts", "technology", "business", "technology", "culture-recreation", "culture-recreation", "technology", "science", "life-arts", "technology", "technology", "technology", "culture-recreation", "technology", "culture-recreation", "technology", "technology", "technology", "science", "technology", "technology", "culture-recreation", "technology", "technology", "technology", "technology", "technology", "professional", "life-arts"]

cat = pandas.DataFrame({'community':communities, 'category':category})
temp = pandas.merge(cat, results, on='community')
temp.groupby(['category']).median()


Out[14]:
males females total_ident total prop_males prop_females prop_ident
category
business 609.0 29.0 649.0 1548.0 0.943662 0.056338 0.356268
culture-recreation 495.0 38.0 536.0 1399.0 0.918089 0.081911 0.366874
life-arts 788.0 70.0 840.0 2330.0 0.919052 0.080948 0.387854
professional 800.0 80.5 880.5 2382.0 0.897391 0.102609 0.379702
science 796.0 52.0 854.5 2253.5 0.931179 0.068821 0.368173
technology 2751.5 127.5 2907.5 7894.0 0.946982 0.053018 0.366328

In [ ]: