In [1]:
from __future__ import division
import pymongo, pandas, random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
In [2]:
connection = pymongo.MongoClient('localhost', 27017)
communities = connection.database_names()
for db in ["gender", "admin", "local", "visualizations", "results"]:
if db in communities:communities.remove(db)
In [3]:
columns = ['community', 'males','females','total_ident', 'total', 'prop_males', 'prop_females', 'prop_ident']
results = pandas.DataFrame(columns=columns)
In [4]:
for community in communities:
community_db = connection[community]['statistics']
cursor = community_db.find({'contributions_total': {'$gt':0}},
{u'_id': False, u'user_id':True, u'gender':True})
df = pandas.DataFrame(list(cursor))
males = df.query('gender == "Male"')
females = df.query('gender == "Female"')
total = df.count()[0]
males_count = males.count()[0]
females_count = females.count()[0]
total_ident = males_count + females_count
result = {
'community': community,
'males': males_count,
'females': females_count,
'total': total,
'total_ident': total_ident,
'prop_males': males_count/total_ident,
'prop_females': females_count/total_ident,
'prop_ident': total_ident/total
}
results = results.append(result, ignore_index=True)
In [34]:
results.to_csv("/Users/milena/Desktop/proportions.csv")
In [6]:
results.median()
Out[6]:
In [14]:
communities = ["academia", "android", "anime", "apple", "askubuntu", "bicycles", "biology", "bitcoin", "chemistry", "chinese", "christianity", "codegolf", "codereview", "cogsci", "cooking", "crypto", "cs", "cstheory", "dba", "diy", "drupal", "dsp", "electronics", "ell", "english", "expressionengine", "fitness", "freelancing", "french", "gamedev", "gaming", "gardening", "genealogy", "german", "gis", "graphicdesign", "hermeneutics", "history", "islam", "japanese", "judaism", "linguistics", "magento", "math", "mathematica", "mathoverflow", "mechanics", "money", "movies", "music", "outdoors", "parenting", "philosophy", "photo", "physics", "pm", "productivity", "programmers", "quant", "raspberrypi", "rpg", "russian", "salesforce", "scicomp", "scifi", "security", "serverfault", "sharepoint", "skeptics", "sound", "spanish", "sqa", "stackapps", "stackoverflow", "stats", "superuser", "tex", "travel", "unix", "ux", "webapps", "webmasters", "wordpress", "workplace", "writers"]
category = ["life-arts", "technology", "culture-recreation", "technology", "technology", "culture-recreation", "science", "business", "science", "culture-recreation", "culture-recreation", "technology", "technology", "science", "life-arts", "technology", "science", "science", "technology", "life-arts", "technology", "technology", "technology", "culture-recreation", "culture-recreation", "technology", "life-arts", "professional", "culture-recreation", "technology", "culture-recreation", "life-arts", "life-arts", "culture-recreation", "technology", "life-arts", "culture-recreation", "culture-recreation", "culture-recreation", "culture-recreation", "culture-recreation", "science", "technology", "science", "technology", "science", "culture-recreation", "life-arts", "life-arts", "life-arts", "culture-recreation", "life-arts", "science", "life-arts", "science", "business", "life-arts", "technology", "business", "technology", "culture-recreation", "culture-recreation", "technology", "science", "life-arts", "technology", "technology", "technology", "culture-recreation", "technology", "culture-recreation", "technology", "technology", "technology", "science", "technology", "technology", "culture-recreation", "technology", "technology", "technology", "technology", "technology", "professional", "life-arts"]
cat = pandas.DataFrame({'community':communities, 'category':category})
temp = pandas.merge(cat, results, on='community')
temp.groupby(['category']).median()
Out[14]:
In [ ]: