In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas, vincent
from scipy import stats
from matplotlib import pyplot
# %matplotlib inline
vincent.core.initialize_notebook()
client = pymongo.MongoClient('localhost', 27017)
community = 'stackoverflow'
stats_db = client[community].statistics
cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
{u'_id': True, u'user_id': True, u'gender':True})
df = pandas.DataFrame(list(cursor))
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
In [4]:
import random
unknown = df[df['gender']=='Unknown']
random.seed(500)
# random.sample(unknown, 50)
# x = unknown.reset_index(level=0)
rows = random.sample(unknown.index, 500)
unknown_df = unknown.ix[rows]
unknown_df['link'] = "http://stackoverflow.com/users/" + unknown_df['user_id']
print unknown_df.tail()
unknown_df.to_csv("stackoverflow-unknown.csv")
In [ ]: