In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas, vincent
from scipy import stats
from matplotlib import pyplot

# %matplotlib inline
vincent.core.initialize_notebook()

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': True, u'user_id': True, u'gender':True})

df =  pandas.DataFrame(list(cursor))

males = df[df['gender']=='Male']
females = df[df['gender']=='Female']



In [4]:
import random
unknown = df[df['gender']=='Unknown']

random.seed(500)
# random.sample(unknown, 50)

# x = unknown.reset_index(level=0)

rows = random.sample(unknown.index, 500)
unknown_df = unknown.ix[rows]
unknown_df['link'] = "http://stackoverflow.com/users/" + unknown_df['user_id']

print unknown_df.tail()

unknown_df.to_csv("stackoverflow-unknown.csv")


                             _id   gender  user_id  \
123604  53b4056cf64481447134cf14  Unknown  1625713   
76633   53b3ff47f6448144713415de  Unknown   508427   
237352  53b413a4f644814471369023  Unknown  1954584   
148901  53b4088bf6448144713532e8  Unknown   167806   
30414   53b3f951f644814471335f75  Unknown   621465   

                                          link  
123604  http://stackoverflow.com/users/1625713  
76633    http://stackoverflow.com/users/508427  
237352  http://stackoverflow.com/users/1954584  
148901   http://stackoverflow.com/users/167806  
30414    http://stackoverflow.com/users/621465  

In [ ]: