In [1]:
from __future__ import division
import pymongo, pandas, random

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': True, u'user_id': True, u'gender':True, u'gender':True, u'reputation':True})

df =  pandas.DataFrame(list(cursor))

In [2]:
# middle = df.query('reputation >= 83 & reputation <= 524').copy()
middle = df.copy()

df.describe()


Out[2]:
reputation
count 448267.000000
mean 956.283057
std 5579.309965
min 50.000000
25% 83.000000
50% 161.000000
75% 524.000000
max 709269.000000

In [3]:
middle['link'] = "http://stackoverflow.com/users/" + middle['user_id']

In [4]:
random.seed(500)

males = middle.query("gender == 'Male'")
females = middle.query("gender == 'Female'")
unknowns = middle.query("gender == 'Unknown'")

In [5]:
rows_males = random.sample(males.index, 50)
sample_males = males.ix[rows_males]

rows_females = random.sample(females.index, 50)
sample_females = females.ix[rows_females]

rows_unknowns = random.sample(unknowns.index, 50)
sample_unknowns = unknowns.ix[rows_unknowns]

In [6]:
sample_all = pandas.concat([sample_females,sample_males,sample_unknowns])
sample_all.to_csv("/Users/milena/Desktop/dummy-sample2.csv")


In [39]:
sample_df['link'] = "http://stackoverflow.com/users/" + sample_df['user_id']

sample_df.to_csv("/Users/milena/Desktop/stackoverflow-sample.csv")

In [38]:
# test = df.query('reputation > 900 & reputation < 1100')
# males.shape
random.seed(2000)
rows = random.sample(middle.index, 2000)
sample_df = middle.ix[rows]


sample_df.count()


Out[38]:
_id           2000
gender        2000
reputation    2000
user_id       2000
dtype: int64


In [ ]:
from __future__ import division
import pymongo, pandas, random

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': True, u'user_id': True, u'gender':True, u'gender':True})

df =  pandas.DataFrame(list(cursor))

random.seed(500)
rows = random.sample(df.index, 2000)
sample_df = df.ix[rows]

males = sample_df.query("gender == 'Male'")
females = sample_df.query("gender == 'Female'")

sample_df['link'] = "http://stackoverflow.com/users/" + sample_df['user_id']

sample_df.to_csv("stackoverflow-sample.csv")