In [1]:
from __future__ import division
import pymongo, pandas, random
client = pymongo.MongoClient('localhost', 27017)
community = 'stackoverflow'
stats_db = client[community].statistics
cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
{u'_id': True, u'user_id': True, u'gender':True, u'gender':True, u'reputation':True})
df = pandas.DataFrame(list(cursor))
In [2]:
# middle = df.query('reputation >= 83 & reputation <= 524').copy()
middle = df.copy()
df.describe()
Out[2]:
In [3]:
middle['link'] = "http://stackoverflow.com/users/" + middle['user_id']
In [4]:
random.seed(500)
males = middle.query("gender == 'Male'")
females = middle.query("gender == 'Female'")
unknowns = middle.query("gender == 'Unknown'")
In [5]:
rows_males = random.sample(males.index, 50)
sample_males = males.ix[rows_males]
rows_females = random.sample(females.index, 50)
sample_females = females.ix[rows_females]
rows_unknowns = random.sample(unknowns.index, 50)
sample_unknowns = unknowns.ix[rows_unknowns]
In [6]:
sample_all = pandas.concat([sample_females,sample_males,sample_unknowns])
sample_all.to_csv("/Users/milena/Desktop/dummy-sample2.csv")
In [39]:
sample_df['link'] = "http://stackoverflow.com/users/" + sample_df['user_id']
sample_df.to_csv("/Users/milena/Desktop/stackoverflow-sample.csv")
In [38]:
# test = df.query('reputation > 900 & reputation < 1100')
# males.shape
random.seed(2000)
rows = random.sample(middle.index, 2000)
sample_df = middle.ix[rows]
sample_df.count()
Out[38]:
In [ ]:
from __future__ import division
import pymongo, pandas, random
client = pymongo.MongoClient('localhost', 27017)
community = 'stackoverflow'
stats_db = client[community].statistics
cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
{u'_id': True, u'user_id': True, u'gender':True, u'gender':True})
df = pandas.DataFrame(list(cursor))
random.seed(500)
rows = random.sample(df.index, 2000)
sample_df = df.ix[rows]
males = sample_df.query("gender == 'Male'")
females = sample_df.query("gender == 'Female'")
sample_df['link'] = "http://stackoverflow.com/users/" + sample_df['user_id']
sample_df.to_csv("stackoverflow-sample.csv")