In [ ]:
from __future__ import division
from PIL import Image
import pymongo, pandas, random, xmldict, urllib2, io
client = pymongo.MongoClient('localhost', 27017)
community = 'stackoverflow'
stats_db = client[community].statistics
cursor = stats_db.find({'contributions_total':{'$gt':0}} ,
{u'_id': False, u'user_id': True, u'gender':True, 'profileImageUrl':True})
df = pandas.DataFrame(list(cursor))
In [ ]:
# df = df.set_index('user_id')
users = df[pandas.notnull(df['profileImageUrl'])]
users.count()
In [ ]:
def count_colors(row):
data = urllib2.urlopen(row["profileImageUrl"])
image = Image.open(io.BytesIO(data.read()))
if image.getcolors():
return len(image.getcolors())
return -1
# users["colors"] = users.apply(count_colors, axis=1)
# users.head()
In [ ]:
random.seed(456)
rows = random.sample(users.index, 4000)
sample_df = users.ix[rows]
sample_df['link'] = "http://stackoverflow.com/users/" + sample_df['user_id']
males = sample_df.query("gender == 'Male'")
females = sample_df.query("gender == 'Female'")
# sample_df.to_csv("/Users/milena/Desktop/stackoverflow-sample.csv")
In [ ]:
sample_df["colors"] = sample_df.apply(count_colors, axis=1)
sample_df.head()
In [ ]:
com_foto = sample_df.query("colors < 0")
com_foto.count()
# com_foto.to_csv("/Users/milena/Desktop/stackoverflow-sample-pic.csv")
In [ ]:
un = com_foto.copy()
# .query("gender == 'Unknown'")
rows_less = random.sample(un.index, 20)
little_sample = un.ix[rows_less]
little_sample["link"]
In [ ]:
un = com_foto.query("gender == 'Male'")
un['link'].count()
In [ ]: