In [ ]:
from __future__ import division
from PIL import Image
import pymongo, pandas, random, xmldict, urllib2, io

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'contributions_total':{'$gt':0}} ,
                       {u'_id': False, u'user_id': True, u'gender':True, 'profileImageUrl':True})

df =  pandas.DataFrame(list(cursor))

In [ ]:
# df = df.set_index('user_id')
users = df[pandas.notnull(df['profileImageUrl'])]
users.count()

In [ ]:
def count_colors(row):
    data = urllib2.urlopen(row["profileImageUrl"])
    image = Image.open(io.BytesIO(data.read()))

    if image.getcolors():
        return len(image.getcolors())
    return -1

# users["colors"] = users.apply(count_colors, axis=1)

# users.head()

In [ ]:
random.seed(456)
rows = random.sample(users.index, 4000)
sample_df = users.ix[rows]
sample_df['link'] = "http://stackoverflow.com/users/" + sample_df['user_id']

males = sample_df.query("gender == 'Male'")
females = sample_df.query("gender == 'Female'")

# sample_df.to_csv("/Users/milena/Desktop/stackoverflow-sample.csv")

In [ ]:
sample_df["colors"] = sample_df.apply(count_colors, axis=1)
sample_df.head()

In [ ]:
com_foto = sample_df.query("colors < 0")
com_foto.count()
# com_foto.to_csv("/Users/milena/Desktop/stackoverflow-sample-pic.csv")

In [ ]:
un = com_foto.copy()
# .query("gender == 'Unknown'")
rows_less = random.sample(un.index, 20)
little_sample = un.ix[rows_less]
little_sample["link"]

In [ ]:
un = com_foto.query("gender == 'Male'")
un['link'].count()

In [ ]: