Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.
In [3]:
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
all_users = df.copy()
In [4]:
print females.median()
females.describe()
Out[4]:
In [5]:
print males.median()
males.describe()
Out[5]:
In [6]:
print all_users.median()
all_users.describe()
Out[6]:
In [7]:
top_females = females[females["reputation"]> 300]
top_males = males[males["reputation"]> 300]
top_users = all_users[all_users["reputation"]> 300]
In [8]:
print top_females.median()
top_females.describe()
Out[8]:
In [9]:
print top_males.median()
top_males.describe()
Out[9]:
In [10]:
print top_users.median()
top_users.describe()
Out[10]:
In [11]:
common_females = females[females["reputation"] <= 300]
common_males = males[males["reputation"] <= 300]
common_users = all_users[all_users["reputation"]<= 300]
In [12]:
print common_females.median()
common_females.describe()
Out[12]:
In [13]:
print common_males.median()
common_males.describe()
Out[13]:
In [14]:
print common_users.median()
common_users.describe()
Out[14]:
In [15]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
T = all_users["lifetime"] #measure in days
C = all_users["dead"]
females_ = all_users["gender"] == "Female"
males_ = all_users["gender"] == "Male"
In [16]:
fig = pyplot.figure(figsize=(12, 6))
ax = pyplot.subplot(111)
kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_
kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_
pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")
Out[16]:
In [17]:
from lifelines.statistics import logrank_test
summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )
In [18]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
T = top_users["lifetime"] #measure in days
C = top_users["dead"]
females_ = top_users["gender"] == "Female"
males_ = top_users["gender"] == "Male"
In [19]:
fig = pyplot.figure(figsize=(12, 6))
ax = pyplot.subplot(111)
kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_
kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_
pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")
Out[19]:
In [20]:
from lifelines.statistics import logrank_test
summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )
In [21]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
T = common_users["lifetime"] #measure in days
C = common_users["dead"]
females_ = common_users["gender"] == "Female"
males_ = common_users["gender"] == "Male"
In [22]:
fig = pyplot.figure(figsize=(12, 6))
ax = pyplot.subplot(111)
kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_
kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_
pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")
Out[22]:
In [23]:
from lifelines.statistics import logrank_test
summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )
In [28]:
all_users.corr(method="spearman")
Out[28]:
In [29]:
exog = all_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = all_users["days_active"]
mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())
res_nbin = mod_nbin.fit()
res_nbin.summary()
Out[29]:
In [30]:
nobs = res_nbin.nobs
y = endog / endog.sum(1)
yhat = res_nbin.mu
pyplot.scatter(yhat, y)
res_nbin.params
Out[30]:
In [31]:
top_users.corr(method="spearman")
Out[31]:
In [32]:
exog = top_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = top_users["days_active"]
mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())
res_nbin = mod_nbin.fit()
res_nbin.summary()
Out[32]:
In [33]:
nobs = res_nbin.nobs
y = endog / endog.sum(1)
yhat = res_nbin.mu
pyplot.scatter(yhat, y)
res_nbin.params
Out[33]:
In [34]:
common_users.corr(method="spearman")
Out[34]:
In [35]:
exog = common_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = common_users["days_active"]
mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())
res_nbin = mod_nbin.fit()
res_nbin.summary()
Out[35]:
In [36]:
nobs = res_nbin.nobs
y = endog / endog.sum(1)
yhat = res_nbin.mu
pyplot.scatter(yhat, y)
res_nbin.params
Out[36]:
In [37]:
females_frequency = females['activity_freq']
males_frequency = males['activity_freq']
In [38]:
print "Female:"
print females_frequency.describe()
print "Median: ", females_frequency.median()
print
print "Male:"
print males_frequency.describe()
print "Median: ", males_frequency.median()
In [39]:
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")
Out[39]:
In [40]:
print "Two-sample Chi-Square test: ", stats.chi2_contingency( females_frequency, males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_frequency, males_frequency)[1]
In [41]:
top_females_frequency = top_females['activity_freq']
top_males_frequency = top_males['activity_freq']
In [42]:
print "Female:"
print top_females_frequency.describe()
print "Median: ", top_females_frequency.median()
print
print "Male:"
print top_males_frequency.describe()
print "Median: ", top_males_frequency.median()
In [43]:
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
top_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
top_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")
Out[43]:
In [44]:
print "Two-sample Chi-Square test: ", stats.chi2_contingency( top_females_frequency, top_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_frequency, top_males_frequency)[1]
In [45]:
common_females_frequency = common_females['activity_freq']
common_males_frequency = common_males['activity_freq']
In [46]:
print "Female:"
print common_females_frequency.describe()
print "Median: ", common_females_frequency.median()
print
print "Male:"
print common_males_frequency.describe()
print "Median: ", common_males_frequency.median()
In [47]:
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
common_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
common_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")
Out[47]:
In [48]:
print "Two-sample Chi-Square test: ", stats.chi2_contingency( common_females_frequency, common_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_frequency, common_males_frequency)[1]
In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas, math
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
import statsmodels.api as sm
%matplotlib inline
mpl.style.use('ggplot')
# pyplot.rcdefaults()
client = pymongo.MongoClient('localhost', 27017)
community = 'math'
stats_db = client[community].statistics
cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}},
{'comments_total':{'$gt':0}}],
'gender': {'$ne': "Unknown"} },
{u'_id': False, u'dates': True, u'reputation': True,
u'joined': True, u'gender':True,
'lifetime': True, 'max_interval': True, 'days_active': True,
'gender_cat': True, 'activity_freq': True})
df = pandas.DataFrame(list(cursor))
In [2]:
import datetime
def seen_death(row):
recent = datetime.datetime(2014,1,20) - datetime.timedelta(days=int(df["max_interval"].mean()))
return row["dates"][-1] < recent
df["dead"] = df.apply(seen_death, axis=1)
In [ ]: