In [1]:
import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv('/home/rmyeid/notebooks/compsocial/online_learning/data/HMXPC13_DI_v2_5-14-14.csv')
In [3]:
df.head()
Out[3]:
In [4]:
df["engagement"] = df.registered
df.loc[df.registered == 1, "engagement"] = 2.0
df.loc[df.viewed == 1, "engagement"] = 1.0
df.loc[df.explored == 1, "engagement"] = 3.0
df.loc[df.certified == 1, "engagement"] = 4.0
In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
In [6]:
df.nchapters.fillna(0, inplace=True)
df.nplay_video.fillna(0, inplace=True)
df.loc[df.nchapters.values > 20, "nchapters"] = 20
df.loc[df.nplay_video > 200, "nplay_video"] = 200
In [7]:
df["course_material_engagement"] = df.nchapters + df.nplay_video
df.loc[df.course_material_engagement < 0, "course_material_engagement"] = 0
In [8]:
df.grade.fillna(0, inplace=True)
if " " in df.grade.values:
df.loc[df.grade == " ", "grade"] = 0
df.grade = df.grade.values.astype(np.float) * 100
df.loc[df.grade > 100, "grade"] = 100
In [9]:
df["course_category"] = pd.Series(np.random.randn(len(df.index)), index=df.index)
In [10]:
from StringIO import StringIO # got moved to io in python3.
import requests
In [11]:
r = requests.get('https://docs.google.com/spreadsheets/d/1lXgYlyLlMZs8huZKpymfu7LUhw0dGcRcFzmwtbICNtY/export?format=csv&id=1lXgYlyLlMZs8huZKpymfu7LUhw0dGcRcFzmwtbICNtY')
courses_data = r.content
In [12]:
courses_df = pd.read_csv(StringIO(courses_data), index_col=0)
courses_df.head()
Out[12]:
In [13]:
STEM_courses = ["MITx/3.091x/2012_Fall", "MITx/6.002x/2013_Spring", "MITx/8.MReV/2013_Summer",
"MITx/6.00x/2012_Fall", "MITx/6.002x/2012_Fall", "MITx/6.00x/2013_Spring",
"MITx/7.00x/2013_Spring", "MITx/3.091x/2013_Spring", "MITx/2.01x/2013_Spring",
"HarvardX/CS50x/2012", "HarvardX/PH207x/2012_Fall", "MITx/8.02x/2013_Spring"]
Non_STEM_courses = ["MITx/14.73x/2013_Spring", "HarvardX/PH278x/2013_Spring", "HarvardX/CB22x/2013_Spring",
"HarvardX/ER22x/2013_Spring"]
df.loc[df.course_id.isin(STEM_courses), "course_category"] = 1
df.loc[df.course_id.isin(Non_STEM_courses), "course_category"] = 0
In [14]:
df["university"] = 0
In [15]:
MIT = ["MITx/3.091x/2012_Fall", "MITx/6.002x/2013_Spring", "MITx/8.MReV/2013_Summer",
"MITx/6.00x/2012_Fall", "MITx/6.002x/2012_Fall", "MITx/6.00x/2013_Spring",
"MITx/7.00x/2013_Spring", "MITx/3.091x/2013_Spring", "MITx/2.01x/2013_Spring",
"MITx/8.02x/2013_Spring", "MITx/14.73x/2013_Spring"]
Harvard = ["HarvardX/CS50x/2012", "HarvardX/PH207x/2012_Fall", "HarvardX/PH278x/2013_Spring",
"HarvardX/CB22x/2013_Spring", "HarvardX/ER22x/2013_Spring"]
df.loc[df.course_id.isin(MIT), "university"] = 1
df.loc[df.course_id.isin(Harvard), "university"] = 0
In [16]:
df["days_ratio"] = 0.0
for course_id_, span in zip(courses_df.index, courses_df.Days.values):
df.loc[df.course_id == course_id_, "days_ratio"] = (df.ndays_act.values / span) * 100
In [17]:
df.loc[df.days_ratio>100, "days_ratio"] = 100
In [18]:
df["age"] = 2014 - df.YoB
In [19]:
df["gender2"] = df.gender.map({"o": np.nan, "f": 1, "m": 0})
In [20]:
df["Education"] = df.LoE_DI.map({"Less than Secondary": 1, "Secondary": 2, "Bachelor's": 3, "Master's":4, "Doctorate":5})
In [21]:
_ = df.course_material_engagement.hist()
In [22]:
df.to_csv("Edx_clean.csv")
In [23]:
df[["engagement", "days_ratio", "ndays_act"]].corr()
Out[23]:
In [24]:
df[["gender2", "course_category", "university", "engagement"]].corr()
Out[24]:
In [25]:
r = df["ndays_act"].corr(df["engagement"], method='pearson')
fisher = lambda r: .5 * (np.log(1+r) - np.log(1-r))
print r, fisher(r)
#, "days_ratio", "ndays_act"]].cov(method='spearman')
In [26]:
from scipy.stats.stats import pearsonr
tmp = df[["ndays_act", "engagement"]].dropna()
r, p = pearsonr(tmp["ndays_act"].values, tmp["engagement"].values)
print r, p, fisher(r)
In [27]:
tmp = df[["grade", "engagement"]].dropna()
r, p = pearsonr(tmp["grade"].values, tmp["engagement"].values)
print r, p, fisher(r)
In [28]:
tmp = df[["gender2", "course_category"]].dropna()
r, p = pearsonr(tmp["gender2"].values, tmp["course_category"].values)
print r, p, fisher(r)
In [29]:
tmp = df[["gender2", "engagement"]].dropna()
r, p = pearsonr(tmp["gender2"].values, tmp["engagement"].values)
print r, p, fisher(r)
In [30]:
tmp = df[df["course_category"] == 1.0]
tmp = tmp[["gender2", "engagement"]].dropna()
r, p = pearsonr(tmp["gender2"].values, tmp["engagement"].values)
print r, p, fisher(r)
In [31]:
tmp = df[(df["university"] == 1.0).values * (df["course_category"] == 0).values]
tmp = tmp[["gender2", "engagement"]].dropna()
r, p = pearsonr(tmp["gender2"].values, tmp["engagement"].values)
print r, p, fisher(r)
In [32]:
tmp = df[df.university == 0]
tmp["engagement"].describe()
Out[32]:
In [39]:
df.university.value_counts()
Out[39]:
In [33]:
import statsmodels.api as sm
In [34]:
cols_to_keep = ["gender2", "Education", "course_category", "university", "age"]
#gender_dummy = pd.get_dummies(df["gender"], prefix="gender")
#_dummy = pd.get_dummies(df["gender"], prefix="gender")
data = df[cols_to_keep]#.join(gender_dummy)
data = sm.add_constant(data)
#data.fillna(0.0, inplace=True)
In [35]:
model = sm.MNLogit(df["engagement"], data, missing='drop')
results = model.fit(method='ncg')
In [36]:
print results.summary()
In [34]:
results.pvalues
Out[34]:
In [35]:
model = sm.OLS(df["grade"], data, missing='drop')
results = model.fit()
In [36]:
print results.summary()
In [37]:
results.pvalues
Out[37]:
In [38]:
cols_to_keep = ["engagement", "course_material_engagement", "ndays_act"]
data = df[cols_to_keep]
data = sm.add_constant(data)
model = sm.OLS(df["grade"], data, missing='drop')
results = model.fit()
In [39]:
print results.summary()
In [40]:
results.pvalues
Out[40]:
In [60]:
cols_to_keep = ["grade", "gender2", "Education", "course_category", "university", "age"]
data = df[cols_to_keep]
data = sm.add_constant(data)
data = data.ix[np.arange(100000)]
data = data.dropna()
model = sm.MixedLM(data["grade"], data[cols_to_keep[1:]], groups=data["university"])
results = model.fit(do_cg=False)
In [145]:
row_indices.keys()
Out[145]:
In [131]:
model.group_labels
In [124]:
print results.summary()