In [2]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [3]:
import pandas as pd
In [4]:
import numpy as np
def cohen_d(X1,X2):
std1=X1.std()
mean1=X1.mean()
N1=len(X1)
std2=X2.std()
mean2=X2.mean()
N2=len(X2)
#pooled std
s2=(float(N1-1)*std1 + float(N2-1)*std2)/float(N1+N2-2)
s=np.sqrt(s2)
cohend=abs(mean2-mean1)/s
return(cohend)
In [5]:
# Get 5% percentile:
def get_fifth_p(X):
N = len(X)
Xsorted = np.sort(X)
fifth = int(N*0.05)-1
fifth_p = Xsorted[fifth]
return fifth_p
def get_fraction_under_value(X,value):
N = len(X)
Xsorted = np.sort(X)
for ii in range(N):
v2 = Xsorted[ii]
if value <= v2:
fraction = float((ii+1)/N)*100
return fraction
return fraction
In [6]:
infile="MedHelp-posts.csv"
df=pd.read_csv(infile,index_col=0)
df.head(1)
Out[6]:
In [7]:
infile="MedHelp-users-class.csv"
df_users=pd.read_csv(infile,index_col=0)
df_users.head(1)
Out[7]:
In [8]:
from textstat.textstat import textstat
def get_grade(a,b,text):
ASL=textstat.avg_sentence_length(text)
ASW=textstat.avg_syllables_per_word(text)
#FKRA = float(0.39 * ASL) + float(11.8 * ASW) - 15.59
#grade=round(FKRA, 1)
grade = float(a*ASL) + float(b*ASW)
return grade
In [9]:
#def coleman_liau_index(n_chars, n_words, n_sents):
# """https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index"""
# return (5.879851 * n_chars / n_words) - (29.587280 * n_sents / n_words) - 15.800804
def mod_cli(a,b,text):
L = round(textstat.avg_letter_per_word(text)*100, 2)
S = round(textstat.avg_sentence_per_word(text)*100, 2)
#CLI = float((0.058 * L) - (0.296 * S) - 15.8)
CLI = float(a*L) - (b*S)
return round(CLI, 2)
In [10]:
text="Whether or not your son has Chiari malformation is a question for your medical doctor. If you have concerns, absolutely take them to your doctor and make sure they are addressed."
grade=get_grade(1,1,text)
print(grade)
In [11]:
# Calculate all smog scores for texts from professional and non-professional
expert_uids=df_users.loc[df_users['is expert']==1].index
non_expert_uids=df_users.loc[df_users['is expert']==0].index
In [12]:
print(len(df))
df_experts=df.loc[df['user id'].isin(expert_uids)]
print(len(df_experts))
df_non_experts=df.loc[df['user id'].isin(non_expert_uids)]
print(len(df_non_experts))
In [13]:
text_experts=df_experts['text']
text_non_experts=df_non_experts['text']
In [14]:
#readability_experts=text_experts.apply(textstat.smog_index)
#readability_non_experts=text_non_experts.apply(textstat.smog_index)
readability_experts=text_experts.apply(textstat.flesch_kincaid_grade)
readability_non_experts=text_non_experts.apply(textstat.flesch_kincaid_grade)
In [15]:
print(cohen_d(readability_experts.values,readability_non_experts.values))
In [14]:
X = np.arange(100,0,-1)
print(get_fifth_p(X))
print(get_fraction_under_value(X,5))
In [26]:
fifth_p = get_fifth_p(readability_experts.values)
print(fifth_p)
print(get_fraction_under_value(readability_non_experts.values,fifth_p))
In [28]:
# Smog:
X1=text_experts.apply(textstat.smog_index).values
X2=text_non_experts.apply(textstat.smog_index).values
fifth_p = get_fifth_p(X1)
print(fifth_p)
print(get_fraction_under_value(X2,fifth_p))
In [31]:
# Smog + Flesch Kincaid grade
X1 = text_experts.apply(textstat.smog_index).values
X1 = X1 + text_experts.apply(textstat.flesch_kincaid_grade).values
X2 = text_non_experts.apply(textstat.smog_index).values
X2 = X2 + text_non_experts.apply(textstat.flesch_kincaid_grade).values
fifth_p = get_fifth_p(X1)
print(fifth_p)
print(get_fraction_under_value(X2,fifth_p))
In [32]:
# Coleman liau
X1=text_experts.apply(textstat.coleman_liau_index).values
X2=text_non_experts.apply(textstat.coleman_liau_index).values
fifth_p = get_fifth_p(X1)
print(fifth_p)
print(get_fraction_under_value(X2,fifth_p))
In [13]:
# Coleman Liau + Flesch Kincaid grade
#automated_readability_index
def readability_cli(X):
return X.apply(textstat.coleman_liau_index).values
def readability_fcg(X):
return X.apply(textstat.flesch_kincaid_grade).values
In [14]:
# Colmena Liau + Flesch Kincaid grade
X1a = readability_cli(text_experts)
X1b = readability_fcg(text_experts)
X2a = readability_cli(text_non_experts)
X2b = readability_fcg(text_non_experts)
for a in range(10):
X1 = X1a + a*X1b
X2 = X2a + a*X2b
fifth_p = get_fifth_p(X1)
print(a,fifth_p,get_fraction_under_value(X2,fifth_p))
In [16]:
#automated_readability_index
def readability_ari(X):
return X.apply(textstat.automated_readability_index).values
X1=readability_ari(text_experts)
X2=readability_ari(text_non_experts)
fifth_p = get_fifth_p(X1)
print(fifth_p)
print(get_fraction_under_value(X2,fifth_p))
In [19]:
# Colmena Liau + Flesch Kincaid grade + automated readability index
X1a = readability_cli(text_experts)
X1b = readability_fcg(text_experts)
X1c = readability_ari(text_experts)
X2a = readability_cli(text_non_experts)
X2b = readability_fcg(text_non_experts)
X2c = readability_ari(text_non_experts)
X1ab = X1a + 6*X1b
X2ab = X2a + 6*X2b
for a in range(10):
X1 = X1ab + a*X1c
X2 = X2ab + a*X2c
fifth_p = get_fifth_p(X1)
print(a,fifth_p,get_fraction_under_value(X2,fifth_p))
In [23]:
def readability_dcr(X):
return X.apply(textstat.dale_chall_readability_score).values
X1_dcr=readability_dcr(text_experts)
X2_dcr=readability_dcr(text_non_experts)
fifth_p = get_fifth_p(X1_dcr)
print(fifth_p)
print(get_fraction_under_value(X2_dcr,fifth_p))
In [24]:
def readability_dw(X):
return X.apply(textstat.difficult_words).values
X1_dw=readability_dw(text_experts)
X2_dw=readability_dw(text_non_experts)
fifth_p = get_fifth_p(X1_dw)
print(fifth_p)
print(get_fraction_under_value(X2_dw,fifth_p))
In [25]:
# Colmena Liau + Flesch Kincaid grade + automated readability index
#X1a = readability_cli(text_experts)
#X1b = readability_fcg(text_experts)
#X2a = readability_cli(text_non_experts)
#X2b = readability_fcg(text_non_experts)
#X1ab = X1a + 6*X1b
#X2ab = X2a + 6*X2b
for a in range(2):
X1 = X1ab + a*X1_dcr
X2 = X2ab + a*X2_dcr
fifth_p = get_fifth_p(X1)
print(a,fifth_p,get_fraction_under_value(X2,fifth_p))
In [28]:
# add dw
for a in range(5):
X1 = X1ab + a*X1_dw
X2 = X2ab + a*X2_dw
fifth_p = get_fifth_p(X1)
print(a,fifth_p,get_fraction_under_value(X2,fifth_p))
In [29]:
def readability_lwf(X):
return X.apply(textstat.linsear_write_formula).values
X1_lwf=readability_lwf(text_experts)
X2_lwf=readability_lwf(text_non_experts)
fifth_p = get_fifth_p(X1_lwf)
print(fifth_p)
print(get_fraction_under_value(X2_lwf,fifth_p))
In [30]:
# add lwf
for a in range(5):
X1 = X1ab + a*X1_lwf
X2 = X2ab + a*X2_lwf
fifth_p = get_fifth_p(X1)
print(a,fifth_p,get_fraction_under_value(X2,fifth_p))
In [31]:
def readability_gf(X):
return X.apply(textstat.gunning_fog).values
X1_gf=readability_gf(text_experts)
X2_gf=readability_gf(text_non_experts)
fifth_p = get_fifth_p(X1_gf)
print(fifth_p)
print(get_fraction_under_value(X2_gf,fifth_p))
In [33]:
# add gf
for a in range(5):
X1 = X1ab + a*X1_gf
X2 = X2ab + a*X2_gf
fifth_p = get_fifth_p(X1)
print(a,fifth_p,get_fraction_under_value(X2,fifth_p))
In [53]:
# vary a and b in Readability formula:
# score = (a * ASL) + (b * ASW)
step_b=5
step_a=0.05
results=[]
#for aa in range(1,40,1):
for aa in range(1,20,1):
a=aa*step_a
#for bb in range(1,40,1):
for bb in range(1,20,1):
b=bb*step_b
r1 = text_experts.apply(lambda x: get_grade(a,b,x)).values
r2 = text_non_experts.apply(lambda x: get_grade(a,b,x)).values
fifth_p = get_fifth_p(r1)
fraction = get_fraction_under_value(r2,fifth_p)
#d = cohen_d(r1,r2)
results.append(dict(a=a,b=b,fraction = fraction))
#print(a,b,d)
In [54]:
import pickle
file_pi = open('readability_test.pi', 'wb')
pickle.dump(results, file_pi)
file_pi.close()
#del results
#file_pi = open('readability_test.pi', 'rb')
#results = pickle.load(file_pi)
#print(results)
In [55]:
df = pd.DataFrame.from_dict(results)
df.head(2)
Out[55]:
In [56]:
import seaborn as sns
import matplotlib.pyplot as plt
# Set up the matplotlib figure
#f, ax = plt.subplots(figsize=(11, 9))
#data = df.pivot("a", "b", "d")
#ax = sns.heatmap(df, cmap="YlGnBu")
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Load the example flights dataset and conver to long-form
data = df.pivot("a", "b", "fraction")
# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(data, linewidths=.5, ax=ax)
plt.show()
In [42]:
# Now try to get more on the Coleman-Liau Index
a=7
b=30.0
# vary a and b in Readability formula:
# score = (a * ASL) + (b * ASW)
step_c=1
step_d=5
del results
results=[]
for cc in range(1,10,1):
c=cc*step_c
for dd in range(1,10,1):
d=dd*step_d
#
r1 = text_experts.apply(lambda x: mod_cli(c,d,x)).values
#
r2 = text_non_experts.apply(lambda x: mod_cli(c,d,x)).values
fifth_p = get_fifth_p(r1)
fraction = get_fraction_under_value(r2,fifth_p)
#d = cohen_d(r1,r2)
results.append(dict(c=c,d=d,fraction = fraction))
print(c,d,fraction)
In [46]:
df = pd.DataFrame.from_dict(results)
In [44]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
# Load the example flights dataset and conver to long-form
data = df.pivot("c", "d", "fraction")
# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(data, linewidths=.5, ax=ax)
plt.show()
In [45]:
df_sorted=df.sort_values(['fraction'], ascending=[False])
df_sorted.head()
Out[45]:
In [49]:
# Mix them:
a=0.7
b=30.0
c=1
d=20
for fact in range(3,10):
r1 = text_experts.apply(lambda x: fact * mod_cli(c,d,x) + get_grade(a,b,x) ).values
r2 = text_non_experts.apply(lambda x: fact * mod_cli(c,d,x) + get_grade(a,b,x)).values
fifth_p = get_fifth_p(r1)
fraction = get_fraction_under_value(r2,fifth_p)
print(fact,fifth_p,fraction)
In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats
import numpy as np
In [62]:
r1 = text_experts.apply(lambda x: mod_cli(1,20,x))
r2 = text_non_experts.apply(lambda x: mod_cli(1,20,x))
fifth_p = get_fifth_p(r1)
fraction = get_fraction_under_value(r2,fifth_p)
print('fifth',fifth_p,' fraction',fraction)
xmax =int(r1.values.max())
xmin = int(r2.values.min())
xmin = 100
xmax = 600
print(type(r1.values),type(r2.values))
print(len(r1),len(r2))
print(r1.values.mean())
print(r2.values.mean())
print(r1.values.std())
print(r2.values.std())
In [59]:
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True
plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size
plt.rcParams['ytick.labelsize'] = label_size
plt.rcParams['axes.linewidth'] = 5 # set the value globally
plt.rc('axes', linewidth=2)
fig = plt.figure()
#fig=plt.figure(figsize=(2,1))
#fig.set_size_inches(6.6,3.3)
gs = gridspec.GridSpec(2, 1)
#fig, ax = plt.subplots(ncols=1, nrows=1) # These arguments can be omitted for one
plt.subplots_adjust(left=0.2,right=1.0,bottom=0.17,top=0.9)
fig.set_size_inches(6,6)
#plt.suptitle('Readability score')
fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')
fig.text(0.4,0.04,'Readability score',va='center')
Out[59]:
In [60]:
#X ticks
#plt.ylabel('Distribution')
#xmax=1000
xsteps=int(xmax/10)
xsteps_bins=int(xsteps/4)
x=np.arange(xmin,xmax,xsteps) #xtics
xx=np.arange(xmin,xmax,xsteps)
# Panel 1
ax1=plt.subplot(gs[0],facecolor='w')
ax1.set_xlim([xmin, xmax])
ax1.set_xticks(x)
#ax1.set_ylim([0, 0.2])
#ax1.set_yticks(np.arange(0,0.3,0.1))
ax1.tick_params(labelbottom='off')
# Customize the grid
ax1.grid(linestyle='--', linewidth='0.5', color='black',which='both')
ax1.axhline(linewidth=4, color="black") # inc. width of x-axis and color it green
ax1.axvline(linewidth=4, color="black")
#Class 0
X=r1.values
#n,bins,patches=plt.hist(X,nbins,normed=1,facecolor='blue',align='mid',label='Experts')
n,bins,patches=plt.hist(X, bins=np.arange(0, xmax, xsteps_bins),normed=1,facecolor='blue',align='mid',label='Experts')
mu=X.mean()
var=X.var()
#legend1=plt.legend(bbox_to_anchor=(0.45, 0.95), loc=0, borderaxespad=0., fontsize=label_size)
print("Experts: Mean,variance: ({},{})".format(mu,var))
# Panel 2
ax2=plt.subplot(gs[1],facecolor='w')
ax2.set_xlim([xmin, xmax])
ax2.set_xticks(x)
#ax2.set_ylim([0, 0.2])
#ax2.set_yticks(np.arange(0,0.3,0.1))
ax2.grid(linestyle='--', linewidth='0.5', color='black',which='both')
ax2.axhline(linewidth=4, color="black") # inc. width of x-axis and color it green
ax2.axvline(linewidth=4, color="black")
#Class 1
X=r2.values
#n,bins,patches=plt.hist(X,nbins,normed=1,facecolor='orange',align='mid',label='Non experts')
n,bins,patches=plt.hist(X, bins=np.arange(0, xmax, xsteps_bins),normed=1,facecolor='orange',align='mid',label='Non experts')
mu=X.mean()
var=X.var()
#legend2=plt.legend(bbox_to_anchor=(0.45, 0.95), loc=0, borderaxespad=0., fontsize=label_size,frameon=1)
print("Non experts: Mean,variance:",mu,var)
plt.show()
In [61]:
#fig.facecolor="white"
fig.savefig('Readability-v2.jpeg', dpi=300)
In [ ]: