In [12]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/arimorcos/Github/getRedditDataset/')
from celebReddit import countMisspellings, countWords
import redditDB
import redUserComment
from textstat.textstat import textstat
import statistics
In [22]:
"""
Process previous to default
"""
# initialize database and get subreddits
prevDefaultDB = redditDB.RedditDB('newDefaults_preDefault')
subList = prevDefaultDB.getSubreddits()
# count each in each subreddit
wordCountPre = {}
misSpellCountPre = {}
fracMisspellPre = {}
for sub in subList:
# get comment text
subComments = prevDefaultDB.getSubredditCommentText(sub)
# count words and misspellings
wordCountPre[sub] = countWords(subComments)
misSpellCountPre[sub] = countMisspellings(subComments)
fracMisspellPre[sub] = float(misSpellCountPre[sub])/wordCountPre[sub]
In [23]:
"""
Process post default
"""
# initialize database and get subreddits
postDefaultDB = redditDB.RedditDB('newDefaults_postDefault')
subList = postDefaultDB.getSubreddits()
# count each in each subreddit
wordCountPost = {}
misSpellCountPost = {}
fracMisspellPost = {}
for sub in subList:
# get comment text
subComments = postDefaultDB.getSubredditCommentText(sub)
# count words and misspellings
wordCountPost[sub] = countWords(subComments)
misSpellCountPost[sub] = countMisspellings(subComments)
fracMisspellPost[sub] = float(misSpellCountPost[sub])/wordCountPost[sub]
In [2]:
def getReadabilityStats(text):
# get scores
fleschGrade = textstat.flesch_kincaid_grade(text)
fleschScore = textstat.flesch_reading_ease(text)
coleman = textstat.coleman_liau_index(text)
ari = textstat.automated_readability_index(text)
# store
return {'fleschGrade': fleschGrade, 'fleschScore': fleschScore,
'coleman': coleman, 'ari': ari}
In [3]:
"""
Process previous to default for text statistics
"""
# initialize database and get subreddits
prevDefaultDB = redditDB.RedditDB('newDefaults_preDefault')
subList = prevDefaultDB.getSubreddits()
# count each in each subreddit
readScoresPre = {}
for sub in subList:
# get comment text
subComments = prevDefaultDB.getSubredditCommentText(sub)
# get text stats
comments = " ".join(subComments)
readScoresPre[sub] = getReadabilityStats(comments)
print sub
In [4]:
"""
Process post default
"""
# initialize database and get subreddits
postDefaultDB = redditDB.RedditDB('newDefaults_postDefault')
subList = postDefaultDB.getSubreddits()
# count each in each subreddit
readScoresPost = {}
for sub in subList:
# get comment text
subComments = postDefaultDB.getSubredditCommentText(sub)
# get text stats
comments = " ".join(subComments)
readScoresPost[sub] = getReadabilityStats(comments)
print sub
In [6]:
print [str(readScoresPre[sub]['fleschGrade']) + '-->' + str(readScoresPost[sub]['fleschGrade']) for sub in subList]
In [53]:
fleschPre = [readScoresPre[sub]['fleschScore'] for sub in subList]
fleschPost = [readScoresPost[sub]['fleschScore'] for sub in subList]
print ['Mean pre: ' + str(statistics.mean(fleschPre)) + ' --> Mean post: ' + str(statistics.mean(fleschPost))]
from scipy import stats as spStats
spStats.ttest_ind(fleschPre, fleschPost)
Out[53]:
In [49]:
readScoresPre['Art']
Out[49]:
In [20]:
wordCountPre
Out[20]:
In [24]:
print [sub + ' ' + str(wordCountPre[sub]) + ' --> ' + str(wordCountPost[sub]) for sub in subList]
In [25]:
print [sub + ' ' + str(fracMisspellPre[sub]) + ' --> ' + str(fracMisspellPost[sub]) for sub in subList]
In [26]:
statistics.mean([fracMisspellPre[sub] for sub in subList])
Out[26]:
In [27]:
statistics.mean([fracMisspellPost[sub] for sub in subList])
Out[27]:
In [30]:
from scipy import stats as spStats
pre = [fracMisspellPre[sub] for sub in subList]
post = [fracMisspellPost[sub] for sub in subList]
spStats.ttest_ind(pre,post)
Out[30]:
In [ ]: