notebook.community

Edit and run



In [12]:

    
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/arimorcos/Github/getRedditDataset/')
from celebReddit import countMisspellings, countWords
import redditDB
import redUserComment
from textstat.textstat import textstat
import statistics









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [22]:

    
"""
Process previous to default 
"""

# initialize database and get subreddits
prevDefaultDB = redditDB.RedditDB('newDefaults_preDefault')
subList = prevDefaultDB.getSubreddits()

# count each in each subreddit
wordCountPre = {}
misSpellCountPre = {}
fracMisspellPre = {}
for sub in subList:
    
    # get comment text
    subComments = prevDefaultDB.getSubredditCommentText(sub)
    
    # count words and misspellings
    wordCountPre[sub] = countWords(subComments)
    misSpellCountPre[sub] = countMisspellings(subComments)
    fracMisspellPre[sub] = float(misSpellCountPre[sub])/wordCountPre[sub]



In [23]:

    
"""
Process post default 
"""

# initialize database and get subreddits
postDefaultDB = redditDB.RedditDB('newDefaults_postDefault')
subList = postDefaultDB.getSubreddits()

# count each in each subreddit
wordCountPost = {}
misSpellCountPost = {}
fracMisspellPost = {}
for sub in subList:
    
    # get comment text
    subComments = postDefaultDB.getSubredditCommentText(sub)
    
    # count words and misspellings
    wordCountPost[sub] = countWords(subComments)
    misSpellCountPost[sub] = countMisspellings(subComments)
    fracMisspellPost[sub] = float(misSpellCountPost[sub])/wordCountPost[sub]



In [2]:

    
def getReadabilityStats(text):

    # get scores
    fleschGrade = textstat.flesch_kincaid_grade(text)
    fleschScore = textstat.flesch_reading_ease(text)
    coleman = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)

    # store
    return {'fleschGrade': fleschGrade, 'fleschScore': fleschScore,
           'coleman': coleman, 'ari': ari}



In [3]:

    
"""
Process previous to default for text statistics 
"""

# initialize database and get subreddits
prevDefaultDB = redditDB.RedditDB('newDefaults_preDefault')
subList = prevDefaultDB.getSubreddits()

# count each in each subreddit
readScoresPre = {}
for sub in subList:
    
    # get comment text
    subComments = prevDefaultDB.getSubredditCommentText(sub)
    
    # get text stats 
    comments = " ".join(subComments)
    readScoresPre[sub] = getReadabilityStats(comments)
    
    print sub









    



Art
askscience
creepy
dataisbeautiful
DIY
Documentaries
Fitness
food
Futurology
gadgets
GetMotivated
history
InternetIsBeautiful
Jokes
LifeProTips
listentothis
mildlyinteresting
nosleep
nottheonion
OldSchoolCool
personalfinance
philosophy
photoshopbattles
Showerthoughts
space
sports
tifu
TwoXChromosomes
UpliftingNews
WritingPrompts
blog



In [4]:

    
"""
Process post default 
"""

# initialize database and get subreddits
postDefaultDB = redditDB.RedditDB('newDefaults_postDefault')
subList = postDefaultDB.getSubreddits()

# count each in each subreddit
readScoresPost = {}
for sub in subList:
    
    # get comment text
    subComments = postDefaultDB.getSubredditCommentText(sub)
    
    # get text stats 
    comments = " ".join(subComments)
    readScoresPost[sub] = getReadabilityStats(comments)
    
    print sub









    



Art
askscience
creepy
dataisbeautiful
DIY
Documentaries
Fitness
food
Futurology
gadgets
GetMotivated
history
InternetIsBeautiful
Jokes
LifeProTips
listentothis
mildlyinteresting
nosleep
nottheonion
OldSchoolCool
personalfinance
philosophy
photoshopbattles
Showerthoughts
space
sports
tifu
TwoXChromosomes
UpliftingNews
WritingPrompts
blog



In [6]:

    
print [str(readScoresPre[sub]['fleschGrade']) + '-->' + str(readScoresPost[sub]['fleschGrade']) for sub in subList]









    



['7.2-->7.2', '9.1-->9.1', '7.2-->6.0', '8.8-->9.1', '6.4-->6.4', '8.8-->8.0', '5.6-->5.6', '6.0-->6.0', '8.4-->8.4', '7.6-->6.8', '6.4-->6.8', '8.7-->8.7', '7.2-->7.2', '5.6-->4.4', '6.4-->6.8', '15.4-->13.1', '6.8-->6.8', '4.8-->4.8', '6.4-->8.0', '6.4-->6.8', '6.8-->7.2', '8.7-->9.1', '20.6-->18.6', '6.4-->6.4', '8.4-->8.4', '6.4-->6.4', '4.8-->5.2', '7.2-->7.2', '7.2-->6.4', '4.8-->4.8', '7.2-->6.0']



In [53]:

    
fleschPre = [readScoresPre[sub]['fleschScore'] for sub in subList]
fleschPost = [readScoresPost[sub]['fleschScore'] for sub in subList]
print ['Mean pre: ' + str(statistics.mean(fleschPre)) + ' --> Mean post: ' + str(statistics.mean(fleschPost))]
from scipy import stats as spStats
spStats.ttest_ind(fleschPre, fleschPost)









    



['Mean pre: 64.7996774194 --> Mean post: 66.6880645161']






    Out[53]:





(-0.34325952855960573, 0.73260307960633608)



In [49]:

    
readScoresPre['Art']









    Out[49]:





{'ari': 9.1, 'coleman': 11.3, 'fleschGrade': 7.2, 'fleschScore': 66.74}



In [20]:

    
wordCountPre









    Out[20]:





{u'Art': 16249}



In [24]:

    
print [sub + '  ' + str(wordCountPre[sub]) + ' --> ' + str(wordCountPost[sub]) for sub in subList]









    



[u'Art  16249 --> 34914', u'askscience  308265 --> 425537', u'creepy  33629 --> 54419', u'dataisbeautiful  52821 --> 80220', u'DIY  103779 --> 162395', u'Documentaries  25712 --> 67538', u'Fitness  488732 --> 583715', u'food  59981 --> 95909', u'Futurology  138376 --> 223272', u'gadgets  20629 --> 76492', u'GetMotivated  36249 --> 71254', u'history  53435 --> 70717', u'InternetIsBeautiful  17744 --> 26205', u'Jokes  37457 --> 54874', u'LifeProTips  67614 --> 118430', u'listentothis  64782 --> 130759', u'mildlyinteresting  110571 --> 123197', u'nosleep  68448 --> 104943', u'nottheonion  70426 --> 84588', u'OldSchoolCool  20527 --> 52942', u'personalfinance  425975 --> 820857', u'philosophy  115963 --> 198325', u'photoshopbattles  34821 --> 53153', u'Showerthoughts  98465 --> 111597', u'space  75997 --> 79490', u'sports  47096 --> 55257', u'tifu  67434 --> 142732', u'TwoXChromosomes  288026 --> 774924', u'UpliftingNews  11413 --> 32498', u'WritingPrompts  624512 --> 1670233', u'blog  1144 --> 770']



In [25]:

    
print [sub + '  ' + str(fracMisspellPre[sub]) + ' --> ' + str(fracMisspellPost[sub]) for sub in subList]









    



[u'Art  0.177857098898 --> 0.172939222089', u'askscience  0.150831265307 --> 0.15250377758', u'creepy  0.179487941955 --> 0.176923500983', u'dataisbeautiful  0.188958936787 --> 0.174569932685', u'DIY  0.150849401131 --> 0.145583299978', u'Documentaries  0.180732731798 --> 0.155941840149', u'Fitness  0.160453991144 --> 0.156658643345', u'food  0.179890298595 --> 0.173946136442', u'Futurology  0.14798809042 --> 0.143838009244', u'gadgets  0.164913471327 --> 0.159912147676', u'GetMotivated  0.167039090734 --> 0.157759564375', u'history  0.15884719753 --> 0.147955937045', u'InternetIsBeautiful  0.201589269612 --> 0.204579278764', u'Jokes  0.190725365085 --> 0.188085432081', u'LifeProTips  0.157526547756 --> 0.154572321202', u'listentothis  0.392269457565 --> 0.341307290512', u'mildlyinteresting  0.199564080998 --> 0.193584259357', u'nosleep  0.160954301075 --> 0.148947523894', u'nottheonion  0.156064521626 --> 0.15730363645', u'OldSchoolCool  0.215082574171 --> 0.199652449851', u'personalfinance  0.149389048653 --> 0.146682065207', u'philosophy  0.148029975078 --> 0.146159082314', u'photoshopbattles  0.38956376899 --> 0.367204108893', u'Showerthoughts  0.168577667191 --> 0.167334247336', u'space  0.148637446215 --> 0.147251226569', u'sports  0.162455410226 --> 0.16148180321', u'tifu  0.15898508171 --> 0.158787097497', u'TwoXChromosomes  0.140476901391 --> 0.137712859584', u'UpliftingNews  0.173135897661 --> 0.157332758939', u'WritingPrompts  0.177082265833 --> 0.178906176563', u'blog  0.203671328671 --> 0.272727272727']



In [26]:

    
statistics.mean([fracMisspellPre[sub] for sub in subList])









    Out[26]:





0.18392356210108524



In [27]:

    
statistics.mean([fracMisspellPost[sub] for sub in subList])









    Out[27]:





0.17897235169492368



In [30]:

    
from scipy import stats as spStats
pre = [fracMisspellPre[sub] for sub in subList]
post = [fracMisspellPost[sub] for sub in subList]
spStats.ttest_ind(pre,post)









    Out[30]:





(0.34807514480107871, 0.72900202386053103)



In [ ]: