In [1]:
%matplotlib inline
# import libraries
import collections
import hashlib
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tag import pos_tag, map_tag
from os import path
import pandas as pd
from scipy.misc import imread
import string
import random
import re
from wordcloud import WordCloud, STOPWORDS
# load data
anorexiaSubreddits = pd.read_csv("data/subreddits_anorexia.csv", encoding='ISO-8859-1')
obesitySubreddits = pd.read_csv("data/subreddits_obesity.csv", encoding='ISO-8859-1')
bothSubreddits = pd.read_csv("data/subreddits_both.csv", encoding='ISO-8859-1')
In [2]:
# apply hash function to author column in each dataset
anorexia_authors = anorexiaSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a: hashlib.md5(a.encode()).hexdigest()).to_frame()
obesity_authors = obesitySubreddits.drop_duplicates(subset="author")['author'].apply(lambda a: hashlib.md5(a.encode()).hexdigest()).to_frame()
both_authors = bothSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a: hashlib.md5(a.encode()).hexdigest()).to_frame()
In [3]:
anorexiaSubreddits['hashedAuthors'] = anorexia_authors
obesitySubreddits['hashedAuthors'] = obesity_authors
bothSubreddits['hashedAuthors'] = both_authors
In [4]:
# print example of anorexia_authors (hashed)
anorexia_authors.head()
Out[4]:
In [5]:
# print first 10 rows of anorexia-related subreddits
# (minus original author column)
anorexiaSubreddits[["hashedAuthors", "body", "subreddit", "subreddit_id", "score"]].head()
Out[5]:
In [6]:
# most common subreddits posted to
anorexiaSubreddits["subreddit"].value_counts()[:10]
Out[6]:
In [8]:
labels = "fatpeoplehate", "AskReddit", "fatlogic", "relationships", "TwoXChromosomes", "WTF", "raisedbynarcissists", "Fitness", "AskWomen", "loseit"
sizes = [251, 214, 207, 74, 32, 30, 26, 25, 24, 24]
colors = ["navajowhite", "aliceblue", "lavenderblush", "honeydew","blanchedalmond", "lemonchiffon","linen", "azure", "thistle", "beige"]
plt.pie(sizes, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=False, startangle=90, pctdistance=1.13, labeldistance=1.4)
plt.axis('equal')
plt.show()
In [9]:
# most common subreddits authors
#anorexiaSubreddits["author"].value_counts()[:10] --> commented out to anonymize data
In [10]:
# map author to body of text
scoreDict = anorexiaSubreddits.set_index('author')['body'].to_dict()
In [11]:
#nltk.help.upenn_tagset()
In [12]:
# strip punctuation from all body comments
bodyList = []
for val in anorexiaSubreddits["body"]:
val = re.sub("[^a-zA-Z]+", " ", val)
bodyList.append(val)
# tokenize each work, using nltk_tokenize
tokenList = []
for sentence in bodyList:
tokens = nltk.word_tokenize(sentence)
tokenList.append(tokens)
# add POS tags to words
taggedList = []
for item in tokenList:
item = pos_tag(item)
taggedList.append(item)
#taggedList.append([(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tag(item)])
#print(taggedList)
# choose the most relevant words to consider,
# according to tags
relevantList = []
count = 0
for i in taggedList:
for j in i:
#if j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS" or j[1] == "NN" or j[1] == "RBR" or j[1] == "RBS" or j[1] == "RB" or j[1] == "VB" or j[1] == "VBD" or j[1] == "VBG":
if j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS" or j[1] == "RBR" or j[1] == "RBS" or j[1] == "RB":
relevantList.append(j[0].lower()) # it seems as if adjectives and adverbs are the most telling
# remove stopwords
finalList = [word for word in relevantList if word not in stopwords.words('english')]
fdist = FreqDist(finalList)
common = fdist.most_common()[0:101] # not including "anorexia"
uncommon = list(reversed(fdist.most_common()))[:50]
print("These are the most common words:",common, "\n")
print("These are the most uncommon words:", uncommon, "\n")
In [15]:
newCommon = [('anorexia', 336), ('fat', 323), ('mental', 278), ('healthy', 253), \
('bad', 142), ('normal', 139), ('enough', 136), ('first', 130), ('little', 125), ('pretty', 122), ('hard', 118), ('thin', 117), ('different', 116), ('less', 115), ('eating', 112), ('best', 111), ('anorexic', 110), ('high', 99), ('weight', 96), ('maybe', 93), ('least', 92), ('skinny', 91), ('great', 90), ('unhealthy', 85), ('old', 85), ('real', 81), ('physical', 80), ('medical', 79), ('underweight', 78), ('away', 77), ('serious', 75), ('wrong', 74), ('obese', 73), ('far', 73), ('big', 71), ('diet', 68), ('happy', 68), ('definitely', 67), ('overweight', 65), ('due', 64), ('similar', 63), ('low', 62), ('full', 62), ('whole', 62), ('social', 60), ('especially', 60), ('small', 58), ('possible', 58), ('completely', 57), ('seriously', 55), ('however', 55), ('already', 55), ('certain', 54), ('important', 52), ('rather', 52), ('worse', 52), ('finally', 50), ('literally', 49), ('severe', 49), ('obviously', 49), ('common', 48), ('psychological', 48), ('new', 47), ('human', 47), ('absolutely', 45), ('yet', 45), ('true', 45), ('nice', 45), ('quite', 44), ('strong', 44), ('honestly', 44), ('gt', 43), ('self', 43), ('later', 42)]
completeText = ""
for key, val in newCommon:
completeText += (key + " ") * val
In [16]:
text = completeText
wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
relative_scaling = 0.5,
stopwords = 'to of'
).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [17]:
# anecdotes containing the most common words
listOfWords = ["eating", "weight", "pretty", "weight"]
count = 0
for sentence in bodyList:
if all(word in sentence for word in listOfWords):
if count <= 5:
print(sentence + "\n")
count += 1
In [18]:
scoreDict = anorexiaSubreddits.set_index('score')['body'].to_dict()
topScoreCount = 0
for score, body in sorted(scoreDict.items(),reverse = True):
if topScoreCount != 5:
print(score, ":" ,body, "\n")
topScoreCount += 1
print ("---------------------------------------------------------------------------------------------------------------------")
bottomScoreCount = 0
for score, body in sorted(scoreDict.items()):
if bottomScoreCount != 5:
print(score, ":" ,body, "\n")
bottomScoreCount += 1
In [19]:
# print first five values of dataset
obesitySubreddits[["hashedAuthors", "body", "subreddit", "subreddit_id", "score"]].head(5)
Out[19]:
In [20]:
# most common subreddits posted to
obesitySubreddits["subreddit"].value_counts()[:10]
Out[20]:
In [22]:
labels = "fatpeoplehate", "AskReddit", "fatlogic", "funny", "todayilearned", "science", "WTF", "worldnews", "Fitness", "TumblrinAction"
sizes = [1055, 1041, 953, 191, 165, 152, 142, 130, 129, 128]
colors = ["navajowhite", "lavenderblush", "aliceblue", "honeydew", "blanchedalmond", "lemonchiffon","linen", "azure", "thistle", "beige"]
plt.pie(sizes, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=False, startangle=90, pctdistance=1.1, labeldistance=1.4)
plt.axis('equal')
plt.show()
In [23]:
# most common subreddits authors
# obesitySubreddits["author"].value_counts()[:10]
# (hidden to protect user data)
In [24]:
# create dictionary to match author with body (aka: Who wrote what?)
scoreDict = obesitySubreddits.set_index('author')['body'].to_dict()
In [25]:
# remove punctuation from body
bodyList = []
for val in obesitySubreddits["body"]:
val = re.sub("[^a-zA-Z]+", " ", val)
bodyList.append(val)
# tokenize each work, using nltk_tokenize
tokenList = []
for sentence in bodyList:
tokens = nltk.word_tokenize(sentence)
tokenList.append(tokens)
# add POS tags to words
taggedList = []
for item in tokenList:
item = pos_tag(item)
taggedList.append(item)
#taggedList.append([(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tag(item)])
#print(taggedList)
# choose the most relevant words to consider,
# according to tags
relevantList = []
count = 0
for i in taggedList:
for j in i:
#if j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS" or j[1] == "NN" or j[1] == "RBR" or j[1] == "RBS" or j[1] == "RB" or j[1] == "VB" or j[1] == "VBD" or j[1] == "VBG":
if j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS" or j[1] == "RBR" or j[1] == "RBS" or j[1] == "RB":
relevantList.append(j[0].lower()) # it seems as if adjectives and adverbs are the most telling
# remove stopwords
finalList = [word for word in relevantList if word not in stopwords.words('english')]
fdist = FreqDist(finalList)
common = fdist.most_common()[0:101] # not including "anorexia"
uncommon = list(reversed(fdist.most_common()))[:50]
print("These are the most common words:",common, "\n")
print("These are the most uncommon words:", uncommon, "\n")
In [29]:
newCommon = [('fat', 2980), ('obese', 1831), ('healthy', 1412), \
('unhealthy', 621), ('diet', 557), ('weight', 543),\
('overweight', 539), ('pretty', 490), ('medical', 486), ('little', 484), \
('poor', 448), ('different', 447), ('normal', 410), ('far', 400), ('wrong', 396), ('due', 393), ('hard', 376), ('lower', 365), ('ever', 355), ('big', 353), ('higher', 347), ('instead', 347), ('least', 346), ('often', 336), ('great', 331), ('real', 322), ('whole', 315), ('else', 313), ('www', 310), ('best', 304), ('new', 301), ('maybe', 300), ('full', 300), ('true', 296), ('physical', 296), ('last', 291), ('able', 288), ('especially', 286), ('human', 281), ('old', 275), ('rather', 273), ('already', 273), ('mental', 272), ('thin', 269), ('social', 268), ('large', 265), ('almost', 263), ('certain', 262), ('huge', 261), ('simply', 256), ('http', 256), ('average', 255), ('important', 250), ('general', 243), ('american', 237), ('free', 237), ('public', 230), ('possible', 228), ('worse', 227), ('gt', 227), ('completely', 225), ('fast', 224), ('usually', 223), ('u', 222), ('common', 221), ('small', 219), ('attractive', 218), ('obesity', 216), ('serious', 216), ('yet', 213), ('skinny', 209), ('easy', 206), ('personal', 204), ('happy', 203), ('quite', 198), ('extra', 198)]
completeText = ""
for key, val in newCommon:
completeText += (key + " ") * val
In [30]:
text = completeText
wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
relative_scaling = 0.5,
stopwords = 'to of'
).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [14]:
# anecdote
listOfWords = ["obese", "fat", "unhealthy", "skinny"]
count = 0
for sentence in bodyList:
if all(word in sentence for word in listOfWords):
if count <= 5:
print(sentence + "\n")
count += 1
In [15]:
scoreDict = obesitySubreddits.set_index('score')['body'].to_dict()
topScoreCount = 0
for score, body in sorted(scoreDict.items(),reverse = True):
if topScoreCount != 5:
print(score, ":" ,body, "\n")
topScoreCount += 1
print ("---------------------------------------------------------------------------------------------------------------------")
bottomScoreCount = 0
for score, body in sorted(scoreDict.items()):
if bottomScoreCount != 5:
print(score, ":" ,body, "\n")
bottomScoreCount += 1
In [72]:
# print first five values of dataset
bothSubreddits[["hashedAuthors", "body", "subreddit", "subreddit_id", "score"]].head(5)
Out[72]:
In [73]:
# most common subreddits posted to
bothSubreddits["subreddit"].value_counts()[:10]
Out[73]:
In [75]:
labels = "AskReddit", "fatpeoplehate", "fatlogic", "Fitness", "funnt", "relationships", "WTF", "todayilearned", "loseit", "TumblrInAction"
sizes = [2119, 1801, 1410, 361, 321, 298, 268, 229, 221, 186]
colors = ["aliceblue", "lavenderblush", "honeydew","blanchedalmond", "navajowhite", "lemonchiffon","linen", "azure", "thistle", "beige"]
plt.pie(sizes, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=False, startangle=90, pctdistance=1.13, labeldistance=1.3)
plt.axis('equal')
plt.show(fig)
In [76]:
# most common subreddits authors
#bothSubreddits["author"].value_counts()[:10] --> commented out, anonymizing data
In [77]:
scoreDict = bothSubreddits.set_index('author')['body'].to_dict()
In [79]:
# remove punctuation from body
bodyList = []
for val in bothSubreddits["body"]:
val = re.sub("[^a-zA-Z]+", " ", val)
bodyList.append(val)
# tokenize each work, using nltk_tokenize
tokenList = []
for sentence in bodyList:
tokens = nltk.word_tokenize(sentence)
tokenList.append(tokens)
# add POS tags to words
taggedList = []
for item in tokenList:
item = pos_tag(item)
taggedList.append(item)
#taggedList.append([(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tag(item)])
#print(taggedList)
# choose the most relevant words to consider,
# according to tags
relevantList = []
count = 0
for i in taggedList:
for j in i:
#if j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS" or j[1] == "NN" or j[1] == "RBR" or j[1] == "RBS" or j[1] == "RB" or j[1] == "VB" or j[1] == "VBD" or j[1] == "VBG":
if j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS" or j[1] == "RBR" or j[1] == "RBS" or j[1] == "RB":
relevantList.append(j[0].lower()) # it seems as if adjectives and adverbs are the most telling
# remove stopwords
finalList = [word for word in relevantList if word not in stopwords.words('english')]
fdist = FreqDist(finalList)
common = fdist.most_common()[0:151] # not including "anorexia"
uncommon = list(reversed(fdist.most_common()))[:50]
print("These are the most common words:",common, "\n")
print("These are the most uncommon words:", uncommon, "\n")
In [84]:
newCommon = [('obese', 8172), ('fat', 4486), ('healthy', 2316), ('still', 2060), ('good', 1999), ('never', 1558), ('morbidly', 1266), ('better', 1245), ('bad', 1164), ('less', 1121), ('overweight', 1108), ('probably', 1072), ('first', 1027), ('long', 1024), ('high', 1021), ('back', 1011), ('little', 1007), ('pretty', 1006), ('always', 997), ('right', 971), ('unhealthy', 891), ('sure', 886), ('normal', 845), ('enough', 835), ('different', 800), ('ever', 796), ('hard', 791), ('weight', 784), ('big', 745), ('diet', 717), ('least', 669), ('maybe', 657), ('best', 652), ('wrong', 645), ('old', 645), ('thin', 640), ('far', 625), ('however', 621), ('u', 620), ('able', 619), ('last', 615), ('low', 612), ('anorexic', 603), ('real', 600), ('skinny', 597), ('great', 594), ('likely', 588), ('eating', 576), ('else', 571), ('often', 571), ('mental', 561), ('already', 552), ('whole', 540), ('medical', 538), ('instead', 515), ('almost', 515), ('due', 510), ('poor', 507), ('new', 500), ('full', 492), ('true', 484), ('definitely', 474), ('especially', 465), ('rather', 460), ('small', 456), ('physical', 444), ('usually', 437), ('average', 423), ('attractive', 415), ('huge', 413), ('completely', 413), ('away', 411), ('happy', 406), ('large', 400), ('yet', 395), ('http', 386), ('possible', 386), ('important', 385), ('free', 370), ('easy', 369), ('quite', 367), ('sometimes', 364), ('serious', 359), ('simply', 356), ('human', 352), ('lbs', 351), ('literally', 343), ('social', 342), ('lower', 341), ('extremely', 340), ('higher', 339), ('certain', 331), ('worse', 328), ('general', 328), ('seriously', 325), ('gt', 319), ('next', 317), ('short', 312), ('absolutely', 310), ('fine', 309), ('obviously', 306), ('young', 300), ('extra', 300), ('fit', 298), ('fast', 298), ('www', 298), ('similar', 297), ('difficult', 293), ('super', 293), ('personal', 291), ('common', 284), ('american', 281), ('basically', 280), ('longer', 278), ('beautiful', 276), ('stupid', 276), ('single', 275), ('exactly', 272), ('ago', 271), ('later', 266), ('honestly', 266), ('generally', 263), ('easier', 263), ('active', 262), ('entire', 261), ('clearly', 257), ('public', 255), ('actual', 255), ('white', 252), ('black', 249), ('nice', 248), ('early', 241), ('physically', 241), ('certainly', 241), ('simple', 240), ('strong', 239), ('together', 239), ('underweight', 237), ('mostly', 236), ('totally', 231), ('daily', 231), ('alone', 230), ('healthier', 223)]
completeText = ""
for key, val in newCommon:
completeText += (key + " ") * val
In [81]:
text = completeText
wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
relative_scaling = 0.5,
stopwords = 'to of'
).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [85]:
# anecdote
listOfWords = ["fat", "healthy", "morbidly", "overweight"]
count = 0
for sentence in bodyList:
if all(word in sentence for word in listOfWords):
if count <= 5:
print(sentence + "\n")
count += 1
In [86]:
scoreDict = bothSubreddits.set_index('score')['body'].to_dict()
topScoreCount = 0
for score, body in sorted(scoreDict.items(),reverse = True):
if topScoreCount != 5:
print(score, ":" ,body, "\n")
topScoreCount += 1
print ("---------------------------------------------------------------------------------------------------------------------")
bottomScoreCount = 0
for score, body in sorted(scoreDict.items()):
if bottomScoreCount != 5:
print(score, ":" ,body, "\n")
bottomScoreCount += 1
In [ ]: