In [246]:
from functional import seq
from bs4 import BeautifulSoup
from collections import namedtuple
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
import numpy as np
wnl = WordNetLemmatizer()
def sentiment_symbol(word, positive_words, negative_words):
word = wnl.lemmatize(word)
if word in positive_words:
return '+'
if word in negative_words:
return '-'
return 'n'
def print_top_words(model, feature_names, n_top_words, pos_words, neg_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join(
[feature_names[i] + ":{0}".format(
sentiment_symbol(feature_names[i], pos_words, neg_words)
)
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
In [247]:
names = seq.open('names.txt').map(lambda x: x.split()[0].lower()).to_set()
positive_words = seq.open('positive-words.txt', mode='rb')\
.drop(35)\
.map(lambda x: x.decode("utf-8").strip())\
.to_set()
negative_words = seq.open('negative-words.txt', mode='rb')\
.drop(35)\
.map(lambda x: x.decode("utf-8", 'ignore').strip())\
.to_set()
In [248]:
Session = namedtuple('Session', ['username',
'link',
'is_aggression',
'is_bullying',
'creation_time',
'comments'])
Comment = namedtuple('Comment', ['user', 'text', 'time'])
def parse_row(row):
raw_d = {}
for i, col in enumerate(header):
raw_d[header[i]] = row[i]
comments = []
for i in range(1, 661):
name = 'column{0}'.format(i)
if raw_d[name] != 'empty':
#print("COMMENT")
#print(raw_d[name])
comment = BeautifulSoup(raw_d[name], 'html.parser')
if not comment.font:
continue
c_user = comment.font.text
c_match = re.search('.+::(.+) \(created_at:(.+)\)', comment.text)
if not c_match:
continue
c_text, c_time = c_match.groups()
comments.append(Comment(c_user, c_text, c_time))
is_aggression = raw_d['question1'] == 'aggression'
is_bullying = raw_d['question2'] == 'bullying'
return Session(
raw_d['userName'],
raw_d['videolink'],
is_aggression,
is_bullying,
raw_d['postCreatedTime'],
seq(comments))
In [249]:
csv_data = seq.csv('vine_meta_data.csv')
header = csv_data.first()
data = csv_data.drop(1).map(parse_row).cache()
In [250]:
global_usernames = data.map(lambda s: s.username).to_set()
In [251]:
def clean_comment(usernames, comment):
text = comment.text
for u in usernames:
text = re.sub(re.escape(u), '', text, flags=re.IGNORECASE)
return Comment(comment.user, text, comment.time)
def clean_session(session):
usernames = session.comments.map(lambda c: c.user).to_set()
comments = session.comments.map(lambda c: clean_comment(usernames, c))
return Session(
session.username,
session.link,
session.is_aggression,
session.is_bullying,
session.creation_time,
comments)
clean_data = data.map(clean_session)
In [252]:
bully_documents = clean_data\
.filter(lambda x: x.is_aggression or x.is_bullying)\
.map(lambda s: s.comments.map(lambda c: c.text).distinct().make_string(' '))
non_bully_documents = clean_data\
.filter_not(lambda x: x.is_aggression or x.is_bullying)\
.map(lambda s: s.comments.map(lambda c: c.text).distinct().make_string(' '))
In [259]:
stop_words = ENGLISH_STOP_WORDS | names
bully_count_vectorizer = CountVectorizer(max_df=.65, min_df=2, stop_words=stop_words, binary=True)
bully_counts = bully_count_vectorizer.fit_transform(bully_documents)
non_bully_count_vectorizer = CountVectorizer(max_df=.65, min_df=2, stop_words=stop_words, binary=True)
non_bully_counts = non_bully_count_vectorizer.fit_transform(non_bully_documents)
In [260]:
def topic_model(counts, count_vectorizer, n_topics):
lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch')
lda.fit(counts)
feature_names = count_vectorizer.get_feature_names()
print_top_words(lda, feature_names, 15, positive_words, negative_words)
return lda.score(counts)
max_i = None
max_p = None
for i in range(2, 5):
bp = topic_model(bully_counts, bully_count_vectorizer, i)
nbp = topic_model(non_bully_counts, non_bully_count_vectorizer, i)
print(bp + nbp)
if max_i is None or max_p is None:
max_i = i
max_p = bp + nbp
if bp + nbp > max_p:
max_i = i
max_p = bp + nbp
print(max_i, max_p)
In [258]:
-761126.463018
Out[258]:
In [188]:
"test\n".strip()
Out[188]:
In [232]:
Out[232]:
In [ ]:
In [ ]: