In [246]:
from functional import seq
from bs4 import BeautifulSoup
from collections import namedtuple
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
import numpy as np

wnl = WordNetLemmatizer()

def sentiment_symbol(word, positive_words, negative_words):
    word = wnl.lemmatize(word)
    if word in positive_words:
        return '+'
    if word in negative_words:
        return '-'
    return 'n'

def print_top_words(model, feature_names, n_top_words, pos_words, neg_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
                [feature_names[i] + ":{0}".format(
                        sentiment_symbol(feature_names[i], pos_words, neg_words)
                    )
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [247]:
names = seq.open('names.txt').map(lambda x: x.split()[0].lower()).to_set()
positive_words = seq.open('positive-words.txt', mode='rb')\
    .drop(35)\
    .map(lambda x: x.decode("utf-8").strip())\
    .to_set()
negative_words = seq.open('negative-words.txt', mode='rb')\
    .drop(35)\
    .map(lambda x: x.decode("utf-8", 'ignore').strip())\
    .to_set()

In [248]:
Session = namedtuple('Session', ['username',
                                 'link',
                                 'is_aggression',
                                 'is_bullying',
                                 'creation_time',
                                 'comments'])

Comment = namedtuple('Comment', ['user', 'text', 'time'])

def parse_row(row):
    raw_d = {}
    for i, col in enumerate(header):
        raw_d[header[i]] = row[i]
    comments = []
    for i in range(1, 661):
        name = 'column{0}'.format(i)
        if raw_d[name] != 'empty':
            #print("COMMENT")
            #print(raw_d[name])
            comment = BeautifulSoup(raw_d[name], 'html.parser')
            if not comment.font:
                continue
            c_user = comment.font.text
            c_match = re.search('.+::(.+) \(created_at:(.+)\)', comment.text)
            if not c_match:
                continue
            c_text, c_time = c_match.groups()
            comments.append(Comment(c_user, c_text, c_time))
    is_aggression = raw_d['question1'] == 'aggression'
    is_bullying = raw_d['question2'] == 'bullying'
    return Session(
        raw_d['userName'],
        raw_d['videolink'],
        is_aggression,
        is_bullying,
        raw_d['postCreatedTime'],
        seq(comments))

In [249]:
csv_data = seq.csv('vine_meta_data.csv')
header = csv_data.first()
data = csv_data.drop(1).map(parse_row).cache()

In [250]:
global_usernames = data.map(lambda s: s.username).to_set()

In [251]:
def clean_comment(usernames, comment):
    text = comment.text
    for u in usernames:
        text = re.sub(re.escape(u), '', text, flags=re.IGNORECASE)
    return Comment(comment.user, text, comment.time)

def clean_session(session):
    usernames = session.comments.map(lambda c: c.user).to_set()
    comments = session.comments.map(lambda c: clean_comment(usernames, c))
    return Session(
        session.username,
        session.link,
        session.is_aggression,
        session.is_bullying,
        session.creation_time,
        comments)

clean_data = data.map(clean_session)

In [252]:
bully_documents = clean_data\
    .filter(lambda x: x.is_aggression or x.is_bullying)\
    .map(lambda s: s.comments.map(lambda c: c.text).distinct().make_string(' '))
non_bully_documents = clean_data\
    .filter_not(lambda x: x.is_aggression or x.is_bullying)\
    .map(lambda s: s.comments.map(lambda c: c.text).distinct().make_string(' '))

In [259]:
stop_words = ENGLISH_STOP_WORDS | names
bully_count_vectorizer = CountVectorizer(max_df=.65, min_df=2, stop_words=stop_words, binary=True)
bully_counts = bully_count_vectorizer.fit_transform(bully_documents)

non_bully_count_vectorizer = CountVectorizer(max_df=.65, min_df=2, stop_words=stop_words, binary=True)
non_bully_counts = non_bully_count_vectorizer.fit_transform(non_bully_documents)

In [260]:
def topic_model(counts, count_vectorizer, n_topics):
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch')
    lda.fit(counts)

    feature_names = count_vectorizer.get_feature_names()
    print_top_words(lda, feature_names, 15, positive_words, negative_words)
    return lda.score(counts)

max_i = None
max_p = None
for i in range(2, 5):
    bp = topic_model(bully_counts, bully_count_vectorizer, i)
    nbp = topic_model(non_bully_counts, non_bully_count_vectorizer, i)
    print(bp + nbp)
    if max_i is None or max_p is None:
        max_i = i
        max_p = bp + nbp
    if bp + nbp > max_p:
        max_i = i
        max_p = bp + nbp
print(max_i, max_p)


Topic #0:
people:n fucking:- know:n did:n bitch:- right:+ look:n say:n got:n stop:n hate:- stupid:- make:n vine:n oh:n
Topic #1:
bitch:- know:n fucking:- look:n ur:n vine:n got:n damn:- really:n people:n yo:n say:n nigga:n funny:- vines:n

Topic #0:
lol:n just:n know:n fuck:- oh:n shit:- look:n omg:n vine:n did:n good:+ fucking:- really:n got:n im:n
Topic #1:
lol:n shit:- just:n lmao:n ass:n damn:- fuck:- omg:n did:n nigga:n know:n vine:n ya:n hell:- got:n

-850720.748504
Topic #0:
good:+ oh:n bitch:- wtf:n lmao:n want:n did:n know:n omg:n say:n tho:n look:n said:n got:n make:n
Topic #1:
bitch:- fucking:- know:n look:n vine:n got:n people:n ur:n really:n say:n think:n stop:n stupid:- vines:n funny:-
Topic #2:
know:n people:n right:+ bitch:- fucking:- good:+ ur:n look:n vine:n make:n damn:- say:n suck:- got:n time:n

Topic #0:
just:n lol:n oh:n omg:n fuck:- shit:- vine:n know:n im:n good:+ yes:n look:n fucking:- wtf:n did:n
Topic #1:
lol:n just:n shit:- ass:n fuck:- got:n look:n know:n lmao:n did:n nigga:n tho:n hell:- funny:- vine:n
Topic #2:
omg:n lol:n just:n good:+ really:n vines:n know:n did:n look:n haha:n yes:n vine:n best:+ follow:n app:n

-856018.374873
Topic #0:
look:n know:n bitch:- yo:n got:n lmao:n damn:- really:n fucking:- tho:n im:n ur:n nigga:n follow:n right:+
Topic #1:
bitch:- fucking:- know:n vine:n ur:n people:n look:n really:n stop:n got:n damn:- shut:n make:n say:n nigga:n
Topic #2:
fucking:- did:n know:n vine:n people:n look:n good:+ said:n right:+ bitch:- think:n say:n better:+ oh:n really:n
Topic #3:
people:n know:n got:n right:+ fucking:- bitch:- think:n ll:n look:n did:n say:n tho:n make:n want:n damn:-

Topic #0:
lol:n just:n shit:- look:n fuck:- know:n ass:n omg:n did:n vine:n damn:- tho:n lmao:n got:n funny:-
Topic #1:
vine:n lol:n ve:n ya:n yes:n hahaha:n watch:n takip:n looks:n follow:n did:n com:n great:+ job:n ederim:n
Topic #2:
lol:n just:n omg:n oh:n shit:- im:n fuck:- lmao:n know:n vine:n did:n good:+ edits:n fucking:- look:n
Topic #3:
just:n know:n lol:n think:n shit:- fuck:- ass:n right:+ look:n did:n yes:n really:n say:n people:n fucking:-

-859372.304105
2 -850720.748504

In [258]:
-761126.463018


Out[258]:
-761126.463018

In [188]:
"test\n".strip()


Out[188]:
'test'

In [232]:



Out[232]:
False

In [ ]:


In [ ]: