In [1]:
import gensim
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
from scipy.spatial.distance import cosine
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
def seperatePunct(incomingString):
    outstr = ''
    characters = set(['!','@','#','$',"%","^","&","*",":","\\",
                  "(",")","+","=","?","\'","\"",";","/",
                  "{","}","[","]","<",">","~","`","|"])

    for char in incomingString:
        if char in characters:
            outstr = outstr + ' ' + char + ' '
        else:
            outstr = outstr + char

    return outstr

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenizer(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''

    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)

In [2]:
path1 = '../../data/labeledRedditComments2.p'
path2 = '../../data/twitter_cross_val.csv'

In [3]:
df = pd.read_csv(path2)

In [5]:
tokenizer = PunktSentenceTokenizer()

In [6]:
df['tokenize'] = df.tweet_text.map(lambda x: mytokenizer(x))

In [4]:
# for row in xrange(len(df.index)):
#     print df['tweet_text'].loc[row]
# #     print df['tokenize'].loc[row]
#     print ""

In [16]:
def myfunc(x):
    if '#' in x:
        return 1
    else:
        return 0

In [17]:
df['hashtag'] = df['tweet_text'].map(lambda x: myfunc(x))

In [5]:
# for row in xrange(len(df.index)):
#     if df['hashtag'].loc[row]:
#         print df['tweet_text'].loc[row]
#         print df['tokenize'].loc[row]
#         print ""

In [20]:
df['hashtag'].sum()


Out[20]:
956

In [6]:
df.head()


Out[6]:
Unnamed: 0 tweet_text label
0 12168 I love how Jayden acts like we have no right t... 1
1 13497 I been POPPIN since kindergarten nigga you a l... 1
2 10058 Like a real life mean person could make you cr... 0
3 4559 Which fags are getting down and tributing Xmas... 1
4 5514 From last night...my thoughts on J-Up and what... 0

In [ ]: