notebook.community

Edit and run



In [1]:

    
import gensim
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
from scipy.spatial.distance import cosine
from nltk.tokenize import PunktSentenceTokenizer



In [2]:

    
def seperatePunct(incomingString):
    outstr = ''
    characters = set(['!','@','#','$',"%","^","&","*",":","\\",
                  "(",")","+","=","?","\'","\"",";","/",
                  "{","}","[","]","<",">","~","`","|"])

    for char in incomingString:
        if char in characters:
            outstr = outstr + ' ' + char + ' '
        else:
            outstr = outstr + char

    return outstr

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenizer(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''

    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)



In [2]:

    
path1 = '../../data/labeledRedditComments2.p'
path2 = '../../data/twitter_cross_val.csv'



In [3]:

    
df = pd.read_csv(path2)



In [5]:

    
tokenizer = PunktSentenceTokenizer()



In [6]:

    
df['tokenize'] = df.tweet_text.map(lambda x: mytokenizer(x))



In [4]:

    
# for row in xrange(len(df.index)):
#     print df['tweet_text'].loc[row]
# #     print df['tokenize'].loc[row]
#     print ""



In [16]:

    
def myfunc(x):
    if '#' in x:
        return 1
    else:
        return 0



In [17]:

    
df['hashtag'] = df['tweet_text'].map(lambda x: myfunc(x))



In [5]:

    
# for row in xrange(len(df.index)):
#     if df['hashtag'].loc[row]:
#         print df['tweet_text'].loc[row]
#         print df['tokenize'].loc[row]
#         print ""



In [20]:

    
df['hashtag'].sum()









    Out[20]:





956



In [6]:

    
df.head()









    Out[6]:






  
    
      
      Unnamed: 0
      tweet_text
      label
    
  
  
    
      0
      12168
      I love how Jayden acts like we have no right t...
      1
    
    
      1
      13497
      I been POPPIN since kindergarten nigga you a l...
      1
    
    
      2
      10058
      Like a real life mean person could make you cr...
      0
    
    
      3
      4559
      Which fags are getting down and tributing Xmas...
      1
    
    
      4
      5514
      From last night...my thoughts on J-Up and what...
      0



In [ ]:

	Unnamed: 0	tweet_text	label
0	12168	I love how Jayden acts like we have no right t...	1
1	13497	I been POPPIN since kindergarten nigga you a l...	1
2	10058	Like a real life mean person could make you cr...	0
3	4559	Which fags are getting down and tributing Xmas...	1
4	5514	From last night...my thoughts on J-Up and what...	0