In [1]:
import gensim
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
from scipy.spatial.distance import cosine
from nltk.tokenize import PunktSentenceTokenizer
In [2]:
def seperatePunct(incomingString):
outstr = ''
characters = set(['!','@','#','$',"%","^","&","*",":","\\",
"(",")","+","=","?","\'","\"",";","/",
"{","}","[","]","<",">","~","`","|"])
for char in incomingString:
if char in characters:
outstr = outstr + ' ' + char + ' '
else:
outstr = outstr + char
return outstr
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def text_cleaner(wordList):
'''
INPUT: List of words to be tokenized
OUTPUT: List of tokenized words
'''
tokenziedList = []
for word in wordList:
#remove these substrings from the word
word = word.replace('[deleted]','')
word = word.replace('>','')
#if link, replace with linktag
if 'http' in word:
tokenziedList.append('LINK_TAG')
continue
#if reference to subreddit, replace with reddittag
if '/r/' in word:
tokenziedList.append('SUBREDDIT_TAG')
continue
#if reference to reddit user, replace with usertag
if '/u/' in word:
tokenziedList.append('USER_TAG')
continue
#if number, replace with numtag
#m8 is a word, 5'10" and 54-59, 56:48 are numbers
if hasNumbers(word) and not any(char.isalpha() for char in word):
tokenziedList.append('NUM_TAG')
continue
#seperate puncuations and add to tokenizedList
newwords = seperatePunct(word).split(" ")
tokenziedList.extend(newwords)
return tokenziedList
def mytokenizer(comment):
'''
Input: takes in a reddit comment as a str or unicode and tokenizes it
Output: a tokenized list
'''
sentenceList = tokenizer.tokenize(comment)
wordList = []
for sentence in sentenceList:
wordList.extend(sentence.split(" "))
return text_cleaner(wordList)
In [2]:
path1 = '../../data/labeledRedditComments2.p'
path2 = '../../data/twitter_cross_val.csv'
In [3]:
df = pd.read_csv(path2)
In [5]:
tokenizer = PunktSentenceTokenizer()
In [6]:
df['tokenize'] = df.tweet_text.map(lambda x: mytokenizer(x))
In [4]:
# for row in xrange(len(df.index)):
# print df['tweet_text'].loc[row]
# # print df['tokenize'].loc[row]
# print ""
In [16]:
def myfunc(x):
if '#' in x:
return 1
else:
return 0
In [17]:
df['hashtag'] = df['tweet_text'].map(lambda x: myfunc(x))
In [5]:
# for row in xrange(len(df.index)):
# if df['hashtag'].loc[row]:
# print df['tweet_text'].loc[row]
# print df['tokenize'].loc[row]
# print ""
In [20]:
df['hashtag'].sum()
Out[20]:
In [6]:
df.head()
Out[6]:
In [ ]: