In [1]:
import pandas as pd
import os
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
import enchant
In [2]:
def readData(filename):
cwd = os.getcwd()
path = cwd + "/" + filename;
#print path
df =pd.read_csv(path);
return df
In [3]:
#df
In [4]:
def tokenize_and_stopwords(data_sample):
#data_sample = list(data_sample)
#Get all english stopwords
stop = stopwords.words('english')# + list(string.punctuation) + ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
#Use only characters from reviews
data_sample = data_sample.str.replace("[^a-zA-Z ]", " ")#, " ")
#print data_sample
#tokenize and remove stop words
return [[spellCheck(i) for i in word_tokenize(sentence) if i not in stop] for sentence in data_sample]
In [5]:
def cleanhtml(tweet):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', tweet)
return cleantext
def cleanUrl(tweet):
tweet= re.sub(r"http\S+", "", tweet)
return tweet;
def removeMention(tweet):
tweet = tweet.replace("rt@","").rstrip()
tweet = tweet.replace("@","").rstrip()
return tweet;
In [6]:
def spellCheck(word):
d = enchant.Dict()
if d.check(word) == False:
word = d.suggest(word)[0] if d.suggest(word) else ""
#print word
return word
In [ ]:
filename = "Homework2_data.csv"
df =readData(filename)
df['text']=df['Tweet_text'].apply(cleanhtml).apply(cleanUrl).apply(removeMention);
#df['text'] = df['text'].apply(spellCheck)
df['text'] = tokenize_and_stopwords(df['text'])
In [ ]:
df
In [ ]: