In [2]:
import pandas as pd
#from gensim.models.wrappers import FastText
import gensim
import csv
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
import re
import string
import time
In [3]:
pd.options.mode.chained_assignment = None
In [4]:
URL = 'train_data.csv'
def load_data(url=URL):
return pd.read_csv(url)
In [5]:
df = load_data()
In [6]:
df.tail()
Out[6]:
In [7]:
# Collecting stopwords from different sources and merging them deleting duplicates
nltk.download("stopwords")
from nltk.corpus import stopwords
import requests
# function to delete duplicates
def del_dup(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
stopword_removed_sentences = []
sw = list(stopwords.words("English"))
response = requests.get('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop')
sw.extend(response.text.split())
sw.extend(["rt", "retweet"])
for i in sw:
sw = [ re.sub('[\W_]+', '', x) for x in sw ]
#sw.append((i for i in list1 if str(i) not in sw).next())
#sw = sw.map
sw = del_dup(sw)
In [94]:
def cleantweet1(s):
'''
:s : string; a tweet
:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
'''
#Read matchpattern from function property (introduced later)
s = re.sub(r"http\S+", '', s)
s = re.sub(r"t.co\s+", '', s)
s = re.sub(cleantweet1.pattern, '', s)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
s = [word.lower() for word in s.split() if word.lower() not in sw]
s = " ".join(s)
return s
def cleantweet2(s):
'''
:s : string; a tweet
:return : list of hashtags'''
s = re.findall(r'#(\w+)', s)
s = " ".join([word.lower() for word in s])
s = re.sub(r'[^a-zA-Z\s]', '', s)
s = s.split()
return s
def cleantweet3(s):
'''
:s : string; a tweet
:return : tweet without hashtags'''
#Read matchpattern from function property (introduced later)
s = re.sub(cleantweet3.pattern, '', s)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
s = [word.lower() for word in s.split() if word.lower() not in sw]
return s
def cleantweet4(s):
'''
:s : string; a tweet
:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
'''
s = re.sub(cleantweet4.pattern, '', s)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
#s = " ".join([word.lower() for word in s.split() if word.lower() not in sw])
#s = [word.lower() for word in s.split() if word.lower() not in sw]
# tokenize tweet into sentences
sents = sent_tokenize(s)
# tokenize sentences into list of words
words = [word_tokenize(s) for s in sents]
# NO IDEA
words = [e for sent in words for e in sent]
return [cleantweet4.stemmer.stem(e.lower()) for e in words]
In [95]:
cleantweet1.pattern = re.compile(r'(@\w+)|[^a-zA-Z\s]', flags=re.IGNORECASE)
cleantweet3.pattern = re.compile(r'(@\w+)|(#\w+)|[^a-zA-Z\s]|(\w+:\/\/\S+)', flags=re.IGNORECASE)
#start = time.perf_counter()
#df["text_clean1"] = df.iloc[0:].text.apply(cleantweet1)
#following is consistently faster
df.loc[:,'text_clean1'] = df.loc[:,'text'].map(cleantweet1)
#end = time.perf_counter()
df.loc[:,'hashtags'] = df.loc[:,'text'].map(cleantweet2)
#print(end - start)
# Removing the morphological and inflexional endings from words in English.
#cleantweet4.stemmer = PorterStemmer()
df.loc[:,'wordlist'] = df.loc[:,'text'].map(cleantweet3)
df.loc[:,'no_names_hashtags'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
#df.loc[:,'wordstring'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
In [96]:
df.head()
Out[96]:
In [113]:
#Procedure that cleans
#print(len(df[df['text_clean1'].str.contains("t.co")].index.values[:]))
print(len(df[df['text_clean1'].str.contains(re.escape('tco'))].index.values[:]))
print(df[df['text_clean1'].str.contains(re.escape('tco'))].index.values[:])
print(df[df['text_clean1'].str.contains(re.escape('tco'))]['text_clean1'][191])
In [126]:
# write data, SNAPPY not available on win10 ?
import fastparquet
fastparquet.write('5col_DFrame.parq', df, compression='GZIP')
#df.to_csv('5col_DFrame.csv') file size comparison
In [127]:
### Continuation in .* FastText_vectors OR .*BuildModel file ###