In [159]:
import pandas as pd
#from gensim.models.wrappers import FastText
import gensim
import fastparquet
import csv
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
import re
import string
import time
In [160]:
pd.options.mode.chained_assignment = None
In [161]:
URL = 'train_data.csv'
def load_data(url=URL):
return pd.read_csv(url)
In [162]:
df = load_data()
In [163]:
df.tail()
Out[163]:
In [321]:
# Collecting stopwords from different sources and merging them deleting duplicates
nltk.download("stopwords")
from nltk.corpus import stopwords
import requests
# function to delete duplicates
def del_dup(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
stopword_removed_sentences = []
sw = list(stopwords.words("English"))
sw.extend(["rt", "retweet"])
# # Additional stopwords from MIT
# response = requests.get('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop')
# sw.extend(response.text.split())
# # Deleting duplicates
# for i in sw:
# sw = [ re.sub('[\W_]+', '', x) for x in sw ]
# sw = del_dup(sw)
sw.remove("not")
print(sorted(sw))
In [322]:
def cleantweet1(s):
'''
:s : string; a tweet
:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
'''
#Read matchpattern from function property (introduced later)
s = re.sub(cleantweet1.pattern, '', s)
#words = word_tokenize(s)
#s = " ".join(words)
s = re.sub(r'[^a-zA-Z\s]', '', s)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
s = [word.lower() for word in s.split() if word.lower() not in sw]
s = " ".join(s)
return s
def cleantweet2(s):
'''
:s : string; a tweet
:return : list of hashtags'''
s = re.findall(r'#(\w+)', s)
s = " ".join([word.lower() for word in s])
s = re.sub(r'[^a-zA-Z\s]', '', s)
s = s.split()
return s
def cleantweet3(s):
'''
:s : string; a tweet
:return : tweet without hashtags'''
#Read matchpattern from function property (introduced later)
s = re.sub(cleantweet3.pattern, '', s)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
s = [word.lower() for word in s.split() if word.lower() not in sw]
return s
def cleantweet4(s):
'''
:s : string; a tweet
:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
'''
s = re.sub(cleantweet4.pattern, '', s)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
#s = " ".join([word.lower() for word in s.split() if word.lower() not in sw])
#s = [word.lower() for word in s.split() if word.lower() not in sw]
# tokenize tweet into sentences
sents = sent_tokenize(s)
# tokenize sentences into list of words
words = [word_tokenize(s) for s in sents]
# NO IDEA
words = [e for sent in words for e in sent]
return [cleantweet4.stemmer.stem(e.lower()) for e in words]
In [323]:
cleantweet1.pattern = re.compile(r'(http\S+)')
cleantweet3.pattern = re.compile(r'(@\w+)|(#\w+)|[^a-zA-Z\s]|(\w+:\/\/\S+)', flags=re.IGNORECASE)
#start = time.perf_counter()
#df["text_clean1"] = df.iloc[0:].text.apply(cleantweet1)
#following is consistently faster
df.loc[:,'text_clean1'] = df.loc[:,'text'].map(cleantweet1)
#end = time.perf_counter()
df.loc[:,'hashtags'] = df.loc[:,'text'].map(cleantweet2)
#print(end - start)
# Removing the morphological and inflexional endings from words in English.
#cleantweet4.stemmer = PorterStemmer()
df.loc[:,'wordlist'] = df.loc[:,'text'].map(cleantweet3)
df.loc[:,'no_names_hashtags'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
#df.loc[:,'wordstring'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
In [331]:
df.head()
Out[331]:
In [364]:
# write data, SNAPPY not available on win10 ?
# import fastparquet
# fastparquet.write('5col_DFrame.parq', df, compression='GZIP')
#df.to_csv('5col_DFrame.csv') file size comparison
# write data to txt
dftrain = pd.DataFrame()
dfmda = pd.DataFrame()
dftrain.loc[:, 'data'] = "__label__" + df.loc[:,'sentiment'] + " " + df.loc[:,'text_clean1']
import random
# def some(x, n):
# return x.ix[random.sample(x.index, n)]
np.random.seed(20)
msk = np.random.rand(len(dftrain)) < 0.9
train = dftrain[msk]
test = dftrain[~msk]
test.head()
# with open('train.txt', 'w') as f:
# dftrain.iloc[0:10000].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')
# with open('test.txt', 'w') as f:
# dftrain.iloc[10000:].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')
with open('train.txt', 'w') as f:
train.to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')
with open('test.txt', 'w') as f:
test.to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')
In [365]:
###FAST TEXT###
In [366]:
import fasttext
classifier = fasttext.supervised('train.txt', 'model')
In [367]:
result = classifier.test('test.txt')
print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('Number of examples:', result.nexamples)
In [107]:
texts = ['motherfucking', 'I really like what senator mccain said about the future of our great nation']
labels = classifier.predict(texts)
print(labels)