In [159]:
import pandas as pd

#from gensim.models.wrappers import FastText
import gensim
import fastparquet
import csv

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords") 
from nltk.corpus import stopwords

import re
import string
import time


[nltk_data] Downloading package punkt to /Users/TIE/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/TIE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [160]:
pd.options.mode.chained_assignment = None

In [161]:
URL = 'train_data.csv'
def load_data(url=URL):
    return pd.read_csv(url)

In [162]:
df = load_data()

In [163]:
df.tail()


Out[163]:
sentiment text
10494 Negative The question about God and the Veterans. What ...
10495 Negative I thought #LastComicStanding airs on Wednesday...
10496 Negative Bingo! Put that in your article!!! #GOPDebates...
10497 Negative RT @RWSurferGirl: Fox is cherry picking the ca...
10498 Neutral Waiting on Trumps answer about God #GOPDebates...

In [321]:
# Collecting stopwords from different sources and merging them deleting duplicates

nltk.download("stopwords") 
from nltk.corpus import stopwords
import requests

# function to delete duplicates
def del_dup(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

stopword_removed_sentences = []
sw = list(stopwords.words("English"))
sw.extend(["rt", "retweet"])

# # Additional stopwords from MIT
# response = requests.get('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop')
# sw.extend(response.text.split())

# # Deleting duplicates
# for i in sw:
#     sw = [ re.sub('[\W_]+', '', x) for x in sw ]
# sw = del_dup(sw)
sw.remove("not")
print(sorted(sw))


[nltk_data] Downloading package stopwords to /Users/TIE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'd', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', 'more', 'most', 'mustn', 'my', 'myself', 'needn', 'no', 'nor', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 'retweet', 'rt', 's', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'wouldn', 'y', 'you', 'your', 'yours', 'yourself', 'yourselves']

In [322]:
def cleantweet1(s):
    '''
    :s : string; a tweet
    :return : list; words that don't contain url, @somebody, and in utf-8 and lower case
    '''
    #Read matchpattern from function property (introduced later)
    s = re.sub(cleantweet1.pattern, '', s)
    #words = word_tokenize(s)
    #s = " ".join(words)
    s = re.sub(r'[^a-zA-Z\s]', '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    s = [word.lower() for word in s.split() if word.lower() not in sw]
    s = " ".join(s)
    return s

def cleantweet2(s):
    '''
    :s : string; a tweet
    :return : list of hashtags'''
    s = re.findall(r'#(\w+)', s)
    s = " ".join([word.lower() for word in s])
    s = re.sub(r'[^a-zA-Z\s]', '', s)
    s = s.split()
    return s

def cleantweet3(s):
    '''
    :s : string; a tweet
    :return : tweet without hashtags'''
    #Read matchpattern from function property (introduced later)
    s = re.sub(cleantweet3.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    s = [word.lower() for word in s.split() if word.lower() not in sw]
    return s

def cleantweet4(s):
    '''
    :s : string; a tweet
    :return : list; words that don't contain url, @somebody, and in utf-8 and lower case
    '''
    s = re.sub(cleantweet4.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    #s = " ".join([word.lower() for word in s.split() if word.lower() not in sw])
    #s = [word.lower() for word in s.split() if word.lower() not in sw]


    # tokenize tweet into sentences
    sents = sent_tokenize(s)
    # tokenize sentences into list of words
    words = [word_tokenize(s) for s in sents]
    # NO IDEA
    words = [e for sent in words for e in sent]
    return [cleantweet4.stemmer.stem(e.lower()) for e in words]

In [323]:
cleantweet1.pattern = re.compile(r'(http\S+)')
cleantweet3.pattern = re.compile(r'(@\w+)|(#\w+)|[^a-zA-Z\s]|(\w+:\/\/\S+)', flags=re.IGNORECASE)
#start = time.perf_counter()
#df["text_clean1"] = df.iloc[0:].text.apply(cleantweet1)
#following is consistently faster
df.loc[:,'text_clean1'] = df.loc[:,'text'].map(cleantweet1)
#end = time.perf_counter()
df.loc[:,'hashtags'] = df.loc[:,'text'].map(cleantweet2)
#print(end - start)
# Removing the morphological and inflexional endings from words in English.
#cleantweet4.stemmer = PorterStemmer()
df.loc[:,'wordlist'] = df.loc[:,'text'].map(cleantweet3)
df.loc[:,'no_names_hashtags'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
#df.loc[:,'wordstring'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))

In [331]:
df.head()


Out[331]:
sentiment text text_clean1 hashtags wordlist no_names_hashtags
10000 Positive RT @kwrcrow: Dr. Carson remark on DC having ha... kwrcrow dr carson remark dc half brain best li... [gopdebates] [dr, carson, remark, dc, half, brain, best, line] dr carson remark dc half brain best line
10001 Neutral #GOPdebates bash Hillary night slick Willy is ... gopdebates bash hillary night slick willy lovin [gopdebates] [bash, hillary, night, slick, willy, lovin] bash hillary night slick willy lovin
10002 Negative Cancel the primaries. Fox Party set up & a... cancel primaries fox party set amp anointed ma... [gopdebates, canttrustabush, morningjoe] [cancel, primaries, fox, party, set, amp, anoi... cancel primaries fox party set amp anointed ma...
10003 Neutral RT @IanGaryTweets: This is real life. These pe... iangarytweets real life people running powerfu... [gopdebates] [real, life, people, running, powerful, office... real life people running powerful office world
10004 Negative RT @SupermanHotMale: Dear Jeb Bush, Your Recor... supermanhotmale dear jeb bush record sir clear... [gopdebates] [dear, jeb, bush, record, sir, clear, fucking,... dear jeb bush record sir clear fucking ememy v...

In [364]:
# write data, SNAPPY not available on win10 ?
# import fastparquet
# fastparquet.write('5col_DFrame.parq', df, compression='GZIP')
#df.to_csv('5col_DFrame.csv') file size comparison

# write data to txt
dftrain = pd.DataFrame()

dfmda = pd.DataFrame()

dftrain.loc[:, 'data'] = "__label__" + df.loc[:,'sentiment'] + " " + df.loc[:,'text_clean1']

import random

# def some(x, n):
#     return x.ix[random.sample(x.index, n)]


np.random.seed(20)

msk = np.random.rand(len(dftrain)) < 0.9

train = dftrain[msk]

test = dftrain[~msk]

test.head()

# with open('train.txt', 'w') as f:
#     dftrain.iloc[0:10000].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')


# with open('test.txt', 'w') as f:
#     dftrain.iloc[10000:].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')

with open('train.txt', 'w') as f:
    train.to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')


with open('test.txt', 'w') as f:
    test.to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')

In [365]:
###FAST TEXT###

In [366]:
import fasttext

classifier = fasttext.supervised('train.txt', 'model')

In [367]:
result = classifier.test('test.txt')


print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('Number of examples:', result.nexamples)


P@1: 0.6666666666666666
R@1: 0.6666666666666666
Number of examples: 1065

In [107]:
texts = ['motherfucking', 'I really like what senator mccain said about the future of our great nation']
labels = classifier.predict(texts)
print(labels)


[['Negative'], ['Positive']]