notebook.community

Edit and run



In [159]:

    
import pandas as pd

#from gensim.models.wrappers import FastText
import gensim
import fastparquet
import csv

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords") 
from nltk.corpus import stopwords

import re
import string
import time









    



[nltk_data] Downloading package punkt to /Users/TIE/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/TIE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



In [160]:

    
pd.options.mode.chained_assignment = None



In [161]:

    
URL = 'train_data.csv'
def load_data(url=URL):
    return pd.read_csv(url)



In [162]:

    
df = load_data()



In [163]:

    
df.tail()









    Out[163]:






  
    
      
      sentiment
      text
    
  
  
    
      10494
      Negative
      The question about God and the Veterans. What ...
    
    
      10495
      Negative
      I thought #LastComicStanding airs on Wednesday...
    
    
      10496
      Negative
      Bingo! Put that in your article!!! #GOPDebates...
    
    
      10497
      Negative
      RT @RWSurferGirl: Fox is cherry picking the ca...
    
    
      10498
      Neutral
      Waiting on Trumps answer about God #GOPDebates...



In [321]:

    
# Collecting stopwords from different sources and merging them deleting duplicates

nltk.download("stopwords") 
from nltk.corpus import stopwords
import requests

# function to delete duplicates
def del_dup(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

stopword_removed_sentences = []
sw = list(stopwords.words("English"))
sw.extend(["rt", "retweet"])

# # Additional stopwords from MIT
# response = requests.get('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop')
# sw.extend(response.text.split())

# # Deleting duplicates
# for i in sw:
#     sw = [ re.sub('[\W_]+', '', x) for x in sw ]
# sw = del_dup(sw)
sw.remove("not")
print(sorted(sw))









    



[nltk_data] Downloading package stopwords to /Users/TIE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'd', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', 'more', 'most', 'mustn', 'my', 'myself', 'needn', 'no', 'nor', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 'retweet', 'rt', 's', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'wouldn', 'y', 'you', 'your', 'yours', 'yourself', 'yourselves']



In [322]:

    
def cleantweet1(s):
    '''
    :s : string; a tweet
    :return : list; words that don't contain url, @somebody, and in utf-8 and lower case
    '''
    #Read matchpattern from function property (introduced later)
    s = re.sub(cleantweet1.pattern, '', s)
    #words = word_tokenize(s)
    #s = " ".join(words)
    s = re.sub(r'[^a-zA-Z\s]', '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    s = [word.lower() for word in s.split() if word.lower() not in sw]
    s = " ".join(s)
    return s

def cleantweet2(s):
    '''
    :s : string; a tweet
    :return : list of hashtags'''
    s = re.findall(r'#(\w+)', s)
    s = " ".join([word.lower() for word in s])
    s = re.sub(r'[^a-zA-Z\s]', '', s)
    s = s.split()
    return s

def cleantweet3(s):
    '''
    :s : string; a tweet
    :return : tweet without hashtags'''
    #Read matchpattern from function property (introduced later)
    s = re.sub(cleantweet3.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    s = [word.lower() for word in s.split() if word.lower() not in sw]
    return s

def cleantweet4(s):
    '''
    :s : string; a tweet
    :return : list; words that don't contain url, @somebody, and in utf-8 and lower case
    '''
    s = re.sub(cleantweet4.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    #s = " ".join([word.lower() for word in s.split() if word.lower() not in sw])
    #s = [word.lower() for word in s.split() if word.lower() not in sw]


    # tokenize tweet into sentences
    sents = sent_tokenize(s)
    # tokenize sentences into list of words
    words = [word_tokenize(s) for s in sents]
    # NO IDEA
    words = [e for sent in words for e in sent]
    return [cleantweet4.stemmer.stem(e.lower()) for e in words]



In [323]:

    
cleantweet1.pattern = re.compile(r'(http\S+)')
cleantweet3.pattern = re.compile(r'(@\w+)|(#\w+)|[^a-zA-Z\s]|(\w+:\/\/\S+)', flags=re.IGNORECASE)
#start = time.perf_counter()
#df["text_clean1"] = df.iloc[0:].text.apply(cleantweet1)
#following is consistently faster
df.loc[:,'text_clean1'] = df.loc[:,'text'].map(cleantweet1)
#end = time.perf_counter()
df.loc[:,'hashtags'] = df.loc[:,'text'].map(cleantweet2)
#print(end - start)
# Removing the morphological and inflexional endings from words in English.
#cleantweet4.stemmer = PorterStemmer()
df.loc[:,'wordlist'] = df.loc[:,'text'].map(cleantweet3)
df.loc[:,'no_names_hashtags'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
#df.loc[:,'wordstring'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))



In [331]:

    
df.head()









    Out[331]:






  
    
      
      sentiment
      text
      text_clean1
      hashtags
      wordlist
      no_names_hashtags
    
  
  
    
      10000
      Positive
      RT @kwrcrow: Dr. Carson remark on DC having ha...
      kwrcrow dr carson remark dc half brain best li...
      [gopdebates]
      [dr, carson, remark, dc, half, brain, best, line]
      dr carson remark dc half brain best line
    
    
      10001
      Neutral
      #GOPdebates bash Hillary night slick Willy is ...
      gopdebates bash hillary night slick willy lovin
      [gopdebates]
      [bash, hillary, night, slick, willy, lovin]
      bash hillary night slick willy lovin
    
    
      10002
      Negative
      Cancel the primaries. Fox Party set up &amp; a...
      cancel primaries fox party set amp anointed ma...
      [gopdebates, canttrustabush, morningjoe]
      [cancel, primaries, fox, party, set, amp, anoi...
      cancel primaries fox party set amp anointed ma...
    
    
      10003
      Neutral
      RT @IanGaryTweets: This is real life. These pe...
      iangarytweets real life people running powerfu...
      [gopdebates]
      [real, life, people, running, powerful, office...
      real life people running powerful office world
    
    
      10004
      Negative
      RT @SupermanHotMale: Dear Jeb Bush, Your Recor...
      supermanhotmale dear jeb bush record sir clear...
      [gopdebates]
      [dear, jeb, bush, record, sir, clear, fucking,...
      dear jeb bush record sir clear fucking ememy v...



In [364]:

    
# write data, SNAPPY not available on win10 ?
# import fastparquet
# fastparquet.write('5col_DFrame.parq', df, compression='GZIP')
#df.to_csv('5col_DFrame.csv') file size comparison

# write data to txt
dftrain = pd.DataFrame()

dfmda = pd.DataFrame()

dftrain.loc[:, 'data'] = "__label__" + df.loc[:,'sentiment'] + " " + df.loc[:,'text_clean1']

import random

# def some(x, n):
#     return x.ix[random.sample(x.index, n)]


np.random.seed(20)

msk = np.random.rand(len(dftrain)) < 0.9

train = dftrain[msk]

test = dftrain[~msk]

test.head()

# with open('train.txt', 'w') as f:
#     dftrain.iloc[0:10000].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')


# with open('test.txt', 'w') as f:
#     dftrain.iloc[10000:].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')

with open('train.txt', 'w') as f:
    train.to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')


with open('test.txt', 'w') as f:
    test.to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')



In [365]:

    
###FAST TEXT###



In [366]:

    
import fasttext

classifier = fasttext.supervised('train.txt', 'model')



In [367]:

    
result = classifier.test('test.txt')


print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('Number of examples:', result.nexamples)









    



P@1: 0.6666666666666666
R@1: 0.6666666666666666
Number of examples: 1065



In [107]:

    
texts = ['motherfucking', 'I really like what senator mccain said about the future of our great nation']
labels = classifier.predict(texts)
print(labels)









    



[['Negative'], ['Positive']]

	sentiment	text
10494	Negative	The question about God and the Veterans. What ...
10495	Negative	I thought #LastComicStanding airs on Wednesday...
10496	Negative	Bingo! Put that in your article!!! #GOPDebates...
10497	Negative	RT @RWSurferGirl: Fox is cherry picking the ca...
10498	Neutral	Waiting on Trumps answer about God #GOPDebates...

	sentiment	text	text_clean1	hashtags	wordlist	no_names_hashtags
10000	Positive	RT @kwrcrow: Dr. Carson remark on DC having ha...	kwrcrow dr carson remark dc half brain best li...	[gopdebates]	[dr, carson, remark, dc, half, brain, best, line]	dr carson remark dc half brain best line
10001	Neutral	#GOPdebates bash Hillary night slick Willy is ...	gopdebates bash hillary night slick willy lovin	[gopdebates]	[bash, hillary, night, slick, willy, lovin]	bash hillary night slick willy lovin
10002	Negative	Cancel the primaries. Fox Party set up & a...	cancel primaries fox party set amp anointed ma...	[gopdebates, canttrustabush, morningjoe]	[cancel, primaries, fox, party, set, amp, anoi...	cancel primaries fox party set amp anointed ma...
10003	Neutral	RT @IanGaryTweets: This is real life. These pe...	iangarytweets real life people running powerfu...	[gopdebates]	[real, life, people, running, powerful, office...	real life people running powerful office world
10004	Negative	RT @SupermanHotMale: Dear Jeb Bush, Your Recor...	supermanhotmale dear jeb bush record sir clear...	[gopdebates]	[dear, jeb, bush, record, sir, clear, fucking,...	dear jeb bush record sir clear fucking ememy v...