In [2]:
import pandas as pd

#from gensim.models.wrappers import FastText
import gensim
import csv

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords") 
from nltk.corpus import stopwords

import re
import string
import time


C:\Users\goldaw\AppData\Local\Continuum\Anaconda3\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [3]:
pd.options.mode.chained_assignment = None

In [4]:
URL = 'train_data.csv'
def load_data(url=URL):
    return pd.read_csv(url)

In [5]:
df = load_data()

In [6]:
df.tail()


Out[6]:
sentiment text
10494 Negative The question about God and the Veterans. What ...
10495 Negative I thought #LastComicStanding airs on Wednesday...
10496 Negative Bingo! Put that in your article!!! #GOPDebates...
10497 Negative RT @RWSurferGirl: Fox is cherry picking the ca...
10498 Neutral Waiting on Trumps answer about God #GOPDebates...

In [7]:
# Collecting stopwords from different sources and merging them deleting duplicates

nltk.download("stopwords") 
from nltk.corpus import stopwords
import requests

# function to delete duplicates
def del_dup(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

stopword_removed_sentences = []
sw = list(stopwords.words("English"))

response = requests.get('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop')
sw.extend(response.text.split())
sw.extend(["rt", "retweet"])
for i in sw:
    sw = [ re.sub('[\W_]+', '', x) for x in sw ]
#sw.append((i for i in list1 if str(i) not in sw).next())
#sw = sw.map
sw = del_dup(sw)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [94]:
def cleantweet1(s):
    '''
    :s : string; a tweet
    :return : list; words that don't contain url, @somebody, and in utf-8 and lower case
    '''
    #Read matchpattern from function property (introduced later)
    s = re.sub(r"http\S+", '', s)
    s = re.sub(r"t.co\s+", '', s)
    s = re.sub(cleantweet1.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    s = [word.lower() for word in s.split() if word.lower() not in sw]
    s = " ".join(s)
    return s

def cleantweet2(s):
    '''
    :s : string; a tweet
    :return : list of hashtags'''
    s = re.findall(r'#(\w+)', s)
    s = " ".join([word.lower() for word in s])
    s = re.sub(r'[^a-zA-Z\s]', '', s)
    s = s.split()
    return s

def cleantweet3(s):
    '''
    :s : string; a tweet
    :return : tweet without hashtags'''
    #Read matchpattern from function property (introduced later)
    s = re.sub(cleantweet3.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    s = [word.lower() for word in s.split() if word.lower() not in sw]
    return s

def cleantweet4(s):
    '''
    :s : string; a tweet
    :return : list; words that don't contain url, @somebody, and in utf-8 and lower case
    '''
    s = re.sub(cleantweet4.pattern, '', s)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    # actually remove the punctutation
    s = s.translate(remove_punctuation_map)
    #s = " ".join([word.lower() for word in s.split() if word.lower() not in sw])
    #s = [word.lower() for word in s.split() if word.lower() not in sw]


    # tokenize tweet into sentences
    sents = sent_tokenize(s)
    # tokenize sentences into list of words
    words = [word_tokenize(s) for s in sents]
    # NO IDEA
    words = [e for sent in words for e in sent]
    return [cleantweet4.stemmer.stem(e.lower()) for e in words]

In [95]:
cleantweet1.pattern = re.compile(r'(@\w+)|[^a-zA-Z\s]', flags=re.IGNORECASE)

cleantweet3.pattern = re.compile(r'(@\w+)|(#\w+)|[^a-zA-Z\s]|(\w+:\/\/\S+)', flags=re.IGNORECASE)
#start = time.perf_counter()
#df["text_clean1"] = df.iloc[0:].text.apply(cleantweet1)
#following is consistently faster
df.loc[:,'text_clean1'] = df.loc[:,'text'].map(cleantweet1)
#end = time.perf_counter()
df.loc[:,'hashtags'] = df.loc[:,'text'].map(cleantweet2)
#print(end - start)
# Removing the morphological and inflexional endings from words in English.
#cleantweet4.stemmer = PorterStemmer()
df.loc[:,'wordlist'] = df.loc[:,'text'].map(cleantweet3)
df.loc[:,'no_names_hashtags'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))
#df.loc[:,'wordstring'] = df.loc[:,'wordlist'].map(lambda slocal:' '.join(slocal))

In [96]:
df.head()


Out[96]:
sentiment text text_clean1 hashtags wordlist no_names_hashtags
0 Neutral RT @NancyLeeGrahn: How did everyone feel about... feel climate change question night gopdebate [gopdebate] [feel, climate, change, question, night] feel climate change question night
1 Positive RT @ScottWalker: Didn't catch the full #GOPdeb... catch full gopdebate night scotts lines second... [gopdebate, walker] [catch, full, night, scotts, lines, seconds] catch full night scotts lines seconds
2 Neutral RT @TJMShow: No mention of Tamir Rice and the ... mention tamir rice gopdebate held cleveland wow [gopdebate] [mention, tamir, rice, held, cleveland, wow] mention tamir rice held cleveland wow
3 Positive RT @RobGeorge: That Carly Fiorina is trending ... carly fiorina trending hours debate men justco... [gopdebate] [carly, fiorina, trending, hours, debate, men,... carly fiorina trending hours debate men justco...
4 Positive RT @DanScavino: #GOPDebate w/ @realDonaldTrump... gopdebate delivered highest ratings history pr... [gopdebate, trump] [delivered, highest, ratings, history, preside... delivered highest ratings history presidential...

In [113]:
#Procedure that cleans 
#print(len(df[df['text_clean1'].str.contains("t.co")].index.values[:]))
print(len(df[df['text_clean1'].str.contains(re.escape('tco'))].index.values[:]))
print(df[df['text_clean1'].str.contains(re.escape('tco'))].index.values[:])
print(df[df['text_clean1'].str.contains(re.escape('tco'))]['text_clean1'][191])


223
[    3    18   124   191   193   207   209   218   340   377   472   478
   485   521   554   577   599   608   624   645   657   673   681   688
   749   751   753   767   794   836   843   847   956   964   971   994
  1000  1041  1057  1127  1133  1181  1200  1224  1266  1328  1460  1475
  1478  1481  1552  1561  1574  1587  1621  1638  1660  1693  1713  1725
  1741  1745  1847  1971  1984  1988  2008  2033  2102  2157  2202  2207
  2210  2218  2247  2353  2406  2431  2470  2529  2544  2551  2651  2794
  2807  2851  2891  2908  2937  2946  2967  2989  2998  3068  3084  3189
  3256  3275  3307  3317  3325  3326  3417  3445  3475  3595  3631  3662
  3678  3693  3735  3874  3939  3996  4005  4007  4024  4075  4081  4107
  4129  4147  4181  4194  4195  4245  4251  4265  4297  4333  4335  4388
  4415  4443  4447  4454  4658  4709  4724  4744  4747  4852  4859  4915
  4992  4997  4999  5008  5026  5050  5063  5089  5215  5240  5294  5299
  5368  5429  5467  5563  5580  5631  5636  5715  5743  5799  6090  6191
  6202  6242  6273  6303  6386  6393  6413  6492  6561  6578  6645  6690
  6768  6818  6883  6899  6954  7056  7061  7070  7097  7132  7264  7490
  7512  7652  7673  7760  7771  7815  7866  7895  7896  7897  7900  7903
  7911  7915  7953  7975  8086  8280  8626  8677  8685  8803  8987  9038
  9387  9590  9598  9600 10361 10372 10495]
fire debate performance gopdebate perry tcot

In [126]:
# write data, SNAPPY not available on win10 ?
import fastparquet
fastparquet.write('5col_DFrame.parq', df, compression='GZIP')
#df.to_csv('5col_DFrame.csv') file size comparison

In [127]:
### Continuation in .* FastText_vectors OR .*BuildModel file ###