In [1]:
# Here are the imports we'll use
import pandas as pd
import re
import nltk
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Let's grab all the tweets from https://data.world/data4democracy/far-right/file/sample_tweets.json. Here is the URL:
df = pd.read_json('https://query.data.world/s/bsbt4eb4g8sm4dsgi7w2ecbkt')

In [3]:
# Let's take a look at it
df.head()
# Does anyone know the difference between message and text?


Out[3]:
created description followers friends_count hashtags id_str loc message name original_id original_name retweet retweet_count text topics user_created
0 2017-01-30 02:00:02 @mashable entertainment reporter. Proud native... 7912 6742 [] 825886254981976064 Los Angeles Was curious what was leading Breitbart. In cas... saba_h NaN None N 0 Was curious what was leading Breitbart. In cas... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2010-08-22 04:16:16
1 2017-01-30 02:00:02 Novelist/freelance editor; Producer and Host ... 2576 2808 [] 825886256806690816 Colorado RT @RaheemKassam: College Lecturer Punched In ... LichenCraig 1.251287e+08 Raheem Kassam Y 0 RT @RaheemKassam: College Lecturer Punched In ... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2012-01-31 03:42:35
2 2017-01-30 02:00:02 NJ conservative.Christian.God Bless the U.S. D... 1731 2408 ["CrybabiesCantLead"] 825886258283085824 None RT @Darren32895836: Head Clown @chuckschumer p... RostaMan09 4.761065e+09 STOCK MONSTER Y 0 RT @Darren32895836: Head Clown @chuckschumer p... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2009-07-28 01:27:11
3 2017-01-30 02:00:02 Mother/wife~ Constitutionalist ~ conservative ... 1231 1461 [] 825886258798936064 United States RT @Timtravels007: On 12 September 2001, Ameri... roadtosingapore 7.275292e+17 Deplorable Tim 🇺🇸 Y 0 RT @Timtravels007: On 12 September 2001, Ameri... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2011-01-27 03:06:46
4 2017-01-30 02:00:02 #MAGA. Owner of guns and partially blind so I ... 266 343 ["AmericaFirst", "TrumpCabinet", "PresidentTru... 825886257746149376 State Of Confusion RT @bfraser747: 🇺🇸🇺🇸 #AmericaFirst \n\nStop de... pilikianocan 2.748912e+08 Brian Fraser Y 0 RT @bfraser747: 🇺🇸🇺🇸 #AmericaFirst \n\nStop de... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2016-06-21 19:29:25

In [4]:
print('Now we have all the tweets inside a {}'.format(type(df)))
print('There are a total of {} tweets in our dataset'.format(len(df)))


Now we have all the tweets inside a <class 'pandas.core.frame.DataFrame'>
There are a total of 5000 tweets in our dataset

In [5]:
# Let's see what different columns we have
print('Here are the columns we have: \n   {}'.format(df.columns))


Here are the columns we have: 
   Index(['created', 'description', 'followers', 'friends_count', 'hashtags',
       'id_str', 'loc', 'message', 'name', 'original_id', 'original_name',
       'retweet', 'retweet_count', 'text', 'topics', 'user_created'],
      dtype='object')

It looks like the topics section is the same for all tweets. These were the search terms used to collect the data


In [6]:
# Let's start by tokenizing all the words
df['tokenized'] = df['text'].apply (lambda row: nltk.word_tokenize(row))

In [7]:
# Let's add part of speech tags. This function can take a bit of time if it's a large dataset
df['tags']=df['tokenized'].apply(lambda row: nltk.pos_tag(row))

In [8]:
# Now let's remove stop words (e.g. and, to, an, etc.)
# We'll build a little function for that
def remove_stop_words(text):
    filtered = [word for word in text if word not in nltk.corpus.stopwords.words('english')]
    return filtered

df['no_stop'] = df['tokenized'].apply(lambda row: remove_stop_words(row))

In [9]:
# Now we can stem the remaining words
stemmer = SnowballStemmer("english")
df['stems'] = df['no_stop'].apply(lambda words: 
                                    [stemmer.stem(word) for word in words])

In [10]:
# OK, let's take another look at the dataframe
df.head()


Out[10]:
created description followers friends_count hashtags id_str loc message name original_id original_name retweet retweet_count text topics user_created tokenized tags no_stop stems
0 2017-01-30 02:00:02 @mashable entertainment reporter. Proud native... 7912 6742 [] 825886254981976064 Los Angeles Was curious what was leading Breitbart. In cas... saba_h NaN None N 0 Was curious what was leading Breitbart. In cas... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2010-08-22 04:16:16 [Was, curious, what, was, leading, Breitbart, ... [(Was, NNP), (curious, JJ), (what, WP), (was, ... [Was, curious, leading, Breitbart, ., In, case... [was, curious, lead, breitbart, ., in, case, ,...
1 2017-01-30 02:00:02 Novelist/freelance editor; Producer and Host ... 2576 2808 [] 825886256806690816 Colorado RT @RaheemKassam: College Lecturer Punched In ... LichenCraig 1.251287e+08 Raheem Kassam Y 0 RT @RaheemKassam: College Lecturer Punched In ... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2012-01-31 03:42:35 [RT, @, RaheemKassam, :, College, Lecturer, Pu... [(RT, NNP), (@, NNP), (RaheemKassam, NNP), (:,... [RT, @, RaheemKassam, :, College, Lecturer, Pu... [rt, @, raheemkassam, :, colleg, lectur, punch...
2 2017-01-30 02:00:02 NJ conservative.Christian.God Bless the U.S. D... 1731 2408 ["CrybabiesCantLead"] 825886258283085824 None RT @Darren32895836: Head Clown @chuckschumer p... RostaMan09 4.761065e+09 STOCK MONSTER Y 0 RT @Darren32895836: Head Clown @chuckschumer p... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2009-07-28 01:27:11 [RT, @, Darren32895836, :, Head, Clown, @, chu... [(RT, NNP), (@, NNP), (Darren32895836, NNP), (... [RT, @, Darren32895836, :, Head, Clown, @, chu... [rt, @, darren32895836, :, head, clown, @, chu...
3 2017-01-30 02:00:02 Mother/wife~ Constitutionalist ~ conservative ... 1231 1461 [] 825886258798936064 United States RT @Timtravels007: On 12 September 2001, Ameri... roadtosingapore 7.275292e+17 Deplorable Tim 🇺🇸 Y 0 RT @Timtravels007: On 12 September 2001, Ameri... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2011-01-27 03:06:46 [RT, @, Timtravels007, :, On, 12, September, 2... [(RT, NNP), (@, NNP), (Timtravels007, NNP), (:... [RT, @, Timtravels007, :, On, 12, September, 2... [rt, @, timtravels007, :, on, 12, septemb, 200...
4 2017-01-30 02:00:02 #MAGA. Owner of guns and partially blind so I ... 266 343 ["AmericaFirst", "TrumpCabinet", "PresidentTru... 825886257746149376 State Of Confusion RT @bfraser747: 🇺🇸🇺🇸 #AmericaFirst \n\nStop de... pilikianocan 2.748912e+08 Brian Fraser Y 0 RT @bfraser747: 🇺🇸🇺🇸 #AmericaFirst \n\nStop de... [cucks, breitbart, Oath Keeper, III%, MAGA, WA... 2016-06-21 19:29:25 [RT, @, bfraser747, :, 🇺🇸🇺🇸, #, AmericaFirst, ... [(RT, NNP), (@, NNP), (bfraser747, NN), (:, :)... [RT, @, bfraser747, :, 🇺🇸🇺🇸, #, AmericaFirst, ... [rt, @, bfraser747, :, 🇺🇸🇺🇸, #, americafirst, ...

In [11]:
# Let's see what variables we have so we know what to store for other datasets
%who


SnowballStemmer	 df	 nltk	 pd	 re	 remove_stop_words	 stemmer	 

In [12]:
# Since we add everything to the dataframe that's the only thing that appears to be worth storing
%store df


Stored 'df' (DataFrame)

To access this dataframe from another notebook, simply run this notebook from your other notebook with this command: %run ./CleanText.ipynb

Then, to load a variable use this command: %store -r df

You can look at the Hashtag Analysis notebook for an example.