In [1]:
# Here are the imports we'll use
import pandas as pd
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
In [2]:
# Let's grab all the tweets from https://data.world/data4democracy/far-right/file/sample_tweets.json. Here is the URL:
df = pd.read_json('https://query.data.world/s/bsbt4eb4g8sm4dsgi7w2ecbkt')
In [3]:
# Let's take a look at it
df.head()
# Does anyone know the difference between message and text?
Out[3]:
In [4]:
print('Now we have all the tweets inside a {}'.format(type(df)))
print('There are a total of {} tweets in our dataset'.format(len(df)))
In [5]:
# Let's see what different columns we have
print('Here are the columns we have: \n {}'.format(df.columns))
It looks like the topics section is the same for all tweets. These were the search terms used to collect the data
In [6]:
# Let's start by tokenizing all the words
df['tokenized'] = df['text'].apply (lambda row: nltk.word_tokenize(row))
In [7]:
# Let's add part of speech tags. This function can take a bit of time if it's a large dataset
df['tags']=df['tokenized'].apply(lambda row: nltk.pos_tag(row))
In [8]:
# Now let's remove stop words (e.g. and, to, an, etc.)
# We'll build a little function for that
def remove_stop_words(text):
filtered = [word for word in text if word not in nltk.corpus.stopwords.words('english')]
return filtered
df['no_stop'] = df['tokenized'].apply(lambda row: remove_stop_words(row))
In [9]:
# Now we can stem the remaining words
stemmer = SnowballStemmer("english")
df['stems'] = df['no_stop'].apply(lambda words:
[stemmer.stem(word) for word in words])
In [10]:
# OK, let's take another look at the dataframe
df.head()
Out[10]:
In [11]:
# Let's see what variables we have so we know what to store for other datasets
%who
In [12]:
# Since we add everything to the dataframe that's the only thing that appears to be worth storing
%store df
To access this dataframe from another notebook, simply run this notebook from your other notebook with this command: %run ./CleanText.ipynb
Then, to load a variable use this command: %store -r df
You can look at the Hashtag Analysis notebook for an example.