In [7]:
import csv
import nltk
from nltk.probability import FreqDist

In [23]:
tweets = []

# replace 
with open("kanye.csv", "r+", encoding="utf-8") as raw:
    data = csv.reader(raw, delimiter=",")
    next(data) # skip header row
    for row in data:
        tweets.append(row[1])

In [24]:
collection  = " ".join(tweets)
# change to lowercase
collection = collection.lower()
# strip whitespace
collection = collection.strip()
# tokenize for analysis
tokenized = nltk.tokenize.word_tokenize(collection)

In [25]:
fdist = FreqDist(tokenized)
print(fdist)
fdist.most_common(50)


<FreqDist with 2819 samples and 12449 outcomes>
Out[25]:
[('the', 397),
 (':', 356),
 ('.', 352),
 ('i', 319),
 ('and', 297),
 ('!', 279),
 ('to', 273),
 ('https', 182),
 ('of', 164),
 ('http', 162),
 ('a', 152),
 ('my', 150),
 ('you', 147),
 ('in', 126),
 ('@', 124),
 (',', 117),
 ('for', 116),
 ('is', 112),
 ('on', 93),
 ('me', 85),
 ('that', 82),
 ('...', 80),
 ('all', 75),
 ('this', 69),
 ('so', 68),
 ('street', 65),
 ('with', 64),
 ('#', 61),
 ('was', 61),
 ('we', 59),
 ('at', 56),
 ('be', 56),
 ('have', 53),
 ('it', 52),
 ('love', 52),
 ('…', 48),
 ('people', 48),
 ('will', 47),
 ('thank', 43),
 ('one', 43),
 ('not', 43),
 ('new', 42),
 ('by', 42),
 ('just', 41),
 ('album', 41),
 ('do', 40),
 ('now', 38),
 ('like', 37),
 ('but', 36),
 ('i’m', 35)]