In [23]:
from setup_notebooks import *
%matplotlib inline
In [24]:
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)
In [25]:
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary
In [26]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
df = pd.read_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
So the new things are LsiModel and scatmat
In [27]:
df.tokens
Out[27]:
Load cleaned tweet data
Don't forget to fix up the tokens!
Can you think of a better way to save a list of lists of strings?
What about the raw, unprocessed unicode tweet text itself?
In [28]:
vocab = Dictionary(df.txt.str.split())
print(vocab)
In [29]:
tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
tfidf.num_docs
Out[29]:
In [30]:
bows = pd.Series(vocab.doc2bow(toks) for toks in df.txt.str.split())
bows
Out[30]:
This would make a nice, compact sparse matrix representation of our entire corpus...
Which would mean we could do more in RAM at once.
Left as an exercise. (check out scipy.sparse.coo_matrix
)
In [14]:
vocab.token2id['publishes']
Out[14]:
In [16]:
vocab[0]
Out[16]:
In [17]:
vocab.token2id['python']
Out[17]:
In [18]:
vocab.token2id['Python']
Out[18]:
In [20]:
dict([(vocab[i], round(freq, 2)) for i, freq in tfidf[bows[0]]])
Out[20]:
Notice how "you" didn't get as much weight as "enjoy"
Let's look at some other tweets
In [31]:
from gensim.models import LsiModel
lsi = LsiModel(tfidf[bows], num_topics=100, id2word=vocab, extra_samples=100, power_iters=2)
# lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
len(lsi.id2word)
Out[31]:
This is starting to look a lot like a set of vectors that we could use as features
But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have?
In [32]:
len(vocab)
Out[32]:
100k dimensions isn't a good idea
Even for a masively parallel deep learning project this would be big
Like the cat/dog picture classification on 256x256 images
What about PCA (Principal Component Analysis) like is used on images?
In NLP PCA is called LSI (Latent Semantic Analysis)
That sounds cool!
I want me some latent semantics (hidden meaning)
In [36]:
lsi.save(os.path.join(DATA_PATH, 'lsi'))
In [69]:
lsi4 = LsiModel(bows, num_topics=4, id2word=vocab, extra_samples=100, power_iters=2)
lsi4.save(os.path.join(DATA_PATH, 'lsi4'))
lsi4
Out[69]:
In [41]:
topics = lsi4[bows]
df_topics = pd.DataFrame([dict(d) for d in topics], index=df.index, columns=range(5))
In [ ]:
df_topics4['favorites'] = (nums.favorite_count > 0).astype(int)
In [60]:
df_topics4['favorites'] = np.ceil(nums.favorite_count ** .13).astype(int)
df_topics4
Out[60]:
In [68]:
scatmat(df_topics, num_topics=4)
In [1]:
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), compression='gzip', engine='python')
vocab = Dictionary.from_documents(([str(s) for s in row]for row in df.txt.str.split()))
tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
bows = pd.Series(vocab.doc2bow(toks) for toks in df.txt.str.split())
topics = lsi[bows]
In [71]:
df_topics
Out[71]:
In [73]:
scatmat(df_topics)
What's that sound I hear?
That's the sound mof your fans blowing hot air out of those tweets!
(check out your system monitor or htop
)
In [28]:
tweetids = pd.Series(range(6), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
pd.DataFrame([pd.Series([x[1] for x in lsi[bows[i]]], index=topicids,
name='tweet') for i in tweetids],
index=tweetids)
Out[28]:
In [30]:
lsi.save(os.path.join(DATA_PATH, 'lsi'))
lsi5.save(os.path.join(DATA_PATH, 'lsi5'))
In [5]:
lsi5 = LsiModel.load(os.path.join(DATA_PATH, 'lsi5'))
In [9]:
# for topic in lsi.show_topics():
# print(topic)
lsi5.show_topic(0, 8)
Out[9]:
In [10]:
lsi5.show_topic(0, 8)
Out[10]:
In [31]:
tweetids = pd.Series(range(len(bows)), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
# `dict()` keeps track of the columns for each topic, in case the lsi model shuffles or skips topics for odd tweets
df = pd.DataFrame([pd.Series(dict(lsi[bows[i]]), name='tweet') for i in tweetids],
columns=topicids,
index=tweetids)
In [32]:
df
Out[32]:
What's with the 1.43?
Aren't they normalize?
... Nope
In [12]:
scatmat(df[df.columns[:5]][::100])
In [ ]:
num
In [ ]:
with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'wb') as f:
df.to_csv(f, encoding='utf8', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
We built LSI topic vectors for 200k tweets in a few minutes!
Lets look at the TFIDF vectors for the top 6 tweets
In [10]:
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6)))
tfidf6 = tfidf6.fillna('')
tfidf6
Notice the small weights on the word "Python"? Why do you think that is? (Think back to the definition of TF and DF and TFIDF
Now lets see how far apart they are based only on word frequency (TFIDF)
We'll "project" the first tweet onto the second with a dot product
to see how much of a "shadow" they make on each other
In [ ]:
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6))).fillna(0).T
In [ ]:
np.dot(tfidf6[0], tfidf6[1])
In [ ]:
np.dot(tfidf6[1], tfidf6[2])
That looks about right.
The first 2 share no words.
The second 2 share only "Python".
But lets do the cosine similarity correctly by normalizing for length.
In [ ]:
np.dot(tfidf6[1], tfidf6[2]) / np.linalg.norm(tfidf6[1]) / np.linalg.norm(tfidf6[2])
Hmmm, nothing changed
Can you guess why?
In [ ]:
[round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)]
In [ ]:
Now lets look at the topic vectors.
In [125]:
df.iloc[:6]
Out[125]:
In [122]:
print([round(np.dot(df.T[i], df.T[i+1]), 4) for i in range(5)])
Better normalize these...
In [123]:
print([round(np.dot(df.T[i], df.T[i+1]) / np.linalg.norm(df.T[i]) / np.linalg.norm(df.T[i+1]), 4) for i in range(5)])
# for comparison the TFIDF scores right below
print([round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)])
So the really chummy neighbors are 1 & 2 and 3 & 4
Surprisingly 2 & 3 didn't hit it off, and no pairing got a zero!
And the last 2 seem to share a "latent" similarity that TFIDF missed entirely!!!
And LSI picked up on the python<->Python similarity (tweets 0 and 1)
In [133]:
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
text = pd.DataFrame.from_csv(f, encoding='utf8')
In [188]:
for toks, twt in zip(text.txt.iloc[:6], text.text.iloc[:6]):
print(toks)
print(twt)
print('-' * 10)
What about a new tweet you are considering?
Notice how I changed the token spelling (BOW),
but not the "semantics" of the tweet.
In [169]:
tweet = 'I want to help build django with a job in Chicago'
tweet_bow = vocab.doc2bow(tweet.split())
tweet_tfidf = tfidf[tweet_bow]
tweet_topics = pd.Series(dict(lsi[tweet_tfidf]))
# Now that the math is done let's convert to a friendlier format with words as the keys/index
tweet_tfidf = pd.Series(dict([(vocab[i], x) for (i, x) in tweet_tfidf]))
print('\nLSI Topic Vector')
tweet_topics
Out[169]:
Compare the topic vector above to the TFIDF vector below.
What's better about TFIDF compared to topic vectors?
What can we do about it?
In [170]:
print('TFIDF Frequency Vector')
print(tweet_tfidf)
Which one is it closest too?
Can you guess?
Does LSI understand the words as well as you do?
In [167]:
print('LSI Topic Similarity')
print([round(np.dot(df.T[i], tweet_topics) / np.linalg.norm(df.T[i]) / np.linalg.norm(tweet_topics), 4) for i in range(6)])
In [184]:
tfidf7 = tfidf6.copy()
tfidf7[6] = tweet_tfidf
tfidf7 = tfidf7.fillna(0)
tfidf7
Out[184]:
In [ ]:
In [186]:
print([round(np.dot(tfidf7[i], tfidf7[6]), 4) for i in range(6)])
In [187]:
tweet
Out[187]:
Can you find the one word I accidentally share with the other tweets?
Hint: use the TFIDF matrix (Dataframe)
Play around with the tweet text to make its topic vector more "orthogonal"
Or make it closer in cosine distance.