Machine Learning types:
In this notebook we:
In [9]:
import time
import requests
import numpy as np
import pandas as pd
from itertools import chain
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from textblob import TextBlob
from gensim.models import word2vec
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
In [2]:
def get_quotes(url):
page = BeautifulSoup(requests.get(url).content, "html.parser")
quotes = [i.get_text() for i in page.find_all("span",class_="text")]
time.sleep(3)
return quotes
quotes = get_quotes("http://quotes.toscrape.com/")
urls = ["http://quotes.toscrape.com/page/"+str(i)+"/" for i in range(1,11)]
quotes_all = [get_quotes(i) for i in urls]
quotes_all = chain.from_iterable(quotes_all)
In [3]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(quotes)
print(tfidf_matrix.shape)
In [4]:
features = tfidf_vectorizer.get_feature_names()
data = tfidf_matrix.toarray()
tfidf_df = pd.DataFrame(data,columns=features)
In [5]:
k=5
k5 = KMeans(n_clusters=k)
k5.fit(tfidf_matrix)
clusters = k5.labels_.tolist()
my_dict = {'quotes': quotes, 'cluster': clusters}
df = pd.DataFrame(my_dict)
print(df)
df.cluster.value_counts()
Out[5]:
In [6]:
important_terms = k5.cluster_centers_.argsort()[:, ::-1]
key_list = list(tfidf_vectorizer.vocabulary_.keys())
val_list = list(tfidf_vectorizer.vocabulary_.values())
key_list[val_list.index(74)]
for i in range(k):
for j in important_terms[i, :5]:
print("Cluster: ", i, key_list[val_list.index(j)])
In [10]:
dist = 1 - cosine_similarity(tfidf_matrix)
linkage_matrix = ward(dist)
plt.subplots(figsize=(15, 20))
dendrogram(linkage_matrix, orientation="right", labels=quotes)
plt.savefig('clusters.png')
In [11]:
tokenized_sentences = [sentence.split() for sentence in quotes_all]
model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
In [12]:
w1 = "world"
w2 = "man"
w3 = w1
In [13]:
print(model.wv.similarity(w1,w2))
print("\n")
model.wv.most_similar(w3)
Out[13]: