Project: Fletcher
Date: 03/10/2017
Name: Prashant Tatineni
In this project, I use Twitter data to highlight the key issues being discussed related to the Demonetization event that occurred in India on January 1, 2017. Specifically, I use an old dataset of "tweets" with the tag #demonetization, downloaded via Kaggle, from late November 2016, when the Demonetization was announced. For comparison I also use tweets with the same tag downloaded via the Twitter API in March 2017.
In [63]:
# imports
import pandas as pd
import nltk
from sklearn.cluster import KMeans
import re
import requests
from requests_oauthlib import OAuth1
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import silhouette_score
import pickle
import collections
from sklearn.decomposition import PCA
import unicodedata
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
import cnfg
config = cnfg.load(".twitter_config")
oauth = OAuth1(config["consumer_key"],
config["consumer_secret"],
config["access_token"],
config["access_token_secret"])
In [ ]:
df = pd.read_csv('data/raw/demonetization-tweets.csv')
In [65]:
df.head()
Out[65]:
In [ ]:
# RESTful search API
all_tweets = []
search_url = "https://api.twitter.com/1.1/search/tweets.json"
parameters = {"q": "#demonetisation", "count":100, "lang": "en"}
response = requests.get(search_url,
params = parameters,
auth=oauth)
for tweet in response.json()['statuses']:
all_tweets.append(tweet['text'])
for _ in range(99):
if 'next_results' in response.json()['search_metadata'].keys():
next_page_url = search_url + response.json()['search_metadata']['next_results']
response = requests.get(next_page_url, auth=oauth)
for tweet in response.json()['statuses']:
all_tweets.append(tweet['text'])
else:
break
In [ ]:
X = pd.DataFrame(all_tweets)
In [ ]:
with open('data/new_tweets.pkl', 'wb') as picklefile:
pickle.dump(X, picklefile)
In [67]:
def tokenize_func(text):
if text[:2] == 'RT':
text = text.partition(':')[2]
tokens = nltk.word_tokenize(text)
filtered_tokens = []
for token in tokens:
token = re.sub('[^A-Za-z]', '', token).strip()
if token not in ['demonetization','demonetisation','https','amp','rt','']:
filtered_tokens.append(token)
return filtered_tokens
In [68]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.97, min_df=0.03, max_features=200000,
stop_words='english', decode_error='ignore',
tokenizer=tokenize_func, ngram_range=(1,3))
In [69]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['text']) #fit the vectorizer to tweets
In [70]:
print tfidf_matrix.shape
In [71]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
In [12]:
Inertia = []
Sil_coefs = []
for k in range(2,10):
km = KMeans(n_clusters=k, random_state=42)
km.fit(tfidf_matrix)
labels = km.labels_
Sil_coefs.append(silhouette_score(tfidf_matrix, labels, metric='euclidean'))
Inertia.append(km.inertia_)
In [17]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette score')
# plot here on ax2
ax2.plot(k_clusters, Inertia)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('Inertia');
In [72]:
km = KMeans(n_clusters=3)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
In [74]:
collections.Counter(km.labels_)
Out[74]:
In [75]:
pca = PCA(n_components=2)
km_pca = pca.fit_transform(dist)
xs, ys = km_pca[:,0], km_pca[:,1]
In [95]:
fig, ax = plt.subplots()
plt.scatter(xs,ys,c=clusters, cmap='Accent')
plt.colorbar(ticks=[0,1,2])
ax.tick_params(axis='x',bottom='off',labelbottom='off')
ax.tick_params(axis='y',left='off',labelleft='off')
In [77]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
In [78]:
terms = tfidf_vectorizer.get_feature_names()
In [98]:
for i in range(3):
print('Cluster----------',i)
for x in order_centroids[i,:10]:
print(terms[x])
In [81]:
df['clusters'] = clusters
In [104]:
df[df.clusters == 1]['screenName'][4340]
Out[104]:
In [106]:
df['sentiment'] = df['text'].apply(lambda x: TextBlob(unicode(x, errors='ignore')).sentiment[0])
In [108]:
fig, ax = plt.subplots()
plt.scatter(xs,ys, c=df['sentiment'], cmap='cool')
plt.colorbar()
ax.tick_params(axis='x',bottom='off',labelbottom='off')
ax.tick_params(axis='y',left='off',labelleft='off')
In [117]:
df.sort_values('sentiment')['screenName'][6402]
Out[117]:
In [120]:
with open("data/new_tweets.pkl", 'rb') as picklefile:
new_tweets = pickle.load(picklefile)
In [3]:
new_tweets
Out[3]:
In [121]:
new_tweets.columns = ['text']
In [122]:
%time tfidf2 = tfidf_vectorizer.fit_transform(new_tweets['text']) #fit the vectorizer to tweets
In [123]:
print tfidf2.shape
In [124]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf2)
In [125]:
km = KMeans(n_clusters=3)
km.fit(tfidf2)
clusters = km.labels_.tolist()
In [126]:
pca = PCA(n_components=2)
km_pca = pca.fit_transform(dist)
xs, ys = km_pca[:,0], km_pca[:,1]
In [127]:
fig, ax = plt.subplots()
plt.scatter(xs,ys,c=clusters, cmap='Accent')
plt.colorbar(ticks=[0,1,2])
ax.tick_params(axis='x',bottom='off',labelbottom='off')
ax.tick_params(axis='y',left='off',labelleft='off')
In [128]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
In [129]:
terms = tfidf_vectorizer.get_feature_names()
In [130]:
for i in range(3):
print('Cluster----------',i)
for x in order_centroids[i,:10]:
print(terms[x])
In [26]:
collections.Counter(km.labels_)
Out[26]:
In [132]:
new_tweets['cluster'] = clusters
In [133]:
new_tweets[new_tweets.cluster == 1]
Out[133]:
In [134]:
new_tweets['text'][1137]
Out[134]:
In [144]:
new_tweets['sentiment'] = new_tweets['text'].apply(lambda x: TextBlob(x).sentiment[0])
In [145]:
fig, ax = plt.subplots()
plt.scatter(xs,ys, c=new_tweets['sentiment'], cmap='cool')
plt.colorbar()
ax.tick_params(axis='x',bottom='off',labelbottom='off')
ax.tick_params(axis='y',left='off',labelleft='off')