In [142]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np, os
import matplotlib.pyplot as plt
hostname = "localhost"; port = 27017
database_name = "twitter-data"
collection_name = "rg-retweeters"
DATA_DIR = '/home/nipg1/Documents/summer_project/data/roland_garros_2017'
In [130]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]
In [131]:
X = []
for retweeter in collection.find():
retweets = retweeter['retweet_count']
X.append(retweets)
X = np.sort(X)
N = len(X)
In [132]:
def get_stat(X,text, p_size=3, grid=True):
global N
print text, '---> percent:', str(float(len(X))/N*100) + '%', ', count:', len(X), ', mean:', np.mean(X), ', std:', np.std(X)
plt.figure(figsize=(20,10))
plt.scatter(range(0,len(X)), X,p_size)
plt.grid(grid)
plt.show()
In [133]:
get_stat(X,'All retweeters')
In [134]:
get_stat(filter(lambda x: x<=10, X), 'x<=10')
In [135]:
get_stat(filter(lambda x: x<=4, X), 'x<=10')
In [136]:
get_stat(filter(lambda x: 10<x<=100, X), '10<x<=100')
In [137]:
get_stat(filter(lambda x: 100<x<=600, X), '100<x<=600')
In [138]:
get_stat(filter(lambda x: 600<x, X), '600<x', p_size=15)
In [195]:
feature = np.load(os.path.join(DATA_DIR, 'feature.npy'))
In [196]:
word2vec_sum = np.sum(feature, axis = 1)
word2vec_sum.sort()
In [171]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')
In [170]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')
In [ ]:
In [175]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')
In [176]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')
In [178]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
In [191]:
pca = PCA(n_components=2)
pca.fit(np.transpose(feature))
Out[191]:
In [190]:
len(feature)
Out[190]:
In [194]:
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],1)
plt.grid()
plt.show()
In [197]:
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],1)
plt.grid()
plt.show()
In [ ]: