In [142]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np, os
import matplotlib.pyplot as plt

hostname = "localhost"; port = 27017
database_name = "twitter-data"
collection_name = "rg-retweeters"

DATA_DIR = '/home/nipg1/Documents/summer_project/data/roland_garros_2017'

In [130]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]

In [131]:
X = []

for retweeter in collection.find():
    retweets = retweeter['retweet_count']
    
    X.append(retweets)
X = np.sort(X)
N = len(X)

In [132]:
def get_stat(X,text, p_size=3, grid=True):
    global N
    print text, '---> percent:', str(float(len(X))/N*100) + '%', ', count:', len(X), ', mean:', np.mean(X), ', std:', np.std(X)
    plt.figure(figsize=(20,10))
    plt.scatter(range(0,len(X)), X,p_size)
    plt.grid(grid)
    plt.show()

In [133]:
get_stat(X,'All retweeters')


All retweeters ---> percent: 100.0% , count: 129370 , mean: 2.98549895648 , std: 14.559314276

In [134]:
get_stat(filter(lambda x: x<=10, X), 'x<=10')


x<=10 ---> percent: 96.1041972637% , count: 124330 , mean: 1.69596235824 , std: 1.51864487545

In [135]:
get_stat(filter(lambda x: x<=4, X), 'x<=10')


x<=10 ---> percent: 90.1816495323% , count: 116668 , mean: 1.36926149415 , std: 0.745121880509

In [136]:
get_stat(filter(lambda x: 10<x<=100, X), '10<x<=100')


10<x<=100 ---> percent: 3.69250985545% , count: 4777 , mean: 25.1155536948 , std: 17.5775217941

In [137]:
get_stat(filter(lambda x: 100<x<=600, X), '100<x<=600')


100<x<=600 ---> percent: 0.198655020484% , count: 257 , mean: 186.338521401 , std: 92.1122126665

In [138]:
get_stat(filter(lambda x: 600<x, X), '600<x', p_size=15)


600<x ---> percent: 0.0046378604004% , count: 6 , mean: 1251.5 , std: 700.108265437

In [195]:
feature = np.load(os.path.join(DATA_DIR, 'feature.npy'))

In [196]:
word2vec_sum = np.sum(feature, axis = 1)
word2vec_sum.sort()

In [171]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')


Absolute sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 0.343918 , std: 0.302017

In [170]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')


Sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 0.343918 , std: 0.302017

In [ ]:


In [175]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')


Absolute sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 1.74039e-06 , std: 1.31653e-06

In [176]:
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')


Absolute sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 1.74039e-06 , std: 1.31653e-06

In [178]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

In [191]:
pca = PCA(n_components=2)
pca.fit(np.transpose(feature))


Out[191]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [190]:
len(feature)


Out[190]:
908416

In [194]:
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],1)
plt.grid()
plt.show()



In [197]:
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],1)
plt.grid()
plt.show()



In [ ]: