notebook.community

Edit and run



In [142]:

    
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np, os
import matplotlib.pyplot as plt

hostname = "localhost"; port = 27017
database_name = "twitter-data"
collection_name = "rg-retweeters"

DATA_DIR = '/home/nipg1/Documents/summer_project/data/roland_garros_2017'



In [130]:

    
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]



In [131]:

    
X = []

for retweeter in collection.find():
    retweets = retweeter['retweet_count']
    
    X.append(retweets)
X = np.sort(X)
N = len(X)



In [132]:

    
def get_stat(X,text, p_size=3, grid=True):
    global N
    print text, '---> percent:', str(float(len(X))/N*100) + '%', ', count:', len(X), ', mean:', np.mean(X), ', std:', np.std(X)
    plt.figure(figsize=(20,10))
    plt.scatter(range(0,len(X)), X,p_size)
    plt.grid(grid)
    plt.show()



In [133]:

    
get_stat(X,'All retweeters')









    



All retweeters ---> percent: 100.0% , count: 129370 , mean: 2.98549895648 , std: 14.559314276



In [134]:

    
get_stat(filter(lambda x: x<=10, X), 'x<=10')









    



x<=10 ---> percent: 96.1041972637% , count: 124330 , mean: 1.69596235824 , std: 1.51864487545



In [135]:

    
get_stat(filter(lambda x: x<=4, X), 'x<=10')









    



x<=10 ---> percent: 90.1816495323% , count: 116668 , mean: 1.36926149415 , std: 0.745121880509



In [136]:

    
get_stat(filter(lambda x: 10<x<=100, X), '10<x<=100')









    



10<x<=100 ---> percent: 3.69250985545% , count: 4777 , mean: 25.1155536948 , std: 17.5775217941



In [137]:

    
get_stat(filter(lambda x: 100<x<=600, X), '100<x<=600')









    



100<x<=600 ---> percent: 0.198655020484% , count: 257 , mean: 186.338521401 , std: 92.1122126665



In [138]:

    
get_stat(filter(lambda x: 600<x, X), '600<x', p_size=15)









    



600<x ---> percent: 0.0046378604004% , count: 6 , mean: 1251.5 , std: 700.108265437



In [195]:

    
feature = np.load(os.path.join(DATA_DIR, 'feature.npy'))



In [196]:

    
word2vec_sum = np.sum(feature, axis = 1)
word2vec_sum.sort()



In [171]:

    
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')









    



Absolute sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 0.343918 , std: 0.302017



In [170]:

    
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')









    



Sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 0.343918 , std: 0.302017



In [ ]:



In [175]:

    
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')









    



Absolute sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 1.74039e-06 , std: 1.31653e-06



In [176]:

    
get_stat(np.abs(word2vec_sum), 'Absolute sum of word2vec vectors')









    



Absolute sum of word2vec vectors ---> percent: 702.184432249% , count: 908416 , mean: 1.74039e-06 , std: 1.31653e-06



In [178]:

    
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA



In [191]:

    
pca = PCA(n_components=2)
pca.fit(np.transpose(feature))









    Out[191]:





PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [190]:

    
len(feature)









    Out[190]:





908416



In [194]:

    
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],1)
plt.grid()
plt.show()



In [197]:

    
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],1)
plt.grid()
plt.show()



In [ ]: