notebook.community

Edit and run



In [1]:

    
# -*- coding: utf-8 -*-
import numpy as np
import cython, codecs, os, logging
import gensim
from gensim import corpora, models

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

database_name = "twitter-data"
collection_name = "roland_garros_2017"

DATA_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name
LOG_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name + '/embedding_logs'



In [2]:

    
feature = np.load(os.path.join(DATA_DIR, 'feature.npy'))
print len(feature)



In [3]:

    
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA



In [4]:

    
test = [[1,1,1,6,89,84,64,23,321],
        [1,2,56,64,96,100,56,31,150],
        [1,-1,1,1,100,80,13,30,200]]



In [5]:

    
X = feature[:100000]
print X.shape









    



(100000, 300)



In [ ]:

    
db = DBSCAN(eps=1, min_samples=10).fit_predict(X)



In [ ]:

    
print len(db)



In [ ]:

    
pca = PCA(n_components=2)
pca.fit(X)



In [ ]:

    
import matplotlib.pyplot as plt



In [ ]:

    
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],c=db)
plt.show()



In [ ]: