In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import cython, codecs, os, logging
import gensim
from gensim import corpora, models

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

database_name = "twitter-data"
collection_name = "roland_garros_2017"

DATA_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name
LOG_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name + '/embedding_logs'

In [2]:
feature = np.load(os.path.join(DATA_DIR, 'feature.npy'))
print len(feature)


908416

In [3]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

In [4]:
test = [[1,1,1,6,89,84,64,23,321],
        [1,2,56,64,96,100,56,31,150],
        [1,-1,1,1,100,80,13,30,200]]

In [5]:
X = feature[:100000]
print X.shape


(100000, 300)

In [ ]:
db = DBSCAN(eps=1, min_samples=10).fit_predict(X)

In [ ]:
print len(db)

In [ ]:
pca = PCA(n_components=2)
pca.fit(X)

In [ ]:
import matplotlib.pyplot as plt

In [ ]:
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],c=db)
plt.show()

In [ ]: