In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import cython, codecs, os, logging
import gensim
from gensim import corpora, models
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
database_name = "twitter-data"
collection_name = "roland_garros_2017"
DATA_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name
LOG_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name + '/embedding_logs'
In [2]:
feature = np.load(os.path.join(DATA_DIR, 'feature.npy'))
print len(feature)
In [3]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
In [4]:
test = [[1,1,1,6,89,84,64,23,321],
[1,2,56,64,96,100,56,31,150],
[1,-1,1,1,100,80,13,30,200]]
In [5]:
X = feature[:100000]
print X.shape
In [ ]:
db = DBSCAN(eps=1, min_samples=10).fit_predict(X)
In [ ]:
print len(db)
In [ ]:
pca = PCA(n_components=2)
pca.fit(X)
In [ ]:
import matplotlib.pyplot as plt
In [ ]:
plt.figure(figsize=(15,15))
plt.scatter(pca.components_[0],pca.components_[1],c=db)
plt.show()
In [ ]: