In [71]:
from pyspark.mllib.clustering import KMeans
from pyspark.mllib.clustering import GaussianMixture
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark import SparkContext
from numpy import array
from math import sqrt
from pyspark.ml.pipeline import Pipeline
import pandas as pd
import sys
import pyspark
import glob
selectedFile = "data/3980"
for selectedFile in glob.glob("data/*.feat"):
selectedFile = selectedFile.split(".")[0]
file = selectedFile + ".feat"
egofile = selectedFile + ".egofeat"
data = sc.textFile(file)
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
model = KMeans.train(parsedData, 8, maxIterations=20, runs=30, initializationMode="k-means||")
vecAssembler = VectorAssembler(inputCols=[], outputCol="features")
scaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features")
firstPipeline = Pipeline(stages=[vecAssembler, model])
centroids = model.centers
weights = model.k
cost = model.computeCost(parsedData)
def error(point):
center = model.centers[model.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Sum of Squared Error = " + str(WSSSE))
egodata = sc.textFile(egofile)
egoparsedData = egodata.map(lambda line: array([0] + [float(x) for x in line.split(' ')]))
print(selectedFile + ": "+ str(model.predict(egoparsedData).collect()[0]))
In [ ]:
In [75]:
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
import colorsys
import matplotlib.pylab as pylab
import glob
pylab.rcParams['figure.figsize'] = 12, 12
def rgb_to_hex(rgb):
return '#' + ''.join(['%02x' % int(p * 255) for p in rgb])
def get_colours(count):
count = count + 1
colours = [colorsys.hsv_to_rgb(h / count, 1, 1) for h in range(count)]
return [rgb_to_hex(c) for c in colours]
def draw_graph(graph, clusters):
nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
G=nx.Graph()
for node in nodes:
G.add_node(node)
for edge in graph:
G.add_edge(edge[0], edge[1])
pos = nx.spring_layout(G, k=0.04, iterations=10)
rainbow = get_colours(max(clusters))
colours = ['r' for n in G.nodes()]
for i in range(len(G.nodes())):
colours[i] = rainbow[clusters[i]]
nx.draw_networkx(G,
pos,
node_size=100,
linewidths=0.5,
with_labels=False,
node_color=colours)
plt.axis("off")
plt.show()
def read_file(name):
with open(name, "r") as f:
return[list(map(int, line.split())) for line in f.readlines()]
all_clusters = model.predict(parsedData).collect()
draw_graph(read_file(file), all_clusters)
cluster_counts = {}
for a in all_clusters:
if a in cluster_counts:
cluster_counts[a] = cluster_counts[a] + 1
else:
cluster_counts[a] = 1
print(cluster_counts)
In [ ]: