from pyspark.mllib.clustering import KMeans
from pyspark.mllib.clustering import GaussianMixture
from import StandardScaler, VectorAssembler
from pyspark import SparkContext
from numpy import array
from math import sqrt
from import Pipeline
import pandas as pd
import sys
import pyspark
import glob

selectedFile = "data/3980" 
for selectedFile in glob.glob("data/*.feat"):
    selectedFile = selectedFile.split(".")[0]
    file = selectedFile + ".feat"
    egofile = selectedFile + ".egofeat"

    data = sc.textFile(file)
    parsedData = line: array([float(x) for x in line.split(' ')]))

    model = KMeans.train(parsedData, 8, maxIterations=20, runs=30,  initializationMode="k-means||")

    vecAssembler = VectorAssembler(inputCols=[], outputCol="features")
    scaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features")
    firstPipeline = Pipeline(stages=[vecAssembler, model])

    centroids = model.centers
    weights = model.k
    cost = model.computeCost(parsedData)

    def error(point):
        center = model.centers[model.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = point: error(point)).reduce(lambda x, y: x + y)
    print("Sum of Squared Error = " + str(WSSSE))    

    egodata = sc.textFile(egofile)
    egoparsedData = line: array([0] + [float(x) for x in line.split(' ')]))
    print(selectedFile + ": "+ str(model.predict(egoparsedData).collect()[0]))

Sum of Squared Error = 3935.4869040143935
data/0: 4
Sum of Squared Error = 31820.11610744439
data/1684: 6
Sum of Squared Error = 12808.256427565573
data/3437: 2
Sum of Squared Error = 24568.497488494475
data/1912: 1
Sum of Squared Error = 991.1722789500066
data/686: 2
Sum of Squared Error = 303.90428695811994
data/698: 7
Sum of Squared Error = 40870.97590689243
data/107: 3
Sum of Squared Error = 1703.8838775274414
data/414: 5
Sum of Squared Error = 163.69480150175343
data/3980: 1
Sum of Squared Error = 2250.6126863962145
data/348: 3

%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
import colorsys
import matplotlib.pylab as pylab
import glob

pylab.rcParams['figure.figsize'] = 12, 12  

def rgb_to_hex(rgb):
    return '#' + ''.join(['%02x' % int(p * 255) for p in rgb])

def get_colours(count):
    count = count + 1
    colours = [colorsys.hsv_to_rgb(h / count, 1, 1) for h in range(count)]
    return [rgb_to_hex(c) for c in colours]

def draw_graph(graph, clusters):
    nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
    for node in nodes:
    for edge in graph:
        G.add_edge(edge[0], edge[1])
    pos = nx.spring_layout(G, k=0.04, iterations=10)
    rainbow = get_colours(max(clusters))
    colours = ['r' for n in G.nodes()]
    for i in range(len(G.nodes())):
        colours[i] = rainbow[clusters[i]]

def read_file(name):
    with open(name, "r") as f:
        return[list(map(int, line.split())) for line in f.readlines()]

all_clusters = model.predict(parsedData).collect() 
draw_graph(read_file(file), all_clusters) 
cluster_counts = {}
for a in all_clusters:
    if a in cluster_counts:
        cluster_counts[a] = cluster_counts[a] + 1
        cluster_counts[a] = 1

ValueError                                Traceback (most recent call last)
<ipython-input-75-80a0015932c6> in <module>()
     47 all_clusters = model.predict(parsedData).collect()
---> 48 draw_graph(read_file(file), all_clusters)
     49 cluster_counts = {}
     50 for a in all_clusters:

<ipython-input-75-80a0015932c6> in draw_graph(graph, clusters)
     18 def draw_graph(graph, clusters):
---> 19     nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
     20     G=nx.Graph()
     21     for node in nodes:

<ipython-input-75-80a0015932c6> in <listcomp>(.0)
     18 def draw_graph(graph, clusters):
---> 19     nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
     20     G=nx.Graph()
     21     for node in nodes:

ValueError: too many values to unpack (expected 2)

