cluster



In [71]:
from pyspark.mllib.clustering import KMeans
from pyspark.mllib.clustering import GaussianMixture
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark import SparkContext
from numpy import array
from math import sqrt
from pyspark.ml.pipeline import Pipeline
import pandas as pd
import sys
import pyspark
import glob


selectedFile = "data/3980" 
for selectedFile in glob.glob("data/*.feat"):
    selectedFile = selectedFile.split(".")[0]
    file = selectedFile + ".feat"
    egofile = selectedFile + ".egofeat"

    data = sc.textFile(file)
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    model = KMeans.train(parsedData, 8, maxIterations=20, runs=30,  initializationMode="k-means||")

    vecAssembler = VectorAssembler(inputCols=[], outputCol="features")
    scaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features")
    firstPipeline = Pipeline(stages=[vecAssembler, model])

    centroids = model.centers
    weights = model.k
    cost = model.computeCost(parsedData)

    def error(point):
        center = model.centers[model.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Sum of Squared Error = " + str(WSSSE))    

    egodata = sc.textFile(egofile)
    egoparsedData = egodata.map(lambda line: array([0] + [float(x) for x in line.split(' ')]))
    print(selectedFile + ": "+ str(model.predict(egoparsedData).collect()[0]))


Sum of Squared Error = 3935.4869040143935
data/0: 4
Sum of Squared Error = 31820.11610744439
data/1684: 6
Sum of Squared Error = 12808.256427565573
data/3437: 2
Sum of Squared Error = 24568.497488494475
data/1912: 1
Sum of Squared Error = 991.1722789500066
data/686: 2
Sum of Squared Error = 303.90428695811994
data/698: 7
Sum of Squared Error = 40870.97590689243
data/107: 3
Sum of Squared Error = 1703.8838775274414
data/414: 5
Sum of Squared Error = 163.69480150175343
data/3980: 1
Sum of Squared Error = 2250.6126863962145
data/348: 3

In [ ]:


In [75]:
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
import colorsys
import matplotlib.pylab as pylab
import glob

pylab.rcParams['figure.figsize'] = 12, 12  

def rgb_to_hex(rgb):
    return '#' + ''.join(['%02x' % int(p * 255) for p in rgb])

def get_colours(count):
    count = count + 1
    colours = [colorsys.hsv_to_rgb(h / count, 1, 1) for h in range(count)]
    return [rgb_to_hex(c) for c in colours]

def draw_graph(graph, clusters):
    nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
    G=nx.Graph()
    for node in nodes:
        G.add_node(node)
    for edge in graph:
        G.add_edge(edge[0], edge[1])
    pos = nx.spring_layout(G, k=0.04, iterations=10)
    
    rainbow = get_colours(max(clusters))
    colours = ['r' for n in G.nodes()]
    for i in range(len(G.nodes())):
        colours[i] = rainbow[clusters[i]]
        
    nx.draw_networkx(G,
                     pos,
                     node_size=100,
                     linewidths=0.5,
                     with_labels=False,
                     node_color=colours) 
    
    plt.axis("off")
    plt.show()

def read_file(name):
    with open(name, "r") as f:
        return[list(map(int, line.split())) for line in f.readlines()]


all_clusters = model.predict(parsedData).collect() 
draw_graph(read_file(file), all_clusters) 
cluster_counts = {}
for a in all_clusters:
    if a in cluster_counts:
        cluster_counts[a] = cluster_counts[a] + 1
    else:
        cluster_counts[a] = 1
print(cluster_counts)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-75-80a0015932c6> in <module>()
     46 
     47 all_clusters = model.predict(parsedData).collect()
---> 48 draw_graph(read_file(file), all_clusters)
     49 cluster_counts = {}
     50 for a in all_clusters:

<ipython-input-75-80a0015932c6> in draw_graph(graph, clusters)
     17 
     18 def draw_graph(graph, clusters):
---> 19     nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
     20     G=nx.Graph()
     21     for node in nodes:

<ipython-input-75-80a0015932c6> in <listcomp>(.0)
     17 
     18 def draw_graph(graph, clusters):
---> 19     nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])
     20     G=nx.Graph()
     21     for node in nodes:

ValueError: too many values to unpack (expected 2)

In [ ]: