mldeveloper01 · July 18, 2018 07:26 · Mar 30, 2018 · Jan 29, 2017 · Jan 23, 2017 · Jan 22, 2017
diff --git a/kmeansExample.py b/kmeansExample.py
@@ -1,11 +1,12 @@
 #############################################################################
 # Full Imports
 
+from __future__ import division
 import math
 import random
 
 """
-This is a pure Python implementation of the K-Means Clustering algorithmn. The
+This is a pure Python implementation of the K-means Clustering algorithmn. The
 original can be found here:
 http://pandoricweb.tumblr.com/post/8646701677/python-implementation-of-the-k-means-clustering
 
@@ -49,28 +50,134 @@ def main():
     upper = 200
 
     # The K in k-means. How many clusters do we assume exist?
+    #   - Must be less than num_points
     num_clusters = 3
 
-    # When do we say the optimization has 'converged' and stop updating clusters
+    # When do we say the process has 'converged' and stop updating clusters?
     cutoff = 0.2
 
     # Generate some points to cluster
+    # Note: If you want to use your own data, set points equal to it here.
     points = [
         makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)
     ]
 
     # Cluster those data!
-    clusters = kmeans(points, num_clusters, cutoff)
-
-    # Print our clusters
-    for i, c in enumerate(clusters):
+    iteration_count = 20
+    best_clusters = iterative_kmeans(
+        points,
+        num_clusters,
+        cutoff,
+        iteration_count
+    )
+
+    # Print our best clusters
+    for i, c in enumerate(best_clusters):
         for p in c.points:
             print " Cluster: ", i, "\t Point :", p
 
     # Display clusters using plotly for 2d data
     if dimensions in [2, 3] and plotly:
         print "Plotting points, launching browser ..."
-        plotClusters(clusters, dimensions)
+        plotClusters(best_clusters, dimensions)
+
+
+#############################################################################
+# K-means Methods
+
+def iterative_kmeans(points, num_clusters, cutoff, iteration_count):
+    """
+    K-means isn't guaranteed to get the best answer the first time. It might
+    get stuck in a "local minimum."
+
+    Here we run kmeans() *iteration_count* times to increase the chance of
+    getting a good answer.
+
+    Returns the best set of clusters found.
+    """
+    print "Running K-means %d times to find best clusters ..." % iteration_count
+    candidate_clusters = []
+    errors = []
+    for _ in range(iteration_count):
+        clusters = kmeans(points, num_clusters, cutoff)
+        error = calculateError(clusters)
+        candidate_clusters.append(clusters)
+        errors.append(error)
+
+    highest_error = max(errors)
+    lowest_error = min(errors)
+    print "Lowest error found: %.2f (Highest: %.2f)" % (
+        lowest_error,
+        highest_error
+    )
+    ind_of_lowest_error = errors.index(lowest_error)
+    best_clusters = candidate_clusters[ind_of_lowest_error]
+
+    return best_clusters
+
+def kmeans(points, k, cutoff):
+
+    # Pick out k random points to use as our initial centroids
+    initial_centroids = random.sample(points, k)
+
+    # Create k clusters using those centroids
+    # Note: Cluster takes lists, so we wrap each point in a list here.
+    clusters = [Cluster([p]) for p in initial_centroids]
+
+    # Loop through the dataset until the clusters stabilize
+    loopCounter = 0
+    while True:
+        # Create a list of lists to hold the points in each cluster
+        lists = [[] for _ in clusters]
+        clusterCount = len(clusters)
+
+        # Start counting loops
+        loopCounter += 1
+        # For every point in the dataset ...
+        for p in points:
+            # Get the distance between that point and the centroid of the first
+            # cluster.
+            smallest_distance = getDistance(p, clusters[0].centroid)
+
+            # Set the cluster this point belongs to
+            clusterIndex = 0
+
+            # For the remainder of the clusters ...
+            for i in range(1, clusterCount):
+                # calculate the distance of that point to each other cluster's
+                # centroid.
+                distance = getDistance(p, clusters[i].centroid)
+                # If it's closer to that cluster's centroid update what we
+                # think the smallest distance is
+                if distance < smallest_distance:
+                    smallest_distance = distance
+                    clusterIndex = i
+            # After finding the cluster the smallest distance away
+            # set the point to belong to that cluster
+            lists[clusterIndex].append(p)
+
+        # Set our biggest_shift to zero for this iteration
+        biggest_shift = 0.0
+
+        # For each cluster ...
+        for i in range(clusterCount):
+            # Calculate how far the centroid moved in this iteration
+            shift = clusters[i].update(lists[i])
+            # Keep track of the largest move from all cluster centroid updates
+            biggest_shift = max(biggest_shift, shift)
+
+        # Remove empty clusters
+        clusters = [c for c in clusters if len(c.points) != 0]
+
+        # If the centroids have stopped moving much, say we're done!
+        if biggest_shift < cutoff:
+            print "Converged after %s iterations" % loopCounter
+            break
+    return clusters
+
+
+#############################################################################
+# Classes
 
 class Point(object):
     '''
@@ -130,6 +237,11 @@ def update(self, points):
         '''
         old_centroid = self.centroid
         self.points = points
+        # Return early if we have no points, this cluster will get
+        # cleaned up (removed) in the outer loop.
+        if len(self.points) == 0:
+            return 0
+
         self.centroid = self.calculateCentroid()
         shift = getDistance(old_centroid, self.centroid)
         return shift
@@ -148,66 +260,23 @@ def calculateCentroid(self):
 
         return Point(centroid_coords)
 
-def kmeans(points, k, cutoff):
-
-    # Pick out k random points to use as our initial centroids
-    initial = random.sample(points, k)
-
-    # Create k clusters using those centroids
-    # Note: Cluster takes lists, so we wrap each point in a list here.
-    clusters = [Cluster([p]) for p in initial]
-
-    # Loop through the dataset until the clusters stabilize
-    loopCounter = 0
-    while True:
-        # Create a list of lists to hold the points in each cluster
-        lists = [[] for _ in clusters]
-        clusterCount = len(clusters)
-
-        # Start counting loops
-        loopCounter += 1
-        # For every point in the dataset ...
-        for p in points:
-            # Get the distance between that point and the centroid of the first
-            # cluster.
-            smallest_distance = getDistance(p, clusters[0].centroid)
-
-            # Set the cluster this point belongs to
-            clusterIndex = 0
-
-            # For the remainder of the clusters ...
-            for i in range(clusterCount - 1):
-                # calculate the distance of that point to each other cluster's
-                # centroid.
-                distance = getDistance(p, clusters[i+1].centroid)
-                # If it's closer to that cluster's centroid update what we
-                # think the smallest distance is
-                if distance < smallest_distance:
-                    smallest_distance = distance
-                    clusterIndex = i+1
-            # After finding the cluster the smallest distance away
-            # set the point to belong to that cluster
-            lists[clusterIndex].append(p)
-
-        # Set our biggest_shift to zero for this iteration
-        biggest_shift = 0.0
+    def getTotalDistance(self):
+        '''
+        Return the sum of all squared Euclidean distances between each point in 
+        the cluster and the cluster's centroid.
+        '''
+        sumOfDistances = 0.0
+        for p in self.points:
+            sumOfDistances += getDistance(p, self.centroid)
 
-        # For each cluster ...
-        for i in range(clusterCount):
-            # Calculate how far the centroid moved in this iteration
-            shift = clusters[i].update(lists[i])
-            # Keep track of the largest move from all cluster centroid updates
-            biggest_shift = max(biggest_shift, shift)
+        return sumOfDistances
 
-        # If the centroids have stopped moving much, say we're done!
-        if biggest_shift < cutoff:
-            print "Converged after %s iterations" % loopCounter
-            break
-    return clusters
+#############################################################################
+# Helper Methods
 
 def getDistance(a, b):
     '''
-    Euclidean distance between two n-dimensional points.
+    Squared Euclidean distance between two n-dimensional points.
     https://en.wikipedia.org/wiki/Euclidean_distance#n_dimensions
     Note: This can be very slow and does not scale well
     '''
@@ -218,9 +287,8 @@ def getDistance(a, b):
     for i in range(a.n):
         squareDifference = pow((a.coords[i]-b.coords[i]), 2)
         accumulatedDifference += squareDifference
-    distance = math.sqrt(accumulatedDifference)
 
-    return distance
+    return accumulatedDifference
 
 def makeRandomPoint(n, lower, upper):
     '''
@@ -230,6 +298,22 @@ def makeRandomPoint(n, lower, upper):
     p = Point([random.uniform(lower, upper) for _ in range(n)])
     return p
 
+def calculateError(clusters):
+    '''
+    Return the average squared distance between each point and its cluster
+    centroid.
+
+    This is also known as the "distortion cost."
+    '''
+    accumulatedDistances = 0
+    num_points = 0
+    for cluster in clusters:
+        num_points += len(cluster.points)
+        accumulatedDistances += cluster.getTotalDistance()
+
+    error = accumulatedDistances / num_points
+    return error
+
 def plotClusters(data, dimensions):
     '''
     This uses the plotly offline mode to create a local HTML file.

diff --git a/kmeansExample.py b/kmeansExample.py
@@ -1,7 +1,6 @@
 #############################################################################
 # Full Imports
 
-import sys
 import math
 import random
 
@@ -27,7 +26,6 @@
 
 This script uses an offline plotting mode and will store and open plots locally.
 To store and share plots online sign up for a plotly API key at https://plot.ly.
-
 """
 
 plotly = False
@@ -38,102 +36,104 @@
     print "INFO: Plotly is not installed, plots will not be generated."
 
 def main():
-    
+
     # How many points are in our dataset?
     num_points = 20
-    
+
     # For each of those points how many dimensions do they have?
     # Note: Plotting will only work in two or three dimensions
     dimensions = 2
-    
+
     # Bounds for the values of those points in each dimension
     lower = 0
     upper = 200
-    
+
     # The K in k-means. How many clusters do we assume exist?
     num_clusters = 3
-    
+
     # When do we say the optimization has 'converged' and stop updating clusters
     cutoff = 0.2
-    
+
     # Generate some points to cluster
-    points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]
-
+    points = [
+        makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)
+    ]
+
     # Cluster those data!
     clusters = kmeans(points, num_clusters, cutoff)
 
     # Print our clusters
     for i, c in enumerate(clusters):
         for p in c.points:
             print " Cluster: ", i, "\t Point :", p
-    
+
     # Display clusters using plotly for 2d data
     if dimensions in [2, 3] and plotly:
         print "Plotting points, launching browser ..."
         plotClusters(clusters, dimensions)
 
-class Point:
+class Point(object):
     '''
     A point in n dimensional space
     '''
     def __init__(self, coords):
         '''
         coords - A list of values, one per dimension
         '''
-        
+
         self.coords = coords
         self.n = len(coords)
-        
+
     def __repr__(self):
         return str(self.coords)
 
-class Cluster:
+class Cluster(object):
     '''
     A set of points and their centroid
     '''
-    
+
     def __init__(self, points):
         '''
         points - A list of point objects
         '''
-        
-        if len(points) == 0: 
+
+        if len(points) == 0:
             raise Exception("ERROR: empty cluster")
-        
+
         # The points that belong to this cluster
         self.points = points
-        
+
         # The dimensionality of the points in this cluster
         self.n = points[0].n
-        
+
         # Assert that all points are of the same dimensionality
         for p in points:
-            if p.n != self.n: 
+            if p.n != self.n:
                 raise Exception("ERROR: inconsistent dimensions")
-            
+
         # Set up the initial centroid (this is usually based off one point)
         self.centroid = self.calculateCentroid()
-        
+
     def __repr__(self):
         '''
         String representation of this object
         '''
         return str(self.points)
-    
+
     def update(self, points):
         '''
         Returns the distance between the previous centroid and the new after
         recalculating and storing the new centroid.
 
-        Note: Initially we expect centroids to shift around a lot and then gradually
-        settle down.
+        Note: Initially we expect centroids to shift around a lot and then
+        gradually settle down.
         '''
         old_centroid = self.centroid
         self.points = points
         self.centroid = self.calculateCentroid()
-        shift = getDistance(old_centroid, self.centroid) 
+        shift = getDistance(old_centroid, self.centroid)
         return shift
-    
+
     def calculateCentroid(self):
         '''
         Finds a virtual center point for a group of n-dimensional points
@@ -145,36 +145,36 @@ def calculateCentroid(self):
         unzipped = zip(*coords)
         # Calculate the mean for each dimension
         centroid_coords = [math.fsum(dList)/numPoints for dList in unzipped]
-        
+
         return Point(centroid_coords)
 
 def kmeans(points, k, cutoff):
-    
+
     # Pick out k random points to use as our initial centroids
     initial = random.sample(points, k)
-    
+
     # Create k clusters using those centroids
     # Note: Cluster takes lists, so we wrap each point in a list here.
     clusters = [Cluster([p]) for p in initial]
-    
+
     # Loop through the dataset until the clusters stabilize
     loopCounter = 0
     while True:
         # Create a list of lists to hold the points in each cluster
-        lists = [[] for c in clusters]
+        lists = [[] for _ in clusters]
         clusterCount = len(clusters)
-        
+
         # Start counting loops
         loopCounter += 1
         # For every point in the dataset ...
         for p in points:
             # Get the distance between that point and the centroid of the first
             # cluster.
             smallest_distance = getDistance(p, clusters[0].centroid)
-        
+
             # Set the cluster this point belongs to
             clusterIndex = 0
-        
+
             # For the remainder of the clusters ...
             for i in range(clusterCount - 1):
                 # calculate the distance of that point to each other cluster's
@@ -188,17 +188,17 @@ def kmeans(points, k, cutoff):
             # After finding the cluster the smallest distance away
             # set the point to belong to that cluster
             lists[clusterIndex].append(p)
-        
+
         # Set our biggest_shift to zero for this iteration
         biggest_shift = 0.0
-        
+
         # For each cluster ...
         for i in range(clusterCount):
             # Calculate how far the centroid moved in this iteration
             shift = clusters[i].update(lists[i])
             # Keep track of the largest move from all cluster centroid updates
             biggest_shift = max(biggest_shift, shift)
-        
+
         # If the centroids have stopped moving much, say we're done!
         if biggest_shift < cutoff:
             print "Converged after %s iterations" % loopCounter
@@ -208,25 +208,31 @@ def kmeans(points, k, cutoff):
 def getDistance(a, b):
     '''
     Euclidean distance between two n-dimensional points.
+    https://en.wikipedia.org/wiki/Euclidean_distance#n_dimensions
     Note: This can be very slow and does not scale well
     '''
     if a.n != b.n:
         raise Exception("ERROR: non comparable points")
-
-    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2), range(a.n), 0.0)
-    return math.sqrt(ret)
+
+    accumulatedDifference = 0.0
+    for i in range(a.n):
+        squareDifference = pow((a.coords[i]-b.coords[i]), 2)
+        accumulatedDifference += squareDifference
+    distance = math.sqrt(accumulatedDifference)
+
+    return distance
 
 def makeRandomPoint(n, lower, upper):
     '''
     Returns a Point object with n dimensions and values between lower and
     upper in each of those dimensions
     '''
-    p = Point([random.uniform(lower, upper) for i in range(n)])
+    p = Point([random.uniform(lower, upper) for _ in range(n)])
     return p
 
 def plotClusters(data, dimensions):
-    '''    
-    This uses the plotly offline mode to create a local HTML file. 
+    '''
+    This uses the plotly offline mode to create a local HTML file.
     This should open your default web browser.
     '''
     if dimensions not in [2, 3]:
@@ -261,12 +267,19 @@ def plotClusters(data, dimensions):
             centroid['name'] = "Centroid " + str(i)
             traceList.append(Scatter(**centroid))
         else:
-            symbols = ["circle", "square", "diamond", "circle-open", "square-open", "diamond-open", "cross", "x"]
+            symbols = [
+                "circle",
+                "square",
+                "diamond",
+                "circle-open",
+                "square-open",
+                "diamond-open",
+                "cross", "x"
+            ]
             symbol_count = len(symbols)
-            symbol_index = i % len(symbols)
             if i > symbol_count:
-                print "Warning: Not enough marker symbols to go around, expect repeats"
-            # Convert our list of x,y,z's into an x list, a y list, and a z list.
+                print "Warning: Not enough marker symbols to go around"
+            # Convert our list of x,y,z's separate lists.
             trace['x'], trace['y'], trace['z'] = zip(*cluster_data)
             trace['mode'] = 'markers'
             trace['marker'] = {}
@@ -291,5 +304,5 @@ def plotClusters(data, dimensions):
         "layout": Layout(title=title)
     })
 
-if __name__ == "__main__": 
-    main()
+if __name__ == "__main__":
+    main()
diff --git a/kmeansExample.py b/kmeansExample.py
@@ -4,7 +4,6 @@
 import sys
 import math
 import random
-import subprocess
 
 """
 This is a pure Python implementation of the K-Means Clustering algorithmn. The
@@ -44,7 +43,7 @@ def main():
     num_points = 20
 
     # For each of those points how many dimensions do they have?
-    # Note: Plotting will only work in two dimensions
+    # Note: Plotting will only work in two or three dimensions
     dimensions = 2
 
     # Bounds for the values of those points in each dimension

diff --git a/kmeansExample.py b/kmeansExample.py
@@ -19,29 +19,32 @@
 This script specifically avoids using numpy or other more obscure libraries. It
 is meant to be *clear* not fast.
 
-I have also added integration with the plot.ly plotting service. If you put in
-your (free) plot.ly credentials below, it will automatically plot the discovered
-clusters and their centroids.
+I have also added integration with the plot.ly plotting library. So you can see
+the clusters found by this algorithm. To install run:
 
-To use plotly integration you will need to:
+```
+pip install plotly
+```
 
-1. Get a username/key from www.plot.ly/api and enter them below
-2. Install the plotly module: pip install plotly
+This script uses an offline plotting mode and will store and open plots locally.
+To store and share plots online sign up for a plotly API key at https://plot.ly.
 
 """
 
-PLOTLY_USERNAME = None
-PLOTLY_KEY = None
-
-if PLOTLY_USERNAME:
-    from plotly import plotly
+plotly = False
+try:
+    import plotly
+    from plotly.graph_objs import Scatter, Scatter3d, Layout
+except ImportError:
+    print "INFO: Plotly is not installed, plots will not be generated."
 
 def main():
 
     # How many points are in our dataset?
-    num_points = 10
+    num_points = 20
 
     # For each of those points how many dimensions do they have?
+    # Note: Plotting will only work in two dimensions
     dimensions = 2
 
     # Bounds for the values of those points in each dimension
@@ -52,28 +55,27 @@ def main():
     num_clusters = 3
 
     # When do we say the optimization has 'converged' and stop updating clusters
-    opt_cutoff = 0.5
+    cutoff = 0.2
 
-    # Generate some points
+    # Generate some points to cluster
     points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]
 
     # Cluster those data!
-    clusters = kmeans(points, num_clusters, opt_cutoff)
+    clusters = kmeans(points, num_clusters, cutoff)
 
     # Print our clusters
-    for i,c in enumerate(clusters):
+    for i, c in enumerate(clusters):
         for p in c.points:
             print " Cluster: ", i, "\t Point :", p
 
     # Display clusters using plotly for 2d data
-    # This uses the 'open' command on a URL and may only work on OSX.
-    if dimensions == 2 and PLOTLY_USERNAME:
+    if dimensions in [2, 3] and plotly:
         print "Plotting points, launching browser ..."
-        plotClusters(clusters)
+        plotClusters(clusters, dimensions)
 
 class Point:
     '''
-    An point in n dimensional space
+    A point in n dimensional space
     '''
     def __init__(self, coords):
         '''
@@ -96,7 +98,9 @@ def __init__(self, points):
         points - A list of point objects
         '''
 
-        if len(points) == 0: raise Exception("ILLEGAL: empty cluster")
+        if len(points) == 0: 
+            raise Exception("ERROR: empty cluster")
+
         # The points that belong to this cluster
         self.points = points
 
@@ -105,7 +109,8 @@ def __init__(self, points):
 
         # Assert that all points are of the same dimensionality
         for p in points:
-            if p.n != self.n: raise Exception("ILLEGAL: wrong dimensions")
+            if p.n != self.n: 
+                raise Exception("ERROR: inconsistent dimensions")
 
         # Set up the initial centroid (this is usually based off one point)
         self.centroid = self.calculateCentroid()
@@ -120,6 +125,9 @@ def update(self, points):
         '''
         Returns the distance between the previous centroid and the new after
         recalculating and storing the new centroid.
+
+        Note: Initially we expect centroids to shift around a lot and then gradually
+        settle down.
         '''
         old_centroid = self.centroid
         self.points = points
@@ -147,13 +155,14 @@ def kmeans(points, k, cutoff):
     initial = random.sample(points, k)
 
     # Create k clusters using those centroids
+    # Note: Cluster takes lists, so we wrap each point in a list here.
     clusters = [Cluster([p]) for p in initial]
 
     # Loop through the dataset until the clusters stabilize
     loopCounter = 0
     while True:
         # Create a list of lists to hold the points in each cluster
-        lists = [ [] for c in clusters]
+        lists = [[] for c in clusters]
         clusterCount = len(clusters)
 
         # Start counting loops
@@ -173,17 +182,18 @@ def kmeans(points, k, cutoff):
                 # centroid.
                 distance = getDistance(p, clusters[i+1].centroid)
                 # If it's closer to that cluster's centroid update what we
-                # think the smallest distance is, and set the point to belong
-                # to that cluster
+                # think the smallest distance is
                 if distance < smallest_distance:
                     smallest_distance = distance
                     clusterIndex = i+1
+            # After finding the cluster the smallest distance away
+            # set the point to belong to that cluster
             lists[clusterIndex].append(p)
 
         # Set our biggest_shift to zero for this iteration
         biggest_shift = 0.0
 
-        # As many times as there are clusters ...
+        # For each cluster ...
         for i in range(clusterCount):
             # Calculate how far the centroid moved in this iteration
             shift = clusters[i].update(lists[i])
@@ -202,9 +212,9 @@ def getDistance(a, b):
     Note: This can be very slow and does not scale well
     '''
     if a.n != b.n:
-        raise Exception("ILLEGAL: non comparable points")
+        raise Exception("ERROR: non comparable points")
 
-    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2),range(a.n),0.0)
+    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2), range(a.n), 0.0)
     return math.sqrt(ret)
 
 def makeRandomPoint(n, lower, upper):
@@ -215,56 +225,72 @@ def makeRandomPoint(n, lower, upper):
     p = Point([random.uniform(lower, upper) for i in range(n)])
     return p
 
-def plotClusters(data):
+def plotClusters(data, dimensions):
+    '''    
+    This uses the plotly offline mode to create a local HTML file. 
+    This should open your default web browser.
     '''
-    Use the plotly API to plot data from clusters.
-    
-    Gets a plot URL from plotly and then uses subprocess to 'open' that URL
-    from the command line. This should open your default web browser.
-    '''
-
-    # List of symbols each cluster will be displayed using    
-    symbols = ['circle', 'cross', 'triangle-up', 'square']
+    if dimensions not in [2, 3]:
+        raise Exception("Plots are only available for 2 and 3 dimensional data")
 
     # Convert data into plotly format.
     traceList = []
     for i, c in enumerate(data):
-        data = []
-        for p in c.points:
-            data.append(p.coords)
-        # Data
+        # Get a list of x,y coordinates for the points in this cluster.
+        cluster_data = []
+        for point in c.points:
+            cluster_data.append(point.coords)
+
         trace = {}
-        trace['x'], trace['y'] = zip(*data)
-        trace['marker'] = {}
-        trace['marker']['symbol'] = symbols[i]
-        trace['name'] = "Cluster " + str(i)
-        traceList.append(trace)
-        # Centroid (A trace of length 1)
         centroid = {}
-        centroid['x'] = [c.centroid.coords[0]]
-        centroid['y'] = [c.centroid.coords[1]]
-        centroid['marker'] = {}
-        centroid['marker']['symbol'] = symbols[i]
-        centroid['marker']['color'] = 'rgb(200,10,10)'
-        centroid['name'] = "Centroid " + str(i)
-        traceList.append(centroid)
-
-    # Log in to plotly
-    py = plotly(username=PLOTLY_USERNAME, key=PLOTLY_KEY)
+        if dimensions == 2:
+            # Convert our list of x,y's into an x list and a y list.
+            trace['x'], trace['y'] = zip(*cluster_data)
+            trace['mode'] = 'markers'
+            trace['marker'] = {}
+            trace['marker']['symbol'] = i
+            trace['marker']['size'] = 12
+            trace['name'] = "Cluster " + str(i)
+            traceList.append(Scatter(**trace))
+            # Centroid (A trace of length 1)
+            centroid['x'] = [c.centroid.coords[0]]
+            centroid['y'] = [c.centroid.coords[1]]
+            centroid['mode'] = 'markers'
+            centroid['marker'] = {}
+            centroid['marker']['symbol'] = i
+            centroid['marker']['color'] = 'rgb(200,10,10)'
+            centroid['name'] = "Centroid " + str(i)
+            traceList.append(Scatter(**centroid))
+        else:
+            symbols = ["circle", "square", "diamond", "circle-open", "square-open", "diamond-open", "cross", "x"]
+            symbol_count = len(symbols)
+            symbol_index = i % len(symbols)
+            if i > symbol_count:
+                print "Warning: Not enough marker symbols to go around, expect repeats"
+            # Convert our list of x,y,z's into an x list, a y list, and a z list.
+            trace['x'], trace['y'], trace['z'] = zip(*cluster_data)
+            trace['mode'] = 'markers'
+            trace['marker'] = {}
+            trace['marker']['symbol'] = symbols[i]
+            trace['marker']['size'] = 12
+            trace['name'] = "Cluster " + str(i)
+            traceList.append(Scatter3d(**trace))
+            # Centroid (A trace of length 1)
+            centroid['x'] = [c.centroid.coords[0]]
+            centroid['y'] = [c.centroid.coords[1]]
+            centroid['z'] = [c.centroid.coords[2]]
+            centroid['mode'] = 'markers'
+            centroid['marker'] = {}
+            centroid['marker']['symbol'] = symbols[i]
+            centroid['marker']['color'] = 'rgb(200,10,10)'
+            centroid['name'] = "Centroid " + str(i)
+            traceList.append(Scatter3d(**centroid))
 
-    # Style the chart
-    datastyle = {'mode':'markers',
-             'type':'scatter',
-             'marker':{'line':{'width':0},
-                       'size':12,
-                       'opacity':0.6,
-                       'color':'rgb(74, 134, 232)'}}
-
-    resp = py.plot(*traceList, style = datastyle)
-
-    # Display that plot in a browser
-    cmd = "open " + resp['url']
-    subprocess.call(cmd, shell=True)
+    title = "K-means clustering with %s clusters" % str(len(data))
+    plotly.offline.plot({
+        "data": traceList,
+        "layout": Layout(title=title)
+    })
 
 if __name__ == "__main__": 
     main()
diff --git a/kmeansExample.py b/kmeansExample.py
@@ -68,7 +68,7 @@ def main():
     # Display clusters using plotly for 2d data
     # This uses the 'open' command on a URL and may only work on OSX.
     if dimensions == 2 and PLOTLY_USERNAME:
-        "Plotting points, launching browser ..."
+        print "Plotting points, launching browser ..."
         plotClusters(clusters)
 
 class Point:

diff --git a/kmeansExample.py b/kmeansExample.py
@@ -0,0 +1,270 @@
+#############################################################################
+# Full Imports
+
+import sys
+import math
+import random
+import subprocess
+
+"""
+This is a pure Python implementation of the K-Means Clustering algorithmn. The
+original can be found here:
+http://pandoricweb.tumblr.com/post/8646701677/python-implementation-of-the-k-means-clustering
+
+I have refactored the code and added comments to aid in readability.
+After reading through this code you should understand clearly how K-means works.
+If not, feel free to email me with questions and suggestions. (iandanforth at
+gmail)
+
+This script specifically avoids using numpy or other more obscure libraries. It
+is meant to be *clear* not fast.
+
+I have also added integration with the plot.ly plotting service. If you put in
+your (free) plot.ly credentials below, it will automatically plot the discovered
+clusters and their centroids.
+
+To use plotly integration you will need to:
+
+1. Get a username/key from www.plot.ly/api and enter them below
+2. Install the plotly module: pip install plotly
+
+"""
+
+PLOTLY_USERNAME = None
+PLOTLY_KEY = None
+
+if PLOTLY_USERNAME:
+    from plotly import plotly
+
+def main():
+
+    # How many points are in our dataset?
+    num_points = 10
+
+    # For each of those points how many dimensions do they have?
+    dimensions = 2
+
+    # Bounds for the values of those points in each dimension
+    lower = 0
+    upper = 200
+
+    # The K in k-means. How many clusters do we assume exist?
+    num_clusters = 3
+
+    # When do we say the optimization has 'converged' and stop updating clusters
+    opt_cutoff = 0.5
+
+    # Generate some points
+    points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]
+
+    # Cluster those data!
+    clusters = kmeans(points, num_clusters, opt_cutoff)
+
+    # Print our clusters
+    for i,c in enumerate(clusters):
+        for p in c.points:
+            print " Cluster: ", i, "\t Point :", p
+
+    # Display clusters using plotly for 2d data
+    # This uses the 'open' command on a URL and may only work on OSX.
+    if dimensions == 2 and PLOTLY_USERNAME:
+        "Plotting points, launching browser ..."
+        plotClusters(clusters)
+
+class Point:
+    '''
+    An point in n dimensional space
+    '''
+    def __init__(self, coords):
+        '''
+        coords - A list of values, one per dimension
+        '''
+
+        self.coords = coords
+        self.n = len(coords)
+
+    def __repr__(self):
+        return str(self.coords)
+
+class Cluster:
+    '''
+    A set of points and their centroid
+    '''
+
+    def __init__(self, points):
+        '''
+        points - A list of point objects
+        '''
+
+        if len(points) == 0: raise Exception("ILLEGAL: empty cluster")
+        # The points that belong to this cluster
+        self.points = points
+
+        # The dimensionality of the points in this cluster
+        self.n = points[0].n
+
+        # Assert that all points are of the same dimensionality
+        for p in points:
+            if p.n != self.n: raise Exception("ILLEGAL: wrong dimensions")
+
+        # Set up the initial centroid (this is usually based off one point)
+        self.centroid = self.calculateCentroid()
+
+    def __repr__(self):
+        '''
+        String representation of this object
+        '''
+        return str(self.points)
+
+    def update(self, points):
+        '''
+        Returns the distance between the previous centroid and the new after
+        recalculating and storing the new centroid.
+        '''
+        old_centroid = self.centroid
+        self.points = points
+        self.centroid = self.calculateCentroid()
+        shift = getDistance(old_centroid, self.centroid) 
+        return shift
+
+    def calculateCentroid(self):
+        '''
+        Finds a virtual center point for a group of n-dimensional points
+        '''
+        numPoints = len(self.points)
+        # Get a list of all coordinates in this cluster
+        coords = [p.coords for p in self.points]
+        # Reformat that so all x's are together, all y'z etc.
+        unzipped = zip(*coords)
+        # Calculate the mean for each dimension
+        centroid_coords = [math.fsum(dList)/numPoints for dList in unzipped]
+
+        return Point(centroid_coords)
+
+def kmeans(points, k, cutoff):
+
+    # Pick out k random points to use as our initial centroids
+    initial = random.sample(points, k)
+
+    # Create k clusters using those centroids
+    clusters = [Cluster([p]) for p in initial]
+
+    # Loop through the dataset until the clusters stabilize
+    loopCounter = 0
+    while True:
+        # Create a list of lists to hold the points in each cluster
+        lists = [ [] for c in clusters]
+        clusterCount = len(clusters)
+
+        # Start counting loops
+        loopCounter += 1
+        # For every point in the dataset ...
+        for p in points:
+            # Get the distance between that point and the centroid of the first
+            # cluster.
+            smallest_distance = getDistance(p, clusters[0].centroid)
+
+            # Set the cluster this point belongs to
+            clusterIndex = 0
+
+            # For the remainder of the clusters ...
+            for i in range(clusterCount - 1):
+                # calculate the distance of that point to each other cluster's
+                # centroid.
+                distance = getDistance(p, clusters[i+1].centroid)
+                # If it's closer to that cluster's centroid update what we
+                # think the smallest distance is, and set the point to belong
+                # to that cluster
+                if distance < smallest_distance:
+                    smallest_distance = distance
+                    clusterIndex = i+1
+            lists[clusterIndex].append(p)
+
+        # Set our biggest_shift to zero for this iteration
+        biggest_shift = 0.0
+
+        # As many times as there are clusters ...
+        for i in range(clusterCount):
+            # Calculate how far the centroid moved in this iteration
+            shift = clusters[i].update(lists[i])
+            # Keep track of the largest move from all cluster centroid updates
+            biggest_shift = max(biggest_shift, shift)
+
+        # If the centroids have stopped moving much, say we're done!
+        if biggest_shift < cutoff:
+            print "Converged after %s iterations" % loopCounter
+            break
+    return clusters
+
+def getDistance(a, b):
+    '''
+    Euclidean distance between two n-dimensional points.
+    Note: This can be very slow and does not scale well
+    '''
+    if a.n != b.n:
+        raise Exception("ILLEGAL: non comparable points")
+
+    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2),range(a.n),0.0)
+    return math.sqrt(ret)
+
+def makeRandomPoint(n, lower, upper):
+    '''
+    Returns a Point object with n dimensions and values between lower and
+    upper in each of those dimensions
+    '''
+    p = Point([random.uniform(lower, upper) for i in range(n)])
+    return p
+
+def plotClusters(data):
+    '''
+    Use the plotly API to plot data from clusters.
+    
+    Gets a plot URL from plotly and then uses subprocess to 'open' that URL
+    from the command line. This should open your default web browser.
+    '''
+
+    # List of symbols each cluster will be displayed using    
+    symbols = ['circle', 'cross', 'triangle-up', 'square']
+
+    # Convert data into plotly format.
+    traceList = []
+    for i, c in enumerate(data):
+        data = []
+        for p in c.points:
+            data.append(p.coords)
+        # Data
+        trace = {}
+        trace['x'], trace['y'] = zip(*data)
+        trace['marker'] = {}
+        trace['marker']['symbol'] = symbols[i]
+        trace['name'] = "Cluster " + str(i)
+        traceList.append(trace)
+        # Centroid (A trace of length 1)
+        centroid = {}
+        centroid['x'] = [c.centroid.coords[0]]
+        centroid['y'] = [c.centroid.coords[1]]
+        centroid['marker'] = {}
+        centroid['marker']['symbol'] = symbols[i]
+        centroid['marker']['color'] = 'rgb(200,10,10)'
+        centroid['name'] = "Centroid " + str(i)
+        traceList.append(centroid)
+
+    # Log in to plotly
+    py = plotly(username=PLOTLY_USERNAME, key=PLOTLY_KEY)
+
+    # Style the chart
+    datastyle = {'mode':'markers',
+             'type':'scatter',
+             'marker':{'line':{'width':0},
+                       'size':12,
+                       'opacity':0.6,
+                       'color':'rgb(74, 134, 232)'}}
+
+    resp = py.plot(*traceList, style = datastyle)
+
+    # Display that plot in a browser
+    cmd = "open " + resp['url']
+    subprocess.call(cmd, shell=True)
+
+if __name__ == "__main__": 
+    main()