Skip to content

Instantly share code, notes, and snippets.

@mldeveloper01
Forked from iandanforth/kmeansExample.py
Created July 18, 2018 07:26
Show Gist options
  • Save mldeveloper01/032a94c6dc59b7b2d0689a9cb7cbbfda to your computer and use it in GitHub Desktop.
Save mldeveloper01/032a94c6dc59b7b2d0689a9cb7cbbfda to your computer and use it in GitHub Desktop.

Revisions

  1. @iandanforth iandanforth revised this gist Mar 30, 2018. 1 changed file with 148 additions and 64 deletions.
    212 changes: 148 additions & 64 deletions kmeansExample.py
    Original file line number Diff line number Diff line change
    @@ -1,11 +1,12 @@
    #############################################################################
    # Full Imports

    from __future__ import division
    import math
    import random

    """
    This is a pure Python implementation of the K-Means Clustering algorithmn. The
    This is a pure Python implementation of the K-means Clustering algorithmn. The
    original can be found here:
    http://pandoricweb.tumblr.com/post/8646701677/python-implementation-of-the-k-means-clustering
    @@ -49,28 +50,134 @@ def main():
    upper = 200

    # The K in k-means. How many clusters do we assume exist?
    # - Must be less than num_points
    num_clusters = 3

    # When do we say the optimization has 'converged' and stop updating clusters
    # When do we say the process has 'converged' and stop updating clusters?
    cutoff = 0.2

    # Generate some points to cluster
    # Note: If you want to use your own data, set points equal to it here.
    points = [
    makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)
    ]

    # Cluster those data!
    clusters = kmeans(points, num_clusters, cutoff)

    # Print our clusters
    for i, c in enumerate(clusters):
    iteration_count = 20
    best_clusters = iterative_kmeans(
    points,
    num_clusters,
    cutoff,
    iteration_count
    )

    # Print our best clusters
    for i, c in enumerate(best_clusters):
    for p in c.points:
    print " Cluster: ", i, "\t Point :", p

    # Display clusters using plotly for 2d data
    if dimensions in [2, 3] and plotly:
    print "Plotting points, launching browser ..."
    plotClusters(clusters, dimensions)
    plotClusters(best_clusters, dimensions)


    #############################################################################
    # K-means Methods

    def iterative_kmeans(points, num_clusters, cutoff, iteration_count):
    """
    K-means isn't guaranteed to get the best answer the first time. It might
    get stuck in a "local minimum."
    Here we run kmeans() *iteration_count* times to increase the chance of
    getting a good answer.
    Returns the best set of clusters found.
    """
    print "Running K-means %d times to find best clusters ..." % iteration_count
    candidate_clusters = []
    errors = []
    for _ in range(iteration_count):
    clusters = kmeans(points, num_clusters, cutoff)
    error = calculateError(clusters)
    candidate_clusters.append(clusters)
    errors.append(error)

    highest_error = max(errors)
    lowest_error = min(errors)
    print "Lowest error found: %.2f (Highest: %.2f)" % (
    lowest_error,
    highest_error
    )
    ind_of_lowest_error = errors.index(lowest_error)
    best_clusters = candidate_clusters[ind_of_lowest_error]

    return best_clusters

    def kmeans(points, k, cutoff):

    # Pick out k random points to use as our initial centroids
    initial_centroids = random.sample(points, k)

    # Create k clusters using those centroids
    # Note: Cluster takes lists, so we wrap each point in a list here.
    clusters = [Cluster([p]) for p in initial_centroids]

    # Loop through the dataset until the clusters stabilize
    loopCounter = 0
    while True:
    # Create a list of lists to hold the points in each cluster
    lists = [[] for _ in clusters]
    clusterCount = len(clusters)

    # Start counting loops
    loopCounter += 1
    # For every point in the dataset ...
    for p in points:
    # Get the distance between that point and the centroid of the first
    # cluster.
    smallest_distance = getDistance(p, clusters[0].centroid)

    # Set the cluster this point belongs to
    clusterIndex = 0

    # For the remainder of the clusters ...
    for i in range(1, clusterCount):
    # calculate the distance of that point to each other cluster's
    # centroid.
    distance = getDistance(p, clusters[i].centroid)
    # If it's closer to that cluster's centroid update what we
    # think the smallest distance is
    if distance < smallest_distance:
    smallest_distance = distance
    clusterIndex = i
    # After finding the cluster the smallest distance away
    # set the point to belong to that cluster
    lists[clusterIndex].append(p)

    # Set our biggest_shift to zero for this iteration
    biggest_shift = 0.0

    # For each cluster ...
    for i in range(clusterCount):
    # Calculate how far the centroid moved in this iteration
    shift = clusters[i].update(lists[i])
    # Keep track of the largest move from all cluster centroid updates
    biggest_shift = max(biggest_shift, shift)

    # Remove empty clusters
    clusters = [c for c in clusters if len(c.points) != 0]

    # If the centroids have stopped moving much, say we're done!
    if biggest_shift < cutoff:
    print "Converged after %s iterations" % loopCounter
    break
    return clusters


    #############################################################################
    # Classes

    class Point(object):
    '''
    @@ -130,6 +237,11 @@ def update(self, points):
    '''
    old_centroid = self.centroid
    self.points = points
    # Return early if we have no points, this cluster will get
    # cleaned up (removed) in the outer loop.
    if len(self.points) == 0:
    return 0

    self.centroid = self.calculateCentroid()
    shift = getDistance(old_centroid, self.centroid)
    return shift
    @@ -148,66 +260,23 @@ def calculateCentroid(self):

    return Point(centroid_coords)

    def kmeans(points, k, cutoff):

    # Pick out k random points to use as our initial centroids
    initial = random.sample(points, k)

    # Create k clusters using those centroids
    # Note: Cluster takes lists, so we wrap each point in a list here.
    clusters = [Cluster([p]) for p in initial]

    # Loop through the dataset until the clusters stabilize
    loopCounter = 0
    while True:
    # Create a list of lists to hold the points in each cluster
    lists = [[] for _ in clusters]
    clusterCount = len(clusters)

    # Start counting loops
    loopCounter += 1
    # For every point in the dataset ...
    for p in points:
    # Get the distance between that point and the centroid of the first
    # cluster.
    smallest_distance = getDistance(p, clusters[0].centroid)

    # Set the cluster this point belongs to
    clusterIndex = 0

    # For the remainder of the clusters ...
    for i in range(clusterCount - 1):
    # calculate the distance of that point to each other cluster's
    # centroid.
    distance = getDistance(p, clusters[i+1].centroid)
    # If it's closer to that cluster's centroid update what we
    # think the smallest distance is
    if distance < smallest_distance:
    smallest_distance = distance
    clusterIndex = i+1
    # After finding the cluster the smallest distance away
    # set the point to belong to that cluster
    lists[clusterIndex].append(p)

    # Set our biggest_shift to zero for this iteration
    biggest_shift = 0.0
    def getTotalDistance(self):
    '''
    Return the sum of all squared Euclidean distances between each point in
    the cluster and the cluster's centroid.
    '''
    sumOfDistances = 0.0
    for p in self.points:
    sumOfDistances += getDistance(p, self.centroid)

    # For each cluster ...
    for i in range(clusterCount):
    # Calculate how far the centroid moved in this iteration
    shift = clusters[i].update(lists[i])
    # Keep track of the largest move from all cluster centroid updates
    biggest_shift = max(biggest_shift, shift)
    return sumOfDistances

    # If the centroids have stopped moving much, say we're done!
    if biggest_shift < cutoff:
    print "Converged after %s iterations" % loopCounter
    break
    return clusters
    #############################################################################
    # Helper Methods

    def getDistance(a, b):
    '''
    Euclidean distance between two n-dimensional points.
    Squared Euclidean distance between two n-dimensional points.
    https://en.wikipedia.org/wiki/Euclidean_distance#n_dimensions
    Note: This can be very slow and does not scale well
    '''
    @@ -218,9 +287,8 @@ def getDistance(a, b):
    for i in range(a.n):
    squareDifference = pow((a.coords[i]-b.coords[i]), 2)
    accumulatedDifference += squareDifference
    distance = math.sqrt(accumulatedDifference)

    return distance
    return accumulatedDifference

    def makeRandomPoint(n, lower, upper):
    '''
    @@ -230,6 +298,22 @@ def makeRandomPoint(n, lower, upper):
    p = Point([random.uniform(lower, upper) for _ in range(n)])
    return p

    def calculateError(clusters):
    '''
    Return the average squared distance between each point and its cluster
    centroid.
    This is also known as the "distortion cost."
    '''
    accumulatedDistances = 0
    num_points = 0
    for cluster in clusters:
    num_points += len(cluster.points)
    accumulatedDistances += cluster.getTotalDistance()

    error = accumulatedDistances / num_points
    return error

    def plotClusters(data, dimensions):
    '''
    This uses the plotly offline mode to create a local HTML file.
  2. @iandanforth iandanforth revised this gist Jan 29, 2017. 1 changed file with 65 additions and 52 deletions.
    117 changes: 65 additions & 52 deletions kmeansExample.py
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,6 @@
    #############################################################################
    # Full Imports

    import sys
    import math
    import random

    @@ -27,7 +26,6 @@
    This script uses an offline plotting mode and will store and open plots locally.
    To store and share plots online sign up for a plotly API key at https://plot.ly.
    """

    plotly = False
    @@ -38,102 +36,104 @@
    print "INFO: Plotly is not installed, plots will not be generated."

    def main():

    # How many points are in our dataset?
    num_points = 20

    # For each of those points how many dimensions do they have?
    # Note: Plotting will only work in two or three dimensions
    dimensions = 2

    # Bounds for the values of those points in each dimension
    lower = 0
    upper = 200

    # The K in k-means. How many clusters do we assume exist?
    num_clusters = 3

    # When do we say the optimization has 'converged' and stop updating clusters
    cutoff = 0.2

    # Generate some points to cluster
    points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]

    points = [
    makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)
    ]

    # Cluster those data!
    clusters = kmeans(points, num_clusters, cutoff)

    # Print our clusters
    for i, c in enumerate(clusters):
    for p in c.points:
    print " Cluster: ", i, "\t Point :", p

    # Display clusters using plotly for 2d data
    if dimensions in [2, 3] and plotly:
    print "Plotting points, launching browser ..."
    plotClusters(clusters, dimensions)

    class Point:
    class Point(object):
    '''
    A point in n dimensional space
    '''
    def __init__(self, coords):
    '''
    coords - A list of values, one per dimension
    '''

    self.coords = coords
    self.n = len(coords)

    def __repr__(self):
    return str(self.coords)

    class Cluster:
    class Cluster(object):
    '''
    A set of points and their centroid
    '''

    def __init__(self, points):
    '''
    points - A list of point objects
    '''
    if len(points) == 0:

    if len(points) == 0:
    raise Exception("ERROR: empty cluster")

    # The points that belong to this cluster
    self.points = points

    # The dimensionality of the points in this cluster
    self.n = points[0].n

    # Assert that all points are of the same dimensionality
    for p in points:
    if p.n != self.n:
    if p.n != self.n:
    raise Exception("ERROR: inconsistent dimensions")

    # Set up the initial centroid (this is usually based off one point)
    self.centroid = self.calculateCentroid()

    def __repr__(self):
    '''
    String representation of this object
    '''
    return str(self.points)

    def update(self, points):
    '''
    Returns the distance between the previous centroid and the new after
    recalculating and storing the new centroid.
    Note: Initially we expect centroids to shift around a lot and then gradually
    settle down.
    Note: Initially we expect centroids to shift around a lot and then
    gradually settle down.
    '''
    old_centroid = self.centroid
    self.points = points
    self.centroid = self.calculateCentroid()
    shift = getDistance(old_centroid, self.centroid)
    shift = getDistance(old_centroid, self.centroid)
    return shift

    def calculateCentroid(self):
    '''
    Finds a virtual center point for a group of n-dimensional points
    @@ -145,36 +145,36 @@ def calculateCentroid(self):
    unzipped = zip(*coords)
    # Calculate the mean for each dimension
    centroid_coords = [math.fsum(dList)/numPoints for dList in unzipped]

    return Point(centroid_coords)

    def kmeans(points, k, cutoff):

    # Pick out k random points to use as our initial centroids
    initial = random.sample(points, k)

    # Create k clusters using those centroids
    # Note: Cluster takes lists, so we wrap each point in a list here.
    clusters = [Cluster([p]) for p in initial]

    # Loop through the dataset until the clusters stabilize
    loopCounter = 0
    while True:
    # Create a list of lists to hold the points in each cluster
    lists = [[] for c in clusters]
    lists = [[] for _ in clusters]
    clusterCount = len(clusters)

    # Start counting loops
    loopCounter += 1
    # For every point in the dataset ...
    for p in points:
    # Get the distance between that point and the centroid of the first
    # cluster.
    smallest_distance = getDistance(p, clusters[0].centroid)

    # Set the cluster this point belongs to
    clusterIndex = 0

    # For the remainder of the clusters ...
    for i in range(clusterCount - 1):
    # calculate the distance of that point to each other cluster's
    @@ -188,17 +188,17 @@ def kmeans(points, k, cutoff):
    # After finding the cluster the smallest distance away
    # set the point to belong to that cluster
    lists[clusterIndex].append(p)

    # Set our biggest_shift to zero for this iteration
    biggest_shift = 0.0

    # For each cluster ...
    for i in range(clusterCount):
    # Calculate how far the centroid moved in this iteration
    shift = clusters[i].update(lists[i])
    # Keep track of the largest move from all cluster centroid updates
    biggest_shift = max(biggest_shift, shift)

    # If the centroids have stopped moving much, say we're done!
    if biggest_shift < cutoff:
    print "Converged after %s iterations" % loopCounter
    @@ -208,25 +208,31 @@ def kmeans(points, k, cutoff):
    def getDistance(a, b):
    '''
    Euclidean distance between two n-dimensional points.
    https://en.wikipedia.org/wiki/Euclidean_distance#n_dimensions
    Note: This can be very slow and does not scale well
    '''
    if a.n != b.n:
    raise Exception("ERROR: non comparable points")

    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2), range(a.n), 0.0)
    return math.sqrt(ret)

    accumulatedDifference = 0.0
    for i in range(a.n):
    squareDifference = pow((a.coords[i]-b.coords[i]), 2)
    accumulatedDifference += squareDifference
    distance = math.sqrt(accumulatedDifference)

    return distance

    def makeRandomPoint(n, lower, upper):
    '''
    Returns a Point object with n dimensions and values between lower and
    upper in each of those dimensions
    '''
    p = Point([random.uniform(lower, upper) for i in range(n)])
    p = Point([random.uniform(lower, upper) for _ in range(n)])
    return p

    def plotClusters(data, dimensions):
    '''
    This uses the plotly offline mode to create a local HTML file.
    '''
    This uses the plotly offline mode to create a local HTML file.
    This should open your default web browser.
    '''
    if dimensions not in [2, 3]:
    @@ -261,12 +267,19 @@ def plotClusters(data, dimensions):
    centroid['name'] = "Centroid " + str(i)
    traceList.append(Scatter(**centroid))
    else:
    symbols = ["circle", "square", "diamond", "circle-open", "square-open", "diamond-open", "cross", "x"]
    symbols = [
    "circle",
    "square",
    "diamond",
    "circle-open",
    "square-open",
    "diamond-open",
    "cross", "x"
    ]
    symbol_count = len(symbols)
    symbol_index = i % len(symbols)
    if i > symbol_count:
    print "Warning: Not enough marker symbols to go around, expect repeats"
    # Convert our list of x,y,z's into an x list, a y list, and a z list.
    print "Warning: Not enough marker symbols to go around"
    # Convert our list of x,y,z's separate lists.
    trace['x'], trace['y'], trace['z'] = zip(*cluster_data)
    trace['mode'] = 'markers'
    trace['marker'] = {}
    @@ -291,5 +304,5 @@ def plotClusters(data, dimensions):
    "layout": Layout(title=title)
    })

    if __name__ == "__main__":
    main()
    if __name__ == "__main__":
    main()
  3. @iandanforth iandanforth revised this gist Jan 23, 2017. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions kmeansExample.py
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,6 @@
    import sys
    import math
    import random
    import subprocess

    """
    This is a pure Python implementation of the K-Means Clustering algorithmn. The
    @@ -44,7 +43,7 @@ def main():
    num_points = 20

    # For each of those points how many dimensions do they have?
    # Note: Plotting will only work in two dimensions
    # Note: Plotting will only work in two or three dimensions
    dimensions = 2

    # Bounds for the values of those points in each dimension
  4. @iandanforth iandanforth revised this gist Jan 22, 2017. 1 changed file with 96 additions and 70 deletions.
    166 changes: 96 additions & 70 deletions kmeansExample.py
    Original file line number Diff line number Diff line change
    @@ -19,29 +19,32 @@
    This script specifically avoids using numpy or other more obscure libraries. It
    is meant to be *clear* not fast.
    I have also added integration with the plot.ly plotting service. If you put in
    your (free) plot.ly credentials below, it will automatically plot the discovered
    clusters and their centroids.
    I have also added integration with the plot.ly plotting library. So you can see
    the clusters found by this algorithm. To install run:
    To use plotly integration you will need to:
    ```
    pip install plotly
    ```
    1. Get a username/key from www.plot.ly/api and enter them below
    2. Install the plotly module: pip install plotly
    This script uses an offline plotting mode and will store and open plots locally.
    To store and share plots online sign up for a plotly API key at https://plot.ly.
    """

    PLOTLY_USERNAME = None
    PLOTLY_KEY = None

    if PLOTLY_USERNAME:
    from plotly import plotly
    plotly = False
    try:
    import plotly
    from plotly.graph_objs import Scatter, Scatter3d, Layout
    except ImportError:
    print "INFO: Plotly is not installed, plots will not be generated."

    def main():

    # How many points are in our dataset?
    num_points = 10
    num_points = 20

    # For each of those points how many dimensions do they have?
    # Note: Plotting will only work in two dimensions
    dimensions = 2

    # Bounds for the values of those points in each dimension
    @@ -52,28 +55,27 @@ def main():
    num_clusters = 3

    # When do we say the optimization has 'converged' and stop updating clusters
    opt_cutoff = 0.5
    cutoff = 0.2

    # Generate some points
    # Generate some points to cluster
    points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]

    # Cluster those data!
    clusters = kmeans(points, num_clusters, opt_cutoff)
    clusters = kmeans(points, num_clusters, cutoff)

    # Print our clusters
    for i,c in enumerate(clusters):
    for i, c in enumerate(clusters):
    for p in c.points:
    print " Cluster: ", i, "\t Point :", p

    # Display clusters using plotly for 2d data
    # This uses the 'open' command on a URL and may only work on OSX.
    if dimensions == 2 and PLOTLY_USERNAME:
    if dimensions in [2, 3] and plotly:
    print "Plotting points, launching browser ..."
    plotClusters(clusters)
    plotClusters(clusters, dimensions)

    class Point:
    '''
    An point in n dimensional space
    A point in n dimensional space
    '''
    def __init__(self, coords):
    '''
    @@ -96,7 +98,9 @@ def __init__(self, points):
    points - A list of point objects
    '''

    if len(points) == 0: raise Exception("ILLEGAL: empty cluster")
    if len(points) == 0:
    raise Exception("ERROR: empty cluster")

    # The points that belong to this cluster
    self.points = points

    @@ -105,7 +109,8 @@ def __init__(self, points):

    # Assert that all points are of the same dimensionality
    for p in points:
    if p.n != self.n: raise Exception("ILLEGAL: wrong dimensions")
    if p.n != self.n:
    raise Exception("ERROR: inconsistent dimensions")

    # Set up the initial centroid (this is usually based off one point)
    self.centroid = self.calculateCentroid()
    @@ -120,6 +125,9 @@ def update(self, points):
    '''
    Returns the distance between the previous centroid and the new after
    recalculating and storing the new centroid.
    Note: Initially we expect centroids to shift around a lot and then gradually
    settle down.
    '''
    old_centroid = self.centroid
    self.points = points
    @@ -147,13 +155,14 @@ def kmeans(points, k, cutoff):
    initial = random.sample(points, k)

    # Create k clusters using those centroids
    # Note: Cluster takes lists, so we wrap each point in a list here.
    clusters = [Cluster([p]) for p in initial]

    # Loop through the dataset until the clusters stabilize
    loopCounter = 0
    while True:
    # Create a list of lists to hold the points in each cluster
    lists = [ [] for c in clusters]
    lists = [[] for c in clusters]
    clusterCount = len(clusters)

    # Start counting loops
    @@ -173,17 +182,18 @@ def kmeans(points, k, cutoff):
    # centroid.
    distance = getDistance(p, clusters[i+1].centroid)
    # If it's closer to that cluster's centroid update what we
    # think the smallest distance is, and set the point to belong
    # to that cluster
    # think the smallest distance is
    if distance < smallest_distance:
    smallest_distance = distance
    clusterIndex = i+1
    # After finding the cluster the smallest distance away
    # set the point to belong to that cluster
    lists[clusterIndex].append(p)

    # Set our biggest_shift to zero for this iteration
    biggest_shift = 0.0

    # As many times as there are clusters ...
    # For each cluster ...
    for i in range(clusterCount):
    # Calculate how far the centroid moved in this iteration
    shift = clusters[i].update(lists[i])
    @@ -202,9 +212,9 @@ def getDistance(a, b):
    Note: This can be very slow and does not scale well
    '''
    if a.n != b.n:
    raise Exception("ILLEGAL: non comparable points")
    raise Exception("ERROR: non comparable points")

    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2),range(a.n),0.0)
    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2), range(a.n), 0.0)
    return math.sqrt(ret)

    def makeRandomPoint(n, lower, upper):
    @@ -215,56 +225,72 @@ def makeRandomPoint(n, lower, upper):
    p = Point([random.uniform(lower, upper) for i in range(n)])
    return p

    def plotClusters(data):
    def plotClusters(data, dimensions):
    '''
    This uses the plotly offline mode to create a local HTML file.
    This should open your default web browser.
    '''
    Use the plotly API to plot data from clusters.
    Gets a plot URL from plotly and then uses subprocess to 'open' that URL
    from the command line. This should open your default web browser.
    '''

    # List of symbols each cluster will be displayed using
    symbols = ['circle', 'cross', 'triangle-up', 'square']
    if dimensions not in [2, 3]:
    raise Exception("Plots are only available for 2 and 3 dimensional data")

    # Convert data into plotly format.
    traceList = []
    for i, c in enumerate(data):
    data = []
    for p in c.points:
    data.append(p.coords)
    # Data
    # Get a list of x,y coordinates for the points in this cluster.
    cluster_data = []
    for point in c.points:
    cluster_data.append(point.coords)

    trace = {}
    trace['x'], trace['y'] = zip(*data)
    trace['marker'] = {}
    trace['marker']['symbol'] = symbols[i]
    trace['name'] = "Cluster " + str(i)
    traceList.append(trace)
    # Centroid (A trace of length 1)
    centroid = {}
    centroid['x'] = [c.centroid.coords[0]]
    centroid['y'] = [c.centroid.coords[1]]
    centroid['marker'] = {}
    centroid['marker']['symbol'] = symbols[i]
    centroid['marker']['color'] = 'rgb(200,10,10)'
    centroid['name'] = "Centroid " + str(i)
    traceList.append(centroid)

    # Log in to plotly
    py = plotly(username=PLOTLY_USERNAME, key=PLOTLY_KEY)
    if dimensions == 2:
    # Convert our list of x,y's into an x list and a y list.
    trace['x'], trace['y'] = zip(*cluster_data)
    trace['mode'] = 'markers'
    trace['marker'] = {}
    trace['marker']['symbol'] = i
    trace['marker']['size'] = 12
    trace['name'] = "Cluster " + str(i)
    traceList.append(Scatter(**trace))
    # Centroid (A trace of length 1)
    centroid['x'] = [c.centroid.coords[0]]
    centroid['y'] = [c.centroid.coords[1]]
    centroid['mode'] = 'markers'
    centroid['marker'] = {}
    centroid['marker']['symbol'] = i
    centroid['marker']['color'] = 'rgb(200,10,10)'
    centroid['name'] = "Centroid " + str(i)
    traceList.append(Scatter(**centroid))
    else:
    symbols = ["circle", "square", "diamond", "circle-open", "square-open", "diamond-open", "cross", "x"]
    symbol_count = len(symbols)
    symbol_index = i % len(symbols)
    if i > symbol_count:
    print "Warning: Not enough marker symbols to go around, expect repeats"
    # Convert our list of x,y,z's into an x list, a y list, and a z list.
    trace['x'], trace['y'], trace['z'] = zip(*cluster_data)
    trace['mode'] = 'markers'
    trace['marker'] = {}
    trace['marker']['symbol'] = symbols[i]
    trace['marker']['size'] = 12
    trace['name'] = "Cluster " + str(i)
    traceList.append(Scatter3d(**trace))
    # Centroid (A trace of length 1)
    centroid['x'] = [c.centroid.coords[0]]
    centroid['y'] = [c.centroid.coords[1]]
    centroid['z'] = [c.centroid.coords[2]]
    centroid['mode'] = 'markers'
    centroid['marker'] = {}
    centroid['marker']['symbol'] = symbols[i]
    centroid['marker']['color'] = 'rgb(200,10,10)'
    centroid['name'] = "Centroid " + str(i)
    traceList.append(Scatter3d(**centroid))

    # Style the chart
    datastyle = {'mode':'markers',
    'type':'scatter',
    'marker':{'line':{'width':0},
    'size':12,
    'opacity':0.6,
    'color':'rgb(74, 134, 232)'}}

    resp = py.plot(*traceList, style = datastyle)

    # Display that plot in a browser
    cmd = "open " + resp['url']
    subprocess.call(cmd, shell=True)
    title = "K-means clustering with %s clusters" % str(len(data))
    plotly.offline.plot({
    "data": traceList,
    "layout": Layout(title=title)
    })

    if __name__ == "__main__":
    main()
  5. @iandanforth iandanforth revised this gist Jun 25, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion kmeansExample.py
    Original file line number Diff line number Diff line change
    @@ -68,7 +68,7 @@ def main():
    # Display clusters using plotly for 2d data
    # This uses the 'open' command on a URL and may only work on OSX.
    if dimensions == 2 and PLOTLY_USERNAME:
    "Plotting points, launching browser ..."
    print "Plotting points, launching browser ..."
    plotClusters(clusters)

    class Point:
  6. @iandanforth iandanforth created this gist Jun 25, 2013.
    270 changes: 270 additions & 0 deletions kmeansExample.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,270 @@
    #############################################################################
    # Full Imports

    import sys
    import math
    import random
    import subprocess

    """
    This is a pure Python implementation of the K-Means Clustering algorithmn. The
    original can be found here:
    http://pandoricweb.tumblr.com/post/8646701677/python-implementation-of-the-k-means-clustering
    I have refactored the code and added comments to aid in readability.
    After reading through this code you should understand clearly how K-means works.
    If not, feel free to email me with questions and suggestions. (iandanforth at
    gmail)
    This script specifically avoids using numpy or other more obscure libraries. It
    is meant to be *clear* not fast.
    I have also added integration with the plot.ly plotting service. If you put in
    your (free) plot.ly credentials below, it will automatically plot the discovered
    clusters and their centroids.
    To use plotly integration you will need to:
    1. Get a username/key from www.plot.ly/api and enter them below
    2. Install the plotly module: pip install plotly
    """

    PLOTLY_USERNAME = None
    PLOTLY_KEY = None

    if PLOTLY_USERNAME:
    from plotly import plotly

    def main():

    # How many points are in our dataset?
    num_points = 10

    # For each of those points how many dimensions do they have?
    dimensions = 2

    # Bounds for the values of those points in each dimension
    lower = 0
    upper = 200

    # The K in k-means. How many clusters do we assume exist?
    num_clusters = 3

    # When do we say the optimization has 'converged' and stop updating clusters
    opt_cutoff = 0.5

    # Generate some points
    points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]

    # Cluster those data!
    clusters = kmeans(points, num_clusters, opt_cutoff)

    # Print our clusters
    for i,c in enumerate(clusters):
    for p in c.points:
    print " Cluster: ", i, "\t Point :", p

    # Display clusters using plotly for 2d data
    # This uses the 'open' command on a URL and may only work on OSX.
    if dimensions == 2 and PLOTLY_USERNAME:
    "Plotting points, launching browser ..."
    plotClusters(clusters)

    class Point:
    '''
    An point in n dimensional space
    '''
    def __init__(self, coords):
    '''
    coords - A list of values, one per dimension
    '''

    self.coords = coords
    self.n = len(coords)

    def __repr__(self):
    return str(self.coords)

    class Cluster:
    '''
    A set of points and their centroid
    '''

    def __init__(self, points):
    '''
    points - A list of point objects
    '''

    if len(points) == 0: raise Exception("ILLEGAL: empty cluster")
    # The points that belong to this cluster
    self.points = points

    # The dimensionality of the points in this cluster
    self.n = points[0].n

    # Assert that all points are of the same dimensionality
    for p in points:
    if p.n != self.n: raise Exception("ILLEGAL: wrong dimensions")

    # Set up the initial centroid (this is usually based off one point)
    self.centroid = self.calculateCentroid()

    def __repr__(self):
    '''
    String representation of this object
    '''
    return str(self.points)

    def update(self, points):
    '''
    Returns the distance between the previous centroid and the new after
    recalculating and storing the new centroid.
    '''
    old_centroid = self.centroid
    self.points = points
    self.centroid = self.calculateCentroid()
    shift = getDistance(old_centroid, self.centroid)
    return shift

    def calculateCentroid(self):
    '''
    Finds a virtual center point for a group of n-dimensional points
    '''
    numPoints = len(self.points)
    # Get a list of all coordinates in this cluster
    coords = [p.coords for p in self.points]
    # Reformat that so all x's are together, all y'z etc.
    unzipped = zip(*coords)
    # Calculate the mean for each dimension
    centroid_coords = [math.fsum(dList)/numPoints for dList in unzipped]

    return Point(centroid_coords)

    def kmeans(points, k, cutoff):

    # Pick out k random points to use as our initial centroids
    initial = random.sample(points, k)

    # Create k clusters using those centroids
    clusters = [Cluster([p]) for p in initial]

    # Loop through the dataset until the clusters stabilize
    loopCounter = 0
    while True:
    # Create a list of lists to hold the points in each cluster
    lists = [ [] for c in clusters]
    clusterCount = len(clusters)

    # Start counting loops
    loopCounter += 1
    # For every point in the dataset ...
    for p in points:
    # Get the distance between that point and the centroid of the first
    # cluster.
    smallest_distance = getDistance(p, clusters[0].centroid)

    # Set the cluster this point belongs to
    clusterIndex = 0

    # For the remainder of the clusters ...
    for i in range(clusterCount - 1):
    # calculate the distance of that point to each other cluster's
    # centroid.
    distance = getDistance(p, clusters[i+1].centroid)
    # If it's closer to that cluster's centroid update what we
    # think the smallest distance is, and set the point to belong
    # to that cluster
    if distance < smallest_distance:
    smallest_distance = distance
    clusterIndex = i+1
    lists[clusterIndex].append(p)

    # Set our biggest_shift to zero for this iteration
    biggest_shift = 0.0

    # As many times as there are clusters ...
    for i in range(clusterCount):
    # Calculate how far the centroid moved in this iteration
    shift = clusters[i].update(lists[i])
    # Keep track of the largest move from all cluster centroid updates
    biggest_shift = max(biggest_shift, shift)

    # If the centroids have stopped moving much, say we're done!
    if biggest_shift < cutoff:
    print "Converged after %s iterations" % loopCounter
    break
    return clusters

    def getDistance(a, b):
    '''
    Euclidean distance between two n-dimensional points.
    Note: This can be very slow and does not scale well
    '''
    if a.n != b.n:
    raise Exception("ILLEGAL: non comparable points")

    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2),range(a.n),0.0)
    return math.sqrt(ret)

    def makeRandomPoint(n, lower, upper):
    '''
    Returns a Point object with n dimensions and values between lower and
    upper in each of those dimensions
    '''
    p = Point([random.uniform(lower, upper) for i in range(n)])
    return p

    def plotClusters(data):
    '''
    Use the plotly API to plot data from clusters.
    Gets a plot URL from plotly and then uses subprocess to 'open' that URL
    from the command line. This should open your default web browser.
    '''

    # List of symbols each cluster will be displayed using
    symbols = ['circle', 'cross', 'triangle-up', 'square']

    # Convert data into plotly format.
    traceList = []
    for i, c in enumerate(data):
    data = []
    for p in c.points:
    data.append(p.coords)
    # Data
    trace = {}
    trace['x'], trace['y'] = zip(*data)
    trace['marker'] = {}
    trace['marker']['symbol'] = symbols[i]
    trace['name'] = "Cluster " + str(i)
    traceList.append(trace)
    # Centroid (A trace of length 1)
    centroid = {}
    centroid['x'] = [c.centroid.coords[0]]
    centroid['y'] = [c.centroid.coords[1]]
    centroid['marker'] = {}
    centroid['marker']['symbol'] = symbols[i]
    centroid['marker']['color'] = 'rgb(200,10,10)'
    centroid['name'] = "Centroid " + str(i)
    traceList.append(centroid)

    # Log in to plotly
    py = plotly(username=PLOTLY_USERNAME, key=PLOTLY_KEY)

    # Style the chart
    datastyle = {'mode':'markers',
    'type':'scatter',
    'marker':{'line':{'width':0},
    'size':12,
    'opacity':0.6,
    'color':'rgb(74, 134, 232)'}}

    resp = py.plot(*traceList, style = datastyle)

    # Display that plot in a browser
    cmd = "open " + resp['url']
    subprocess.call(cmd, shell=True)

    if __name__ == "__main__":
    main()