Skip to content

Instantly share code, notes, and snippets.

@dave-andersen
Last active September 1, 2022 11:15
Show Gist options
  • Select an option

  • Save dave-andersen/265e68a5e879b5540ebc to your computer and use it in GitHub Desktop.

Select an option

Save dave-andersen/265e68a5e879b5540ebc to your computer and use it in GitHub Desktop.

Revisions

  1. dave-andersen revised this gist Apr 25, 2016. 1 changed file with 5 additions and 3 deletions.
    8 changes: 5 additions & 3 deletions kmeans.py
    Original file line number Diff line number Diff line change
    @@ -15,9 +15,6 @@
    # centroids. In the real world, do this better.
    centroids = tf.Variable(tf.slice(points.initialized_value(), [0,0], [K,2]))

    sess = tf.Session()
    sess.run(tf.initialize_all_variables())

    # Replicate to N copies of each centroid and K copies of each
    # point, then subtract and compute the sum of squared distances.
    rep_centroids = tf.reshape(tf.tile(centroids, [N, 1]), [N, K, 2])
    @@ -44,6 +41,11 @@ def bucket_mean(data, bucket_ids, num_buckets):
    centroids.assign(means),
    cluster_assignments.assign(best_centroids))

    init = tf.initialize_all_variables()

    sess = tf.Session()
    sess.run(init)

    changed = True
    iters = 0

  2. dave-andersen created this gist Nov 18, 2015.
    59 changes: 59 additions & 0 deletions kmeans.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,59 @@
    import tensorflow as tf
    import numpy as np
    import time

    N=10000
    K=4
    MAX_ITERS = 1000

    start = time.time()

    points = tf.Variable(tf.random_uniform([N,2]))
    cluster_assignments = tf.Variable(tf.zeros([N], dtype=tf.int64))

    # Silly initialization: Use the first K points as the starting
    # centroids. In the real world, do this better.
    centroids = tf.Variable(tf.slice(points.initialized_value(), [0,0], [K,2]))

    sess = tf.Session()
    sess.run(tf.initialize_all_variables())

    # Replicate to N copies of each centroid and K copies of each
    # point, then subtract and compute the sum of squared distances.
    rep_centroids = tf.reshape(tf.tile(centroids, [N, 1]), [N, K, 2])
    rep_points = tf.reshape(tf.tile(points, [1, K]), [N, K, 2])
    sum_squares = tf.reduce_sum(tf.square(rep_points - rep_centroids),
    reduction_indices=2)

    # Use argmin to select the lowest-distance point
    best_centroids = tf.argmin(sum_squares, 1)
    did_assignments_change = tf.reduce_any(tf.not_equal(best_centroids,
    cluster_assignments))

    def bucket_mean(data, bucket_ids, num_buckets):
    total = tf.unsorted_segment_sum(data, bucket_ids, num_buckets)
    count = tf.unsorted_segment_sum(tf.ones_like(data), bucket_ids, num_buckets)
    return total / count

    means = bucket_mean(points, best_centroids, K)

    # Do not write to the assigned clusters variable until after
    # computing whether the assignments have changed - hence with_dependencies
    with tf.control_dependencies([did_assignments_change]):
    do_updates = tf.group(
    centroids.assign(means),
    cluster_assignments.assign(best_centroids))

    changed = True
    iters = 0

    while changed and iters < MAX_ITERS:
    iters += 1
    [changed, _] = sess.run([did_assignments_change, do_updates])

    [centers, assignments] = sess.run([centroids, cluster_assignments])
    end = time.time()
    print ("Found in %.2f seconds" % (end-start)), iters, "iterations"
    print "Centroids:"
    print centers
    print "Cluster assignments:", assignments