Skip to content

Instantly share code, notes, and snippets.

@swayson
Forked from mblondel/kernel_kmeans.py
Last active August 29, 2015 14:14
Show Gist options
  • Select an option

  • Save swayson/d2cedceef8a85baf859a to your computer and use it in GitHub Desktop.

Select an option

Save swayson/d2cedceef8a85baf859a to your computer and use it in GitHub Desktop.

Revisions

  1. @mblondel mblondel revised this gist Nov 14, 2013. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -34,6 +34,10 @@ def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None,
    self.coef0 = coef0
    self.kernel_params = kernel_params
    self.verbose = verbose

    @property
    def _pairwise(self):
    return self.kernel == "precomputed"

    def _get_kernel(self, X, Y=None):
    if callable(self.kernel):
  2. @mblondel mblondel revised this gist Nov 13, 2013. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -66,10 +66,10 @@ def fit(self, X, y=None, sample_weight=None):
    labels_old = self.labels_
    self.labels_ = dist.argmin(axis=1)

    # Compute the number of samples whose cluster changed
    # Compute the number of samples whose cluster did not change
    # since last iteration.
    n_diff = np.sum((self.labels_ - labels_old) == 0)
    if 1 - float(n_diff) / n_samples < self.tol:
    n_same = np.sum((self.labels_ - labels_old) == 0)
    if 1 - float(n_same) / n_samples < self.tol:
    if self.verbose:
    print "Converged at iteration", it + 1
    break
  3. @mblondel mblondel revised this gist Nov 13, 2013. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -79,6 +79,8 @@ def fit(self, X, y=None, sample_weight=None):
    return self

    def _compute_dist(self, K, dist, within_distances, update_within):
    """Compute a n_samples x n_clusters distance matrix using the
    kernel trick."""
    sw = self.sample_weight_

    for j in xrange(self.n_clusters):
  4. @mblondel mblondel revised this gist Nov 13, 2013. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -66,6 +66,8 @@ def fit(self, X, y=None, sample_weight=None):
    labels_old = self.labels_
    self.labels_ = dist.argmin(axis=1)

    # Compute the number of samples whose cluster changed
    # since last iteration.
    n_diff = np.sum((self.labels_ - labels_old) == 0)
    if 1 - float(n_diff) / n_samples < self.tol:
    if self.verbose:
  5. @mblondel mblondel revised this gist Nov 13, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -89,7 +89,7 @@ def _compute_dist(self, K, dist, within_distances, update_within):
    denomsq = denom * denom

    if update_within:
    KK = K[mask][:, mask]
    KK = K[mask][:, mask] # K[mask, mask] does not work.
    dist_j = np.sum(np.outer(sw[mask], sw[mask]) * KK / denomsq)
    within_distances[j] = dist_j
    dist[:, j] += dist_j
  6. @mblondel mblondel revised this gist Nov 13, 2013. 1 changed file with 9 additions and 0 deletions.
    9 changes: 9 additions & 0 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -11,6 +11,15 @@


    class KernelKMeans(BaseEstimator, ClusterMixin):
    """
    Kernel K-means
    Reference
    ---------
    Kernel k-means, Spectral Clustering and Normalized Cuts.
    Inderjit S. Dhillon, Yuqiang Guan, Brian Kulis.
    KDD 2004.
    """

    def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None,
    kernel="linear", gamma=None, degree=3, coef0=1,
  7. @mblondel mblondel revised this gist Aug 14, 2013. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -69,15 +69,13 @@ def fit(self, X, y=None, sample_weight=None):

    def _compute_dist(self, K, dist, within_distances, update_within):
    sw = self.sample_weight_
    ind = np.arange(sw.shape[0])

    for j in xrange(self.n_clusters):
    mask = self.labels_ == j

    if np.sum(mask) == 0:
    raise ValueError("Empty cluster found, try smaller n_cluster.")

    cluster_ind = ind[mask]
    denom = sw[mask].sum()
    denomsq = denom * denom

  8. @mblondel mblondel created this gist Aug 14, 2013.
    108 changes: 108 additions & 0 deletions kernel_kmeans.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,108 @@
    """Kernel K-means"""

    # Author: Mathieu Blondel <[email protected]>
    # License: BSD 3 clause

    import numpy as np

    from sklearn.base import BaseEstimator, ClusterMixin
    from sklearn.metrics.pairwise import pairwise_kernels
    from sklearn.utils import check_random_state


    class KernelKMeans(BaseEstimator, ClusterMixin):

    def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None,
    kernel="linear", gamma=None, degree=3, coef0=1,
    kernel_params=None, verbose=0):
    self.n_clusters = n_clusters
    self.max_iter = max_iter
    self.tol = tol
    self.random_state = random_state
    self.kernel = kernel
    self.gamma = gamma
    self.degree = degree
    self.coef0 = coef0
    self.kernel_params = kernel_params
    self.verbose = verbose

    def _get_kernel(self, X, Y=None):
    if callable(self.kernel):
    params = self.kernel_params or {}
    else:
    params = {"gamma": self.gamma,
    "degree": self.degree,
    "coef0": self.coef0}
    return pairwise_kernels(X, Y, metric=self.kernel,
    filter_params=True, **params)

    def fit(self, X, y=None, sample_weight=None):
    n_samples = X.shape[0]

    K = self._get_kernel(X)

    sw = sample_weight if sample_weight else np.ones(n_samples)
    self.sample_weight_ = sw

    rs = check_random_state(self.random_state)
    self.labels_ = rs.randint(self.n_clusters, size=n_samples)

    dist = np.zeros((n_samples, self.n_clusters))
    self.within_distances_ = np.zeros(self.n_clusters)

    for it in xrange(self.max_iter):
    dist.fill(0)
    self._compute_dist(K, dist, self.within_distances_,
    update_within=True)
    labels_old = self.labels_
    self.labels_ = dist.argmin(axis=1)

    n_diff = np.sum((self.labels_ - labels_old) == 0)
    if 1 - float(n_diff) / n_samples < self.tol:
    if self.verbose:
    print "Converged at iteration", it + 1
    break

    self.X_fit_ = X

    return self

    def _compute_dist(self, K, dist, within_distances, update_within):
    sw = self.sample_weight_
    ind = np.arange(sw.shape[0])

    for j in xrange(self.n_clusters):
    mask = self.labels_ == j

    if np.sum(mask) == 0:
    raise ValueError("Empty cluster found, try smaller n_cluster.")

    cluster_ind = ind[mask]
    denom = sw[mask].sum()
    denomsq = denom * denom

    if update_within:
    KK = K[mask][:, mask]
    dist_j = np.sum(np.outer(sw[mask], sw[mask]) * KK / denomsq)
    within_distances[j] = dist_j
    dist[:, j] += dist_j
    else:
    dist[:, j] += within_distances[j]

    dist[:, j] -= 2 * np.sum(sw[mask] * K[:, mask], axis=1) / denom

    def predict(self, X):
    K = self._get_kernel(X, self.X_fit_)
    n_samples = X.shape[0]
    dist = np.zeros((n_samples, self.n_clusters))
    self._compute_dist(K, dist, self.within_distances_,
    update_within=False)
    return dist.argmin(axis=1)

    if __name__ == '__main__':
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=1000, centers=5, random_state=0)

    km = KernelKMeans(n_clusters=5, max_iter=100, random_state=0, verbose=1)
    print km.fit_predict(X)[:10]
    print km.predict(X[:10])