Skip to content

Instantly share code, notes, and snippets.

@edarin
Forked from afrendeiro/kde_2d_weighted.py
Created December 5, 2016 11:27
Show Gist options
  • Select an option

  • Save edarin/2bd6c63b80cd157f05d99536d8f2cef8 to your computer and use it in GitHub Desktop.

Select an option

Save edarin/2bd6c63b80cd157f05d99536d8f2cef8 to your computer and use it in GitHub Desktop.

Revisions

  1. @afrendeiro afrendeiro revised this gist Jan 13, 2016. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions kde_2d_weighted.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@
    import matplotlib.pyplot as plt


    # class from here: http://nbviewer.ipython.org/gist/tillahoffmann/f844bce2ec264c1c8cb5
    class gaussian_kde(object):
    """Representation of a kernel-density estimate using Gaussian kernels.
  2. @afrendeiro afrendeiro revised this gist Jan 13, 2016. 1 changed file with 2 additions and 4 deletions.
    6 changes: 2 additions & 4 deletions kde_2d_weighted.py
    Original file line number Diff line number Diff line change
    @@ -294,8 +294,7 @@ def _compute_covariance(self):
    obs = np.reshape(np.random.standard_exponential(1000), (100, 10))[:, 0:2]
    xy = np.transpose(obs)

    # Let's add a lot of weight to values with y > 3
    weights = [0.999 if i > 3 else 0.1 for i in obs[:, 1]]
    weights = [0.9 if i > 3 else 0.1 for i in obs[:, 1]]

    # kde space
    xmin, xmax = (xy[0].min(), xy[0].max())
    @@ -318,17 +317,16 @@ def _compute_covariance(self):

    # PLot
    fig, axis = plt.subplots(2)
    bounds = [xy[0].min(), xy[1].min(), xy[0].max(), xy[1].max()]

    # plot unweighted
    # axis[0].scatter(obs[:, 0], obs[:, 1])
    axis[0].scatter(obs[:, 0], obs[:, 1])
    bounds = [xy[0].min(), xy[1].min(), xy[0].max(), xy[1].max()]
    cax = axis[0].imshow(np.rot90(zz1.T), cmap=plt.cm.Reds, extent=[bounds[0], bounds[2], bounds[1], bounds[3]])#, alpha=0.5)
    axis[0].legend()

    # plot weighted
    axis[1].scatter(obs[:, 0], obs[:, 1])
    bounds = axis[1].dataLim.get_points().ravel()
    cax = axis[1].imshow(np.rot90(zz2.T), cmap=plt.cm.Reds, extent=[bounds[0], bounds[2], bounds[1], bounds[3]])#, alpha=0.5)
    axis[1].legend()

  3. @afrendeiro afrendeiro created this gist Jan 13, 2016.
    338 changes: 338 additions & 0 deletions kde_2d_weighted.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,338 @@
    import numpy as np
    import matplotlib.pyplot as plt


    class gaussian_kde(object):
    """Representation of a kernel-density estimate using Gaussian kernels.
    Kernel density estimation is a way to estimate the probability density
    function (PDF) of a random variable in a non-parametric way.
    `gaussian_kde` works for both uni-variate and multi-variate data. It
    includes automatic bandwidth determination. The estimation works best for
    a unimodal distribution; bimodal or multi-modal distributions tend to be
    oversmoothed.
    Parameters
    ----------
    dataset : array_like
    Datapoints to estimate from. In case of univariate data this is a 1-D
    array, otherwise a 2-D array with shape (# of dims, # of data).
    bw_method : str, scalar or callable, optional
    The method used to calculate the estimator bandwidth. This can be
    'scott', 'silverman', a scalar constant or a callable. If a scalar,
    this will be used directly as `kde.factor`. If a callable, it should
    take a `gaussian_kde` instance as only parameter and return a scalar.
    If None (default), 'scott' is used. See Notes for more details.
    weights : array_like, shape (n, ), optional, default: None
    An array of weights, of the same shape as `x`. Each value in `x`
    only contributes its associated weight towards the bin count
    (instead of 1).
    Attributes
    ----------
    dataset : ndarray
    The dataset with which `gaussian_kde` was initialized.
    d : int
    Number of dimensions.
    n : int
    Number of datapoints.
    neff : float
    Effective sample size using Kish's approximation.
    factor : float
    The bandwidth factor, obtained from `kde.covariance_factor`, with which
    the covariance matrix is multiplied.
    covariance : ndarray
    The covariance matrix of `dataset`, scaled by the calculated bandwidth
    (`kde.factor`).
    inv_cov : ndarray
    The inverse of `covariance`.
    Methods
    -------
    kde.evaluate(points) : ndarray
    Evaluate the estimated pdf on a provided set of points.
    kde(points) : ndarray
    Same as kde.evaluate(points)
    kde.pdf(points) : ndarray
    Alias for ``kde.evaluate(points)``.
    kde.set_bandwidth(bw_method='scott') : None
    Computes the bandwidth, i.e. the coefficient that multiplies the data
    covariance matrix to obtain the kernel covariance matrix.
    .. versionadded:: 0.11.0
    kde.covariance_factor : float
    Computes the coefficient (`kde.factor`) that multiplies the data
    covariance matrix to obtain the kernel covariance matrix.
    The default is `scotts_factor`. A subclass can overwrite this method
    to provide a different method, or set it through a call to
    `kde.set_bandwidth`.
    Notes
    -----
    Bandwidth selection strongly influences the estimate obtained from the KDE
    (much more so than the actual shape of the kernel). Bandwidth selection
    can be done by a "rule of thumb", by cross-validation, by "plug-in
    methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
    uses a rule of thumb, the default is Scott's Rule.
    Scott's Rule [1]_, implemented as `scotts_factor`, is::
    n**(-1./(d+4)),
    with ``n`` the number of data points and ``d`` the number of dimensions.
    Silverman's Rule [2]_, implemented as `silverman_factor`, is::
    (n * (d + 2) / 4.)**(-1. / (d + 4)).
    Good general descriptions of kernel density estimation can be found in [1]_
    and [2]_, the mathematics for this multi-dimensional implementation can be
    found in [1]_.
    References
    ----------
    .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
    Visualization", John Wiley & Sons, New York, Chicester, 1992.
    .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
    Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
    Chapman and Hall, London, 1986.
    .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
    Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
    .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
    conditional density estimation", Computational Statistics & Data
    Analysis, Vol. 36, pp. 279-298, 2001.
    Examples
    --------
    Generate some random two-dimensional data:
    >>> from scipy import stats
    >>> def measure(n):
    >>> "Measurement model, return two coupled measurements."
    >>> m1 = np.random.normal(size=n)
    >>> m2 = np.random.normal(scale=0.5, size=n)
    >>> return m1+m2, m1-m2
    >>> m1, m2 = measure(2000)
    >>> xmin = m1.min()
    >>> xmax = m1.max()
    >>> ymin = m2.min()
    >>> ymax = m2.max()
    Perform a kernel density estimate on the data:
    >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    >>> positions = np.vstack([X.ravel(), Y.ravel()])
    >>> values = np.vstack([m1, m2])
    >>> kernel = stats.gaussian_kde(values)
    >>> Z = np.reshape(kernel(positions).T, X.shape)
    Plot the results:
    >>> import matplotlib.pyplot as plt
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
    ... extent=[xmin, xmax, ymin, ymax])
    >>> ax.plot(m1, m2, 'k.', markersize=2)
    >>> ax.set_xlim([xmin, xmax])
    >>> ax.set_ylim([ymin, ymax])
    >>> plt.show()
    """
    def __init__(self, dataset, bw_method=None, weights=None):
    self.dataset = np.atleast_2d(dataset)
    if not self.dataset.size > 1:
    raise ValueError("`dataset` input should have multiple elements.")
    self.d, self.n = self.dataset.shape

    if weights is not None:
    self.weights = weights / np.sum(weights)
    else:
    self.weights = np.ones(self.n) / self.n

    # Compute the effective sample size
    # http://surveyanalysis.org/wiki/Design_Effects_and_Effective_Sample_Size#Kish.27s_approximate_formula_for_computing_effective_sample_size
    self.neff = 1.0 / np.sum(self.weights ** 2)

    self.set_bandwidth(bw_method=bw_method)

    def evaluate(self, points):
    """Evaluate the estimated pdf on a set of points.
    Parameters
    ----------
    points : (# of dimensions, # of points)-array
    Alternatively, a (# of dimensions,) vector can be passed in and
    treated as a single point.
    Returns
    -------
    values : (# of points,)-array
    The values at each point.
    Raises
    ------
    ValueError : if the dimensionality of the input points is different than
    the dimensionality of the KDE.
    """
    from scipy.spatial.distance import cdist

    points = np.atleast_2d(points)

    d, m = points.shape
    if d != self.d:
    if d == 1 and m == self.d:
    # points was passed in as a row vector
    points = np.reshape(points, (self.d, 1))
    m = 1
    else:
    msg = "points have dimension %s, dataset has dimension %s" % (d, self.d)
    raise ValueError(msg)

    # compute the normalised residuals
    chi2 = cdist(points.T, self.dataset.T, 'mahalanobis', VI=self.inv_cov) ** 2
    # compute the pdf
    result = np.sum(np.exp(-.5 * chi2) * self.weights, axis=1) / self._norm_factor

    return result

    __call__ = evaluate

    def scotts_factor(self):
    return np.power(self.neff, -1. / (self.d + 4))

    def silverman_factor(self):
    return np.power(self.neff * (self.d + 2.0) / 4.0, -1. / (self.d + 4))

    # Default method to calculate bandwidth, can be overwritten by subclass
    covariance_factor = scotts_factor

    def set_bandwidth(self, bw_method=None):
    """Compute the estimator bandwidth with given method.
    The new bandwidth calculated after a call to `set_bandwidth` is used
    for subsequent evaluations of the estimated density.
    Parameters
    ----------
    bw_method : str, scalar or callable, optional
    The method used to calculate the estimator bandwidth. This can be
    'scott', 'silverman', a scalar constant or a callable. If a
    scalar, this will be used directly as `kde.factor`. If a callable,
    it should take a `gaussian_kde` instance as only parameter and
    return a scalar. If None (default), nothing happens; the current
    `kde.covariance_factor` method is kept.
    Notes
    -----
    .. versionadded:: 0.11
    Examples
    --------
    >>> x1 = np.array([-7, -5, 1, 4, 5.])
    >>> kde = stats.gaussian_kde(x1)
    >>> xs = np.linspace(-10, 10, num=50)
    >>> y1 = kde(xs)
    >>> kde.set_bandwidth(bw_method='silverman')
    >>> y2 = kde(xs)
    >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
    >>> y3 = kde(xs)
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
    ... label='Data points (rescaled)')
    >>> ax.plot(xs, y1, label='Scott (default)')
    >>> ax.plot(xs, y2, label='Silverman')
    >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
    >>> ax.legend()
    >>> plt.show()
    """
    from six import string_types
    if bw_method is None:
    pass
    elif bw_method == 'scott':
    self.covariance_factor = self.scotts_factor
    elif bw_method == 'silverman':
    self.covariance_factor = self.silverman_factor
    elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
    self._bw_method = 'use constant'
    self.covariance_factor = lambda: bw_method
    elif callable(bw_method):
    self._bw_method = bw_method
    self.covariance_factor = lambda: self._bw_method(self)
    else:
    msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
    "or a callable."
    raise ValueError(msg)

    self._compute_covariance()

    def _compute_covariance(self):
    """Computes the covariance matrix for each Gaussian kernel using
    covariance_factor().
    """
    self.factor = self.covariance_factor()
    # Cache covariance and inverse covariance of the data
    if not hasattr(self, '_data_inv_cov'):
    # Compute the mean and residuals
    _mean = np.sum(self.weights * self.dataset, axis=1)
    _residual = (self.dataset - _mean[:, None])
    # Compute the biased covariance
    self._data_covariance = np.atleast_2d(np.dot(_residual * self.weights, _residual.T))
    # Correct for bias (http://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_covariance)
    self._data_covariance /= (1 - np.sum(self.weights ** 2))
    self._data_inv_cov = np.linalg.inv(self._data_covariance)

    self.covariance = self._data_covariance * self.factor ** 2
    self.inv_cov = self._data_inv_cov / self.factor ** 2
    self._norm_factor = np.sqrt(np.linalg.det(2 * np.pi * self.covariance)) # * self.n


    # some random observations in 2D space
    obs = np.reshape(np.random.standard_exponential(1000), (100, 10))[:, 0:2]
    xy = np.transpose(obs)

    # Let's add a lot of weight to values with y > 3
    weights = [0.999 if i > 3 else 0.1 for i in obs[:, 1]]

    # kde space
    xmin, xmax = (xy[0].min(), xy[0].max())
    ymin, ymax = (xy[1].min(), xy[1].max())
    x = np.linspace(xmin, xmax, 100) # kde resolution
    y = np.linspace(ymin, ymax, 100) # kde resolution
    xx, yy = np.meshgrid(x, y)

    # Unweighted KDE
    pdf = gaussian_kde(xy)
    pdf.set_bandwidth(bw_method=pdf.factor / 5.) # kde bandwidth
    zz = pdf((np.ravel(xx), np.ravel(yy)))
    zz1 = np.reshape(zz, xx.shape)

    # weighted KDE
    pdf = gaussian_kde(xy, weights=np.array(weights, np.float))
    pdf.set_bandwidth(bw_method=pdf.factor / 5.) # kde bandwidth
    zz2 = pdf((np.ravel(xx), np.ravel(yy)))
    zz2 = np.reshape(zz2, xx.shape)

    # PLot
    fig, axis = plt.subplots(2)

    # plot unweighted
    # axis[0].scatter(obs[:, 0], obs[:, 1])
    axis[0].scatter(obs[:, 0], obs[:, 1])
    bounds = [xy[0].min(), xy[1].min(), xy[0].max(), xy[1].max()]
    cax = axis[0].imshow(np.rot90(zz1.T), cmap=plt.cm.Reds, extent=[bounds[0], bounds[2], bounds[1], bounds[3]])#, alpha=0.5)
    axis[0].legend()

    # plot weighted
    axis[1].scatter(obs[:, 0], obs[:, 1])
    bounds = axis[1].dataLim.get_points().ravel()
    cax = axis[1].imshow(np.rot90(zz2.T), cmap=plt.cm.Reds, extent=[bounds[0], bounds[2], bounds[1], bounds[3]])#, alpha=0.5)
    axis[1].legend()

    axis[0].set_title("unweighted")
    axis[1].set_title("weighted")

    plt.show()