import numpy as np import matplotlib.pyplot as plt # class from here: http://nbviewer.ipython.org/gist/tillahoffmann/f844bce2ec264c1c8cb5 class gaussian_kde(object): """Representation of a kernel-density estimate using Gaussian kernels. Kernel density estimation is a way to estimate the probability density function (PDF) of a random variable in a non-parametric way. `gaussian_kde` works for both uni-variate and multi-variate data. It includes automatic bandwidth determination. The estimation works best for a unimodal distribution; bimodal or multi-modal distributions tend to be oversmoothed. Parameters ---------- dataset : array_like Datapoints to estimate from. In case of univariate data this is a 1-D array, otherwise a 2-D array with shape (# of dims, # of data). bw_method : str, scalar or callable, optional The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If a scalar, this will be used directly as `kde.factor`. If a callable, it should take a `gaussian_kde` instance as only parameter and return a scalar. If None (default), 'scott' is used. See Notes for more details. weights : array_like, shape (n, ), optional, default: None An array of weights, of the same shape as `x`. Each value in `x` only contributes its associated weight towards the bin count (instead of 1). Attributes ---------- dataset : ndarray The dataset with which `gaussian_kde` was initialized. d : int Number of dimensions. n : int Number of datapoints. neff : float Effective sample size using Kish's approximation. factor : float The bandwidth factor, obtained from `kde.covariance_factor`, with which the covariance matrix is multiplied. covariance : ndarray The covariance matrix of `dataset`, scaled by the calculated bandwidth (`kde.factor`). inv_cov : ndarray The inverse of `covariance`. Methods ------- kde.evaluate(points) : ndarray Evaluate the estimated pdf on a provided set of points. kde(points) : ndarray Same as kde.evaluate(points) kde.pdf(points) : ndarray Alias for ``kde.evaluate(points)``. kde.set_bandwidth(bw_method='scott') : None Computes the bandwidth, i.e. the coefficient that multiplies the data covariance matrix to obtain the kernel covariance matrix. .. versionadded:: 0.11.0 kde.covariance_factor : float Computes the coefficient (`kde.factor`) that multiplies the data covariance matrix to obtain the kernel covariance matrix. The default is `scotts_factor`. A subclass can overwrite this method to provide a different method, or set it through a call to `kde.set_bandwidth`. Notes ----- Bandwidth selection strongly influences the estimate obtained from the KDE (much more so than the actual shape of the kernel). Bandwidth selection can be done by a "rule of thumb", by cross-validation, by "plug-in methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde` uses a rule of thumb, the default is Scott's Rule. Scott's Rule [1]_, implemented as `scotts_factor`, is:: n**(-1./(d+4)), with ``n`` the number of data points and ``d`` the number of dimensions. Silverman's Rule [2]_, implemented as `silverman_factor`, is:: (n * (d + 2) / 4.)**(-1. / (d + 4)). Good general descriptions of kernel density estimation can be found in [1]_ and [2]_, the mathematics for this multi-dimensional implementation can be found in [1]_. References ---------- .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and Visualization", John Wiley & Sons, New York, Chicester, 1992. .. [2] B.W. Silverman, "Density Estimation for Statistics and Data Analysis", Vol. 26, Monographs on Statistics and Applied Probability, Chapman and Hall, London, 1986. .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993. .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel conditional density estimation", Computational Statistics & Data Analysis, Vol. 36, pp. 279-298, 2001. Examples -------- Generate some random two-dimensional data: >>> from scipy import stats >>> def measure(n): >>> "Measurement model, return two coupled measurements." >>> m1 = np.random.normal(size=n) >>> m2 = np.random.normal(scale=0.5, size=n) >>> return m1+m2, m1-m2 >>> m1, m2 = measure(2000) >>> xmin = m1.min() >>> xmax = m1.max() >>> ymin = m2.min() >>> ymax = m2.max() Perform a kernel density estimate on the data: >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] >>> positions = np.vstack([X.ravel(), Y.ravel()]) >>> values = np.vstack([m1, m2]) >>> kernel = stats.gaussian_kde(values) >>> Z = np.reshape(kernel(positions).T, X.shape) Plot the results: >>> import matplotlib.pyplot as plt >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, ... extent=[xmin, xmax, ymin, ymax]) >>> ax.plot(m1, m2, 'k.', markersize=2) >>> ax.set_xlim([xmin, xmax]) >>> ax.set_ylim([ymin, ymax]) >>> plt.show() """ def __init__(self, dataset, bw_method=None, weights=None): self.dataset = np.atleast_2d(dataset) if not self.dataset.size > 1: raise ValueError("`dataset` input should have multiple elements.") self.d, self.n = self.dataset.shape if weights is not None: self.weights = weights / np.sum(weights) else: self.weights = np.ones(self.n) / self.n # Compute the effective sample size # http://surveyanalysis.org/wiki/Design_Effects_and_Effective_Sample_Size#Kish.27s_approximate_formula_for_computing_effective_sample_size self.neff = 1.0 / np.sum(self.weights ** 2) self.set_bandwidth(bw_method=bw_method) def evaluate(self, points): """Evaluate the estimated pdf on a set of points. Parameters ---------- points : (# of dimensions, # of points)-array Alternatively, a (# of dimensions,) vector can be passed in and treated as a single point. Returns ------- values : (# of points,)-array The values at each point. Raises ------ ValueError : if the dimensionality of the input points is different than the dimensionality of the KDE. """ from scipy.spatial.distance import cdist points = np.atleast_2d(points) d, m = points.shape if d != self.d: if d == 1 and m == self.d: # points was passed in as a row vector points = np.reshape(points, (self.d, 1)) m = 1 else: msg = "points have dimension %s, dataset has dimension %s" % (d, self.d) raise ValueError(msg) # compute the normalised residuals chi2 = cdist(points.T, self.dataset.T, 'mahalanobis', VI=self.inv_cov) ** 2 # compute the pdf result = np.sum(np.exp(-.5 * chi2) * self.weights, axis=1) / self._norm_factor return result __call__ = evaluate def scotts_factor(self): return np.power(self.neff, -1. / (self.d + 4)) def silverman_factor(self): return np.power(self.neff * (self.d + 2.0) / 4.0, -1. / (self.d + 4)) # Default method to calculate bandwidth, can be overwritten by subclass covariance_factor = scotts_factor def set_bandwidth(self, bw_method=None): """Compute the estimator bandwidth with given method. The new bandwidth calculated after a call to `set_bandwidth` is used for subsequent evaluations of the estimated density. Parameters ---------- bw_method : str, scalar or callable, optional The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If a scalar, this will be used directly as `kde.factor`. If a callable, it should take a `gaussian_kde` instance as only parameter and return a scalar. If None (default), nothing happens; the current `kde.covariance_factor` method is kept. Notes ----- .. versionadded:: 0.11 Examples -------- >>> x1 = np.array([-7, -5, 1, 4, 5.]) >>> kde = stats.gaussian_kde(x1) >>> xs = np.linspace(-10, 10, num=50) >>> y1 = kde(xs) >>> kde.set_bandwidth(bw_method='silverman') >>> y2 = kde(xs) >>> kde.set_bandwidth(bw_method=kde.factor / 3.) >>> y3 = kde(xs) >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo', ... label='Data points (rescaled)') >>> ax.plot(xs, y1, label='Scott (default)') >>> ax.plot(xs, y2, label='Silverman') >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)') >>> ax.legend() >>> plt.show() """ from six import string_types if bw_method is None: pass elif bw_method == 'scott': self.covariance_factor = self.scotts_factor elif bw_method == 'silverman': self.covariance_factor = self.silverman_factor elif np.isscalar(bw_method) and not isinstance(bw_method, string_types): self._bw_method = 'use constant' self.covariance_factor = lambda: bw_method elif callable(bw_method): self._bw_method = bw_method self.covariance_factor = lambda: self._bw_method(self) else: msg = "`bw_method` should be 'scott', 'silverman', a scalar " \ "or a callable." raise ValueError(msg) self._compute_covariance() def _compute_covariance(self): """Computes the covariance matrix for each Gaussian kernel using covariance_factor(). """ self.factor = self.covariance_factor() # Cache covariance and inverse covariance of the data if not hasattr(self, '_data_inv_cov'): # Compute the mean and residuals _mean = np.sum(self.weights * self.dataset, axis=1) _residual = (self.dataset - _mean[:, None]) # Compute the biased covariance self._data_covariance = np.atleast_2d(np.dot(_residual * self.weights, _residual.T)) # Correct for bias (http://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_covariance) self._data_covariance /= (1 - np.sum(self.weights ** 2)) self._data_inv_cov = np.linalg.inv(self._data_covariance) self.covariance = self._data_covariance * self.factor ** 2 self.inv_cov = self._data_inv_cov / self.factor ** 2 self._norm_factor = np.sqrt(np.linalg.det(2 * np.pi * self.covariance)) # * self.n # some random observations in 2D space obs = np.reshape(np.random.standard_exponential(1000), (100, 10))[:, 0:2] xy = np.transpose(obs) weights = [0.9 if i > 3 else 0.1 for i in obs[:, 1]] # kde space xmin, xmax = (xy[0].min(), xy[0].max()) ymin, ymax = (xy[1].min(), xy[1].max()) x = np.linspace(xmin, xmax, 100) # kde resolution y = np.linspace(ymin, ymax, 100) # kde resolution xx, yy = np.meshgrid(x, y) # Unweighted KDE pdf = gaussian_kde(xy) pdf.set_bandwidth(bw_method=pdf.factor / 5.) # kde bandwidth zz = pdf((np.ravel(xx), np.ravel(yy))) zz1 = np.reshape(zz, xx.shape) # weighted KDE pdf = gaussian_kde(xy, weights=np.array(weights, np.float)) pdf.set_bandwidth(bw_method=pdf.factor / 5.) # kde bandwidth zz2 = pdf((np.ravel(xx), np.ravel(yy))) zz2 = np.reshape(zz2, xx.shape) # PLot fig, axis = plt.subplots(2) bounds = [xy[0].min(), xy[1].min(), xy[0].max(), xy[1].max()] # plot unweighted # axis[0].scatter(obs[:, 0], obs[:, 1]) axis[0].scatter(obs[:, 0], obs[:, 1]) cax = axis[0].imshow(np.rot90(zz1.T), cmap=plt.cm.Reds, extent=[bounds[0], bounds[2], bounds[1], bounds[3]])#, alpha=0.5) axis[0].legend() # plot weighted axis[1].scatter(obs[:, 0], obs[:, 1]) cax = axis[1].imshow(np.rot90(zz2.T), cmap=plt.cm.Reds, extent=[bounds[0], bounds[2], bounds[1], bounds[3]])#, alpha=0.5) axis[1].legend() axis[0].set_title("unweighted") axis[1].set_title("weighted") plt.show()