pysal.explore.esda.geary 源代码

"""
Geary's C statistic for spatial autocorrelation
"""
__author__ = "Serge Rey <sjsrey@gmail.com> "

import numpy as np
import scipy.stats as stats
from pysal.lib import weights
from .tabular import _univariate_handler

__all__ = ['Geary']


[文档]class Geary(object):
    """
    Global Geary C Autocorrelation statistic

    Parameters
    ----------
    y              : array
                     (n, 1) attribute vector
    w              : W
                     spatial weights
    transformation : {'B', 'R', 'D', 'U', 'V'}
                     weights transformation, default is binary.
                     Other options include "R": row-standardized, "D":
                     doubly-standardized, "U": untransformed (general
                     weights), "V": variance-stabilizing.
    permutations   : int
                     number of random permutations for calculation of
                     pseudo-p_values

    Attributes
    ----------
    y              : array
                     original variable
    w              : W
                     spatial weights
    permutations   : int
                     number of permutations
    C              : float
                     value of statistic
    EC             : float
                     expected value
    VC             : float
                     variance of G under normality assumption
    z_norm         : float
                     z-statistic for C under normality assumption
    z_rand         : float
                     z-statistic for C under randomization assumption
    p_norm         : float
                     p-value under normality assumption (one-tailed)
    p_rand         : float
                     p-value under randomization assumption (one-tailed)
    sim            : array
                     (if permutations!=0)
                     vector of I values for permutated samples
    p_sim          : float
                     (if permutations!=0)
                     p-value based on permutations (one-tailed)
                     null: sptial randomness
                     alternative: the observed C is extreme
                     it is either extremely high or extremely low
    EC_sim         : float
                     (if permutations!=0)
                     average value of C from permutations
    VC_sim         : float
                     (if permutations!=0)
                     variance of C from permutations
    seC_sim        : float
                     (if permutations!=0)
                     standard deviation of C under permutations.
    z_sim          : float
                     (if permutations!=0)
                     standardized C based on permutations
    p_z_sim        : float
                     (if permutations!=0)
                     p-value based on standard normal approximation from
                     permutations (one-tailed)

    Examples
    --------
    >>> import pysal.lib
    >>> from pysal.explore.esda.geary import Geary
    >>> w = pysal.lib.io.open(pysal.lib.examples.get_path("book.gal")).read()
    >>> f = pysal.lib.io.open(pysal.lib.examples.get_path("book.txt"))
    >>> y = np.array(f.by_col['y'])
    >>> c = Geary(y,w,permutations=0)
    >>> round(c.C,7)
    0.3330108
    >>> round(c.p_norm,7)
    9.2e-05
    >>>


    Notes
    -----
    Technical details and derivations can be found in :cite:`cliff81`.

    """
[文档]    def __init__(self, y, w, transformation="r", permutations=999):
        if not isinstance(w, weights.W):
            raise TypeError('w must be a pysal weights object, got {}'
                            ' instead'.format(type(w)))
        y = np.asarray(y).flatten()
        self.n = len(y)
        self.y = y
        w.transform = transformation
        self.w = w
        self.permutations = permutations
        self.__moments()
        xn = range(len(y))
        self.xn = xn
        self.y2 = y * y
        yd = y - y.mean()
        yss = sum(yd * yd)
        self.den = yss * self.w.s0 * 2.0
        self.C = self.__calc(y)
        de = self.C - 1.0
        self.EC = 1.0
        self.z_norm = de / self.seC_norm
        self.z_rand = de / self.seC_rand
        if de > 0:
            self.p_norm = 1 - stats.norm.cdf(self.z_norm)
            self.p_rand = 1 - stats.norm.cdf(self.z_rand)
        else:
            self.p_norm = stats.norm.cdf(self.z_norm)
            self.p_rand = stats.norm.cdf(self.z_rand)


        if permutations:
            sim = [self.__calc(np.random.permutation(self.y))
                   for i in range(permutations)]
            self.sim = sim = np.array(sim)
            above = sim >= self.C
            larger = sum(above)
            if (permutations - larger) < larger:
                larger = permutations - larger
            self.p_sim = (larger + 1.) / (permutations + 1.)
            self.EC_sim = sum(sim) / permutations
            self.seC_sim = np.array(sim).std()
            self.VC_sim = self.seC_sim ** 2
            self.z_sim = (self.C - self.EC_sim) / self.seC_sim
            self.p_z_sim = 1 - stats.norm.cdf(np.abs(self.z_sim))

    @property
    def _statistic(self):
        """ a standardized accessor for pysal.explore.esda statistics"""
        return self.C

    def __moments(self):
        y = self.y
        n = self.n
        w = self.w
        s0 = w.s0
        s1 = w.s1
        s2 = w.s2
        s02 = s0 * s0
        yd = y - y.mean()
        yd4 = yd**4
        yd2 = yd**2
        n2 = n * n
        k = (yd4.sum() / n) / ((yd2.sum()/n)**2)
        A = (n-1) * s1 * (n2 - 3*n + 3 - (n-1) * k)
        B = (1./4) * ((n-1) * s2 * (n2 + 3*n - 6 - (n2 - n +2) * k ))
        C = s02 * (n2 - 3 - (n-1)**2 * k)
        vc_rand = (A-B+C) / (n * (n-2) * (n-3)*s02)
        vc_norm = ((1 / (2 * (n + 1) * s02)) *
                   ((2 * s1 + s2) * (n - 1) - 4 * s02))

        self.VC_rand = vc_rand
        self.VC_norm = vc_norm
        self.seC_rand = vc_rand ** (0.5)
        self.seC_norm = vc_norm ** (0.5)

    def __calc(self, y):
        ys = np.zeros(y.shape)
        y2 = y ** 2
        for i, i0 in enumerate(self.w.id_order):
            neighbors = self.w.neighbor_offsets[i0]
            wijs = self.w.weights[i0]
            z = list(zip(neighbors, wijs))
            ys[i] = sum([wij * (y2[i] - 2 * y[i] * y[j] + y2[j])
                         for j, wij in z])
        a = (self.n - 1) * sum(ys)
        return a / self.den

    @classmethod
    def by_col(cls, df, cols, w=None, inplace=False, pvalue='sim', outvals=None, **stat_kws):
        """
        Function to compute a Geary statistic on a dataframe

        Arguments
        ---------
        df          :   pandas.DataFrame
                        a pandas dataframe with a geometry column
        cols        :   string or list of string
                        name or list of names of columns to use to compute the statistic
        w           :   pysal weights object
                        a weights object aligned with the dataframe. If not provided, this
                        is searched for in the dataframe's metadata
        inplace     :   bool
                        a boolean denoting whether to operate on the dataframe inplace or to
                        return a series contaning the results of the computation. If
                        operating inplace, with default configurations,
                        the derived columns will be named like 'column_geary' and 'column_p_sim'
        pvalue      :   string
                        a string denoting which pvalue should be returned. Refer to the
                        the Geary statistic's documentation for available p-values
        outvals     :   list of strings
                        list of arbitrary attributes to return as columns from the
                        Geary statistic
        **stat_kws  :   keyword arguments
                        options to pass to the underlying statistic. For this, see the
                        documentation for the Geary statistic.

        Returns
        --------
        If inplace, None, and operation is conducted on dataframe in memory. Otherwise,
        returns a copy of the dataframe with the relevant columns attached.

        Notes
        -----
        Technical details and derivations can be found in :cite:`cliff81`.

        """
        return _univariate_handler(df, cols, w=w, inplace=inplace, pvalue=pvalue,
                                   outvals=outvals, stat=cls,
                                   swapname=cls.__name__.lower(), **stat_kws)