pysal.explore.esda.getisord 源代码

"""
Getis and Ord G statistic for spatial autocorrelation
"""
__author__ = "Sergio J. Rey <srey@asu.edu>, Myunghwa Hwang <mhwang4@gmail.com> "
__all__ = ['G', 'G_Local']

from pysal.lib.common import np, stats
from pysal.lib.weights.spatial_lag import lag_spatial as slag
from .tabular import _univariate_handler

PERMUTATIONS = 999


[文档]class G(object):
    """
    Global G Autocorrelation Statistic

    Parameters
    ----------
    y             : array (n,1)
                    Attribute values
    w             : W
                   DistanceBand W spatial weights based on distance band
    permutations  : int
                    the number of random permutations for calculating pseudo p_values

    Attributes
    ----------
    y             : array
                    original variable
    w             : W
                    DistanceBand W spatial weights based on distance band
    permutation   : int
                    the number of permutations
    G             : float
                    the value of statistic
    EG            : float
                    the expected value of statistic
    VG            : float
                    the variance of G under normality assumption
    z_norm        : float
                    standard normal test statistic
    p_norm        : float
                    p-value under normality assumption (one-sided)
    sim           : array
                    (if permutations > 0)
                    vector of G values for permutated samples
    p_sim         : float
                    p-value based on permutations (one-sided)
                    null: spatial randomness
                    alternative: the observed G is extreme it is either extremely high or extremely low
    EG_sim        : float
                    average value of G from permutations
    VG_sim        : float
                    variance of G from permutations
    seG_sim       : float
                    standard deviation of G under permutations.
    z_sim         : float
                    standardized G based on permutations
    p_z_sim       : float
                    p-value based on standard normal approximation from
                    permutations (one-sided)

    Notes
    -----
    Moments are based on normality assumption.

    For technical details see :cite:`Getis_2010` and :cite:`Ord_2010`.


    Examples
    --------
    >>> import pysal.lib
    >>> import numpy
    >>> numpy.random.seed(10)

    Preparing a point data set
    >>> points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]

    Creating a weights object from points
    >>> w = pysal.lib.weights.DistanceBand(points,threshold=15)
    >>> w.transform = "B"

    Preparing a variable
    >>> y = numpy.array([2, 3, 3.2, 5, 8, 7])

    Applying Getis and Ord G test
    >>> from pysal.explore.esda.getisord import G
    >>> g = G(y,w)

    Examining the results
    >>> round(g.G, 3)
    0.557

    >>> round(g.p_norm, 3)
    0.173

    """

[文档]    def __init__(self, y, w, permutations=PERMUTATIONS):
        y = np.asarray(y).flatten()
        self.n = len(y)
        self.y = y
        w.transform = "B"
        self.w = w
        self.permutations = permutations
        self.__moments()
        self.y2 = y * y
        y = y.reshape(len(y), 1)  # Ensure that y is an n by 1 vector, otherwise y*y.T == y*y
        self.den_sum = (y * y.T).sum() - (y * y).sum()
        self.G = self.__calc(self.y)
        self.z_norm = (self.G - self.EG) / np.sqrt(self.VG)
        self.p_norm = 1.0 - stats.norm.cdf(np.abs(self.z_norm))

        if permutations:
            sim = [self.__calc(np.random.permutation(self.y))
                   for i in range(permutations)]
            self.sim = sim = np.array(sim)
            above = sim >= self.G
            larger = sum(above)
            if (self.permutations - larger) < larger:
                larger = self.permutations - larger
            self.p_sim = (larger + 1.0) / (permutations + 1.)
            self.EG_sim = sum(sim) / permutations
            self.seG_sim = sim.std()
            self.VG_sim = self.seG_sim ** 2
            self.z_sim = (self.G - self.EG_sim) / self.seG_sim
            self.p_z_sim = 1. - stats.norm.cdf(np.abs(self.z_sim))

    def __moments(self):
        y = self.y
        n = self.n
        w = self.w
        n2 = n * n
        s0 = w.s0
        self.EG = s0 / (n * (n - 1))
        s02 = s0 * s0
        s1 = w.s1
        s2 = w.s2
        b0 = (n2 - 3 * n + 3) * s1 - n * s2 + 3 * s02
        b1 = (-1.) * ((n2 - n) * s1 - 2 * n * s2 + 6 * s02)
        b2 = (-1.) * (2 * n * s1 - (n + 3) * s2 + 6 * s02)
        b3 = 4 * (n - 1) * s1 - 2 * (n + 1) * s2 + 8 * s02
        b4 = s1 - s2 + s02
        self.b0 = b0
        self.b1 = b1
        self.b2 = b2
        self.b3 = b3
        self.b4 = b4
        y2 = y * y
        y3 = y * y2
        y4 = y2 * y2
        EG2 = (b0 * (sum(
            y2) ** 2) + b1 * sum(y4) + b2 * (sum(y) ** 2) * sum(y2))
        EG2 += b3 * sum(y) * sum(y3) + b4 * (sum(y) ** 4)
        EG2NUM = EG2
        EG2DEN = (((sum(y) ** 2 - sum(y2)) ** 2) * n * (n - 1) * (
            n - 2) * (n - 3))
        self.EG2 = EG2NUM / EG2DEN
        self.VG = self.EG2 - self.EG ** 2

    def __calc(self, y):
        yl = slag(self.w, y)
        self.num = y * yl
        return self.num.sum() / self.den_sum

    @property
    def _statistic(self):
        """ Standardized accessor for pysal.explore.esda statistics"""
        return self.G

    @classmethod
    def by_col(cls, df, cols, w=None, inplace=False, pvalue='sim', outvals=None, **stat_kws):
        """
        Function to compute a G statistic on a dataframe

        Arguments
        ---------
        df          :   pandas.DataFrame
                        a pandas dataframe with a geometry column
        cols        :   string or list of string
                        name or list of names of columns to use to compute the statistic
        w           :   pysal weights object
                        a weights object aligned with the dataframe. If not provided, this
                        is searched for in the dataframe's metadata
        inplace     :   bool
                        a boolean denoting whether to operate on the dataframe inplace or to
                        return a series contaning the results of the computation. If
                        operating inplace, the derived columns will be named 'column_g'
        pvalue      :   string
                        a string denoting which pvalue should be returned. Refer to the
                        the G statistic's documentation for available p-values
        outvals     :   list of strings
                        list of arbitrary attributes to return as columns from the
                        G statistic
        **stat_kws  :   keyword arguments
                        options to pass to the underlying statistic. For this, see the
                        documentation for the G statistic.

        Returns
        -------
        If inplace, None, and operation is conducted on dataframe in memory. Otherwise,
        returns a copy of the dataframe with the relevant columns attached.


        """
        return _univariate_handler(df, cols, w=w, inplace=inplace, pvalue=pvalue,
                                   outvals=outvals, stat=cls,
                                   swapname=cls.__name__.lower(), **stat_kws)



[文档]class G_Local(object):
    """
    Generalized Local G Autocorrelation

    Parameters
    ----------
    y : array
        variable
    w : W
        DistanceBand, weights instance that is based on threshold distance
        and is assumed to be aligned with y
    transform : {'R', 'B'}
                the type of w, either 'B' (binary) or 'R' (row-standardized)
    permutations : int
                  the number of random permutations for calculating
                  pseudo p values
    star : boolean
           whether or not to include focal observation in sums (default: False)

    Attributes
    ----------
    y : array
       original variable
    w : DistanceBand W
       original weights object
    permutations : int
                  the number of permutations
    Gs : array
        of floats, the value of the orginal G statistic in Getis & Ord (1992)
    EGs : float
         expected value of Gs under normality assumption
         the values is scalar, since the expectation is identical
         across all observations
    VGs : array
         of floats, variance values of Gs under normality assumption
    Zs : array
        of floats, standardized Gs
    p_norm : array
            of floats, p-value under normality assumption (one-sided)
            for two-sided tests, this value should be multiplied by 2
    sim : array
         of arrays of floats (if permutations>0), vector of I values
         for permutated samples
    p_sim : array
           of floats, p-value based on permutations (one-sided)
           null - spatial randomness
           alternative - the observed G is extreme it is either extremely high or extremely low
    EG_sim : array
            of floats, average value of G from permutations
    VG_sim : array
            of floats, variance of G from permutations
    seG_sim : array
             of floats, standard deviation of G under permutations.
    z_sim : array
           of floats, standardized G based on permutations
    p_z_sim : array
             of floats, p-value based on standard normal approximation from
             permutations (one-sided)

    Notes
    -----
    To compute moments of Gs under normality assumption,
    PySAL considers w is either binary or row-standardized.
    For binary weights object, the weight value for self is 1
    For row-standardized weights object, the weight value for self is
    1/(the number of its neighbors + 1).


    For technical details see :cite:`Getis_2010` and :cite:`Ord_2010`.


    Examples
    --------
    >>> import pysal.lib
    >>> import numpy
    >>> numpy.random.seed(10)

    Preparing a point data set

    >>> points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]

    Creating a weights object from points

    >>> w = pysal.lib.weights.DistanceBand(points,threshold=15)

    Prepareing a variable

    >>> y = numpy.array([2, 3, 3.2, 5, 8, 7])

    Applying Getis and Ord local G test using a binary weights object

    >>> from pysal.explore.esda.getisord import G_Local
    >>> lg = G_Local(y,w,transform='B')

    Examining the results

    >>> lg.Zs
    array([-1.0136729 , -0.04361589,  1.31558703, -0.31412676,  1.15373986,
            1.77833941])
    >>> round(lg.p_sim[0], 3)
    0.101

    >>> numpy.random.seed(10)

    Applying Getis and Ord local G* test using a binary weights object
    >>> lg_star = G_Local(y,w,transform='B',star=True)

    Examining the results
    >>> lg_star.Zs
    array([-1.39727626, -0.28917762,  0.65064964, -0.28917762,  1.23452088,
            2.02424331])
    >>> round(lg_star.p_sim[0], 3)
    0.101

    >>> numpy.random.seed(12345)

    Applying Getis and Ord local G test using a row-standardized weights object
    >>> lg = G_Local(y,w,transform='R')

    Examining the results
    >>> lg.Zs
    array([-0.62074534, -0.01780611,  1.31558703, -0.12824171,  0.28843496,
            1.77833941])
    >>> round(lg.p_sim[0], 3)
    0.103

    >>> numpy.random.seed(10)

    Applying Getis and Ord local G* test using a row-standardized weights object

    >>> lg_star = G_Local(y,w,transform='R',star=True)

    Examining the results
    >>> lg_star.Zs
    array([-0.62488094, -0.09144599,  0.41150696, -0.09144599,  0.24690418,
            1.28024388])
    >>> round(lg_star.p_sim[0], 3)
    0.101

    """
[文档]    def __init__(self, y, w, transform='R', permutations=PERMUTATIONS, star=False):
        y = np.asarray(y).flatten()
        self.n = len(y)
        self.y = y
        self.w = w
        self.w_original = w.transform
        self.w.transform = self.w_transform = transform.lower()
        self.permutations = permutations
        self.star = star
        self.calc()
        self.p_norm = np.array(
            [1 - stats.norm.cdf(np.abs(i)) for i in self.Zs])
        if permutations:
            self.__crand()
            sim = np.transpose(self.rGs)
            above = sim >= self.Gs
            larger = sum(above)
            low_extreme = (self.permutations - larger) < larger
            larger[low_extreme] = self.permutations - larger[low_extreme]
            self.p_sim = (larger + 1.0) / (permutations + 1)
            self.sim = sim
            self.EG_sim = sim.mean()
            self.seG_sim = sim.std()
            self.VG_sim = self.seG_sim * self.seG_sim
            self.z_sim = (self.Gs - self.EG_sim) / self.seG_sim
            self.p_z_sim = 1 - stats.norm.cdf(np.abs(self.z_sim))

    def __crand(self):
        y = self.y
        rGs = np.zeros((self.n, self.permutations))
        n_1 = self.n - 1
        rid = list(range(n_1))
        prange = list(range(self.permutations))
        k = self.w.max_neighbors + 1
        rids = np.array([np.random.permutation(rid)[0:k] for i in prange])
        ids = np.arange(self.w.n)
        ido = self.w.id_order
        wc = self.__getCardinalities()
        if self.w_transform == 'r':
            den = np.array(wc) + self.star
        else:
            den = np.ones(self.w.n)
        for i in range(self.w.n):
            idsi = ids[ids != i]
            np.random.shuffle(idsi)
            yi_star = y[i] * self.star
            wci = wc[i]
            rGs[i] = (y[idsi[rids[:, 0:wci]]]).sum(1) + yi_star
            rGs[i] = (np.array(rGs[i]) / den[i]) / (
                self.y_sum - (1 - self.star) * y[i])
        self.rGs = rGs

    def __getCardinalities(self):
        ido = self.w.id_order
        self.wc = np.array(
            [self.w.cardinalities[ido[i]] for i in range(self.n)])
        return self.wc

    def calc(self):
        y = self.y
        y2 = y * y
        self.y_sum = y_sum = sum(y)
        y2_sum = sum(y2)

        if not self.star:
            yl = 1.0 * slag(self.w, y)
            ydi = y_sum - y
            self.Gs = yl / ydi
            N = self.n - 1
            yl_mean = ydi / N
            s2 = (y2_sum - y2) / N - (yl_mean) ** 2
        else:
            self.w.transform = 'B'
            yl = 1.0 * slag(self.w, y)
            yl += y
            if self.w_transform == 'r':
                yl = yl / (self.__getCardinalities() + 1.0)
            self.Gs = yl / y_sum
            N = self.n
            yl_mean = y.mean()
            s2 = y.var()

        EGs_num, VGs_num = 1.0, 1.0
        if self.w_transform == 'b':
            W = self.__getCardinalities()
            W += self.star
            EGs_num = W * 1.0
            VGs_num = (W * (1.0 * N - W)) / (1.0 * N - 1)

        self.EGs = (EGs_num * 1.0) / N
        self.VGs = (VGs_num) * (1.0 / (N ** 2)) * ((s2 * 1.0) / (yl_mean ** 2))
        self.Zs = (self.Gs - self.EGs) / np.sqrt(self.VGs)

        self.w.transform = self.w_original

    @property
    def _statistic(self):
        """Standardized accessor for pysal.explore.esda statistics"""
        return self.Gs

    @classmethod
    def by_col(cls, df, cols, w=None, inplace=False, pvalue='sim', outvals=None, **stat_kws):
        """
        Function to compute a G_Local statistic on a dataframe

        Arguments
        ---------
        df          :   pandas.DataFrame
                        a pandas dataframe with a geometry column
        cols        :   string or list of string
                        name or list of names of columns to use to compute the statistic
        w           :   pysal weights object
                        a weights object aligned with the dataframe. If not provided, this
                        is searched for in the dataframe's metadata
        inplace     :   bool
                        a boolean denoting whether to operate on the dataframe inplace or to
                        return a series contaning the results of the computation. If
                        operating inplace, the derived columns will be named 'column_g_local'
        pvalue      :   string
                        a string denoting which pvalue should be returned. Refer to the
                        the G_Local statistic's documentation for available p-values
        outvals     :   list of strings
                        list of arbitrary attributes to return as columns from the
                        G_Local statistic
        **stat_kws  :   keyword arguments
                        options to pass to the underlying statistic. For this, see the
                        documentation for the G_Local statistic.

        Returns
        --------
        If inplace, None, and operation is conducted on dataframe in memory. Otherwise,
        returns a copy of the dataframe with the relevant columns attached.

        See also
        --------

        G_Local

        """
        return _univariate_handler(df, cols, w=w, inplace=inplace, pvalue=pvalue,
                                   outvals=outvals, stat=cls,
                                   swapname=cls.__name__.lower(), **stat_kws)