>>> from env_helper import info; info()
页面更新时间: 2023-12-16 22:02:25
运行环境:
    Linux发行版本: Debian GNU/Linux 12 (bookworm)
    操作系统内核: Linux-6.1.0-15-amd64-x86_64-with-glibc2.36
    Python版本: 3.11.2

8.7. Pandas注意事项&窍门

警告和疑难意味着一个看不见的问题。在使用Pandas过程中,需要特别注意的地方。 与Pandas一起使用If/Truth语句

当尝试将某些东西转换成布尔值时,Pandas遵循了一个错误的惯例。 这种情况发生在使用布尔运算的。 目前还不清楚结果是什么。 如果它是真的,因为它不是zerolength? 错误,因为有错误的值? 目前还不清楚,Pandas提出了一个ValueError -

import pandas as pd

if pd.Series([False, True, False]):
    print ('I am True')

执行上面示例代码,得到以下结果 -

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

在if条件,它不清楚如何处理它。错误提示是否使用None或任何这些。

>>> import pandas as pd
>>> if pd.Series([False, True, False]).any():
>>>     print("I am any")
I am any

要在布尔上下文中评估单元素Pandas对象,请使用方法.bool() -

>>> import pandas as pd
>>> print (pd.Series([True]).bool())
True

8.7.1. 按位布尔值

按位布尔运算符(如 ==!= )将返回一个布尔系列,这几乎总是需要的。

>>> import pandas as pd
>>>
>>> s = pd.Series(range(5))
>>> print (s==4)
0    False
1    False
2    False
3    False
4     True
dtype: bool

isin操作符

这将返回一个布尔序列,显示系列中的每个元素是否完全包含在传递的值序列中。

>>> import pandas as pd
>>>
>>> s = pd.Series(list('abc'))
>>> s = s.isin(['a', 'c', 'e'])
>>> print (s)
0     True
1    False
2     True
dtype: bool

重构索引与ix陷阱

许多用户会发现自己使用ix索引功能作为从Pandas对象中选择数据的简洁方法 -

>>> import pandas as pd
>>> import numpy as np
>>>
>>> df = pd.DataFrame(np.random.randn(6, 4),
>>>                   columns=['one', 'two', 'three','four'],
>>>                   index=list('abcdef'))
>>>
>>> print (df)
>>> print ("=============================================")
>>> print (df.loc[['b', 'c', 'e']])
        one       two     three      four
a -0.109092 -1.049401  1.471157 -1.488999
b -0.620283  0.333117  0.265730  0.756193
c -0.151341 -0.088314 -2.208548 -0.361831
d  0.519641  0.547256 -1.531098  1.087439
e -0.235775 -0.563923 -0.491280  0.634726
f  1.049348  0.243605  0.549927 -1.430398
=============================================
        one       two     three      four
b -0.620283  0.333117  0.265730  0.756193
c -0.151341 -0.088314 -2.208548 -0.361831
e -0.235775 -0.563923 -0.491280  0.634726

这当然在这种情况下完全等同于使用reindex方法 -

>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame(np.random.randn(6, 4),
>>>                   columns=['one', 'two', 'three','four'],
>>>                   index=list('abcdef'))
>>> print (df)
>>> print("=============================================")
>>> print (df.reindex(['b', 'c', 'e']))
        one       two     three      four
a -1.898824 -1.353821 -0.626713  0.189775
b  0.896474 -0.652357 -0.128021  1.419379
c  0.410777  0.730201 -0.270423 -0.548112
d -0.899703  0.866182 -0.676110 -1.597479
e  0.445195  1.646908  1.608295  0.746869
f -1.680827  0.102113  0.478425 -0.172722
=============================================
        one       two     three      four
b  0.896474 -0.652357 -0.128021  1.419379
c  0.410777  0.730201 -0.270423 -0.548112
e  0.445195  1.646908  1.608295  0.746869

有人可能会得出这样的结论,ix和reindex是基于这个100%的等价物。 除了整数索引的情况,它是true。例如,上述操作可选地表示为 -

>>> import pandas as pd
>>> import numpy as np
>>>
>>> df = pd.DataFrame(np.random.randn(6, 4),
>>>                   columns=['one', 'two', 'three','four'],
>>>                   index=list('abcdef'))
>>>
>>> print (df)
>>> print("=====================================")
>>> print (df.loc[[1, 2, 3]])
>>> print("=====================================")
>>> print (df.reindex([1, 2, 3]))
        one       two     three      four
a  0.405851  0.612873 -0.392511 -0.579213
b  0.580977 -0.772240  0.470936  0.737463
c -0.169078  1.548112  1.513963  0.235278
d -0.535708 -1.116789  0.833861  0.285282
e -0.494875  0.108672 -0.229865 -0.501433
f -0.522854  0.787164 -0.225666  0.509507
=====================================
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

Cell In [19], line 10
      8 print (df)
      9 print("=====================================")
---> 10 print (df.loc[[1, 2, 3]])
     11 print("=====================================")
     12 print (df.reindex([1, 2, 3]))


File /usr/lib/python3/dist-packages/pandas/core/indexing.py:1073, in _LocationIndexer.__getitem__(self, key)
   1070 axis = self.axis or 0
   1072 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1073 return self._getitem_axis(maybe_callable, axis=axis)


File /usr/lib/python3/dist-packages/pandas/core/indexing.py:1301, in _LocIndexer._getitem_axis(self, key, axis)
   1298     if hasattr(key, "ndim") and key.ndim > 1:
   1299         raise ValueError("Cannot index with multidimensional key")
-> 1301     return self._getitem_iterable(key, axis=axis)
   1303 # nested tuple slicing
   1304 if is_nested_tuple(key, labels):


File /usr/lib/python3/dist-packages/pandas/core/indexing.py:1239, in _LocIndexer._getitem_iterable(self, key, axis)
   1236 self._validate_key(key, axis)
   1238 # A collection of keys
-> 1239 keyarr, indexer = self._get_listlike_indexer(key, axis)
   1240 return self.obj._reindex_with_indexers(
   1241     {axis: [keyarr, indexer]}, copy=True, allow_dups=True
   1242 )


File /usr/lib/python3/dist-packages/pandas/core/indexing.py:1432, in _LocIndexer._get_listlike_indexer(self, key, axis)
   1429 ax = self.obj._get_axis(axis)
   1430 axis_name = self.obj._get_axis_name(axis)
-> 1432 keyarr, indexer = ax._get_indexer_strict(key, axis_name)
   1434 return keyarr, indexer


File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:6070, in Index._get_indexer_strict(self, key, axis_name)
   6067 else:
   6068     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6070 self._raise_if_missing(keyarr, indexer, axis_name)
   6072 keyarr = self.take(indexer)
   6073 if isinstance(key, Index):
   6074     # GH 42790 - Preserve name from an Index


File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:6130, in Index._raise_if_missing(self, key, indexer, axis_name)
   6128     if use_interval_msg:
   6129         key = list(key)
-> 6130     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6132 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   6133 raise KeyError(f"{not_found} not in index")


KeyError: "None of [Int64Index([1, 2, 3], dtype='int64')] are in the [index]"

重要的是要记住,reindex只是严格的标签索引。这可能会导致一些潜在的令人惊讶的结果,例如索引包含整数和字符串的病态情况。