Current File : //usr/local/lib64/python3.6/site-packages/pandas/core/internals/blocks.py |
from datetime import datetime, timedelta
import inspect
import re
from typing import TYPE_CHECKING, Any, List, Optional
import warnings
import numpy as np
from pandas._libs import NaT, algos as libalgos, lib, writers
import pandas._libs.internals as libinternals
from pandas._libs.internals import BlockPlacement
from pandas._libs.tslibs import conversion
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import ArrayLike
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.cast import (
astype_nansafe,
convert_scalar_for_putitemlike,
find_common_type,
infer_dtype_from,
infer_dtype_from_scalar,
maybe_downcast_numeric,
maybe_downcast_to_dtype,
maybe_infer_dtype_type,
maybe_promote,
maybe_upcast,
soft_convert_objects,
)
from pandas.core.dtypes.common import (
DT64NS_DTYPE,
TD64NS_DTYPE,
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_interval_dtype,
is_list_like,
is_object_dtype,
is_period_dtype,
is_re,
is_re_compilable,
is_sparse,
is_timedelta64_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCIndexClass,
ABCPandasArray,
ABCSeries,
)
from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna
import pandas.core.algorithms as algos
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
Categorical,
DatetimeArray,
ExtensionArray,
PandasArray,
PandasDtype,
TimedeltaArray,
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import (
check_setitem_lengths,
is_empty_indexer,
is_scalar_indexer,
)
import pandas.core.missing as missing
from pandas.core.nanops import nanpercentile
if TYPE_CHECKING:
from pandas import Index
class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
data structure
Index-ignorant; let the container take care of that
"""
__slots__ = ["_mgr_locs", "values", "ndim"]
is_numeric = False
is_float = False
is_integer = False
is_complex = False
is_datetime = False
is_datetimetz = False
is_timedelta = False
is_bool = False
is_object = False
is_categorical = False
is_extension = False
_can_hold_na = False
_can_consolidate = True
_validate_ndim = True
@classmethod
def _simple_new(
cls, values: ArrayLike, placement: BlockPlacement, ndim: int
) -> "Block":
"""
Fastpath constructor, does *no* validation
"""
obj = object.__new__(cls)
obj.ndim = ndim
obj.values = values
obj._mgr_locs = placement
return obj
def __init__(self, values, placement, ndim=None):
self.ndim = self._check_ndim(values, ndim)
self.mgr_locs = placement
self.values = values
if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
raise ValueError(
f"Wrong number of items passed {len(self.values)}, "
f"placement implies {len(self.mgr_locs)}"
)
def _check_ndim(self, values, ndim):
"""
ndim inference and validation.
Infers ndim from 'values' if not provided to __init__.
Validates that values.ndim and ndim are consistent if and only if
the class variable '_validate_ndim' is True.
Parameters
----------
values : array-like
ndim : int or None
Returns
-------
ndim : int
Raises
------
ValueError : the number of dimensions do not match
"""
if ndim is None:
ndim = values.ndim
if self._validate_ndim and values.ndim != ndim:
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim != ndim [{values.ndim} != {ndim}]"
)
return ndim
@property
def _holder(self):
"""
The array-like that can hold the underlying values.
None for 'Block', overridden by subclasses that don't
use an ndarray.
"""
return None
@property
def _consolidate_key(self):
return (self._can_consolidate, self.dtype.name)
@property
def is_view(self) -> bool:
""" return a boolean if I am possibly a view """
return self.values.base is not None
@property
def is_datelike(self) -> bool:
""" return True if I am a non-datelike """
return self.is_datetime or self.is_timedelta
def external_values(self):
"""
The array that Series.values returns (public attribute).
This has some historical constraints, and is overridden in block
subclasses to return the correct array (e.g. period returns
object ndarray and datetimetz a datetime64[ns] ndarray instead of
proper extension array).
"""
return self.values
def internal_values(self):
"""
The array that Series._values returns (internal values).
"""
return self.values
def array_values(self) -> ExtensionArray:
"""
The array that Series.array returns. Always an ExtensionArray.
"""
return PandasArray(self.values)
def get_values(self, dtype=None):
"""
return an internal format, currently just the ndarray
this is often overridden to handle to_dense like operations
"""
if is_object_dtype(dtype):
return self.values.astype(object)
return self.values
def get_block_values_for_json(self) -> np.ndarray:
"""
This is used in the JSON C code.
"""
# TODO(EA2D): reshape will be unnecessary with 2D EAs
return np.asarray(self.values).reshape(self.shape)
@property
def fill_value(self):
return np.nan
@property
def mgr_locs(self):
return self._mgr_locs
@mgr_locs.setter
def mgr_locs(self, new_mgr_locs):
if not isinstance(new_mgr_locs, libinternals.BlockPlacement):
new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs)
self._mgr_locs = new_mgr_locs
def make_block(self, values, placement=None) -> "Block":
"""
Create a new block, with type inference propagate any values that are
not specified
"""
if placement is None:
placement = self.mgr_locs
if self.is_extension:
values = _block_shape(values, ndim=self.ndim)
return make_block(values, placement=placement, ndim=self.ndim)
def make_block_same_class(self, values, placement=None, ndim=None):
""" Wrap given values in a block of same type as self. """
if placement is None:
placement = self.mgr_locs
if ndim is None:
ndim = self.ndim
return type(self)(values, placement=placement, ndim=ndim)
def __repr__(self) -> str:
# don't want to print out all of the items here
name = type(self).__name__
if self.ndim == 1:
result = f"{name}: {len(self)} dtype: {self.dtype}"
else:
shape = " x ".join(str(s) for s in self.shape)
result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}"
return result
def __len__(self) -> int:
return len(self.values)
def __getstate__(self):
return self.mgr_locs.indexer, self.values
def __setstate__(self, state):
self.mgr_locs = libinternals.BlockPlacement(state[0])
self.values = state[1]
self.ndim = self.values.ndim
def _slice(self, slicer):
""" return a slice of my values """
return self.values[slicer]
def getitem_block(self, slicer, new_mgr_locs=None):
"""
Perform __getitem__-like, return result as block.
As of now, only supports slices that preserve dimensionality.
"""
if new_mgr_locs is None:
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
new_mgr_locs = self.mgr_locs[axis0_slicer]
elif not isinstance(new_mgr_locs, BlockPlacement):
new_mgr_locs = BlockPlacement(new_mgr_locs)
new_values = self._slice(slicer)
if self._validate_ndim and new_values.ndim != self.ndim:
raise ValueError("Only same dim slicing is allowed")
return type(self)._simple_new(new_values, new_mgr_locs, self.ndim)
@property
def shape(self):
return self.values.shape
@property
def dtype(self):
return self.values.dtype
def iget(self, i):
return self.values[i]
def set(self, locs, values):
"""
Modify block values in-place with new item value.
Notes
-----
`set` never creates a new array or new Block, whereas `setitem` _may_
create a new array and always creates a new Block.
"""
self.values[locs] = values
def delete(self, loc) -> None:
"""
Delete given loc(-s) from block in-place.
"""
self.values = np.delete(self.values, loc, 0)
self.mgr_locs = self.mgr_locs.delete(loc)
def apply(self, func, **kwargs) -> List["Block"]:
"""
apply the function to my values; return a block if we are not
one
"""
with np.errstate(all="ignore"):
result = func(self.values, **kwargs)
return self._split_op_result(result)
def _split_op_result(self, result) -> List["Block"]:
# See also: split_and_operate
if is_extension_array_dtype(result) and result.ndim > 1:
# TODO(EA2D): unnecessary with 2D EAs
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
nbs = []
for i, loc in enumerate(self.mgr_locs):
vals = result[i]
block = self.make_block(values=vals, placement=[loc])
nbs.append(block)
return nbs
if not isinstance(result, Block):
result = self.make_block(result)
return [result]
def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> List["Block"]:
"""
fillna on the block with the value. If we fail, then convert to
ObjectBlock and try again
"""
inplace = validate_bool_kwarg(inplace, "inplace")
mask = isna(self.values)
if limit is not None:
limit = libalgos._validate_limit(None, limit=limit)
mask[mask.cumsum(self.ndim - 1) > limit] = False
if not self._can_hold_na:
if inplace:
return [self]
else:
return [self.copy()]
if self._can_hold_element(value):
# equivalent: _try_coerce_args(value) would not raise
blocks = self.putmask(mask, value, inplace=inplace)
return self._maybe_downcast(blocks, downcast)
# we can't process the value, but nothing to do
if not mask.any():
return [self] if inplace else [self.copy()]
# operate column-by-column
def f(mask, val, idx):
block = self.coerce_to_target_dtype(value)
# slice out our block
if idx is not None:
# i.e. self.ndim == 2
block = block.getitem_block(slice(idx, idx + 1))
return block.fillna(value, limit=limit, inplace=inplace, downcast=None)
return self.split_and_operate(None, f, inplace)
def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
"""
split the block per-column, and apply the callable f
per-column, return a new block for each. Handle
masking which will not change a block unless needed.
Parameters
----------
mask : 2-d boolean mask
f : callable accepting (1d-mask, 1d values, indexer)
inplace : boolean
Returns
-------
list of blocks
"""
if mask is None:
mask = np.broadcast_to(True, shape=self.shape)
new_values = self.values
def make_a_block(nv, ref_loc):
if isinstance(nv, list):
assert len(nv) == 1, nv
assert isinstance(nv[0], Block)
block = nv[0]
else:
# Put back the dimension that was taken from it and make
# a block out of the result.
nv = _block_shape(nv, ndim=self.ndim)
block = self.make_block(values=nv, placement=ref_loc)
return block
# ndim == 1
if self.ndim == 1:
if mask.any():
nv = f(mask, new_values, None)
else:
nv = new_values if inplace else new_values.copy()
block = make_a_block(nv, self.mgr_locs)
return [block]
# ndim > 1
new_blocks = []
for i, ref_loc in enumerate(self.mgr_locs):
m = mask[i]
v = new_values[i]
# need a new block
if m.any():
nv = f(m, v, i)
else:
nv = v if inplace else v.copy()
block = make_a_block(nv, [ref_loc])
new_blocks.append(block)
return new_blocks
def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]:
# no need to downcast our float
# unless indicated
if downcast is None and (
self.is_float or self.is_timedelta or self.is_datetime
):
return blocks
return _extend_blocks([b.downcast(downcast) for b in blocks])
def downcast(self, dtypes=None):
""" try to downcast each item to the dict of dtypes if present """
# turn it off completely
if dtypes is False:
return self
values = self.values
if self.ndim == 1:
# try to cast all non-floats here
if dtypes is None:
dtypes = "infer"
nv = maybe_downcast_to_dtype(values, dtypes)
return self.make_block(nv)
# ndim > 1
if dtypes is None:
return self
if not (dtypes == "infer" or isinstance(dtypes, dict)):
raise ValueError(
"downcast must have a dictionary or 'infer' as its argument"
)
elif dtypes != "infer":
raise AssertionError("dtypes as dict is not supported yet")
# operate column-by-column
# this is expensive as it splits the blocks items-by-item
def f(mask, val, idx):
val = maybe_downcast_to_dtype(val, dtype="infer")
return val
return self.split_and_operate(None, f, False)
def astype(self, dtype, copy: bool = False, errors: str = "raise"):
"""
Coerce to the new dtype.
Parameters
----------
dtype : str, dtype convertible
copy : bool, default False
copy if indicated
errors : str, {'raise', 'ignore'}, default 'ignore'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
Returns
-------
Block
"""
errors_legal_values = ("raise", "ignore")
if errors not in errors_legal_values:
invalid_arg = (
"Expected value of kwarg 'errors' to be one of "
f"{list(errors_legal_values)}. Supplied value is '{errors}'"
)
raise ValueError(invalid_arg)
if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype):
msg = (
f"Expected an instance of {dtype.__name__}, "
"but got the class instead. Try instantiating 'dtype'."
)
raise TypeError(msg)
if dtype is not None:
dtype = pandas_dtype(dtype)
# may need to convert to categorical
if is_categorical_dtype(dtype):
if is_categorical_dtype(self.values.dtype):
# GH 10696/18593: update an existing categorical efficiently
return self.make_block(self.values.astype(dtype, copy=copy))
return self.make_block(Categorical(self.values, dtype=dtype))
dtype = pandas_dtype(dtype)
# astype processing
if is_dtype_equal(self.dtype, dtype):
if copy:
return self.copy()
return self
# force the copy here
if self.is_extension:
try:
values = self.values.astype(dtype)
except (ValueError, TypeError):
if errors == "ignore":
values = self.values
else:
raise
else:
if issubclass(dtype.type, str):
# use native type formatting for datetime/tz/timedelta
if self.is_datelike:
values = self.to_native_types()
# astype formatting
else:
# Because we have neither is_extension nor is_datelike,
# self.values already has the correct shape
values = self.values
else:
values = self.get_values(dtype=dtype)
# _astype_nansafe works fine with 1-d only
vals1d = values.ravel()
try:
values = astype_nansafe(vals1d, dtype, copy=True)
except (ValueError, TypeError):
# e.g. astype_nansafe can fail on object-dtype of strings
# trying to convert to float
if errors == "raise":
raise
newb = self.copy() if copy else self
return newb
# TODO(EA2D): special case not needed with 2D EAs
if isinstance(values, np.ndarray):
values = values.reshape(self.shape)
newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim)
if newb.is_numeric and self.is_numeric:
if newb.shape != self.shape:
raise TypeError(
f"cannot set astype for copy = [{copy}] for dtype "
f"({self.dtype.name} [{self.shape}]) to different shape "
f"({newb.dtype.name} [{newb.shape}])"
)
return newb
def convert(
self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
coerce: bool = False,
):
"""
attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
return self.copy() if copy else self
def _can_hold_element(self, element: Any) -> bool:
""" require the same dtype as ourselves """
dtype = self.values.dtype.type
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, dtype)
return isinstance(element, dtype)
def should_store(self, value: ArrayLike) -> bool:
"""
Should we set self.values[indexer] = value inplace or do we need to cast?
Parameters
----------
value : np.ndarray or ExtensionArray
Returns
-------
bool
"""
return is_dtype_equal(value.dtype, self.dtype)
def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
""" convert to our native types format """
values = self.values
mask = isna(values)
itemsize = writers.word_len(na_rep)
if not self.is_object and not quoting and itemsize:
values = values.astype(str)
if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
# enlarge for the na_rep
values = values.astype(f"<U{itemsize}")
else:
values = np.array(values, dtype="object")
values[mask] = na_rep
return values
# block actions #
def copy(self, deep: bool = True):
""" copy constructor """
values = self.values
if deep:
values = values.copy()
return self.make_block_same_class(values, ndim=self.ndim)
def replace(
self,
to_replace,
value,
inplace: bool = False,
regex: bool = False,
convert: bool = True,
):
"""
replace the to_replace value with value, possible to create new
blocks here this is just a call to putmask. regex is not used here.
It is used in ObjectBlocks. It is here for API compatibility.
"""
inplace = validate_bool_kwarg(inplace, "inplace")
original_to_replace = to_replace
# If we cannot replace with own dtype, convert to ObjectBlock and
# retry
if not self._can_hold_element(to_replace):
if not isinstance(to_replace, list):
if inplace:
return [self]
return [self.copy()]
to_replace = [x for x in to_replace if self._can_hold_element(x)]
if not len(to_replace):
# GH#28084 avoid costly checks since we can infer
# that there is nothing to replace in this block
if inplace:
return [self]
return [self.copy()]
if len(to_replace) == 1:
# _can_hold_element checks have reduced this back to the
# scalar case and we can avoid a costly object cast
return self.replace(
to_replace[0], value, inplace=inplace, regex=regex, convert=convert,
)
# GH 22083, TypeError or ValueError occurred within error handling
# causes infinite loop. Cast and retry only if not objectblock.
if is_object_dtype(self):
raise AssertionError
# try again with a compatible block
block = self.astype(object)
return block.replace(
to_replace=to_replace,
value=value,
inplace=inplace,
regex=regex,
convert=convert,
)
values = self.values
if lib.is_scalar(to_replace) and isinstance(values, np.ndarray):
# The only non-DatetimeLike class that also has a non-trivial
# try_coerce_args is ObjectBlock, but that overrides replace,
# so does not get here.
to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype)
mask = missing.mask_missing(values, to_replace)
try:
blocks = self.putmask(mask, value, inplace=inplace)
# Note: it is _not_ the case that self._can_hold_element(value)
# is always true at this point. In particular, that can fail
# for:
# "2u" with bool-dtype, float-dtype
# 0.5 with int64-dtype
# np.nan with int64-dtype
except (TypeError, ValueError):
# GH 22083, TypeError or ValueError occurred within error handling
# causes infinite loop. Cast and retry only if not objectblock.
if is_object_dtype(self):
raise
if not self.is_extension:
# TODO: https://github.com/pandas-dev/pandas/issues/32586
# Need an ExtensionArray._can_hold_element to indicate whether
# a scalar value can be placed in the array.
assert not self._can_hold_element(value), value
# try again with a compatible block
block = self.astype(object)
return block.replace(
to_replace=original_to_replace,
value=value,
inplace=inplace,
regex=regex,
convert=convert,
)
if convert:
blocks = [b.convert(numeric=False, copy=not inplace) for b in blocks]
return blocks
def _replace_single(self, *args, **kwargs):
""" no-op on a non-ObjectBlock """
return self if kwargs["inplace"] else self.copy()
def setitem(self, indexer, value):
"""
Attempt self.values[indexer] = value, possibly creating a new array.
Parameters
----------
indexer : tuple, list-like, array-like, slice
The subset of self.values to set
value : object
The value being set
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
transpose = self.ndim == 2
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")
# coerce None values, if appropriate
if value is None:
if self.is_numeric:
value = np.nan
# coerce if block dtype can store value
values = self.values
if self._can_hold_element(value):
# We only get here for non-Extension Blocks, so _try_coerce_args
# is only relevant for DatetimeBlock and TimedeltaBlock
if lib.is_scalar(value):
value = convert_scalar_for_putitemlike(value, values.dtype)
else:
# current dtype cannot store value, coerce to common dtype
if hasattr(value, "dtype"):
dtype = value.dtype
elif lib.is_scalar(value) and not isna(value):
dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
else:
# e.g. we are bool dtype and value is nan
# TODO: watch out for case with listlike value and scalar/empty indexer
dtype, _ = maybe_promote(np.array(value).dtype)
return self.astype(dtype).setitem(indexer, value)
dtype = find_common_type([values.dtype, dtype])
assert not is_dtype_equal(self.dtype, dtype)
# otherwise should have _can_hold_element
return self.astype(dtype).setitem(indexer, value)
# value must be storable at this moment
if is_extension_array_dtype(getattr(value, "dtype", None)):
# We need to be careful not to allow through strings that
# can be parsed to EADtypes
is_ea_value = True
arr_value = value
else:
is_ea_value = False
arr_value = np.array(value)
if transpose:
values = values.T
# length checking
check_setitem_lengths(indexer, value, values)
exact_match = (
len(arr_value.shape)
and arr_value.shape[0] == values.shape[0]
and arr_value.size == values.size
)
if is_empty_indexer(indexer, arr_value):
# GH#8669 empty indexers
pass
elif is_scalar_indexer(indexer, self.ndim):
# setting a single element for each dim and with a rhs that could
# be e.g. a list; see GH#6043
values[indexer] = value
elif exact_match and is_categorical_dtype(arr_value.dtype):
# GH25495 - If the current dtype is not categorical,
# we need to create a new categorical block
values[indexer] = value
return self.make_block(Categorical(self.values, dtype=arr_value.dtype))
elif exact_match and is_ea_value:
# GH#32395 if we're going to replace the values entirely, just
# substitute in the new array
return self.make_block(arr_value)
# if we are an exact match (ex-broadcasting),
# then use the resultant dtype
elif exact_match:
# We are setting _all_ of the array's values, so can cast to new dtype
values[indexer] = value
values = values.astype(arr_value.dtype, copy=False)
# set
else:
values[indexer] = value
if transpose:
values = values.T
block = self.make_block(values)
return block
def putmask(
self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False,
) -> List["Block"]:
"""
putmask the data to the block; it is possible that we may create a
new dtype of block
Return the resulting block(s).
Parameters
----------
mask : np.ndarray[bool], SparseArray[bool], or BooleanArray
new : a ndarray/object
inplace : bool, default False
Perform inplace modification.
axis : int
transpose : bool, default False
Set to True if self is stored with axes reversed.
Returns
-------
List[Block]
"""
mask = _extract_bool_array(mask)
assert not isinstance(new, (ABCIndexClass, ABCSeries, ABCDataFrame))
new_values = self.values # delay copy if possible.
# if we are passed a scalar None, convert it here
if not is_list_like(new) and isna(new) and not self.is_object:
# FIXME: make sure we have compatible NA
new = self.fill_value
if self._can_hold_element(new):
# We only get here for non-Extension Blocks, so _try_coerce_args
# is only relevant for DatetimeBlock and TimedeltaBlock
if lib.is_scalar(new):
new = convert_scalar_for_putitemlike(new, self.values.dtype)
if transpose:
new_values = new_values.T
# If the default repeat behavior in np.putmask would go in the
# wrong direction, then explicitly repeat and reshape new instead
if getattr(new, "ndim", 0) >= 1:
if self.ndim - 1 == new.ndim and axis == 1:
new = np.repeat(new, new_values.shape[-1]).reshape(self.shape)
new = new.astype(new_values.dtype)
if new_values is self.values and not inplace:
new_values = new_values.copy()
# we require exact matches between the len of the
# values we are setting (or is compat). np.putmask
# doesn't check this and will simply truncate / pad
# the output, but we want sane error messages
#
# TODO: this prob needs some better checking
# for 2D cases
if (
is_list_like(new)
and np.any(mask[mask])
and getattr(new, "ndim", 1) == 1
):
if mask[mask].shape[-1] == len(new):
# GH 30567
# If length of ``new`` is less than the length of ``new_values``,
# `np.putmask` would first repeat the ``new`` array and then
# assign the masked values hence produces incorrect result.
# `np.place` on the other hand uses the ``new`` values at it is
# to place in the masked locations of ``new_values``
np.place(new_values, mask, new)
elif mask.shape[-1] == len(new) or len(new) == 1:
np.putmask(new_values, mask, new)
else:
raise ValueError("cannot assign mismatch length to masked array")
else:
np.putmask(new_values, mask, new)
# maybe upcast me
elif mask.any():
if transpose:
mask = mask.T
if isinstance(new, np.ndarray):
new = new.T
axis = new_values.ndim - axis - 1
# Pseudo-broadcast
if getattr(new, "ndim", 0) >= 1:
if self.ndim - 1 == new.ndim:
new_shape = list(new.shape)
new_shape.insert(axis, 1)
new = new.reshape(tuple(new_shape))
# operate column-by-column
def f(mask, val, idx):
if idx is None:
# ndim==1 case.
n = new
else:
if isinstance(new, np.ndarray):
n = np.squeeze(new[idx % new.shape[0]])
else:
n = np.array(new)
# type of the new block
dtype, _ = maybe_promote(n.dtype)
# we need to explicitly astype here to make a copy
n = n.astype(dtype)
nv = _putmask_smart(val, mask, n)
return nv
new_blocks = self.split_and_operate(mask, f, inplace)
return new_blocks
if inplace:
return [self]
if transpose:
if new_values is None:
new_values = self.values if inplace else self.values.copy()
new_values = new_values.T
return [self.make_block(new_values)]
def coerce_to_target_dtype(self, other):
"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
we can also safely try to coerce to the same dtype
and will receive the same block
"""
# if we cannot then coerce to object
dtype, _ = infer_dtype_from(other, pandas_dtype=True)
if is_dtype_equal(self.dtype, dtype):
return self
if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):
# we don't upcast to bool
return self.astype(object)
elif (self.is_float or self.is_complex) and (
is_integer_dtype(dtype) or is_float_dtype(dtype)
):
# don't coerce float/complex to int
return self
elif (
self.is_datetime
or is_datetime64_dtype(dtype)
or is_datetime64tz_dtype(dtype)
):
# not a datetime
if not (
(is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype))
and self.is_datetime
):
return self.astype(object)
# don't upcast timezone with different timezone or no timezone
mytz = getattr(self.dtype, "tz", None)
othertz = getattr(dtype, "tz", None)
if not tz_compare(mytz, othertz):
return self.astype(object)
raise AssertionError(
f"possible recursion in coerce_to_target_dtype: {self} {other}"
)
elif self.is_timedelta or is_timedelta64_dtype(dtype):
# not a timedelta
if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
return self.astype(object)
raise AssertionError(
f"possible recursion in coerce_to_target_dtype: {self} {other}"
)
try:
return self.astype(dtype)
except (ValueError, TypeError, OverflowError):
return self.astype(object)
def interpolate(
self,
method: str = "pad",
axis: int = 0,
index: Optional["Index"] = None,
inplace: bool = False,
limit: Optional[int] = None,
limit_direction: str = "forward",
limit_area: Optional[str] = None,
fill_value: Optional[Any] = None,
coerce: bool = False,
downcast: Optional[str] = None,
**kwargs,
):
inplace = validate_bool_kwarg(inplace, "inplace")
# Only FloatBlocks will contain NaNs. timedelta subclasses IntBlock
if (self.is_bool or self.is_integer) and not self.is_timedelta:
return self if inplace else self.copy()
# a fill na type method
try:
m = missing.clean_fill_method(method)
except ValueError:
m = None
if m is not None:
return self._interpolate_with_fill(
method=m,
axis=axis,
inplace=inplace,
limit=limit,
fill_value=fill_value,
coerce=coerce,
downcast=downcast,
)
# validate the interp method
m = missing.clean_interp_method(method, **kwargs)
assert index is not None # for mypy
return self._interpolate(
method=m,
index=index,
axis=axis,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
inplace=inplace,
downcast=downcast,
**kwargs,
)
def _interpolate_with_fill(
self,
method: str = "pad",
axis: int = 0,
inplace: bool = False,
limit: Optional[int] = None,
fill_value: Optional[Any] = None,
coerce: bool = False,
downcast: Optional[str] = None,
) -> List["Block"]:
""" fillna but using the interpolate machinery """
inplace = validate_bool_kwarg(inplace, "inplace")
# if we are coercing, then don't force the conversion
# if the block can't hold the type
if coerce:
if not self._can_hold_na:
if inplace:
return [self]
else:
return [self.copy()]
values = self.values if inplace else self.values.copy()
# We only get here for non-ExtensionBlock
fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype)
values = missing.interpolate_2d(
values,
method=method,
axis=axis,
limit=limit,
fill_value=fill_value,
dtype=self.dtype,
)
blocks = [self.make_block_same_class(values, ndim=self.ndim)]
return self._maybe_downcast(blocks, downcast)
def _interpolate(
self,
method: str,
index: "Index",
fill_value: Optional[Any] = None,
axis: int = 0,
limit: Optional[int] = None,
limit_direction: str = "forward",
limit_area: Optional[str] = None,
inplace: bool = False,
downcast: Optional[str] = None,
**kwargs,
) -> List["Block"]:
""" interpolate using scipy wrappers """
inplace = validate_bool_kwarg(inplace, "inplace")
data = self.values if inplace else self.values.copy()
# only deal with floats
if not self.is_float:
if not self.is_integer:
return [self]
data = data.astype(np.float64)
if fill_value is None:
fill_value = self.fill_value
if method in ("krogh", "piecewise_polynomial", "pchip"):
if not index.is_monotonic:
raise ValueError(
f"{method} interpolation requires that the index be monotonic."
)
# process 1-d slices in the axis direction
def func(yvalues: np.ndarray) -> np.ndarray:
# process a 1-d slice, returning it
# should the axis argument be handled below in apply_along_axis?
# i.e. not an arg to missing.interpolate_1d
return missing.interpolate_1d(
xvalues=index,
yvalues=yvalues,
method=method,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
bounds_error=False,
**kwargs,
)
# interp each column independently
interp_values = np.apply_along_axis(func, axis, data)
blocks = [self.make_block_same_class(interp_values)]
return self._maybe_downcast(blocks, downcast)
def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default):
"""
Take values according to indexer and return them as a block.bb
"""
# algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock
# so need to preserve types
# sparse is treated like an ndarray, but needs .get_values() shaping
values = self.values
if fill_value is lib.no_default:
fill_value = self.fill_value
allow_fill = False
else:
allow_fill = True
new_values = algos.take_nd(
values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value
)
# Called from three places in managers, all of which satisfy
# this assertion
assert not (axis == 0 and new_mgr_locs is None)
if new_mgr_locs is None:
new_mgr_locs = self.mgr_locs
if not is_dtype_equal(new_values.dtype, self.dtype):
return self.make_block(new_values, new_mgr_locs)
else:
return self.make_block_same_class(new_values, new_mgr_locs)
def diff(self, n: int, axis: int = 1) -> List["Block"]:
""" return block for the diff of the values """
new_values = algos.diff(self.values, n, axis=axis, stacklevel=7)
return [self.make_block(values=new_values)]
def shift(self, periods: int, axis: int = 0, fill_value=None):
""" shift the block by periods, possibly upcast """
# convert integer to float if necessary. need to do a lot more than
# that, handle boolean etc also
new_values, fill_value = maybe_upcast(self.values, fill_value)
new_values = shift(new_values, periods, axis, fill_value)
return [self.make_block(new_values)]
def where(
self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0,
) -> List["Block"]:
"""
evaluate the block; return result block(s) from the result
Parameters
----------
other : a ndarray/object
cond : np.ndarray[bool], SparseArray[bool], or BooleanArray
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
axis : int, default 0
Returns
-------
List[Block]
"""
import pandas.core.computation.expressions as expressions
cond = _extract_bool_array(cond)
assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame))
assert errors in ["raise", "ignore"]
transpose = self.ndim == 2
values = self.values
orig_other = other
if transpose:
values = values.T
# If the default broadcasting would go in the wrong direction, then
# explicitly reshape other instead
if getattr(other, "ndim", 0) >= 1:
if values.ndim - 1 == other.ndim and axis == 1:
other = other.reshape(tuple(other.shape + (1,)))
elif transpose and values.ndim == self.ndim - 1:
cond = cond.T
if not hasattr(cond, "shape"):
raise ValueError("where must have a condition that is ndarray like")
def where_func(cond, values, other):
if not (
(self.is_integer or self.is_bool)
and lib.is_float(other)
and np.isnan(other)
):
# np.where will cast integer array to floats in this case
if not self._can_hold_element(other):
raise TypeError
if lib.is_scalar(other) and isinstance(values, np.ndarray):
# convert datetime to datetime64, timedelta to timedelta64
other = convert_scalar_for_putitemlike(other, values.dtype)
# By the time we get here, we should have all Series/Index
# args extracted to ndarray
fastres = expressions.where(cond, values, other)
return fastres
if cond.ravel("K").all():
result = values
else:
# see if we can operate on the entire block, or need item-by-item
# or if we are a single block (ndim == 1)
try:
result = where_func(cond, values, other)
except TypeError:
# we cannot coerce, return a compat dtype
# we are explicitly ignoring errors
block = self.coerce_to_target_dtype(other)
blocks = block.where(
orig_other, cond, errors=errors, try_cast=try_cast, axis=axis,
)
return self._maybe_downcast(blocks, "infer")
if self._can_hold_na or self.ndim == 1:
if transpose:
result = result.T
return [self.make_block(result)]
# might need to separate out blocks
axis = cond.ndim - 1
cond = cond.swapaxes(axis, 0)
mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool)
result_blocks = []
for m in [mask, ~mask]:
if m.any():
taken = result.take(m.nonzero()[0], axis=axis)
r = maybe_downcast_numeric(taken, self.dtype)
nb = self.make_block(r.T, placement=self.mgr_locs[m])
result_blocks.append(nb)
return result_blocks
def _unstack(self, unstacker, fill_value, new_placement):
"""
Return a list of unstacked blocks of self
Parameters
----------
unstacker : reshape._Unstacker
fill_value : int
Only used in ExtensionBlock._unstack
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
new_values, mask = unstacker.get_new_values(
self.values.T, fill_value=fill_value
)
mask = mask.any(0)
# TODO: in all tests we have mask.all(); can we rely on that?
new_values = new_values.T[mask]
new_placement = new_placement[mask]
blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask
def quantile(self, qs, interpolation="linear", axis: int = 0):
"""
compute the quantiles of the
Parameters
----------
qs: a scalar or list of the quantiles to be computed
interpolation: type of interpolation, default 'linear'
axis: axis to compute, default 0
Returns
-------
Block
"""
# We should always have ndim == 2 because Series dispatches to DataFrame
assert self.ndim == 2
values = self.get_values()
is_empty = values.shape[axis] == 0
orig_scalar = not is_list_like(qs)
if orig_scalar:
# make list-like, unpack later
qs = [qs]
if is_empty:
# create the array of na_values
# 2d len(values) * len(qs)
result = np.repeat(
np.array([self.fill_value] * len(qs)), len(values)
).reshape(len(values), len(qs))
else:
# asarray needed for Sparse, see GH#24600
mask = np.asarray(isna(values))
result = nanpercentile(
values,
np.array(qs) * 100,
axis=axis,
na_value=self.fill_value,
mask=mask,
ndim=values.ndim,
interpolation=interpolation,
)
result = np.array(result, copy=False)
result = result.T
if orig_scalar and not lib.is_scalar(result):
# result could be scalar in case with is_empty and self.ndim == 1
assert result.shape[-1] == 1, result.shape
result = result[..., 0]
result = lib.item_from_zerodim(result)
ndim = np.ndim(result)
return make_block(result, placement=np.arange(len(result)), ndim=ndim)
def _replace_coerce(
self,
to_replace,
value,
inplace: bool = True,
regex: bool = False,
convert: bool = False,
mask=None,
):
"""
Replace value corresponding to the given boolean array with another
value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default True
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
convert : bool, default True
If true, try to coerce any object types to better types.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
Returns
-------
A new block if there is anything to replace or the original block.
"""
if mask.any():
if not regex:
self = self.coerce_to_target_dtype(value)
return self.putmask(mask, value, inplace=inplace)
else:
return self._replace_single(
to_replace,
value,
inplace=inplace,
regex=regex,
convert=convert,
mask=mask,
)
return self
class ExtensionBlock(Block):
"""
Block for holding extension types.
Notes
-----
This holds all 3rd-party extension array types. It's also the immediate
parent class for our internal extension types' blocks, CategoricalBlock.
ExtensionArrays are limited to 1-D.
"""
_can_consolidate = False
_validate_ndim = False
is_extension = True
def __init__(self, values, placement, ndim=None):
"""
Initialize a non-consolidatable block.
'ndim' may be inferred from 'placement'.
This will call continue to call __init__ for the other base
classes mixed in with this Mixin.
"""
values = self._maybe_coerce_values(values)
# Placement must be converted to BlockPlacement so that we can check
# its length
if not isinstance(placement, libinternals.BlockPlacement):
placement = libinternals.BlockPlacement(placement)
# Maybe infer ndim from placement
if ndim is None:
if len(placement) != 1:
ndim = 1
else:
ndim = 2
super().__init__(values, placement, ndim=ndim)
if self.ndim == 2 and len(self.mgr_locs) != 1:
# TODO(EA2D): check unnecessary with 2D EAs
raise AssertionError("block.size != values.size")
@property
def shape(self):
# TODO(EA2D): override unnecessary with 2D EAs
if self.ndim == 1:
return ((len(self.values)),)
return (len(self.mgr_locs), len(self.values))
def iget(self, col):
if self.ndim == 2 and isinstance(col, tuple):
# TODO(EA2D): unnecessary with 2D EAs
col, loc = col
if not com.is_null_slice(col) and col != 0:
raise IndexError(f"{self} only contains one item")
elif isinstance(col, slice):
if col != slice(None):
raise NotImplementedError(col)
return self.values[[loc]]
return self.values[loc]
else:
if col != 0:
raise IndexError(f"{self} only contains one item")
return self.values
def should_store(self, value: ArrayLike) -> bool:
"""
Can we set the given array-like value inplace?
"""
return isinstance(value, self._holder)
def set(self, locs, values):
assert locs.tolist() == [0]
self.values = values
def putmask(
self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False,
) -> List["Block"]:
"""
See Block.putmask.__doc__
"""
inplace = validate_bool_kwarg(inplace, "inplace")
mask = _extract_bool_array(mask)
new_values = self.values if inplace else self.values.copy()
if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask):
new = new[mask]
mask = _safe_reshape(mask, new_values.shape)
new_values[mask] = new
return [self.make_block(values=new_values)]
def _maybe_coerce_values(self, values):
"""
Unbox to an extension array.
This will unbox an ExtensionArray stored in an Index or Series.
ExtensionArrays pass through. No dtype coercion is done.
Parameters
----------
values : Index, Series, ExtensionArray
Returns
-------
ExtensionArray
"""
return extract_array(values)
@property
def _holder(self):
# For extension blocks, the holder is values-dependent.
return type(self.values)
@property
def fill_value(self):
# Used in reindex_indexer
return self.values.dtype.na_value
@property
def _can_hold_na(self):
# The default ExtensionArray._can_hold_na is True
return self._holder._can_hold_na
@property
def is_view(self) -> bool:
"""Extension arrays are never treated as views."""
return False
@property
def is_numeric(self):
return self.values.dtype._is_numeric
def setitem(self, indexer, value):
"""
Attempt self.values[indexer] = value, possibly creating a new array.
This differs from Block.setitem by not allowing setitem to change
the dtype of the Block.
Parameters
----------
indexer : tuple, list-like, array-like, slice
The subset of self.values to set
value : object
The value being set
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
if isinstance(indexer, tuple):
# TODO(EA2D): not needed with 2D EAs
# we are always 1-D
indexer = indexer[0]
check_setitem_lengths(indexer, value, self.values)
self.values[indexer] = value
return self
def get_values(self, dtype=None):
# ExtensionArrays must be iterable, so this works.
# TODO(EA2D): reshape not needed with 2D EAs
return np.asarray(self.values).reshape(self.shape)
def array_values(self) -> ExtensionArray:
return self.values
def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
"""override to use ExtensionArray astype for the conversion"""
values = self.values
mask = isna(values)
values = np.asarray(values.astype(object))
values[mask] = na_rep
# TODO(EA2D): reshape not needed with 2D EAs
# we are expected to return a 2-d ndarray
return values.reshape(1, len(values))
def take_nd(
self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default
):
"""
Take values according to indexer and return them as a block.
"""
if fill_value is lib.no_default:
fill_value = None
# TODO(EA2D): special case not needed with 2D EAs
# axis doesn't matter; we are really a single-dim object
# but are passed the axis depending on the calling routing
# if its REALLY axis 0, then this will be a reindex and not a take
new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True)
# Called from three places in managers, all of which satisfy
# this assertion
assert not (self.ndim == 1 and new_mgr_locs is None)
if new_mgr_locs is None:
new_mgr_locs = self.mgr_locs
return self.make_block_same_class(new_values, new_mgr_locs)
def _can_hold_element(self, element: Any) -> bool:
# TODO: We may need to think about pushing this onto the array.
# We're doing the same as CategoricalBlock here.
return True
def _slice(self, slicer):
"""
Return a slice of my values.
Parameters
----------
slicer : slice, ndarray[int], or a tuple of these
Valid (non-reducing) indexer for self.values.
Returns
-------
np.ndarray or ExtensionArray
"""
# return same dims as we currently have
if not isinstance(slicer, tuple) and self.ndim == 2:
# reached via getitem_block via _slice_take_blocks_ax0
# TODO(EA2D): wont be necessary with 2D EAs
slicer = (slicer, slice(None))
if isinstance(slicer, tuple) and len(slicer) == 2:
first = slicer[0]
if not isinstance(first, slice):
raise AssertionError(
"invalid slicing for a 1-ndim ExtensionArray", first
)
# GH#32959 only full-slicers along fake-dim0 are valid
# TODO(EA2D): wont be necessary with 2D EAs
new_locs = self.mgr_locs[first]
if len(new_locs):
# effectively slice(None)
slicer = slicer[1]
else:
raise AssertionError(
"invalid slicing for a 1-ndim ExtensionArray", slicer
)
return self.values[slicer]
def fillna(self, value, limit=None, inplace=False, downcast=None):
values = self.values if inplace else self.values.copy()
values = values.fillna(value=value, limit=limit)
return [
self.make_block_same_class(
values=values, placement=self.mgr_locs, ndim=self.ndim
)
]
def interpolate(
self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
):
values = self.values if inplace else self.values.copy()
return self.make_block_same_class(
values=values.fillna(value=fill_value, method=method, limit=limit),
placement=self.mgr_locs,
)
def diff(self, n: int, axis: int = 1) -> List["Block"]:
if axis == 0 and n != 0:
# n==0 case will be a no-op so let is fall through
# Since we only have one column, the result will be all-NA.
# Create this result by shifting along axis=0 past the length of
# our values.
return super().diff(len(self.values), axis=0)
if axis == 1:
# TODO(EA2D): unnecessary with 2D EAs
# we are by definition 1D.
axis = 0
return super().diff(n, axis)
def shift(
self, periods: int, axis: int = 0, fill_value: Any = None,
) -> List["ExtensionBlock"]:
"""
Shift the block by `periods`.
Dispatches to underlying ExtensionArray and re-boxes in an
ExtensionBlock.
"""
return [
self.make_block_same_class(
self.values.shift(periods=periods, fill_value=fill_value),
placement=self.mgr_locs,
ndim=self.ndim,
)
]
def where(
self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0,
) -> List["Block"]:
cond = _extract_bool_array(cond)
assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame))
if isinstance(other, np.ndarray) and other.ndim == 2:
# TODO(EA2D): unnecessary with 2D EAs
assert other.shape[1] == 1
other = other[:, 0]
if isinstance(cond, np.ndarray) and cond.ndim == 2:
# TODO(EA2D): unnecessary with 2D EAs
assert cond.shape[1] == 1
cond = cond[:, 0]
if lib.is_scalar(other) and isna(other):
# The default `other` for Series / Frame is np.nan
# we want to replace that with the correct NA value
# for the type
other = self.dtype.na_value
if is_sparse(self.values):
# TODO(SparseArray.__setitem__): remove this if condition
# We need to re-infer the type of the data after doing the
# where, for cases where the subtypes don't match
dtype = None
else:
dtype = self.dtype
result = self.values.copy()
icond = ~cond
if lib.is_scalar(other):
set_other = other
else:
set_other = other[icond]
try:
result[icond] = set_other
except (NotImplementedError, TypeError):
# NotImplementedError for class not implementing `__setitem__`
# TypeError for SparseArray, which implements just to raise
# a TypeError
result = self._holder._from_sequence(
np.where(cond, self.values, other), dtype=dtype
)
return [self.make_block_same_class(result, placement=self.mgr_locs)]
def _unstack(self, unstacker, fill_value, new_placement):
# ExtensionArray-safe unstack.
# We override ObjectBlock._unstack, which unstacks directly on the
# values of the array. For EA-backed blocks, this would require
# converting to a 2-D ndarray of objects.
# Instead, we unstack an ndarray of integer positions, followed by
# a `take` on the actual values.
n_rows = self.shape[-1]
dummy_arr = np.arange(n_rows)
new_values, mask = unstacker.get_new_values(dummy_arr, fill_value=-1)
mask = mask.any(0)
# TODO: in all tests we have mask.all(); can we rely on that?
blocks = [
self.make_block_same_class(
self.values.take(indices, allow_fill=True, fill_value=fill_value),
[place],
)
for indices, place in zip(new_values.T, new_placement)
]
return blocks, mask
class ObjectValuesExtensionBlock(ExtensionBlock):
"""
Block providing backwards-compatibility for `.values`.
Used by PeriodArray and IntervalArray to ensure that
Series[T].values is an ndarray of objects.
"""
def external_values(self):
return self.values.astype(object)
class NumericBlock(Block):
__slots__ = ()
is_numeric = True
_can_hold_na = True
class FloatOrComplexBlock(NumericBlock):
__slots__ = ()
class FloatBlock(FloatOrComplexBlock):
__slots__ = ()
is_float = True
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(
tipo.type, (np.datetime64, np.timedelta64)
)
return isinstance(
element, (float, int, np.floating, np.int_)
) and not isinstance(
element,
(bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64),
)
def to_native_types(
self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs,
):
""" convert to our native types format """
values = self.values
# see gh-13418: no special formatting is desired at the
# output (important for appropriate 'quoting' behaviour),
# so do not pass it through the FloatArrayFormatter
if float_format is None and decimal == ".":
mask = isna(values)
if not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype="object")
values[mask] = na_rep
return values
from pandas.io.formats.format import FloatArrayFormatter
formatter = FloatArrayFormatter(
values,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
quoting=quoting,
fixed_width=False,
)
return formatter.get_result_as_array()
class ComplexBlock(FloatOrComplexBlock):
__slots__ = ()
is_complex = True
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating))
return isinstance(
element, (float, int, complex, np.float_, np.int_)
) and not isinstance(element, (bool, np.bool_))
def should_store(self, value: ArrayLike) -> bool:
return issubclass(value.dtype.type, np.complexfloating)
class IntBlock(NumericBlock):
__slots__ = ()
is_integer = True
_can_hold_na = False
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return (
issubclass(tipo.type, np.integer)
and not issubclass(tipo.type, (np.datetime64, np.timedelta64))
and self.dtype.itemsize >= tipo.itemsize
)
# We have not inferred an integer from the dtype
# check if we have a builtin int or a float equal to an int
return is_integer(element) or (is_float(element) and element.is_integer())
class DatetimeLikeBlockMixin:
"""Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock."""
@property
def _holder(self):
return DatetimeArray
@property
def fill_value(self):
return np.datetime64("NaT", "ns")
def get_values(self, dtype=None):
"""
return object dtype as boxed values, such as Timestamps/Timedelta
"""
if is_object_dtype(dtype):
# DTA/TDA constructor and astype can handle 2D
return self._holder(self.values).astype(object)
return self.values
def internal_values(self):
# Override to return DatetimeArray and TimedeltaArray
return self.array_values()
def array_values(self):
return self._holder._simple_new(self.values)
def iget(self, key):
# GH#31649 we need to wrap scalars in Timestamp/Timedelta
# TODO(EA2D): this can be removed if we ever have 2D EA
return self.array_values().reshape(self.shape)[key]
def shift(self, periods, axis=0, fill_value=None):
# TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs
values = self.array_values()
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
return self.make_block_same_class(new_values)
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
is_datetime = True
def __init__(self, values, placement, ndim=None):
values = self._maybe_coerce_values(values)
super().__init__(values, placement=placement, ndim=ndim)
@property
def _can_hold_na(self):
return True
def _maybe_coerce_values(self, values):
"""
Input validation for values passed to __init__. Ensure that
we have datetime64ns, coercing if necessary.
Parameters
----------
values : array-like
Must be convertible to datetime64
Returns
-------
values : ndarray[datetime64ns]
Overridden by DatetimeTZBlock.
"""
if values.dtype != DT64NS_DTYPE:
values = conversion.ensure_datetime64ns(values)
if isinstance(values, DatetimeArray):
values = values._data
assert isinstance(values, np.ndarray), type(values)
return values
def astype(self, dtype, copy: bool = False, errors: str = "raise"):
"""
these automatically copy, so copy=True has no effect
raise on an except if raise == True
"""
dtype = pandas_dtype(dtype)
# if we are passed a datetime64[ns, tz]
if is_datetime64tz_dtype(dtype):
values = self.values
if copy:
# this should be the only copy
values = values.copy()
if getattr(values, "tz", None) is None:
values = DatetimeArray(values).tz_localize("UTC")
values = values.tz_convert(dtype.tz)
return self.make_block(values)
# delegate
return super().astype(dtype=dtype, copy=copy, errors=errors)
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
if self.is_datetimetz:
# require exact match, since non-nano does not exist
return is_dtype_equal(tipo, self.dtype) or is_valid_nat_for_dtype(
element, self.dtype
)
# GH#27419 if we get a non-nano datetime64 object
return is_datetime64_dtype(tipo)
elif element is NaT:
return True
elif isinstance(element, datetime):
if self.is_datetimetz:
return tz_compare(element.tzinfo, self.dtype.tz)
return element.tzinfo is None
return is_valid_nat_for_dtype(element, self.dtype)
def to_native_types(self, na_rep="NaT", date_format=None, **kwargs):
""" convert to our native types format """
dta = self.array_values()
result = dta._format_native_types(
na_rep=na_rep, date_format=date_format, **kwargs
)
return np.atleast_2d(result)
def set(self, locs, values):
"""
See Block.set.__doc__
"""
values = conversion.ensure_datetime64ns(values, copy=False)
self.values[locs] = values
class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
__slots__ = ()
is_datetimetz = True
is_extension = True
internal_values = Block.internal_values
_can_hold_element = DatetimeBlock._can_hold_element
to_native_types = DatetimeBlock.to_native_types
fill_value = np.datetime64("NaT", "ns")
should_store = Block.should_store
array_values = ExtensionBlock.array_values
@property
def _holder(self):
return DatetimeArray
def _maybe_coerce_values(self, values):
"""
Input validation for values passed to __init__. Ensure that
we have datetime64TZ, coercing if necessary.
Parameters
----------
values : array-like
Must be convertible to datetime64
Returns
-------
values : DatetimeArray
"""
if not isinstance(values, self._holder):
values = self._holder(values)
if values.tz is None:
raise ValueError("cannot create a DatetimeTZBlock without a tz")
return values
@property
def is_view(self) -> bool:
""" return a boolean if I am possibly a view """
# check the ndarray values of the DatetimeIndex values
return self.values._data.base is not None
def get_values(self, dtype=None):
"""
Returns an ndarray of values.
Parameters
----------
dtype : np.dtype
Only `object`-like dtypes are respected here (not sure
why).
Returns
-------
values : ndarray
When ``dtype=object``, then and object-dtype ndarray of
boxed values is returned. Otherwise, an M8[ns] ndarray
is returned.
DatetimeArray is always 1-d. ``get_values`` will reshape
the return value to be the same dimensionality as the
block.
"""
values = self.values
if is_object_dtype(dtype):
values = values.astype(object)
# TODO(EA2D): reshape unnecessary with 2D EAs
# Ensure that our shape is correct for DataFrame.
# ExtensionArrays are always 1-D, even in a DataFrame when
# the analogous NumPy-backed column would be a 2-D ndarray.
return np.asarray(values).reshape(self.shape)
def external_values(self):
# NB: this is different from np.asarray(self.values), since that
# return an object-dtype ndarray of Timestamps.
return np.asarray(self.values.astype("datetime64[ns]", copy=False))
def diff(self, n: int, axis: int = 0) -> List["Block"]:
"""
1st discrete difference.
Parameters
----------
n : int
Number of periods to diff.
axis : int, default 0
Axis to diff upon.
Returns
-------
A list with a new TimeDeltaBlock.
Notes
-----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
if axis == 0:
# TODO(EA2D): special case not needed with 2D EAs
# Cannot currently calculate diff across multiple blocks since this
# function is invoked via apply
raise NotImplementedError
if n == 0:
# Fastpath avoids making a copy in `shift`
new_values = np.zeros(self.values.shape, dtype=np.int64)
else:
new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8
# Reshape the new_values like how algos.diff does for timedelta data
new_values = new_values.reshape(1, len(new_values))
new_values = new_values.astype("timedelta64[ns]")
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
def fillna(self, value, limit=None, inplace=False, downcast=None):
# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
if self._can_hold_element(value):
return super().fillna(value, limit, inplace, downcast)
# different timezones, or a non-tz
return self.astype(object).fillna(
value, limit=limit, inplace=inplace, downcast=downcast
)
def setitem(self, indexer, value):
# https://github.com/pandas-dev/pandas/issues/24020
# Need a dedicated setitem until #24020 (type promotion in setitem
# for extension arrays) is designed and implemented.
if self._can_hold_element(value) or (
isinstance(indexer, np.ndarray) and indexer.size == 0
):
return super().setitem(indexer, value)
obj_vals = self.values.astype(object)
newb = make_block(
obj_vals, placement=self.mgr_locs, klass=ObjectBlock, ndim=self.ndim
)
return newb.setitem(indexer, value)
def quantile(self, qs, interpolation="linear", axis=0):
naive = self.values.view("M8[ns]")
# TODO(EA2D): kludge for 2D block with 1D values
naive = naive.reshape(self.shape)
blk = self.make_block(naive)
res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis)
# TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like
aware = self._holder(res_blk.values.ravel(), dtype=self.dtype)
return self.make_block_same_class(aware, ndim=res_blk.ndim)
def _check_ndim(self, values, ndim):
"""
ndim inference and validation.
This is overriden by the DatetimeTZBlock to check the case of 2D
data (values.ndim == 2), which should only be allowed if ndim is
also 2.
The case of 1D array is still allowed with both ndim of 1 or 2, as
if the case for other EAs. Therefore, we are only checking
`values.ndim > ndim` instead of `values.ndim != ndim` as for
consolidated blocks.
"""
if ndim is None:
ndim = values.ndim
if values.ndim > ndim:
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim != ndim [{values.ndim} != {ndim}]"
)
return ndim
class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
__slots__ = ()
is_timedelta = True
_can_hold_na = True
is_numeric = False
fill_value = np.timedelta64("NaT", "ns")
def __init__(self, values, placement, ndim=None):
if values.dtype != TD64NS_DTYPE:
# e.g. non-nano or int64
values = TimedeltaArray._from_sequence(values)._data
if isinstance(values, TimedeltaArray):
values = values._data
assert isinstance(values, np.ndarray), type(values)
super().__init__(values, placement=placement, ndim=ndim)
@property
def _holder(self):
return TimedeltaArray
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, np.timedelta64)
elif element is NaT:
return True
elif isinstance(element, (timedelta, np.timedelta64)):
return True
return is_valid_nat_for_dtype(element, self.dtype)
def fillna(self, value, **kwargs):
# allow filling with integers to be
# interpreted as nanoseconds
if is_integer(value):
# Deprecation GH#24694, GH#19233
raise TypeError(
"Passing integers to fillna for timedelta64[ns] dtype is no "
"longer supported. To obtain the old behavior, pass "
"`pd.Timedelta(seconds=n)` instead."
)
return super().fillna(value, **kwargs)
def to_native_types(self, na_rep="NaT", **kwargs):
""" convert to our native types format """
tda = self.array_values()
return tda._format_native_types(na_rep, **kwargs)
class BoolBlock(NumericBlock):
__slots__ = ()
is_bool = True
_can_hold_na = False
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, np.bool_)
return isinstance(element, (bool, np.bool_))
def replace(self, to_replace, value, inplace=False, regex=False, convert=True):
inplace = validate_bool_kwarg(inplace, "inplace")
to_replace_values = np.atleast_1d(to_replace)
if not np.can_cast(to_replace_values, bool):
return self
return super().replace(
to_replace, value, inplace=inplace, regex=regex, convert=convert,
)
class ObjectBlock(Block):
__slots__ = ()
is_object = True
_can_hold_na = True
def __init__(self, values, placement=None, ndim=2):
if issubclass(values.dtype.type, str):
values = np.array(values, dtype=object)
super().__init__(values, ndim=ndim, placement=placement)
@property
def is_bool(self):
"""
we can be a bool if we have only bool values but are of type
object
"""
return lib.is_bool_array(self.values.ravel("K"))
def convert(
self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
coerce: bool = False,
):
"""
attempt to coerce any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
can return multiple blocks!
"""
# operate column-by-column
def f(mask, val, idx):
shape = val.shape
values = soft_convert_objects(
val.ravel(),
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
coerce=coerce,
copy=copy,
)
if isinstance(values, np.ndarray):
# TODO(EA2D): allow EA once reshape is supported
values = values.reshape(shape)
return values
if self.ndim == 2:
blocks = self.split_and_operate(None, f, False)
else:
values = f(None, self.values.ravel(), None)
blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)]
return blocks
def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]:
if downcast is not None:
return blocks
# split and convert the blocks
return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks])
def _can_hold_element(self, element: Any) -> bool:
return True
def replace(self, to_replace, value, inplace=False, regex=False, convert=True):
to_rep_is_list = is_list_like(to_replace)
value_is_list = is_list_like(value)
both_lists = to_rep_is_list and value_is_list
either_list = to_rep_is_list or value_is_list
result_blocks = []
blocks = [self]
if not either_list and is_re(to_replace):
return self._replace_single(
to_replace, value, inplace=inplace, regex=True, convert=convert,
)
elif not (either_list or regex):
return super().replace(
to_replace, value, inplace=inplace, regex=regex, convert=convert,
)
elif both_lists:
for to_rep, v in zip(to_replace, value):
result_blocks = []
for b in blocks:
result = b._replace_single(
to_rep, v, inplace=inplace, regex=regex, convert=convert,
)
result_blocks = _extend_blocks(result, result_blocks)
blocks = result_blocks
return result_blocks
elif to_rep_is_list and regex:
for to_rep in to_replace:
result_blocks = []
for b in blocks:
result = b._replace_single(
to_rep, value, inplace=inplace, regex=regex, convert=convert,
)
result_blocks = _extend_blocks(result, result_blocks)
blocks = result_blocks
return result_blocks
return self._replace_single(
to_replace, value, inplace=inplace, convert=convert, regex=regex,
)
def _replace_single(
self, to_replace, value, inplace=False, regex=False, convert=True, mask=None,
):
"""
Replace elements by the given value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
convert : bool, default True
If true, try to coerce any object types to better types.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
Returns
-------
a new block, the result after replacing
"""
inplace = validate_bool_kwarg(inplace, "inplace")
# to_replace is regex compilable
to_rep_re = regex and is_re_compilable(to_replace)
# regex is regex compilable
regex_re = is_re_compilable(regex)
# only one will survive
if to_rep_re and regex_re:
raise AssertionError(
"only one of to_replace and regex can be regex compilable"
)
# if regex was passed as something that can be a regex (rather than a
# boolean)
if regex_re:
to_replace = regex
regex = regex_re or to_rep_re
# try to get the pattern attribute (compiled re) or it's a string
if is_re(to_replace):
pattern = to_replace.pattern
else:
pattern = to_replace
# if the pattern is not empty and to_replace is either a string or a
# regex
if regex and pattern:
rx = re.compile(to_replace)
else:
# if the thing to replace is not a string or compiled regex call
# the superclass method -> to_replace is some kind of object
return super().replace(to_replace, value, inplace=inplace, regex=regex)
new_values = self.values if inplace else self.values.copy()
# deal with replacing values with objects (strings) that match but
# whose replacement is not a string (numeric, nan, object)
if isna(value) or not isinstance(value, str):
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return value if rx.search(s) is not None else s
else:
return s
else:
# value is guaranteed to be a string here, s can be either a string
# or null if it's null it gets returned
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return rx.sub(value, s)
else:
return s
f = np.vectorize(re_replacer, otypes=[self.dtype])
if mask is None:
new_values[:] = f(new_values)
else:
new_values[mask] = f(new_values[mask])
# convert
block = self.make_block(new_values)
if convert:
block = block.convert(numeric=False)
return block
def _replace_coerce(
self, to_replace, value, inplace=True, regex=False, convert=False, mask=None
):
"""
Replace value corresponding to the given boolean array with another
value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
convert : bool, default True
If true, try to coerce any object types to better types.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
Returns
-------
A new block if there is anything to replace or the original block.
"""
if mask.any():
block = super()._replace_coerce(
to_replace=to_replace,
value=value,
inplace=inplace,
regex=regex,
convert=convert,
mask=mask,
)
if convert:
block = [b.convert(numeric=False, copy=True) for b in block]
return block
if convert:
return [self.convert(numeric=False, copy=True)]
return self
class CategoricalBlock(ExtensionBlock):
__slots__ = ()
is_categorical = True
_can_hold_na = True
should_store = Block.should_store
def __init__(self, values, placement, ndim=None):
# coerce to categorical if we can
values = extract_array(values)
assert isinstance(values, Categorical), type(values)
super().__init__(values, placement=placement, ndim=ndim)
@property
def _holder(self):
return Categorical
def replace(
self,
to_replace,
value,
inplace: bool = False,
regex: bool = False,
convert: bool = True,
):
inplace = validate_bool_kwarg(inplace, "inplace")
result = self if inplace else self.copy()
result.values.replace(to_replace, value, inplace=True)
return result
# -----------------------------------------------------------------
# Constructor Helpers
def get_block_type(values, dtype=None):
"""
Find the appropriate Block subclass to use for the given values and dtype.
Parameters
----------
values : ndarray-like
dtype : numpy or pandas dtype
Returns
-------
cls : class, subclass of Block
"""
dtype = dtype or values.dtype
vtype = dtype.type
if is_sparse(dtype):
# Need this first(ish) so that Sparse[datetime] is sparse
cls = ExtensionBlock
elif is_categorical_dtype(values.dtype):
cls = CategoricalBlock
elif issubclass(vtype, np.datetime64):
assert not is_datetime64tz_dtype(values.dtype)
cls = DatetimeBlock
elif is_datetime64tz_dtype(values.dtype):
cls = DatetimeTZBlock
elif is_interval_dtype(dtype) or is_period_dtype(dtype):
cls = ObjectValuesExtensionBlock
elif is_extension_array_dtype(values.dtype):
cls = ExtensionBlock
elif issubclass(vtype, np.floating):
cls = FloatBlock
elif issubclass(vtype, np.timedelta64):
assert issubclass(vtype, np.integer)
cls = TimeDeltaBlock
elif issubclass(vtype, np.complexfloating):
cls = ComplexBlock
elif issubclass(vtype, np.integer):
cls = IntBlock
elif dtype == np.bool_:
cls = BoolBlock
else:
cls = ObjectBlock
return cls
def make_block(values, placement, klass=None, ndim=None, dtype=None):
# Ensure that we don't allow PandasArray / PandasDtype in internals.
# For now, blocks should be backed by ndarrays when possible.
if isinstance(values, ABCPandasArray):
values = values.to_numpy()
if ndim and ndim > 1:
# TODO(EA2D): special case not needed with 2D EAs
values = np.atleast_2d(values)
if isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype
if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(values, dtype)
elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
# TODO: This is no longer hit internally; does it need to be retained
# for e.g. pyarrow?
values = DatetimeArray._simple_new(values, dtype=dtype)
return klass(values, ndim=ndim, placement=placement)
# -----------------------------------------------------------------
def _extend_blocks(result, blocks=None):
""" return a new extended blocks, given the result """
if blocks is None:
blocks = []
if isinstance(result, list):
for r in result:
if isinstance(r, list):
blocks.extend(r)
else:
blocks.append(r)
else:
assert isinstance(result, Block), type(result)
blocks.append(result)
return blocks
def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike:
""" guarantee the shape of the values to be at least 1 d """
if values.ndim < ndim:
shape = values.shape
if not is_extension_array_dtype(values.dtype):
# TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023
# block.shape is incorrect for "2D" ExtensionArrays
# We can't, and don't need to, reshape.
values = values.reshape(tuple((1,) + shape)) # type: ignore
return values
def _safe_reshape(arr, new_shape):
"""
If possible, reshape `arr` to have shape `new_shape`,
with a couple of exceptions (see gh-13012):
1) If `arr` is a ExtensionArray or Index, `arr` will be
returned as is.
2) If `arr` is a Series, the `_values` attribute will
be reshaped and returned.
Parameters
----------
arr : array-like, object to be reshaped
new_shape : int or tuple of ints, the new shape
"""
if isinstance(arr, ABCSeries):
arr = arr._values
if not is_extension_array_dtype(arr.dtype):
# Note: this will include TimedeltaArray and tz-naive DatetimeArray
# TODO(EA2D): special case will be unnecessary with 2D EAs
arr = np.asarray(arr).reshape(new_shape)
return arr
def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray:
"""
Return a new ndarray, try to preserve dtype if possible.
Parameters
----------
v : np.ndarray
`values`, updated in-place.
mask : np.ndarray[bool]
Applies to both sides (array like).
n : `new values` either scalar or an array like aligned with `values`
Returns
-------
values : ndarray with updated values
this *may* be a copy of the original
See Also
--------
ndarray.putmask
"""
# we cannot use np.asarray() here as we cannot have conversions
# that numpy does when numeric are mixed with strings
# n should be the length of the mask or a scalar here
if not is_list_like(n):
n = np.repeat(n, len(mask))
# see if we are only masking values that if putted
# will work in the current dtype
try:
nn = n[mask]
except TypeError:
# TypeError: only integer scalar arrays can be converted to a scalar index
pass
else:
# make sure that we have a nullable type
# if we have nulls
if not _isna_compat(v, nn[0]):
pass
elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)):
# only compare integers/floats
pass
elif not (is_float_dtype(v.dtype) or is_integer_dtype(v.dtype)):
# only compare integers/floats
pass
else:
# we ignore ComplexWarning here
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", np.ComplexWarning)
nn_at = nn.astype(v.dtype)
comp = nn == nn_at
if is_list_like(comp) and comp.all():
nv = v.copy()
nv[mask] = nn_at
return nv
n = np.asarray(n)
def _putmask_preserve(nv, n):
try:
nv[mask] = n[mask]
except (IndexError, ValueError):
nv[mask] = n
return nv
# preserves dtype if possible
if v.dtype.kind == n.dtype.kind:
return _putmask_preserve(v, n)
# change the dtype if needed
dtype, _ = maybe_promote(n.dtype)
v = v.astype(dtype)
return _putmask_preserve(v, n)
def _extract_bool_array(mask: ArrayLike) -> np.ndarray:
"""
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
"""
if isinstance(mask, ExtensionArray):
# We could have BooleanArray, Sparse[bool], ...
mask = np.asarray(mask, dtype=np.bool_)
assert isinstance(mask, np.ndarray), type(mask)
assert mask.dtype == bool, mask.dtype
return mask