Source code for zarr.core.dtype.npy.string

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import (
    TYPE_CHECKING,
    ClassVar,
    Literal,
    Protocol,
    Self,
    TypedDict,
    TypeGuard,
    overload,
    runtime_checkable,
)

import numpy as np

from zarr.core.common import NamedConfig
from zarr.core.dtype.common import (
    DataTypeValidationError,
    DTypeConfig_V2,
    DTypeJSON,
    HasEndianness,
    HasItemSize,
    HasLength,
    HasObjectCodec,
    check_dtype_spec_v2,
    v3_unstable_dtype_warning,
)
from zarr.core.dtype.npy.common import (
    check_json_str,
    endianness_to_numpy_str,
    get_endianness_from_numpy_dtype,
)
from zarr.core.dtype.wrapper import TDType_co, ZDType

if TYPE_CHECKING:
    from zarr.core.common import JSON, ZarrFormat
    from zarr.core.dtype.wrapper import TBaseDType

_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType")


@runtime_checkable
class SupportsStr(Protocol):
    def __str__(self) -> str: ...


class LengthBytesConfig(TypedDict):
    """
    Configuration for a fixed-length string data type in Zarr V3.

    Attributes
    ----------
    length_bytes : int
        The length in bytes of the data associated with this configuration.
    """

    length_bytes: int


[docs] class FixedLengthUTF32JSON_V2(DTypeConfig_V2[str, None]): """ A wrapper around the JSON representation of the ``FixedLengthUTF32`` data type in Zarr V2. The ``name`` field of this class contains the value that would appear under the ``dtype`` field in Zarr V2 array metadata. References ---------- The structure of the ``name`` field is defined in the Zarr V2 `specification document <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__. Examples -------- .. code-block:: python { "name": "<U12", "object_codec_id": None } """
[docs] class FixedLengthUTF32JSON_V3(NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig]): """ The JSON representation of the ``FixedLengthUTF32`` data type in Zarr V3. References ---------- This representation is not currently defined in an external specification. Examples -------- .. code-block:: python { "name": "fixed_length_utf32", "configuration": { "length_bytes": 12 } """
[docs] @dataclass(frozen=True, kw_only=True) class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): """ A Zarr data type for arrays containing fixed-length UTF-32 strings. Wraps the ``np.dtypes.StrDType`` data type. Scalars for this data type are instances of ``np.str_``. Attributes ---------- dtype_cls : Type[np.dtypes.StrDType] The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["fixed_length_utf32"]] The name of this data type in Zarr V3. code_point_bytes : ClassVar[int] = 4 The number of bytes per code point in UTF-32, which is 4. """ dtype_cls = np.dtypes.StrDType _zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32" code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point def __post_init__(self) -> None: """ We don't allow instances of this class with length less than 1 because there is no way such a data type can contain actual data. """ if self.length < 1: raise ValueError(f"length must be >= 1, got {self.length}.")
[docs] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ Create a FixedLengthUTF32 from a NumPy data type. Parameters ---------- dtype : TBaseDType The NumPy data type. Returns ------- Self An instance of this data type. """ if cls._check_native_dtype(dtype): endianness = get_endianness_from_numpy_dtype(dtype) return cls( length=dtype.itemsize // (cls.code_point_bytes), endianness=endianness, ) raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" )
[docs] def to_native_dtype(self) -> np.dtypes.StrDType[int]: """ Convert the FixedLengthUTF32 instance to a NumPy data type. Returns ------- np.dtypes.StrDType[int] The NumPy data type. """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(byte_order)
@classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: """ Check that the input is a valid JSON representation of a NumPy U dtype. Parameters ---------- data : DTypeJSON The JSON data. Returns ------- TypeGuard[FixedLengthUTF32JSON_V2] Whether the input is a valid JSON representation of a NumPy U dtype. """ return ( check_dtype_spec_v2(data) and isinstance(data["name"], str) and re.match(r"^[><]U\d+$", data["name"]) is not None and data["object_codec_id"] is None ) @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: """ Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- data : DTypeJSON The JSON data. Returns ------- TypeGuard[FixedLengthUTF32JSONV3] Whether the input is a valid JSON representation of a NumPy U dtype. """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and "configuration" in data and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} and isinstance(data["configuration"]["length_bytes"], int) ) @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ...
[docs] def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: """ Convert the FixedLengthUTF32 instance to a JSON representation. Parameters ---------- zarr_format : ZarrFormat The Zarr format to use. Returns ------- DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 The JSON representation of the data type. """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) return { "name": self._zarr_v3_name, "configuration": {"length_bytes": self.length * self.code_point_bytes}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
@classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. Parameters ---------- data : DTypeJSON The JSON data. Returns ------- Self An instance of this data type. """ if cls._check_json_v2(data): # Construct the NumPy dtype instead of string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) raise DataTypeValidationError( f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." ) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. Parameters ---------- data : DTypeJSON The JSON data. Returns ------- Self An instance of this data type. """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg)
[docs] def default_scalar(self) -> np.str_: """ Return the default scalar value for this data type. Returns ------- ``np.str_`` The default scalar value. """ return np.str_("")
[docs] def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: """ Convert the scalar value to a JSON representation. Parameters ---------- data : object The scalar value. zarr_format : ZarrFormat The Zarr format to use. Returns ------- str The JSON representation of the scalar value. """ return str(data)
[docs] def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: """ Convert the JSON representation of a scalar value to the native scalar value. Parameters ---------- data : JSON The JSON data. zarr_format : ZarrFormat The Zarr format to use. Returns ------- ``np.str_`` The native scalar value. """ if check_json_str(data): return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: """ Check that the input is a valid scalar value for this data type. Parameters ---------- data : object The scalar value. Returns ------- TypeGuard[SupportsStr] Whether the input is a valid scalar value for this data type. """ # this is generous for backwards compatibility return isinstance(data, SupportsStr)
[docs] def cast_scalar(self, data: object) -> np.str_: """ Cast the scalar value to the native scalar value. Parameters ---------- data : object The scalar value. Returns ------- ``np.str_`` The native scalar value. """ if self._check_scalar(data): # We explicitly truncate before casting because of the following NumPy behavior: # >>> x = np.dtype('U3').type('hello world') # >>> x # np.str_('hello world') # >>> x.dtype # dtype('U11') return self.to_native_dtype().type(str(data)[: self.length]) msg = ( # pragma: no cover f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " f"data type {self}." ) raise TypeError(msg) # pragma: no-cover
@property def item_size(self) -> int: """ The size of a single scalar in bytes. Returns ------- int The size of a single scalar in bytes. """ return self.length * self.code_point_bytes
def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: """ Check if the input is a valid JSON scalar for a variable-length string. This function is generous for backwards compatibility, as Zarr Python v2 would use ints for variable-length string fill values. Parameters ---------- data : object The JSON value to check. Returns ------- TypeGuard[int | str | float] True if the input is a valid scalar for a variable-length string. """ return isinstance(data, int | str | float)
[docs] class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]): """ A wrapper around the JSON representation of the ``VariableLengthUTF8`` data type in Zarr V2. The ``name`` field of this class contains the value that would appear under the ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-utf8"``. References ---------- The structure of the ``name`` field is defined in the Zarr V2 `specification document <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__. Examples -------- .. code-block:: python { "name": "|O", "object_codec_id": "vlen-utf8" } """
# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. # If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length # string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object # dtype as the native dtype. class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): """ A base class for variable-length UTF-8 string data types. Not intended for direct use, but as a base for concrete implementations. Attributes ---------- object_codec_id : ClassVar[Literal["vlen-utf8"]] The object codec ID for this data type. References ---------- This data type does not have a Zarr V3 specification. The Zarr V2 data type specification can be found `here <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__. """ _zarr_v3_name: ClassVar[Literal["string"]] = "string" object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ Create an instance of this data type from a compatible NumPy data type. Parameters ---------- dtype : TBaseDType The native data type. Returns ------- Self An instance of this data type. Raises ------ DataTypeValidationError If the input is not compatible with this data type. """ if cls._check_native_dtype(dtype): return cls() raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) @classmethod def _check_json_v2( cls, data: DTypeJSON, ) -> TypeGuard[VariableLengthUTF8JSON_V2]: """ "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype for Zarr v2." Parameters ---------- data : DTypeJSON The JSON data to check. Returns ------- ``TypeGuard[VariableLengthUTF8JSON_V2]`` Whether the input is a valid JSON representation of a NumPy "object" data type, and that the object codec id is appropriate for variable-length UTF-8 strings. """ return ( check_dtype_spec_v2(data) and data["name"] == "|O" and data["object_codec_id"] == cls.object_codec_id ) @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: """ Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- data : DTypeJSON The JSON data to check. Returns ------- TypeGuard[Literal["variable_length_utf8"]] Whether the input is a valid JSON representation of a variable length UTF-8 string data type. """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ Create an instance of this class from a JSON representation of a NumPy "object" dtype. Parameters ---------- data : DTypeJSON The JSON data to create an instance from. Returns ------- Self An instance of this data type. """ if cls._check_json_v2(data): return cls() msg = ( f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" ) raise DataTypeValidationError(msg) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ Create an instance of this class from a JSON representation of a variable length UTF-8 string data type. Parameters ---------- data : DTypeJSON The JSON data to create an instance from. Returns ------- Self An instance of this data type. """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) @overload def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: """ Convert this data type to a JSON representation. Parameters ---------- zarr_format : int The zarr format to use for the JSON representation. Returns ------- ``VariableLengthUTF8JSON_V2 | Literal["string"]`` The JSON representation of this data type. """ if zarr_format == 2: return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_scalar(self) -> str: """ Return the default scalar value for this data type. Returns ------- str The default scalar value. """ return "" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: """ Convert a scalar value to a JSON representation. Parameters ---------- data : object The scalar value to convert. zarr_format : int The zarr format to use for the JSON representation. Returns ------- str The JSON representation of the scalar value. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected a string.") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ Convert a JSON representation of a scalar value to the native scalar type. Parameters ---------- data : JSON The JSON representation of the scalar value. zarr_format : int The zarr format to use for the JSON representation. Returns ------- str The native scalar type of the scalar value. """ if not check_vlen_string_json_scalar(data): raise TypeError(f"Invalid type: {data}. Expected a string or number.") return str(data) def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: """ Check that the input is a valid scalar value for this data type. Parameters ---------- data : object The scalar value to check. Returns ------- TypeGuard[SupportsStr] Whether the input is a valid scalar value for this data type. """ return isinstance(data, SupportsStr) def _cast_scalar_unchecked(self, data: SupportsStr) -> str: """ Cast a scalar value to a string. Parameters ---------- data : object The scalar value to cast. Returns ------- str The string representation of the scalar value. """ return str(data) def cast_scalar(self, data: object) -> str: """ Cast an object to a string. Parameters ---------- data : object The value to cast. Returns ------- str The input cast to str. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = ( # pragma: no cover f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " f"data type {self}." ) raise TypeError(msg) # pragma: no cover if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] """ A Zarr data type for arrays containing variable-length UTF-8 strings. Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances of ``str``. Attributes ---------- dtype_cls : Type[np.dtypes.StringDType] The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" The name of this data type in Zarr V3. object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" The object codec ID for this data type. """ dtype_cls = np.dtypes.StringDType def to_native_dtype(self) -> np.dtypes.StringDType: """ Create a NumPy string dtype from this VariableLengthUTF8 ZDType. Returns ------- np.dtypes.StringDType The NumPy string dtype. """ return self.dtype_cls() else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead.
[docs] @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] """ A Zarr data type for arrays containing variable-length UTF-8 strings. Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances of ``str``. Attributes ---------- dtype_cls : Type[np.dtypes.ObjectDType] The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" The name of this data type in Zarr V3. object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" The object codec ID for this data type. """ dtype_cls = np.dtypes.ObjectDType
[docs] def to_native_dtype(self) -> np.dtypes.ObjectDType: """ Create a NumPy object dtype from this VariableLengthUTF8 ZDType. Returns ------- np.dtypes.ObjectDType The NumPy object dtype. """ return self.dtype_cls()