From 9b36337d2bd6e3284b9aa6c8e1d99e3a625057ff Mon Sep 17 00:00:00 2001 From: Sam Verhasselt Date: Mon, 22 Dec 2025 00:32:57 -0800 Subject: [PATCH 1/4] feat: Add Iceberg v3 geospatial primitive types (geometry and geography) Implement support for Iceberg v3 geospatial types as specified in the Iceberg specification: - Add GeometryType(crs) and GeographyType(crs, algorithm) to types.py - Default CRS is "OGC:CRS84", default algorithm is "spherical" - Types require format version 3 (minimum_format_version() returns 3) - Values are stored as WKB (Well-Known Binary) bytes at runtime - Avro schema conversion maps to "bytes" - PyArrow conversion maps to large_binary() - Add type string parsing for geometry('CRS') and geography('CRS', 'algo') - Add visitor pattern support in schema.py and resolver.py Note: JSON single-value encoding (WKB<->WKT) raises NotImplementedError as it requires external libraries (e.g., Shapely) which are not included to avoid heavy dependencies. --- pyiceberg/avro/resolver.py | 26 ++++ pyiceberg/conversions.py | 92 ++++++++++++ pyiceberg/io/pyarrow.py | 28 ++++ pyiceberg/schema.py | 26 ++++ pyiceberg/types.py | 217 +++++++++++++++++++++++++++ pyiceberg/utils/schema_conversion.py | 10 ++ tests/test_types.py | 192 ++++++++++++++++++++++++ 7 files changed, 591 insertions(+) diff --git a/pyiceberg/avro/resolver.py b/pyiceberg/avro/resolver.py index 65f06ca8b1..81b573aa79 100644 --- a/pyiceberg/avro/resolver.py +++ b/pyiceberg/avro/resolver.py @@ -87,6 +87,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -204,6 +206,14 @@ def visit_binary(self, binary_type: BinaryType) -> Writer: def visit_unknown(self, unknown_type: UnknownType) -> Writer: return UnknownWriter() + def visit_geometry(self, geometry_type: "GeometryType") -> Writer: + """Geometry is written as WKB bytes in Avro.""" + return BinaryWriter() + + def visit_geography(self, geography_type: "GeographyType") -> Writer: + """Geography is written as WKB bytes in Avro.""" + return BinaryWriter() + CONSTRUCT_WRITER_VISITOR = ConstructWriter() @@ -359,6 +369,14 @@ def visit_binary(self, binary_type: BinaryType, partner: IcebergType | None) -> def visit_unknown(self, unknown_type: UnknownType, partner: IcebergType | None) -> Writer: return UnknownWriter() + def visit_geometry(self, geometry_type: "GeometryType", partner: IcebergType | None) -> Writer: + """Geometry is written as WKB bytes in Avro.""" + return BinaryWriter() + + def visit_geography(self, geography_type: "GeographyType", partner: IcebergType | None) -> Writer: + """Geography is written as WKB bytes in Avro.""" + return BinaryWriter() + class ReadSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]): __slots__ = ("read_types", "read_enums", "context") @@ -498,6 +516,14 @@ def visit_binary(self, binary_type: BinaryType, partner: IcebergType | None) -> def visit_unknown(self, unknown_type: UnknownType, partner: IcebergType | None) -> Reader: return UnknownReader() + def visit_geometry(self, geometry_type: "GeometryType", partner: IcebergType | None) -> Reader: + """Geometry is read as WKB bytes from Avro.""" + return BinaryReader() + + def visit_geography(self, geography_type: "GeographyType", partner: IcebergType | None) -> Reader: + """Geography is read as WKB bytes from Avro.""" + return BinaryReader() + class SchemaPartnerAccessor(PartnerAccessor[IcebergType]): def schema_partner(self, partner: IcebergType | None) -> IcebergType | None: diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 8739a1ab08..43f8e1a469 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -49,6 +49,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, LongType, PrimitiveType, @@ -182,6 +184,18 @@ def _(type_: UnknownType, _: str) -> None: return None +@partition_to_py.register(GeometryType) +@partition_to_py.register(GeographyType) +@handle_none +def _(_: PrimitiveType, value_str: str) -> bytes: + """Convert a geometry/geography partition string to bytes. + + Note: Partition values for geometry/geography types are expected to be + hex-encoded WKB (Well-Known Binary) strings. + """ + return bytes.fromhex(value_str) + + @singledispatch def to_bytes( primitive_type: PrimitiveType, _: bool | bytes | Decimal | date | datetime | float | int | str | time | uuid.UUID @@ -273,6 +287,8 @@ def _(_: UUIDType, value: uuid.UUID | bytes) -> bytes: @to_bytes.register(BinaryType) @to_bytes.register(FixedType) +@to_bytes.register(GeometryType) +@to_bytes.register(GeographyType) def _(_: PrimitiveType, value: bytes) -> bytes: return value @@ -354,6 +370,8 @@ def _(_: StringType, b: bytes) -> str: @from_bytes.register(BinaryType) @from_bytes.register(FixedType) @from_bytes.register(UUIDType) +@from_bytes.register(GeometryType) +@from_bytes.register(GeographyType) def _(_: PrimitiveType, b: bytes) -> bytes: return b @@ -474,6 +492,40 @@ def _(_: UUIDType, val: uuid.UUID) -> str: return str(val) +@to_json.register(GeometryType) +def _(_: GeometryType, val: bytes) -> str: + """Serialize geometry to WKT string per Iceberg spec. + + Note: This requires WKB to WKT conversion which is not yet implemented. + The Iceberg spec requires geometry values to be serialized as WKT strings + in JSON, but PyIceberg stores geometry as WKB bytes at runtime. + + Raises: + NotImplementedError: WKB to WKT conversion is not yet supported. + """ + raise NotImplementedError( + "Geometry JSON serialization requires WKB to WKT conversion, which is not yet implemented. " + "See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details." + ) + + +@to_json.register(GeographyType) +def _(_: GeographyType, val: bytes) -> str: + """Serialize geography to WKT string per Iceberg spec. + + Note: This requires WKB to WKT conversion which is not yet implemented. + The Iceberg spec requires geography values to be serialized as WKT strings + in JSON, but PyIceberg stores geography as WKB bytes at runtime. + + Raises: + NotImplementedError: WKB to WKT conversion is not yet supported. + """ + raise NotImplementedError( + "Geography JSON serialization requires WKB to WKT conversion, which is not yet implemented. " + "See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details." + ) + + @singledispatch # type: ignore def from_json(primitive_type: PrimitiveType, val: Any) -> L: # type: ignore """Convert JSON value types into built-in python values. @@ -593,3 +645,43 @@ def _(_: UUIDType, val: str | bytes | uuid.UUID) -> uuid.UUID: return uuid.UUID(bytes=val) else: return val + + +@from_json.register(GeometryType) +def _(_: GeometryType, val: str | bytes) -> bytes: + """Convert JSON WKT string into WKB bytes per Iceberg spec. + + Note: This requires WKT to WKB conversion which is not yet implemented. + The Iceberg spec requires geometry values to be represented as WKT strings + in JSON, but PyIceberg stores geometry as WKB bytes at runtime. + + Raises: + NotImplementedError: WKT to WKB conversion is not yet supported. + """ + if isinstance(val, bytes): + # Already WKB bytes, return as-is + return val + raise NotImplementedError( + "Geometry JSON deserialization requires WKT to WKB conversion, which is not yet implemented. " + "See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details." + ) + + +@from_json.register(GeographyType) +def _(_: GeographyType, val: str | bytes) -> bytes: + """Convert JSON WKT string into WKB bytes per Iceberg spec. + + Note: This requires WKT to WKB conversion which is not yet implemented. + The Iceberg spec requires geography values to be represented as WKT strings + in JSON, but PyIceberg stores geography as WKB bytes at runtime. + + Raises: + NotImplementedError: WKT to WKB conversion is not yet supported. + """ + if isinstance(val, bytes): + # Already WKB bytes, return as-is + return val + raise NotImplementedError( + "Geography JSON deserialization requires WKT to WKB conversion, which is not yet implemented. " + "See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details." + ) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index efeb72cbd4..fab019138e 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -157,6 +157,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -798,6 +800,26 @@ def visit_unknown(self, _: UnknownType) -> pa.DataType: def visit_binary(self, _: BinaryType) -> pa.DataType: return pa.large_binary() + def visit_geometry(self, geometry_type: GeometryType) -> pa.DataType: + """Convert geometry type to PyArrow binary. + + Note: PyArrow 21.0.0+ supports native GEOMETRY logical type from Arrow GEO. + For now, we use large_binary which stores WKB bytes. + Future enhancement: detect PyArrow version and use pa.geometry() when available. + """ + # TODO: When PyArrow 21.0.0+ is available, use pa.geometry() with CRS metadata + return pa.large_binary() + + def visit_geography(self, geography_type: GeographyType) -> pa.DataType: + """Convert geography type to PyArrow binary. + + Note: PyArrow 21.0.0+ supports native GEOGRAPHY logical type from Arrow GEO. + For now, we use large_binary which stores WKB bytes. + Future enhancement: detect PyArrow version and use pa.geography() when available. + """ + # TODO: When PyArrow 21.0.0+ is available, use pa.geography() with CRS and algorithm metadata + return pa.large_binary() + def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar: if not isinstance(iceberg_type, PrimitiveType): @@ -2111,6 +2133,12 @@ def visit_binary(self, binary_type: BinaryType) -> str: def visit_unknown(self, unknown_type: UnknownType) -> str: return "UNKNOWN" + def visit_geometry(self, geometry_type: GeometryType) -> str: + return "BYTE_ARRAY" + + def visit_geography(self, geography_type: GeographyType) -> str: + return "BYTE_ARRAY" + _PRIMITIVE_TO_PHYSICAL_TYPE_VISITOR = PrimitiveToPhysicalType() diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 5896e7e1eb..22941a987a 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -43,6 +43,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -553,6 +555,10 @@ def primitive(self, primitive: PrimitiveType, primitive_partner: P | None) -> T: return self.visit_binary(primitive, primitive_partner) elif isinstance(primitive, UnknownType): return self.visit_unknown(primitive, primitive_partner) + elif isinstance(primitive, GeometryType): + return self.visit_geometry(primitive, primitive_partner) + elif isinstance(primitive, GeographyType): + return self.visit_geography(primitive, primitive_partner) else: raise ValueError(f"Type not recognized: {primitive}") @@ -624,6 +630,14 @@ def visit_binary(self, binary_type: BinaryType, partner: P | None) -> T: def visit_unknown(self, unknown_type: UnknownType, partner: P | None) -> T: """Visit a UnknownType.""" + @abstractmethod + def visit_geometry(self, geometry_type: GeometryType, partner: P | None) -> T: + """Visit a GeometryType.""" + + @abstractmethod + def visit_geography(self, geography_type: GeographyType, partner: P | None) -> T: + """Visit a GeographyType.""" + class PartnerAccessor(Generic[P], ABC): @abstractmethod @@ -747,6 +761,10 @@ def primitive(self, primitive: PrimitiveType) -> T: return self.visit_binary(primitive) elif isinstance(primitive, UnknownType): return self.visit_unknown(primitive) + elif isinstance(primitive, GeometryType): + return self.visit_geometry(primitive) + elif isinstance(primitive, GeographyType): + return self.visit_geography(primitive) else: raise ValueError(f"Type not recognized: {primitive}") @@ -818,6 +836,14 @@ def visit_binary(self, binary_type: BinaryType) -> T: def visit_unknown(self, unknown_type: UnknownType) -> T: """Visit a UnknownType.""" + @abstractmethod + def visit_geometry(self, geometry_type: GeometryType) -> T: + """Visit a GeometryType.""" + + @abstractmethod + def visit_geography(self, geography_type: GeographyType) -> T: + """Visit a GeographyType.""" + @dataclass(init=True, eq=True, frozen=True) class Accessor: diff --git a/pyiceberg/types.py b/pyiceberg/types.py index 742da00f57..29f28fd2d4 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -61,6 +61,17 @@ FIXED = "fixed" FIXED_PARSER = ParseNumberFromBrackets(FIXED) +# Default CRS for geometry and geography types per Iceberg v3 spec +DEFAULT_GEOMETRY_CRS = "OGC:CRS84" +DEFAULT_GEOGRAPHY_CRS = "OGC:CRS84" +DEFAULT_GEOGRAPHY_ALGORITHM = "spherical" + +# Regex patterns for parsing geometry and geography type strings +# Matches: geometry, geometry('CRS'), geometry("CRS") +GEOMETRY_REGEX = re.compile(r"geometry(?:\(\s*['\"]([^'\"]+)['\"]\s*\))?$") +# Matches: geography, geography('CRS'), geography('CRS', 'algo') +GEOGRAPHY_REGEX = re.compile(r"geography(?:\(\s*['\"]([^'\"]+)['\"](?:\s*,\s*['\"]([^'\"]+)['\"])?\s*\))?$") + def transform_dict_value_to_str(dict: dict[str, Any]) -> dict[str, str]: """Transform all values in the dictionary to string. Raise an error if any value is None.""" @@ -92,6 +103,53 @@ def _parse_fixed_type(fixed: Any) -> int: return fixed +def _parse_geometry_type(geometry: Any) -> str: + """Parse geometry type string and return CRS. + + Args: + geometry: The geometry type specification (string or dict). + + Returns: + The CRS string (defaults to DEFAULT_GEOMETRY_CRS if not specified). + """ + if isinstance(geometry, str): + match = GEOMETRY_REGEX.match(geometry) + if match: + crs = match.group(1) + return crs if crs else DEFAULT_GEOMETRY_CRS + else: + raise ValidationError(f"Could not parse {geometry} into a GeometryType") + elif isinstance(geometry, dict): + return geometry.get("crs", DEFAULT_GEOMETRY_CRS) + else: + return geometry + + +def _parse_geography_type(geography: Any) -> tuple[str, str]: + """Parse geography type string and return (CRS, algorithm). + + Args: + geography: The geography type specification (string or dict). + + Returns: + Tuple of (CRS, algorithm) with defaults applied where not specified. + """ + if isinstance(geography, str): + match = GEOGRAPHY_REGEX.match(geography) + if match: + crs = match.group(1) if match.group(1) else DEFAULT_GEOGRAPHY_CRS + algorithm = match.group(2) if match.group(2) else DEFAULT_GEOGRAPHY_ALGORITHM + return crs, algorithm + else: + raise ValidationError(f"Could not parse {geography} into a GeographyType") + elif isinstance(geography, dict): + crs = geography.get("crs", DEFAULT_GEOGRAPHY_CRS) + algorithm = geography.get("algorithm", DEFAULT_GEOGRAPHY_ALGORITHM) + return crs, algorithm + else: + return geography + + def strtobool(val: str) -> bool: """Convert a string representation of truth to true (1) or false (0). @@ -124,6 +182,13 @@ def handle_primitive_type(cls, v: Any, handler: ValidatorFunctionWrapHandler) -> # Pydantic works mostly around dicts, and there seems to be something # by not serializing into a RootModel, might revisit this. if isinstance(v, str): + # When constructing GeometryType/GeographyType directly with CRS/algorithm values, + # skip the type string parsing to avoid infinite recursion + if cls.__name__ == "GeometryType" and not v.startswith("geometry"): + return handler(v) + if cls.__name__ == "GeographyType" and not v.startswith("geography"): + return handler(v) + if v == "boolean": return BooleanType() elif v == "string": @@ -159,6 +224,12 @@ def handle_primitive_type(cls, v: Any, handler: ValidatorFunctionWrapHandler) -> if v.startswith("decimal"): precision, scale = _parse_decimal_type(v) return DecimalType(precision, scale) + if v.startswith("geometry"): + crs = _parse_geometry_type(v) + return GeometryType(crs) + if v.startswith("geography"): + crs, algorithm = _parse_geography_type(v) + return GeographyType(crs, algorithm) else: raise ValueError(f"Type not recognized: {v}") if isinstance(v, dict) and cls == IcebergType: @@ -879,3 +950,149 @@ class UnknownType(PrimitiveType): def minimum_format_version(self) -> TableVersion: return 3 + + +class GeometryType(PrimitiveType): + """A geometry data type in Iceberg (v3+) for storing spatial geometries. + + Geometries are stored as WKB (Well-Known Binary) at runtime. The CRS (Coordinate Reference System) + parameter specifies the spatial reference system for the geometry values. + + Example: + >>> column_foo = GeometryType() + >>> isinstance(column_foo, GeometryType) + True + >>> column_foo + GeometryType() + >>> column_foo.crs + 'OGC:CRS84' + >>> GeometryType("EPSG:4326") + GeometryType(crs='EPSG:4326') + >>> str(GeometryType("EPSG:4326")) + "geometry('EPSG:4326')" + """ + + root: str = Field(default=DEFAULT_GEOMETRY_CRS) + + def __init__(self, crs: str = DEFAULT_GEOMETRY_CRS) -> None: + super().__init__(root=crs) + + @model_serializer + def ser_model(self) -> str: + """Serialize the model to a string.""" + if self.crs == DEFAULT_GEOMETRY_CRS: + return "geometry" + return f"geometry('{self.crs}')" + + @property + def crs(self) -> str: + """Return the CRS (Coordinate Reference System) of the geometry.""" + return self.root + + def __repr__(self) -> str: + """Return the string representation of the GeometryType class.""" + if self.crs == DEFAULT_GEOMETRY_CRS: + return "GeometryType()" + return f"GeometryType(crs={self.crs!r})" + + def __str__(self) -> str: + """Return the string representation.""" + if self.crs == DEFAULT_GEOMETRY_CRS: + return "geometry" + return f"geometry('{self.crs}')" + + def __hash__(self) -> int: + """Return the hash of the CRS.""" + return hash(self.root) + + def __getnewargs__(self) -> tuple[str]: + """Pickle the GeometryType class.""" + return (self.crs,) + + def __eq__(self, other: Any) -> bool: + """Compare to another object.""" + return self.root == other.root if isinstance(other, GeometryType) else False + + def minimum_format_version(self) -> TableVersion: + """Geometry type requires Iceberg format version 3.""" + return 3 + + +class GeographyType(PrimitiveType): + """A geography data type in Iceberg (v3+) for storing spatial geographies. + + Geographies are stored as WKB (Well-Known Binary) at runtime. The CRS (Coordinate Reference System) + and algorithm parameters specify the spatial reference system and the algorithm used for + geographic calculations. + + Example: + >>> column_foo = GeographyType() + >>> isinstance(column_foo, GeographyType) + True + >>> column_foo + GeographyType() + >>> column_foo.crs + 'OGC:CRS84' + >>> column_foo.algorithm + 'spherical' + >>> GeographyType("EPSG:4326", "planar") + GeographyType(crs='EPSG:4326', algorithm='planar') + >>> str(GeographyType("EPSG:4326", "planar")) + "geography('EPSG:4326', 'planar')" + """ + + root: tuple[str, str] = Field(default=(DEFAULT_GEOGRAPHY_CRS, DEFAULT_GEOGRAPHY_ALGORITHM)) + + def __init__(self, crs: str = DEFAULT_GEOGRAPHY_CRS, algorithm: str = DEFAULT_GEOGRAPHY_ALGORITHM) -> None: + super().__init__(root=(crs, algorithm)) + + @model_serializer + def ser_model(self) -> str: + """Serialize the model to a string.""" + if self.crs == DEFAULT_GEOGRAPHY_CRS and self.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM: + return "geography" + if self.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM: + return f"geography('{self.crs}')" + return f"geography('{self.crs}', '{self.algorithm}')" + + @property + def crs(self) -> str: + """Return the CRS (Coordinate Reference System) of the geography.""" + return self.root[0] + + @property + def algorithm(self) -> str: + """Return the algorithm used for geographic calculations.""" + return self.root[1] + + def __repr__(self) -> str: + """Return the string representation of the GeographyType class.""" + if self.crs == DEFAULT_GEOGRAPHY_CRS and self.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM: + return "GeographyType()" + if self.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM: + return f"GeographyType(crs={self.crs!r})" + return f"GeographyType(crs={self.crs!r}, algorithm={self.algorithm!r})" + + def __str__(self) -> str: + """Return the string representation.""" + if self.crs == DEFAULT_GEOGRAPHY_CRS and self.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM: + return "geography" + if self.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM: + return f"geography('{self.crs}')" + return f"geography('{self.crs}', '{self.algorithm}')" + + def __hash__(self) -> int: + """Return the hash of the tuple.""" + return hash(self.root) + + def __getnewargs__(self) -> tuple[str, str]: + """Pickle the GeographyType class.""" + return self.crs, self.algorithm + + def __eq__(self, other: Any) -> bool: + """Compare to another object.""" + return self.root == other.root if isinstance(other, GeographyType) else False + + def minimum_format_version(self) -> TableVersion: + """Geography type requires Iceberg format version 3.""" + return 3 diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py index 66e57d5d9f..f734adc3dd 100644 --- a/pyiceberg/utils/schema_conversion.py +++ b/pyiceberg/utils/schema_conversion.py @@ -37,6 +37,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -634,3 +636,11 @@ def visit_binary(self, binary_type: BinaryType) -> AvroType: def visit_unknown(self, unknown_type: UnknownType) -> AvroType: return "null" + + def visit_geometry(self, geometry_type: GeometryType) -> AvroType: + """Convert geometry type to Avro bytes (WKB format per Iceberg spec).""" + return "bytes" + + def visit_geography(self, geography_type: GeographyType) -> AvroType: + """Convert geography type to Avro bytes (WKB format per Iceberg spec).""" + return "bytes" diff --git a/tests/test_types.py b/tests/test_types.py index 707deb160e..ca51254366 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -22,6 +22,9 @@ from pyiceberg.exceptions import ValidationError from pyiceberg.types import ( + DEFAULT_GEOGRAPHY_ALGORITHM, + DEFAULT_GEOGRAPHY_CRS, + DEFAULT_GEOMETRY_CRS, BinaryType, BooleanType, DateType, @@ -29,6 +32,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -728,3 +733,190 @@ def test_transform_dict_value_to_str() -> None: input_dict["key6"] = None with pytest.raises(ValueError, match="None type is not a supported value in properties: key6"): transform_dict_value_to_str(input_dict) + + +# Geometry Type Tests + + +def test_geometry_type_default() -> None: + """Test GeometryType with default CRS.""" + type_var = GeometryType() + assert type_var.crs == DEFAULT_GEOMETRY_CRS + assert str(type_var) == "geometry" + assert repr(type_var) == "GeometryType()" + assert type_var.minimum_format_version() == 3 + assert type_var.is_primitive + + +def test_geometry_type_custom_crs() -> None: + """Test GeometryType with custom CRS.""" + type_var = GeometryType("EPSG:4326") + assert type_var.crs == "EPSG:4326" + assert str(type_var) == "geometry('EPSG:4326')" + assert repr(type_var) == "GeometryType(crs='EPSG:4326')" + + +def test_geometry_type_equality() -> None: + """Test GeometryType equality and hashing.""" + assert GeometryType() == GeometryType() + assert GeometryType("EPSG:4326") == GeometryType("EPSG:4326") + assert GeometryType() != GeometryType("EPSG:4326") + assert hash(GeometryType()) == hash(GeometryType()) + assert hash(GeometryType("EPSG:4326")) == hash(GeometryType("EPSG:4326")) + + +def test_geometry_type_pickle() -> None: + """Test GeometryType pickle round-trip.""" + assert GeometryType() == pickle.loads(pickle.dumps(GeometryType())) + assert GeometryType("EPSG:4326") == pickle.loads(pickle.dumps(GeometryType("EPSG:4326"))) + + +def test_geometry_type_serialization() -> None: + """Test GeometryType JSON serialization.""" + assert GeometryType().model_dump_json() == '"geometry"' + assert GeometryType("EPSG:4326").model_dump_json() == "\"geometry('EPSG:4326')\"" + + +def test_geometry_type_deserialization() -> None: + """Test GeometryType JSON deserialization.""" + assert GeometryType.model_validate_json('"geometry"') == GeometryType() + assert GeometryType.model_validate_json("\"geometry('EPSG:4326')\"") == GeometryType("EPSG:4326") + + +def test_geometry_type_deserialization_failure() -> None: + """Test GeometryType deserialization with invalid input.""" + with pytest.raises(ValidationError) as exc_info: + GeometryType.model_validate_json('"geometry(invalid)"') + assert "Could not parse geometry(invalid) into a GeometryType" in str(exc_info.value) + + +def test_geometry_type_singleton() -> None: + """Test that GeometryType uses singleton pattern for same CRS.""" + assert id(GeometryType()) == id(GeometryType()) + assert id(GeometryType("EPSG:4326")) == id(GeometryType("EPSG:4326")) + assert id(GeometryType()) != id(GeometryType("EPSG:4326")) + + +# Geography Type Tests + + +def test_geography_type_default() -> None: + """Test GeographyType with default CRS and algorithm.""" + type_var = GeographyType() + assert type_var.crs == DEFAULT_GEOGRAPHY_CRS + assert type_var.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM + assert str(type_var) == "geography" + assert repr(type_var) == "GeographyType()" + assert type_var.minimum_format_version() == 3 + assert type_var.is_primitive + + +def test_geography_type_custom_crs() -> None: + """Test GeographyType with custom CRS.""" + type_var = GeographyType("EPSG:4326") + assert type_var.crs == "EPSG:4326" + assert type_var.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM + assert str(type_var) == "geography('EPSG:4326')" + assert repr(type_var) == "GeographyType(crs='EPSG:4326')" + + +def test_geography_type_custom_crs_and_algorithm() -> None: + """Test GeographyType with custom CRS and algorithm.""" + type_var = GeographyType("EPSG:4326", "planar") + assert type_var.crs == "EPSG:4326" + assert type_var.algorithm == "planar" + assert str(type_var) == "geography('EPSG:4326', 'planar')" + assert repr(type_var) == "GeographyType(crs='EPSG:4326', algorithm='planar')" + + +def test_geography_type_equality() -> None: + """Test GeographyType equality and hashing.""" + assert GeographyType() == GeographyType() + assert GeographyType("EPSG:4326") == GeographyType("EPSG:4326") + assert GeographyType("EPSG:4326", "planar") == GeographyType("EPSG:4326", "planar") + assert GeographyType() != GeographyType("EPSG:4326") + assert GeographyType("EPSG:4326") != GeographyType("EPSG:4326", "planar") + assert hash(GeographyType()) == hash(GeographyType()) + + +def test_geography_type_pickle() -> None: + """Test GeographyType pickle round-trip.""" + assert GeographyType() == pickle.loads(pickle.dumps(GeographyType())) + assert GeographyType("EPSG:4326", "planar") == pickle.loads(pickle.dumps(GeographyType("EPSG:4326", "planar"))) + + +def test_geography_type_serialization() -> None: + """Test GeographyType JSON serialization.""" + assert GeographyType().model_dump_json() == '"geography"' + assert GeographyType("EPSG:4326").model_dump_json() == "\"geography('EPSG:4326')\"" + assert GeographyType("EPSG:4326", "planar").model_dump_json() == "\"geography('EPSG:4326', 'planar')\"" + + +def test_geography_type_deserialization() -> None: + """Test GeographyType JSON deserialization.""" + assert GeographyType.model_validate_json('"geography"') == GeographyType() + assert GeographyType.model_validate_json("\"geography('EPSG:4326')\"") == GeographyType("EPSG:4326") + assert GeographyType.model_validate_json("\"geography('EPSG:4326', 'planar')\"") == GeographyType("EPSG:4326", "planar") + + +def test_geography_type_deserialization_failure() -> None: + """Test GeographyType deserialization with invalid input.""" + with pytest.raises(ValidationError) as exc_info: + GeographyType.model_validate_json('"geography(invalid)"') + assert "Could not parse geography(invalid) into a GeographyType" in str(exc_info.value) + + +def test_geography_type_singleton() -> None: + """Test that GeographyType uses singleton pattern for same parameters.""" + assert id(GeographyType()) == id(GeographyType()) + assert id(GeographyType("EPSG:4326")) == id(GeographyType("EPSG:4326")) + assert id(GeographyType("EPSG:4326", "planar")) == id(GeographyType("EPSG:4326", "planar")) + assert id(GeographyType()) != id(GeographyType("EPSG:4326")) + + +# NestedField with geometry/geography types + + +def test_nested_field_with_geometry() -> None: + """Test NestedField with GeometryType.""" + field = NestedField(1, "location", GeometryType(), required=True) + assert isinstance(field.field_type, GeometryType) + assert field.field_type.crs == DEFAULT_GEOMETRY_CRS + + +def test_nested_field_with_geography() -> None: + """Test NestedField with GeographyType.""" + field = NestedField(1, "location", GeographyType("EPSG:4326", "planar"), required=True) + assert isinstance(field.field_type, GeographyType) + assert field.field_type.crs == "EPSG:4326" + assert field.field_type.algorithm == "planar" + + +def test_nested_field_geometry_as_string() -> None: + """Test NestedField with geometry type specified as string.""" + field = NestedField(1, "location", "geometry", required=True) + assert isinstance(field.field_type, GeometryType) + assert field.field_type.crs == DEFAULT_GEOMETRY_CRS + + +def test_nested_field_geography_as_string() -> None: + """Test NestedField with geography type specified as string.""" + field = NestedField(1, "location", "geography", required=True) + assert isinstance(field.field_type, GeographyType) + assert field.field_type.crs == DEFAULT_GEOGRAPHY_CRS + assert field.field_type.algorithm == DEFAULT_GEOGRAPHY_ALGORITHM + + +def test_nested_field_geometry_with_params_as_string() -> None: + """Test NestedField with parameterized geometry type as string.""" + field = NestedField(1, "location", "geometry('EPSG:4326')", required=True) + assert isinstance(field.field_type, GeometryType) + assert field.field_type.crs == "EPSG:4326" + + +def test_nested_field_geography_with_params_as_string() -> None: + """Test NestedField with parameterized geography type as string.""" + field = NestedField(1, "location", "geography('EPSG:4326', 'planar')", required=True) + assert isinstance(field.field_type, GeographyType) + assert field.field_type.crs == "EPSG:4326" + assert field.field_type.algorithm == "planar" From a7ac0d4f5b2cb116bc2d71b6a9ea898ed2432959 Mon Sep 17 00:00:00 2001 From: Sam Verhasselt Date: Mon, 22 Dec 2025 15:52:23 -0800 Subject: [PATCH 2/4] Add geoarrow dependency and document current capabilities --- mkdocs/docs/geospatial.md | 110 ++++++++++++++++++++++++++++++++++++++ pyiceberg/io/pyarrow.py | 35 +++++++----- pyproject.toml | 1 + tests/io/test_pyarrow.py | 104 +++++++++++++++++++++++++++++++++++ uv.lock | 91 ++++++++++++++++++++++++++++++- 5 files changed, 328 insertions(+), 13 deletions(-) create mode 100644 mkdocs/docs/geospatial.md diff --git a/mkdocs/docs/geospatial.md b/mkdocs/docs/geospatial.md new file mode 100644 index 0000000000..797143114a --- /dev/null +++ b/mkdocs/docs/geospatial.md @@ -0,0 +1,110 @@ +# Geospatial Types + +PyIceberg supports Iceberg v3 geospatial primitive types: `geometry` and `geography`. + +## Overview + +Iceberg v3 introduces native support for spatial data types: + +- **`geometry(C)`**: Represents geometric shapes in a coordinate reference system (CRS) +- **`geography(C, A)`**: Represents geographic shapes with CRS and calculation algorithm + +Both types store values as WKB (Well-Known Binary) bytes. + +## Requirements + +- Iceberg format version 3 or higher +- `geoarrow-pyarrow` for full GeoArrow extension type support (optional: `pip install pyiceberg[geoarrow]`) + +## Usage + +### Declaring Columns + +```python +from pyiceberg.schema import Schema +from pyiceberg.types import NestedField, GeometryType, GeographyType + +# Schema with geometry and geography columns +schema = Schema( + NestedField(1, "id", IntegerType(), required=True), + NestedField(2, "location", GeometryType(), required=True), + NestedField(3, "boundary", GeographyType(), required=False), +) +``` + +### Type Parameters + +#### GeometryType + +```python +# Default CRS (OGC:CRS84) +GeometryType() + +# Custom CRS +GeometryType("EPSG:4326") +``` + +#### GeographyType + +```python +# Default CRS (OGC:CRS84) and algorithm (spherical) +GeographyType() + +# Custom CRS +GeographyType("EPSG:4326") + +# Custom CRS and algorithm +GeographyType("EPSG:4326", "planar") +``` + +### String Type Syntax + +Types can also be specified as strings in schema definitions: + +```python +# Using string type names +NestedField(1, "point", "geometry", required=True) +NestedField(2, "region", "geography", required=True) + +# With parameters +NestedField(3, "location", "geometry('EPSG:4326')", required=True) +NestedField(4, "boundary", "geography('EPSG:4326', 'planar')", required=True) +``` + +## Data Representation + +Values are represented as WKB (Well-Known Binary) bytes at runtime: + +```python +# Example: Point(0, 0) in WKB format +point_wkb = bytes.fromhex("0101000000000000000000000000000000000000") +``` + +## Current Limitations + +1. **WKB/WKT Conversion**: Converting between WKB bytes and WKT strings requires external libraries (like Shapely). PyIceberg does not include this conversion to avoid heavy dependencies. + +2. **Spatial Predicates**: Spatial filtering (e.g., ST_Contains, ST_Intersects) is not yet supported for query pushdown. + +3. **Bounds Metrics**: Geometry/geography columns do not currently contribute to data file bounds metrics. + +4. **Without geoarrow-pyarrow**: When the `geoarrow-pyarrow` package is not installed, geometry and geography columns are stored as binary without GeoArrow extension type metadata. The Iceberg schema preserves type information, but other tools reading the Parquet files directly may not recognize them as spatial types. Install with `pip install pyiceberg[geoarrow]` for full GeoArrow support. + +## Format Version + +Geometry and geography types require Iceberg format version 3: + +```python +from pyiceberg.table import TableProperties + +# Creating a v3 table +table = catalog.create_table( + identifier="db.spatial_table", + schema=schema, + properties={ + TableProperties.FORMAT_VERSION: "3" + } +) +``` + +Attempting to use these types with format version 1 or 2 will raise a validation error. diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index fab019138e..cde9e9986b 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -801,24 +801,35 @@ def visit_binary(self, _: BinaryType) -> pa.DataType: return pa.large_binary() def visit_geometry(self, geometry_type: GeometryType) -> pa.DataType: - """Convert geometry type to PyArrow binary. + """Convert geometry type to PyArrow type. - Note: PyArrow 21.0.0+ supports native GEOMETRY logical type from Arrow GEO. - For now, we use large_binary which stores WKB bytes. - Future enhancement: detect PyArrow version and use pa.geometry() when available. + When geoarrow-pyarrow is available, returns a GeoArrow WKB extension type + with CRS metadata. Otherwise, falls back to large_binary which stores WKB bytes. """ - # TODO: When PyArrow 21.0.0+ is available, use pa.geometry() with CRS metadata - return pa.large_binary() + try: + import geoarrow.pyarrow as ga + + return ga.wkb().with_crs(geometry_type.crs) + except ImportError: + return pa.large_binary() def visit_geography(self, geography_type: GeographyType) -> pa.DataType: - """Convert geography type to PyArrow binary. + """Convert geography type to PyArrow type. - Note: PyArrow 21.0.0+ supports native GEOGRAPHY logical type from Arrow GEO. - For now, we use large_binary which stores WKB bytes. - Future enhancement: detect PyArrow version and use pa.geography() when available. + When geoarrow-pyarrow is available, returns a GeoArrow WKB extension type + with CRS and edge type metadata. Otherwise, falls back to large_binary which stores WKB bytes. """ - # TODO: When PyArrow 21.0.0+ is available, use pa.geography() with CRS and algorithm metadata - return pa.large_binary() + try: + import geoarrow.pyarrow as ga + + wkb_type = ga.wkb().with_crs(geography_type.crs) + # Map Iceberg algorithm to GeoArrow edge type + if geography_type.algorithm == "spherical": + wkb_type = wkb_type.with_edge_type(ga.EdgeType.SPHERICAL) + # "planar" is the default edge type in GeoArrow, no need to set explicitly + return wkb_type + except ImportError: + return pa.large_binary() def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar: diff --git a/pyproject.toml b/pyproject.toml index 7fd1f3fdfa..1a9ad35b9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,7 @@ hf = ["huggingface-hub>=0.24.0"] pyiceberg-core = ["pyiceberg-core>=0.5.1,<0.8.0"] datafusion = ["datafusion>=45,<49"] gcp-auth = ["google-auth>=2.4.0"] +geoarrow = ["geoarrow-pyarrow>=0.2.0"] [dependency-groups] dev = [ diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index ea2928cae2..3f737e4306 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -98,6 +98,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, ListType, LongType, @@ -596,6 +598,108 @@ def test_binary_type_to_pyarrow() -> None: assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() +def test_geometry_type_to_pyarrow_without_geoarrow() -> None: + """Test geometry type falls back to large_binary when geoarrow is not available.""" + import sys + + iceberg_type = GeometryType() + + # Remove geoarrow from sys.modules if present and block re-import + saved_modules = {} + for mod_name in list(sys.modules.keys()): + if mod_name.startswith("geoarrow"): + saved_modules[mod_name] = sys.modules.pop(mod_name) + + import builtins + + original_import = builtins.__import__ + + def mock_import(name: str, *args: Any, **kwargs: Any) -> Any: + if name.startswith("geoarrow"): + raise ImportError(f"No module named '{name}'") + return original_import(name, *args, **kwargs) + + try: + builtins.__import__ = mock_import + result = visit(iceberg_type, _ConvertToArrowSchema()) + assert result == pa.large_binary() + finally: + builtins.__import__ = original_import + sys.modules.update(saved_modules) + + +def test_geography_type_to_pyarrow_without_geoarrow() -> None: + """Test geography type falls back to large_binary when geoarrow is not available.""" + import sys + + iceberg_type = GeographyType() + + # Remove geoarrow from sys.modules if present and block re-import + saved_modules = {} + for mod_name in list(sys.modules.keys()): + if mod_name.startswith("geoarrow"): + saved_modules[mod_name] = sys.modules.pop(mod_name) + + import builtins + + original_import = builtins.__import__ + + def mock_import(name: str, *args: Any, **kwargs: Any) -> Any: + if name.startswith("geoarrow"): + raise ImportError(f"No module named '{name}'") + return original_import(name, *args, **kwargs) + + try: + builtins.__import__ = mock_import + result = visit(iceberg_type, _ConvertToArrowSchema()) + assert result == pa.large_binary() + finally: + builtins.__import__ = original_import + sys.modules.update(saved_modules) + + +def test_geometry_type_to_pyarrow_with_geoarrow() -> None: + """Test geometry type uses geoarrow WKB extension type when available.""" + pytest.importorskip("geoarrow.pyarrow") + import geoarrow.pyarrow as ga + + # Test default CRS + iceberg_type = GeometryType() + result = visit(iceberg_type, _ConvertToArrowSchema()) + expected = ga.wkb().with_crs("OGC:CRS84") + assert result == expected + + # Test custom CRS + iceberg_type_custom = GeometryType("EPSG:4326") + result_custom = visit(iceberg_type_custom, _ConvertToArrowSchema()) + expected_custom = ga.wkb().with_crs("EPSG:4326") + assert result_custom == expected_custom + + +def test_geography_type_to_pyarrow_with_geoarrow() -> None: + """Test geography type uses geoarrow WKB extension type with edge type when available.""" + pytest.importorskip("geoarrow.pyarrow") + import geoarrow.pyarrow as ga + + # Test default (spherical algorithm) + iceberg_type = GeographyType() + result = visit(iceberg_type, _ConvertToArrowSchema()) + expected = ga.wkb().with_crs("OGC:CRS84").with_edge_type(ga.EdgeType.SPHERICAL) + assert result == expected + + # Test custom CRS with spherical + iceberg_type_custom = GeographyType("EPSG:4326", "spherical") + result_custom = visit(iceberg_type_custom, _ConvertToArrowSchema()) + expected_custom = ga.wkb().with_crs("EPSG:4326").with_edge_type(ga.EdgeType.SPHERICAL) + assert result_custom == expected_custom + + # Test planar algorithm (no edge type set, uses default) + iceberg_type_planar = GeographyType("OGC:CRS84", "planar") + result_planar = visit(iceberg_type_planar, _ConvertToArrowSchema()) + expected_planar = ga.wkb().with_crs("OGC:CRS84") + assert result_planar == expected_planar + + def test_struct_type_to_pyarrow(table_schema_simple: Schema) -> None: expected = pa.struct( [ diff --git a/uv.lock b/uv.lock index b0bab96be4..880d4fc20b 100644 --- a/uv.lock +++ b/uv.lock @@ -1430,6 +1430,91 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/bf/e664cbeab8f2a8b097c0604252760410fde191fe6ac7d5081b29e601ac52/gcsfs-2025.12.0-py3-none-any.whl", hash = "sha256:e06aaec53797dc6b83d5cc90c4d3ae7247b4ee0cf8d8b1ce50e8d6b78e3a9aea", size = 41204, upload-time = "2025-12-03T15:44:58.464Z" }, ] +[[package]] +name = "geoarrow-c" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/fe727122e139df3ebca696d9ed2c4eae43ae916a3d6beb92fe7f845f8337/geoarrow_c-0.3.1.tar.gz", hash = "sha256:a488794aab6631f5c54c3f77fcff734b2dda162e2ef74cd22fff6989d7aed89d", size = 299004, upload-time = "2025-10-10T03:04:30.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/16/3b96e1decaaff9dab927075b8f1f0ef8ae9e4bf33af7ba35bff4762f0e26/geoarrow_c-0.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d064c2a8b1884821429e543b24e146966a2bbfd0cb7fb09937d63522c2196303", size = 571130, upload-time = "2025-10-10T03:05:35.586Z" }, + { url = "https://files.pythonhosted.org/packages/07/13/0d5d12d42653280f302ef662b07c22cac16fd4326af95cfbe19963fa52a4/geoarrow_c-0.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b2d7e708c94946ab6a50cbcc97aeb4dcc4088bca186440d0845e7a3528533845", size = 558043, upload-time = "2025-10-10T03:05:26.911Z" }, + { url = "https://files.pythonhosted.org/packages/34/aa/ffe9dd57dde3aff36032ee1ae40a3da60aad8e4983b54c533df8b74d55c0/geoarrow_c-0.3.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e83df66541132b49d1492919642a42fe0d77b1c0d64911a6cf47710df26ce202", size = 1873023, upload-time = "2025-10-10T03:04:42.245Z" }, + { url = "https://files.pythonhosted.org/packages/40/30/270c4625498980a81f256b64e867f5db9f67dce5a18979fbe04a5d3c1cc2/geoarrow_c-0.3.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a59a1663727705c710cb47a75d1ebd69dcbec2958cc4f2cec39d3bdd91171ae", size = 1899985, upload-time = "2025-10-10T03:05:05.584Z" }, + { url = "https://files.pythonhosted.org/packages/8c/48/1fb4d9e20c6d86696aa7bf6de5bd72380b7d1ba4133da110edb4eca8a433/geoarrow_c-0.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:47968f89178026889a4e2c460f084b299134c1ed2a242a43c8b69c9154c14e39", size = 2811463, upload-time = "2025-10-10T03:04:43.71Z" }, + { url = "https://files.pythonhosted.org/packages/a3/7a/36b13b1d4d0b92a11c20aabcef9eb97c6ad57d58454f06e2d8778173d220/geoarrow_c-0.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bd91674ac439f419323bee1cbb2f28c11c798f846f93f0ab9e93270cd12b5580", size = 2913250, upload-time = "2025-10-10T03:05:07.161Z" }, + { url = "https://files.pythonhosted.org/packages/db/e5/2946b49ca3df684f4ae878fdfee46e7beb07f42ac7ce94c3e239e065f2f3/geoarrow_c-0.3.1-cp310-cp310-win32.whl", hash = "sha256:8e8d2b23fd3a0c23908ee650650438d55d60d90254cbdca12c3dd91a95c77e68", size = 430386, upload-time = "2025-10-10T03:06:00.865Z" }, + { url = "https://files.pythonhosted.org/packages/a7/2d/1b125adf985652def385e494d7757e5d79d18fdfd67966efb14f5e115795/geoarrow_c-0.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:a8e27a260a3ec6b214bd807a40d7be94b14dea028464c48c0e8704f2237fa9d0", size = 448872, upload-time = "2025-10-10T03:05:43.709Z" }, + { url = "https://files.pythonhosted.org/packages/3f/a8/e2637190a66d05cd9ad2ba2a75a2fa08fc836bd12aa6c61e9dedba0d733e/geoarrow_c-0.3.1-cp310-cp310-win_arm64.whl", hash = "sha256:c49094074218ac98a9c26d37ec2e94129e6d1e4185b79f83997b3aea0115a2ce", size = 424638, upload-time = "2025-10-10T03:05:53.691Z" }, + { url = "https://files.pythonhosted.org/packages/73/ad/b688d7825a786b0dfbe1363ad087f9436e6c26bcd8788f97487f44d0512c/geoarrow_c-0.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:946bc5816f7158092d27b76e2e436e03edad814a90b767745534cf02dff2d4b1", size = 574886, upload-time = "2025-10-10T03:05:36.676Z" }, + { url = "https://files.pythonhosted.org/packages/07/7a/53dfdb581d28755c206f7245fb259ca68758a75169b3b1cbcf671dad4915/geoarrow_c-0.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4f8d7d269446772c887bf8682d67877ea6ba12187de86ee2f7d7ae679b00bed", size = 561122, upload-time = "2025-10-10T03:05:27.884Z" }, + { url = "https://files.pythonhosted.org/packages/c5/f4/b46d6a95415fa385894b22444106fd506e0b3e514ceec205d6960e57d669/geoarrow_c-0.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:081ec4d7eeaefa1a381e43e7f676ca3ba4c718ef97c26c834627cac77821f4c1", size = 1931003, upload-time = "2025-10-10T03:04:44.926Z" }, + { url = "https://files.pythonhosted.org/packages/a4/b7/6b6a007f1b59aace626c244040dc76a6ea996bc06baf7ecb00717f6381b5/geoarrow_c-0.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c11b3d1fe1f4c54a5666ca9816af68764197e73c3c548d935f336cbf43d705ea", size = 1953137, upload-time = "2025-10-10T03:05:08.787Z" }, + { url = "https://files.pythonhosted.org/packages/92/38/a9b7422d5f4c84799ef9d541f93bb5b8e65b0d2b815b0a594f2e44203281/geoarrow_c-0.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:85de6694e6d304ed002aa81d3f1d8625455317c97fde05f1e12e985ca37ad53e", size = 2870043, upload-time = "2025-10-10T03:04:46.493Z" }, + { url = "https://files.pythonhosted.org/packages/5b/bf/19175d8a61ca41ef316437a8f3db2fc338cf67f511a23feccb152bd615f8/geoarrow_c-0.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49871c87a7c0ce2a416e006fb4d4273b0302f92aa76bb2706379e2b61e9f8fd2", size = 2964436, upload-time = "2025-10-10T03:05:09.937Z" }, + { url = "https://files.pythonhosted.org/packages/7f/a5/505c59de3abee3af7b0a15418cab32349a51bb63cfeba0a150114023ac98/geoarrow_c-0.3.1-cp311-cp311-win32.whl", hash = "sha256:a70edf5531198a3bcb590566e3927c62b5f25cfc030e551625ca01238278f8db", size = 431483, upload-time = "2025-10-10T03:06:01.893Z" }, + { url = "https://files.pythonhosted.org/packages/de/71/1b95bc02a9eb165026e6c404cbb7f8de42cc8856142ef2a6cf506ba842e7/geoarrow_c-0.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:7e9aa0f1e063fab642d2b9d24f2bb8a8311e8cfdc48cd2df1340d90b387a9a8b", size = 450199, upload-time = "2025-10-10T03:05:44.705Z" }, + { url = "https://files.pythonhosted.org/packages/f0/1c/85d256b0c95861e2251684d7e9d5faeb41a9d5723b85924a13bf62541ad3/geoarrow_c-0.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:60707a335d40807e806ca295a80087d8cd2c6e061dc4135ca22116208dfa477f", size = 425425, upload-time = "2025-10-10T03:05:54.712Z" }, + { url = "https://files.pythonhosted.org/packages/01/3c/5b2ada5b6166495ad1fd1f93127a6ad313833a25c38f98bab0c9b4acc95d/geoarrow_c-0.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:49b0cf15d73223138fae38b6f8426897c3352006d2b4216be55d5c38383fb5d5", size = 574941, upload-time = "2025-10-10T03:05:37.628Z" }, + { url = "https://files.pythonhosted.org/packages/66/ec/ffd43f0671a5c7d46776afe4a3d15d21a04206339df3054ef676b3c36fbf/geoarrow_c-0.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d0bab5f31ad17e7c6baaf2a7a0e65b67011e1b008daf4bd4ecbf9172085f0a20", size = 560325, upload-time = "2025-10-10T03:05:29.035Z" }, + { url = "https://files.pythonhosted.org/packages/74/05/c28749e651921e469d22c06c694b3e42dac5605228ac1c34add3371bb4ba/geoarrow_c-0.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0700091e9bbb0ffdee37e5ac59ef2a38aab5683e33df14f99a7bfec487dd12f1", size = 1910515, upload-time = "2025-10-10T03:04:48.188Z" }, + { url = "https://files.pythonhosted.org/packages/2a/c8/71f5632a1912551e0f1a04280c7a14fa6378f01b3fe849d78bb3817226a2/geoarrow_c-0.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3aa9370ac8bb295fc192c70f499502e8ae419b335fbe29ecb344a2a368f3afb1", size = 1949508, upload-time = "2025-10-10T03:05:11.23Z" }, + { url = "https://files.pythonhosted.org/packages/e7/e4/e7a4de01e1ac1cbcf8e20f4747182a5c2d8655a0532c1e3eada1182904b1/geoarrow_c-0.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:434536677f328923ea9be366824abe6fd177881eb46ebcf7cca4d7b73546cf0c", size = 2846395, upload-time = "2025-10-10T03:04:49.857Z" }, + { url = "https://files.pythonhosted.org/packages/7f/c8/f62fbad76e8baf7209b8f20fa849457a7b493639f5b0ec4343b69ebcf554/geoarrow_c-0.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:08d48af947d76a983c6cbaf05587c106d81149fad63c18ac644b72c599e70711", size = 2947165, upload-time = "2025-10-10T03:05:12.321Z" }, + { url = "https://files.pythonhosted.org/packages/14/8a/dda4390f01ebc29a8a4b2190f132e9e251afecdb12afe1c0143f3525b0b2/geoarrow_c-0.3.1-cp312-cp312-win32.whl", hash = "sha256:bee0205900f3b88f6a3e833ab08007b654298840bd9e4e39f6aa1595cf294f1e", size = 430547, upload-time = "2025-10-10T03:06:02.895Z" }, + { url = "https://files.pythonhosted.org/packages/36/1f/63d7d5491f1dc81e6c793276b898b983824faa6167616ed4130477d4629e/geoarrow_c-0.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:b9101c94ee44687f2c62ce3cc55acef99bb032b0653e17ab12c83f3505f8b4b9", size = 451193, upload-time = "2025-10-10T03:05:46.211Z" }, + { url = "https://files.pythonhosted.org/packages/1b/fb/a153ad1381899459442b46c028819bc945837459bc8d7e0847c74bd375d9/geoarrow_c-0.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:be77085242d5db5f85271da15a1aa5adc84c30e198691af04490b4fa91f09a8b", size = 424403, upload-time = "2025-10-10T03:05:55.602Z" }, + { url = "https://files.pythonhosted.org/packages/f4/66/f408e4c5cea55d056657ff3a8ae46ca1c9f47adb4d4edeffaacbdccd940d/geoarrow_c-0.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:105ce761cf7e59b335ebff256f7d65ae7ddedda587bc1c5a63191169f14a18ef", size = 576484, upload-time = "2025-10-10T03:05:38.585Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ba/d3a5e04f9ebd511dd5e269af2b4227463b7327a17a817267d64fadbfe662/geoarrow_c-0.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ea64cb38ae2af47973a28c7d36ac089f35dcf2916765511aace63fc2c868f39a", size = 566953, upload-time = "2025-10-10T03:05:29.998Z" }, + { url = "https://files.pythonhosted.org/packages/8c/24/9b956c023046f7c082347d57de6d11683d713b4b4a6edf92b364f551da1f/geoarrow_c-0.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e5849c656a8ab8359215befbaaf197a0e2eddfa3b030ee11f49a58c985c4c59c", size = 1955242, upload-time = "2025-10-10T03:04:51.061Z" }, + { url = "https://files.pythonhosted.org/packages/01/b0/7226069e909fb54c9b799ea6f5ff97c500c689311bd0b81eabf22f5c6eec/geoarrow_c-0.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b2b90b22caf3756e63f7038a4277368865e3541101872ba73e38cebd462c790", size = 1995358, upload-time = "2025-10-10T03:05:13.477Z" }, + { url = "https://files.pythonhosted.org/packages/dc/0d/01d32c9b57f86fdb38aca62ec29c24c02c324982d7fa8108e38744ec460f/geoarrow_c-0.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a1f544cf77a724f7853a0bb167640915a458dbce802dc0869f3f40968f2022cf", size = 2884255, upload-time = "2025-10-10T03:04:52.2Z" }, + { url = "https://files.pythonhosted.org/packages/88/d1/51e16079cb33cb2271717391585f95002930a2886e7f43127f9a1b644711/geoarrow_c-0.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:15b1b920b01532903bc11a1fac351cb93d12b6965a97b365f2c88e1667ef488d", size = 2988622, upload-time = "2025-10-10T03:05:14.751Z" }, + { url = "https://files.pythonhosted.org/packages/d4/79/f57f8f8971770285d62d5d76afe42bca248a25304fbc4733bac58d14ab0e/geoarrow_c-0.3.1-cp313-cp313-win32.whl", hash = "sha256:1664774a1540721ab42ac3131a9e0941e6394ba4f82e408f7ce27800cbd1aed2", size = 431565, upload-time = "2025-10-10T03:06:03.778Z" }, + { url = "https://files.pythonhosted.org/packages/b5/de/2b53a0c1a3b7641f2662a69089a0bb15fd7e7d25b04a92d2330a73a4de96/geoarrow_c-0.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:dd13290d9fab82e7c6470da6ec6ee0f4a2e1d05a6b4050650c71f5bfb2b023e9", size = 455825, upload-time = "2025-10-10T03:05:47.245Z" }, + { url = "https://files.pythonhosted.org/packages/53/e1/1b99b2252163fc68197fd0514087569ca610a75e0926716a4c591803822a/geoarrow_c-0.3.1-cp313-cp313-win_arm64.whl", hash = "sha256:ab4511159b0405c1b094b631c259ad07a16b4f1a3b84d453999e9dc60c302397", size = 427610, upload-time = "2025-10-10T03:05:56.549Z" }, + { url = "https://files.pythonhosted.org/packages/3f/e9/36e6cfeeb1713c1f1b409e81065a2011a4a1c618cebc663153cedae47e0a/geoarrow_c-0.3.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8204245dd903f99fcd2f9309c1103392d1150735a2221f02575b07a5c66fa285", size = 577089, upload-time = "2025-10-10T03:05:39.69Z" }, + { url = "https://files.pythonhosted.org/packages/f2/f1/b6aae1316d28a44b9604a66cc4f3a7b38f0f1e7ef86e68e1e6a7c7e13ea7/geoarrow_c-0.3.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4a69d707e21d7d2acdbd61eb11df7b052ce1b6c08cd92793c329342d4e501ce7", size = 567589, upload-time = "2025-10-10T03:05:30.965Z" }, + { url = "https://files.pythonhosted.org/packages/a4/a3/0d68cb5eff8e18bfdf8b6a624678e36ed4c47b2a32d5a27d48e0f130e7e0/geoarrow_c-0.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:80060160896849351f998b77b536853e91cc8eaddc13fedd5785b2459f9ce0a0", size = 1953668, upload-time = "2025-10-10T03:04:53.838Z" }, + { url = "https://files.pythonhosted.org/packages/f1/56/f12733a85737688a74029fd598a15e5f97fff999210d395fd25e3d70ae58/geoarrow_c-0.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c566c80aec907de482f9b87095b6f026a69538ff37cf701a7efbcaee6ba5389", size = 1988200, upload-time = "2025-10-10T03:05:16.355Z" }, + { url = "https://files.pythonhosted.org/packages/ea/b5/1529f5d8cc892790c3524252f6961e0887a6116a9c8b1f8c4f6441c5b95f/geoarrow_c-0.3.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:775615b2e77d7615ca23ac37f3a00ae11696eed3bb2a1e153a5cc094c662799f", size = 2883956, upload-time = "2025-10-10T03:04:55.185Z" }, + { url = "https://files.pythonhosted.org/packages/8b/0d/d269507838d049ffb090108d2000c4638f9a27606fdaee74ff6d6b400b29/geoarrow_c-0.3.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ec1f29501d9d28c2642b0f84b2a1b958e3332375ebf1c6f2d7cb1b3876daa500", size = 2982956, upload-time = "2025-10-10T03:05:17.633Z" }, + { url = "https://files.pythonhosted.org/packages/09/46/d1fc8d114ca713d9f25b0a130fa1e2a5f250b75680020cf63c610044a7eb/geoarrow_c-0.3.1-cp314-cp314-win32.whl", hash = "sha256:e3a310a06edebb7289785cdab19a9e64a11150644d3b657c23125aafe404f0d4", size = 436258, upload-time = "2025-10-10T03:06:04.953Z" }, + { url = "https://files.pythonhosted.org/packages/7d/48/33c41748a4f7dd461745b7fbedb19e81f773771144d851e12ff12b2fc546/geoarrow_c-0.3.1-cp314-cp314-win_amd64.whl", hash = "sha256:86f87599950abe7412a85a20fd5a045a859e7bb8f69d9376ccaa3c3a7cbf293a", size = 460679, upload-time = "2025-10-10T03:05:48.491Z" }, + { url = "https://files.pythonhosted.org/packages/87/21/c7d14179893f83c76b528276aeeefc5f6ed3409ea0ef723d8e157840dd8d/geoarrow_c-0.3.1-cp314-cp314-win_arm64.whl", hash = "sha256:6502ab08968f427e94e267ae223020c0c5eb7a09db00885e349a6fab3fe632c6", size = 433657, upload-time = "2025-10-10T03:05:57.449Z" }, + { url = "https://files.pythonhosted.org/packages/f0/87/96bda2d65fa1da8a27f11f56bfd5d99f335829ab9892149a68acc04bf04e/geoarrow_c-0.3.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9bafb69791a1ca78273b0f69ab7358d1ec90e5fa80b20816fb8b7228c107bf42", size = 584495, upload-time = "2025-10-10T03:05:40.686Z" }, + { url = "https://files.pythonhosted.org/packages/5e/a8/772aa9c0df3e285d14eb4ae1c47d310b4c2dcc8ad96225f16841d86613c3/geoarrow_c-0.3.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:dcf61d7b35999c3fbde3152a3774ba5b6633dbcbe7fc40fea385a96ff3b5f42a", size = 576697, upload-time = "2025-10-10T03:05:31.987Z" }, + { url = "https://files.pythonhosted.org/packages/2e/ad/8b10f728f2b0467c9859b1170dcc4c460350f935bb6600842d7bbd51b377/geoarrow_c-0.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24da8395d5b298f2071f54497d876645daad5bee78fcf627bee746dc425cba2b", size = 1992239, upload-time = "2025-10-10T03:04:56.306Z" }, + { url = "https://files.pythonhosted.org/packages/73/69/521cf08fa59d1d492b9906b7d43238bbb0b50e9d5a119b94af6ce89cfc3c/geoarrow_c-0.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:82350f13e1a31d1d4825fb65e6efb8d1b0368c17ddf5145d81049c532e3a1198", size = 1988446, upload-time = "2025-10-10T03:05:18.901Z" }, + { url = "https://files.pythonhosted.org/packages/d9/56/abee559124e989d9f60101aedb8385e13bc808ffaabdb711ee77f27bbf7a/geoarrow_c-0.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0eb54b4d6ecddfdfdf726fefbed477b64b006873523c014d73b7d3b37818592c", size = 2901820, upload-time = "2025-10-10T03:04:57.524Z" }, + { url = "https://files.pythonhosted.org/packages/7a/39/dab9032f617c5e6bec8a1bfe2037fd26853576c958c75af2aacfc76bb7ac/geoarrow_c-0.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aa17482255471b2d954866588480aa5a494d41fc3f8e00a8ab6f25d25b06157d", size = 2975926, upload-time = "2025-10-10T03:05:20.669Z" }, + { url = "https://files.pythonhosted.org/packages/b8/60/ce8fbc75f1d6dab9faccbd002bb11eaa2d584df948bc33a4abca15df6aa3/geoarrow_c-0.3.1-cp314-cp314t-win32.whl", hash = "sha256:dea8b9e16699909725e80c214ca4b4968b4c512213816876fae9e56087cf1f57", size = 452711, upload-time = "2025-10-10T03:06:06.012Z" }, + { url = "https://files.pythonhosted.org/packages/91/2d/fb9923dcb9f04f8ee4479ca9e1c601e1327cdeb63cc67c25d0bf3c1465df/geoarrow_c-0.3.1-cp314-cp314t-win_amd64.whl", hash = "sha256:3394440734affd699dfa766eeaec1e53d76d0fe1ece5f5ee3c80e06a0e647abf", size = 482826, upload-time = "2025-10-10T03:05:49.903Z" }, + { url = "https://files.pythonhosted.org/packages/65/c4/c96e10f0e26ec59fe89470686486f6ef75d13b8de4ee423cefd9b4ae8823/geoarrow_c-0.3.1-cp314-cp314t-win_arm64.whl", hash = "sha256:430a177b6a2145f0bcf7e39d575f2e01352608054f848e53198bfda7f59e59fb", size = 442172, upload-time = "2025-10-10T03:05:58.508Z" }, +] + +[[package]] +name = "geoarrow-pyarrow" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "geoarrow-c" }, + { name = "geoarrow-types" }, + { name = "pyarrow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/e0/ff8430e5e1049e310c35f335cf7e14682bfeafeb773f48c5a7425359a4ba/geoarrow_pyarrow-0.2.0.tar.gz", hash = "sha256:5c981f5cae26fa6cdfb6f9b83fb490d36bf0fe6f6fa360c4c8983e0a8a457926", size = 33488, upload-time = "2025-05-27T03:51:45.071Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/d2/f47453e6aefb69acdde7af8b0efb3fe28c82e7c37650b0ce1ad61ab847e5/geoarrow_pyarrow-0.2.0-py3-none-any.whl", hash = "sha256:dcc1d4684e11771c3f59ba18e71fa7cc6d7cb8fd01db7bdc73ffb88c66cd0446", size = 25600, upload-time = "2025-05-27T03:51:44.113Z" }, +] + +[[package]] +name = "geoarrow-types" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a5/97/fa35f5d13a803b8f16e59c1f18f06607b9df5683c08bd7cd7a48a29ce988/geoarrow_types-0.3.0.tar.gz", hash = "sha256:82243e4be88b268fa978ae5bba6c6680c3556735e795965b2fe3e6fbfea9f9ee", size = 23708, upload-time = "2025-05-27T03:39:39.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/16/e37cb1b0894c9cf3f9b1c50ebcfab56a0d9fe7c3b6f97d5680a7eb27ca08/geoarrow_types-0.3.0-py3-none-any.whl", hash = "sha256:439df6101632080442beccc7393cac54d6c7f6965da897554349e94d2492f613", size = 19025, upload-time = "2025-05-27T03:39:38.652Z" }, +] + [[package]] name = "ghp-import" version = "2.1.0" @@ -3728,6 +3813,9 @@ gcp-auth = [ gcsfs = [ { name = "gcsfs" }, ] +geoarrow = [ + { name = "geoarrow-pyarrow" }, +] glue = [ { name = "boto3" }, ] @@ -3827,6 +3915,7 @@ requires-dist = [ { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=0.5.0,<2.0.0" }, { name = "fsspec", specifier = ">=2023.1.0" }, { name = "gcsfs", marker = "extra == 'gcsfs'", specifier = ">=2023.1.0" }, + { name = "geoarrow-pyarrow", marker = "extra == 'geoarrow'", specifier = ">=0.2.0" }, { name = "google-auth", marker = "extra == 'gcp-auth'", specifier = ">=2.4.0" }, { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.33.0,<4" }, { name = "huggingface-hub", marker = "extra == 'hf'", specifier = ">=0.24.0" }, @@ -3860,7 +3949,7 @@ requires-dist = [ { name = "thrift-sasl", marker = "extra == 'hive-kerberos'", specifier = ">=0.4.3" }, { name = "zstandard", specifier = ">=0.13.0,<1.0.0" }, ] -provides-extras = ["pyarrow", "pandas", "duckdb", "ray", "bodo", "daft", "polars", "snappy", "hive", "hive-kerberos", "s3fs", "glue", "adlfs", "dynamodb", "bigquery", "sql-postgres", "sql-sqlite", "gcsfs", "rest-sigv4", "hf", "pyiceberg-core", "datafusion", "gcp-auth"] +provides-extras = ["pyarrow", "pandas", "duckdb", "ray", "bodo", "daft", "polars", "snappy", "hive", "hive-kerberos", "s3fs", "glue", "adlfs", "dynamodb", "bigquery", "sql-postgres", "sql-sqlite", "gcsfs", "rest-sigv4", "hf", "pyiceberg-core", "datafusion", "gcp-auth", "geoarrow"] [package.metadata.requires-dev] dev = [ From 0d1d7a1a01141a455a5c0569da5643e1dd1ca3eb Mon Sep 17 00:00:00 2001 From: Sam Verhasselt Date: Mon, 22 Dec 2025 16:07:59 -0800 Subject: [PATCH 3/4] Add RFC --- mkdocs/docs/dev/rfc-geospatial-types.md | 145 ++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 mkdocs/docs/dev/rfc-geospatial-types.md diff --git a/mkdocs/docs/dev/rfc-geospatial-types.md b/mkdocs/docs/dev/rfc-geospatial-types.md new file mode 100644 index 0000000000..7a28a15d2b --- /dev/null +++ b/mkdocs/docs/dev/rfc-geospatial-types.md @@ -0,0 +1,145 @@ +# RFC: Iceberg v3 Geospatial Primitive Types + +## Motivation + +Apache Iceberg v3 introduces native geospatial types (`geometry` and `geography`) to support spatial data workloads. These types enable: + +1. **Interoperability**: Consistent spatial data representation across Iceberg implementations +2. **Query optimization**: Future support for spatial predicate pushdown +3. **Standards compliance**: Alignment with OGC and ISO spatial data standards + +This RFC describes the design and implementation of these types in PyIceberg. + +## Scope + +**In scope:** +- `geometry(C)` and `geography(C, A)` primitive type definitions +- Type parsing and serialization (round-trip support) +- Avro mapping (WKB bytes) +- PyArrow/Parquet conversion (with version-aware fallback) +- Format version enforcement (v3 required) + +**Out of scope (future work):** +- Spatial predicate pushdown (e.g., ST_Contains, ST_Intersects) +- WKB/WKT conversion (requires external dependencies) +- Geometry/geography bounds metrics +- Spatial indexing + +## Non-Goals + +- Adding heavy dependencies like Shapely, GEOS, or GeoPandas +- Implementing spatial operations or computations +- Supporting format versions < 3 + +## Design + +### Type Parameters + +**GeometryType:** +- `crs` (string): Coordinate Reference System, defaults to `"OGC:CRS84"` + +**GeographyType:** +- `crs` (string): Coordinate Reference System, defaults to `"OGC:CRS84"` +- `algorithm` (string): Geographic algorithm, defaults to `"spherical"` + +### Type String Format + +```python +# Default parameters +"geometry" +"geography" + +# With custom CRS +"geometry('EPSG:4326')" +"geography('EPSG:4326')" + +# With custom CRS and algorithm +"geography('EPSG:4326', 'planar')" +``` + +### Runtime Representation + +Values are stored as WKB (Well-Known Binary) bytes at runtime. This matches the Avro and Parquet physical representation per the Iceberg spec. + +### JSON Single-Value Serialization + +Per the Iceberg spec, geometry/geography values should be serialized as WKT (Well-Known Text) strings in JSON. However, since we represent values as WKB bytes at runtime, conversion between WKB and WKT would require external dependencies. + +**Current behavior:** `NotImplementedError` is raised for JSON serialization/deserialization until a conversion strategy is established. + +### Avro Mapping + +Both geometry and geography types map to Avro `bytes` type, consistent with `BinaryType` handling. + +### PyArrow/Parquet Mapping + +**With geoarrow-pyarrow installed:** +- Geometry types convert to GeoArrow WKB extension type with CRS metadata +- Geography types convert to GeoArrow WKB extension type with CRS and edge type metadata +- Uses `geoarrow.pyarrow.wkb().with_crs()` and `.with_edge_type()` for full GeoArrow compatibility + +**Without geoarrow-pyarrow:** +- Geometry and geography types fall back to `pa.large_binary()` +- This provides WKB storage without GEO logical type metadata + +## Compatibility + +### Format Version + +Geometry and geography types require Iceberg format version 3. Attempting to use them with format version 1 or 2 will raise a validation error via `Schema.check_format_version_compatibility()`. + +### geoarrow-pyarrow + +- **Optional dependency**: Install with `pip install pyiceberg[geoarrow]` +- **Without geoarrow**: Geometry/geography stored as binary columns (WKB) +- **With geoarrow**: Full GeoArrow extension type support with CRS/edge metadata + +### Breaking Changes + +None. These are new types that do not affect existing functionality. + +## Dependency/Versioning + +**Required:** +- PyIceberg core (no new dependencies) + +**Optional for full functionality:** +- PyArrow 21.0.0+ for native Parquet GEO logical types + +## Testing Strategy + +1. **Unit tests** (`test_types.py`): + - Type creation with default/custom parameters + - `__str__` and `__repr__` methods + - JSON serialization/deserialization round-trip + - Equality, hashing, and pickling + - `minimum_format_version()` enforcement + +2. **Integration tests** (future): + - End-to-end table creation with geometry/geography columns + - Parquet file round-trip with PyArrow + +## Known Limitations + +1. **No WKB/WKT conversion**: JSON single-value serialization raises `NotImplementedError` +2. **No bounds metrics**: Cannot extract bounds from WKB without parsing +3. **No spatial predicates**: Query optimization for spatial filters not yet implemented +4. **PyArrow < 21.0.0**: Falls back to binary type without GEO metadata +5. **Reverse conversion from Parquet**: Binary columns cannot be distinguished from geometry/geography without Iceberg schema metadata + +## File Locations + +| Component | File | +|-----------|------| +| Type definitions | `pyiceberg/types.py` | +| Conversions | `pyiceberg/conversions.py` | +| Schema visitors | `pyiceberg/schema.py` | +| Avro conversion | `pyiceberg/utils/schema_conversion.py` | +| PyArrow conversion | `pyiceberg/io/pyarrow.py` | +| Unit tests | `tests/test_types.py` | + +## References + +- [Iceberg v3 Type Specification](https://iceberg.apache.org/spec/#schemas-and-data-types) +- [Arrow GEO Proposal](https://arrow.apache.org/docs/format/GeoArrow.html) +- [Arrow PR #45459](https://github.com/apache/arrow/pull/45459) From f4359b48afff0213f0d8b3c69d8eb27c17dbb354 Mon Sep 17 00:00:00 2001 From: Sam Verhasselt Date: Mon, 22 Dec 2025 16:29:00 -0800 Subject: [PATCH 4/4] markdown lint fix --- mkdocs/docs/dev/rfc-geospatial-types.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mkdocs/docs/dev/rfc-geospatial-types.md b/mkdocs/docs/dev/rfc-geospatial-types.md index 7a28a15d2b..8b5d492b43 100644 --- a/mkdocs/docs/dev/rfc-geospatial-types.md +++ b/mkdocs/docs/dev/rfc-geospatial-types.md @@ -13,6 +13,7 @@ This RFC describes the design and implementation of these types in PyIceberg. ## Scope **In scope:** + - `geometry(C)` and `geography(C, A)` primitive type definitions - Type parsing and serialization (round-trip support) - Avro mapping (WKB bytes) @@ -20,6 +21,7 @@ This RFC describes the design and implementation of these types in PyIceberg. - Format version enforcement (v3 required) **Out of scope (future work):** + - Spatial predicate pushdown (e.g., ST_Contains, ST_Intersects) - WKB/WKT conversion (requires external dependencies) - Geometry/geography bounds metrics @@ -36,9 +38,11 @@ This RFC describes the design and implementation of these types in PyIceberg. ### Type Parameters **GeometryType:** + - `crs` (string): Coordinate Reference System, defaults to `"OGC:CRS84"` **GeographyType:** + - `crs` (string): Coordinate Reference System, defaults to `"OGC:CRS84"` - `algorithm` (string): Geographic algorithm, defaults to `"spherical"` @@ -74,11 +78,13 @@ Both geometry and geography types map to Avro `bytes` type, consistent with `Bin ### PyArrow/Parquet Mapping **With geoarrow-pyarrow installed:** + - Geometry types convert to GeoArrow WKB extension type with CRS metadata - Geography types convert to GeoArrow WKB extension type with CRS and edge type metadata - Uses `geoarrow.pyarrow.wkb().with_crs()` and `.with_edge_type()` for full GeoArrow compatibility **Without geoarrow-pyarrow:** + - Geometry and geography types fall back to `pa.large_binary()` - This provides WKB storage without GEO logical type metadata @@ -101,9 +107,11 @@ None. These are new types that do not affect existing functionality. ## Dependency/Versioning **Required:** + - PyIceberg core (no new dependencies) **Optional for full functionality:** + - PyArrow 21.0.0+ for native Parquet GEO logical types ## Testing Strategy