diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index e05450a97f7f..911604cd0bd1 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -443,10 +443,10 @@ impl RecordBatch { /// // Initially, the metadata is empty /// assert!(batch.schema().metadata().get("key").is_none()); /// // Insert a key-value pair into the metadata - /// batch.schema_metadata_mut().insert("key".into(), "value".into()); + /// batch.schema_metadata_mut().insert("key", "value"); /// assert_eq!(batch.schema().metadata().get("key"), Some(&String::from("value"))); /// ``` - pub fn schema_metadata_mut(&mut self) -> &mut std::collections::HashMap { + pub fn schema_metadata_mut(&mut self) -> &mut arrow_schema::Metadata { let schema = Arc::make_mut(&mut self.schema); &mut schema.metadata } @@ -1709,9 +1709,7 @@ mod tests { batch.clone().with_schema(required_schema).unwrap_err(); // Can add metadata - let metadata = vec![("foo".to_string(), "bar".to_string())] - .into_iter() - .collect(); + let metadata = arrow_schema::Metadata::from([("foo", "bar")]); let metadata_schema = nullable_schema.as_ref().clone().with_metadata(metadata); let batch = batch.with_schema(Arc::new(metadata_schema)).unwrap(); diff --git a/arrow-avro/src/reader/async_reader/mod.rs b/arrow-avro/src/reader/async_reader/mod.rs index 31df7cdf110b..9ec8465a0165 100644 --- a/arrow-avro/src/reader/async_reader/mod.rs +++ b/arrow-avro/src/reader/async_reader/mod.rs @@ -553,7 +553,7 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::types::{Int32Type, Int64Type}; use arrow_array::*; - use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; + use arrow_schema::{DataType, Field, Metadata, Schema, SchemaRef, TimeUnit}; use futures::{StreamExt, TryStreamExt}; use object_store::local::LocalFileSystem; use object_store::path::Path; @@ -1630,7 +1630,7 @@ mod tests { let expected_schema = get_alltypes_schema() .as_ref() .clone() - .with_metadata(Default::default()); + .with_metadata(Metadata::default()); // Build reader without providing reader schema - should use writer schema from file let reader = AsyncAvroFileReader::builder(file_reader, file_size, 1024) @@ -1657,7 +1657,7 @@ mod tests { let schema = get_alltypes_schema() .project(&[0, 1, 7]) .unwrap() - .with_metadata(Default::default()); + .with_metadata(Metadata::default()); let reader_schema = AvroSchema::try_from(&schema).unwrap(); let expected_schema = schema.clone(); @@ -1690,7 +1690,7 @@ mod tests { let expected_schema = get_nested_records_schema() .as_ref() .clone() - .with_metadata(Default::default()); + .with_metadata(Metadata::default()); let reader = AsyncAvroFileReader::builder(file_reader, file_size, 1024) .try_build() diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs index 6dbf5b1553c2..9070ff5a7a6d 100644 --- a/arrow-avro/src/reader/mod.rs +++ b/arrow-avro/src/reader/mod.rs @@ -5482,34 +5482,14 @@ mod test { #[cfg(not(feature = "avro_custom_types"))] { let schema = Arc::new(Schema::new(vec![ - Field::new("duration_time_nanos", DataType::Int64, false).with_metadata( - [( - "logicalType".to_string(), - "arrow.duration-nanos".to_string(), - )] - .into(), - ), - Field::new("duration_time_micros", DataType::Int64, false).with_metadata( - [( - "logicalType".to_string(), - "arrow.duration-micros".to_string(), - )] - .into(), - ), - Field::new("duration_time_millis", DataType::Int64, false).with_metadata( - [( - "logicalType".to_string(), - "arrow.duration-millis".to_string(), - )] - .into(), - ), - Field::new("duration_time_seconds", DataType::Int64, false).with_metadata( - [( - "logicalType".to_string(), - "arrow.duration-seconds".to_string(), - )] - .into(), - ), + Field::new("duration_time_nanos", DataType::Int64, false) + .with_metadata([("logicalType", "arrow.duration-nanos")]), + Field::new("duration_time_micros", DataType::Int64, false) + .with_metadata([("logicalType", "arrow.duration-micros")]), + Field::new("duration_time_millis", DataType::Int64, false) + .with_metadata([("logicalType", "arrow.duration-millis")]), + Field::new("duration_time_seconds", DataType::Int64, false) + .with_metadata([("logicalType", "arrow.duration-seconds")]), ])); let nanos = @@ -8431,7 +8411,7 @@ mod test { const UUID_EXT_KEY: &str = "ARROW:extension:name"; const UUID_LOGICAL_KEY: &str = "logicalType"; - let uuid_md_top: Option> = batch + let uuid_md_top: Option = batch .schema() .field_with_name("uuid_str") .ok() @@ -8449,7 +8429,7 @@ mod test { } }); - let uuid_md_union: Option> = batch + let uuid_md_union: Option = batch .schema() .field_with_name("union_uuid_or_fixed10") .ok() diff --git a/arrow-avro/src/schema.rs b/arrow-avro/src/schema.rs index 1b0c2e26f773..28965e5e99e9 100644 --- a/arrow-avro/src/schema.rs +++ b/arrow-avro/src/schema.rs @@ -20,8 +20,8 @@ #[cfg(feature = "canonical_extension_types")] use arrow_schema::extension::ExtensionType; use arrow_schema::{ - ArrowError, DataType, Field as ArrowField, IntervalUnit, Schema as ArrowSchema, TimeUnit, - UnionMode, + ArrowError, DataType, Field as ArrowField, IntervalUnit, Metadata, Schema as ArrowSchema, + TimeUnit, UnionMode, }; use serde::{Deserialize, Serialize}; use serde_json::{Map as JsonMap, Value, json}; @@ -1155,10 +1155,7 @@ fn is_internal_arrow_key(key: &str) -> bool { /// skipping keys that are Avro-reserved, internal Arrow keys, or /// nested under the `avro.schema.` namespace. Values that parse as /// JSON are inserted as JSON; otherwise the raw string is preserved. -fn extend_with_passthrough_metadata( - target: &mut JsonMap, - metadata: &HashMap, -) { +fn extend_with_passthrough_metadata(target: &mut JsonMap, metadata: &Metadata) { for (meta_key, meta_val) in metadata { if meta_key.starts_with("avro.") || is_internal_arrow_key(meta_key) { continue; @@ -1318,7 +1315,7 @@ fn union_branch_signature(branch: &Value) -> Result { fn datatype_to_avro( dt: &DataType, field_name: &str, - metadata: &HashMap, + metadata: &Metadata, name_gen: &mut NameGenerator, null_order: Nullability, strip: bool, @@ -1915,7 +1912,7 @@ fn datatype_to_avro( fn process_datatype( dt: &DataType, field_name: &str, - metadata: &HashMap, + metadata: &Metadata, name_gen: &mut NameGenerator, null_order: Nullability, is_nullable: bool, diff --git a/arrow-avro/src/writer/mod.rs b/arrow-avro/src/writer/mod.rs index 8d078b1b8268..235db3f0f7cf 100644 --- a/arrow-avro/src/writer/mod.rs +++ b/arrow-avro/src/writer/mod.rs @@ -1883,8 +1883,8 @@ mod tests { /// Checks that `actual_meta` contains all of `expected_meta`, and any additional /// keys in `actual_meta` are from a permitted set. fn assert_metadata_is_superset( - expected_meta: &HashMap, - actual_meta: &HashMap, + expected_meta: &arrow_schema::Metadata, + actual_meta: &arrow_schema::Metadata, context: &str, ) { let allowed_additions: HashSet<&str> = diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index e26072fea917..b2a3ca73e17b 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -682,7 +682,7 @@ impl Decoder { fn parse( rows: &StringRecords<'_>, fields: &Fields, - metadata: Option>, + metadata: Option, projection: Option<&Vec>, line_number: usize, null_regex: &NullRegex, @@ -1326,7 +1326,7 @@ mod tests { assert_eq!(37, batch.num_rows()); assert_eq!(3, batch.num_columns()); - assert_eq!(&metadata, batch.schema().metadata()); + assert_eq!(batch.schema().metadata(), &metadata); } #[test] diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index e0aa3ecf855a..ac027554b250 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1341,18 +1341,10 @@ mod tests { let nanos_tz = Some("Africa/Johannesburg".into()); let schema = Schema::new(vec![ - Field::new("bools-with-metadata-map", DataType::Boolean, true).with_metadata( - [("k".to_string(), "v".to_string())] - .iter() - .cloned() - .collect(), - ), - Field::new("bools-with-metadata-vec", DataType::Boolean, true).with_metadata( - [("k2".to_string(), "v2".to_string())] - .iter() - .cloned() - .collect(), - ), + Field::new("bools-with-metadata-map", DataType::Boolean, true) + .with_metadata([("k", "v")]), + Field::new("bools-with-metadata-vec", DataType::Boolean, true) + .with_metadata([("k2", "v2")]), Field::new("bools", DataType::Boolean, true), Field::new("int8s", DataType::Int8, true), Field::new("int16s", DataType::Int16, true), diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index 7777c48c1f4b..8e0dd730d9d6 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -22,9 +22,14 @@ use std::collections::HashMap; /// Generate a JSON representation of the `Schema`. pub fn schema_to_json(schema: &Schema) -> serde_json::Value { + let metadata: serde_json::Map = schema + .metadata() + .iter() + .map(|(k, v)| (k.clone(), serde_json::Value::String(v.clone()))) + .collect(); serde_json::json!({ "fields": schema.fields().iter().map(|f| field_to_json(f.as_ref())).collect::>(), - "metadata": serde_json::to_value(schema.metadata()).unwrap() + "metadata": metadata }) } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 16e61deadb0f..5b65c81a94da 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -131,14 +131,12 @@ impl<'a> IpcSchemaEncoder<'a> { /// Push a key-value metadata into a FlatBufferBuilder and return [WIPOffset] pub fn metadata_to_fb<'a>( fbb: &mut FlatBufferBuilder<'a>, - metadata: &HashMap, + metadata: &Metadata, ) -> WIPOffset>>> { - let mut ordered_keys = metadata.keys().collect::>(); - ordered_keys.sort(); - let custom_metadata = ordered_keys - .into_iter() - .map(|k| { - let v = metadata.get(k).unwrap(); + // `Metadata` iterates in deterministic (sorted) key order + let custom_metadata = metadata + .iter() + .map(|(k, v)| { let fb_key_name = fbb.create_string(k); let fb_val_name = fbb.create_string(v); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 46e2dd7739e0..c18075df3c58 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1082,7 +1082,7 @@ pub struct FileWriter { /// Keeps track of dictionaries that have been written dictionary_tracker: DictionaryTracker, /// User level customized metadata - custom_metadata: HashMap, + custom_metadata: Metadata, data_gen: IpcDataGenerator, @@ -1146,7 +1146,7 @@ impl FileWriter { record_blocks: vec![], finished: false, dictionary_tracker, - custom_metadata: HashMap::new(), + custom_metadata: Default::default(), data_gen, compression_context: CompressionContext::default(), }) diff --git a/arrow-schema/src/datatype_display.rs b/arrow-schema/src/datatype_display.rs index cca7cf254fc3..203d4a1584b1 100644 --- a/arrow-schema/src/datatype_display.rs +++ b/arrow-schema/src/datatype_display.rs @@ -16,12 +16,13 @@ // under the License. use crate::DataType; +use crate::Metadata; +use std::fmt; use std::fmt::Display; -use std::{collections::HashMap, fmt}; impl Display for DataType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fn format_metadata(metadata: &HashMap) -> String { + fn format_metadata(metadata: &Metadata) -> String { format!("{}", FormatMetadata(metadata)) } @@ -183,8 +184,8 @@ impl Display for DataType { } } -/// Adapter to format a metadata HashMap consistently. -struct FormatMetadata<'a>(&'a HashMap); +/// Adapter to format [`Metadata`] consistently. +struct FormatMetadata<'a>(&'a Metadata); impl fmt::Display for FormatMetadata<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -192,10 +193,9 @@ impl fmt::Display for FormatMetadata<'_> { if metadata.is_empty() { Ok(()) } else { - let mut entries: Vec<(&String, &String)> = metadata.iter().collect(); - entries.sort_by(|a, b| a.0.cmp(b.0)); + // `Metadata` iterates in sorted key order write!(f, ", metadata: ")?; - f.debug_map().entries(entries).finish() + f.debug_map().entries(metadata.iter()).finish() } } } @@ -203,6 +203,7 @@ impl fmt::Display for FormatMetadata<'_> { #[cfg(test)] mod tests { + use std::collections::HashMap; use std::sync::Arc; use crate::Field; diff --git a/arrow-schema/src/extension/canonical/bool8.rs b/arrow-schema/src/extension/canonical/bool8.rs index 95482bc54044..17eeb240ac29 100644 --- a/arrow-schema/src/extension/canonical/bool8.rs +++ b/arrow-schema/src/extension/canonical/bool8.rs @@ -102,11 +102,8 @@ mod tests { #[test] #[should_panic(expected = "Extension type name missing")] fn missing_name() { - let field = Field::new("", DataType::Int8, false).with_metadata( - [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int8, false) + .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "")]); field.extension_type::(); } @@ -119,28 +116,18 @@ mod tests { #[test] #[should_panic(expected = "Bool8 extension type expects an empty string as metadata")] fn missing_metadata() { - let field = Field::new("", DataType::Int8, false).with_metadata( - [(EXTENSION_TYPE_NAME_KEY.to_owned(), Bool8::NAME.to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int8, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, Bool8::NAME)]); field.extension_type::(); } #[test] #[should_panic(expected = "Bool8 extension type expects an empty string as metadata")] fn invalid_metadata() { - let field = Field::new("", DataType::Int8, false).with_metadata( - [ - (EXTENSION_TYPE_NAME_KEY.to_owned(), Bool8::NAME.to_owned()), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - "non-empty".to_owned(), - ), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int8, false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, Bool8::NAME), + (EXTENSION_TYPE_METADATA_KEY, "non-empty"), + ]); field.extension_type::(); } } diff --git a/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs b/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs index 5157eefe9ebb..00c856435002 100644 --- a/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs +++ b/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs @@ -475,14 +475,10 @@ mod tests { fn missing_name() { let field = Field::new_fixed_size_list("", Field::new("", DataType::Float32, false), 3, false) - .with_metadata( - [( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - r#"{ "shape": [100, 200, 500], }"#.to_owned(), - )] - .into_iter() - .collect(), - ); + .with_metadata([( + EXTENSION_TYPE_METADATA_KEY, + r#"{ "shape": [100, 200, 500], }"#, + )]); field.extension_type::(); } @@ -505,14 +501,7 @@ mod tests { fn missing_metadata() { let field = Field::new_fixed_size_list("", Field::new("", DataType::Float32, false), 3, false) - .with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - FixedShapeTensor::NAME.to_owned(), - )] - .into_iter() - .collect(), - ); + .with_metadata([(EXTENSION_TYPE_NAME_KEY, FixedShapeTensor::NAME)]); field.extension_type::(); } @@ -528,20 +517,10 @@ mod tests { i32::try_from(fixed_shape_tensor.list_size()).expect("overflow"), false, ) - .with_metadata( - [ - ( - EXTENSION_TYPE_NAME_KEY.to_owned(), - FixedShapeTensor::NAME.to_owned(), - ), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - r#"{ "not-shape": [] }"#.to_owned(), - ), - ] - .into_iter() - .collect(), - ); + .with_metadata([ + (EXTENSION_TYPE_NAME_KEY, FixedShapeTensor::NAME), + (EXTENSION_TYPE_METADATA_KEY, r#"{ "not-shape": [] }"#), + ]); field.extension_type::(); } diff --git a/arrow-schema/src/extension/canonical/json.rs b/arrow-schema/src/extension/canonical/json.rs index 99b78fd6ef52..1188a6dda439 100644 --- a/arrow-schema/src/extension/canonical/json.rs +++ b/arrow-schema/src/extension/canonical/json.rs @@ -228,11 +228,8 @@ mod tests { #[test] #[should_panic(expected = "Extension type name missing")] fn missing_name() { - let field = Field::new("", DataType::Int8, false).with_metadata( - [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "{}".to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int8, false) + .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "{}")]); field.extension_type::(); } @@ -247,14 +244,10 @@ mod tests { expected = "Json extension type metadata is either an empty string or a JSON string with an empty object" )] fn invalid_metadata() { - let field = Field::new("", DataType::Utf8, false).with_metadata( - [ - (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()), - (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Utf8, false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, Json::NAME), + (EXTENSION_TYPE_METADATA_KEY, "1234"), + ]); field.extension_type::(); } @@ -263,11 +256,8 @@ mod tests { expected = "Json extension type metadata is either an empty string or a JSON string with an empty object" )] fn missing_metadata() { - let field = Field::new("", DataType::LargeUtf8, false).with_metadata( - [(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::LargeUtf8, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, Json::NAME)]); field.extension_type::(); } } diff --git a/arrow-schema/src/extension/canonical/opaque.rs b/arrow-schema/src/extension/canonical/opaque.rs index 5a5eb1b66818..62ae062ca12e 100644 --- a/arrow-schema/src/extension/canonical/opaque.rs +++ b/arrow-schema/src/extension/canonical/opaque.rs @@ -291,25 +291,18 @@ mod tests { #[test] #[should_panic(expected = "Extension type name missing")] fn missing_name() { - let field = Field::new("", DataType::Null, false).with_metadata( - [( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - r#"{ "type_name": "type", "vendor_name": "vendor" }"#.to_owned(), - )] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Null, false).with_metadata([( + EXTENSION_TYPE_METADATA_KEY, + r#"{ "type_name": "type", "vendor_name": "vendor" }"#, + )]); field.extension_type::(); } #[test] #[should_panic(expected = "Opaque extension types requires metadata")] fn missing_metadata() { - let field = Field::new("", DataType::Null, false).with_metadata( - [(EXTENSION_TYPE_NAME_KEY.to_owned(), Opaque::NAME.to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Null, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, Opaque::NAME)]); field.extension_type::(); } @@ -318,17 +311,13 @@ mod tests { expected = "Opaque metadata deserialization failed: missing field `vendor_name`" )] fn invalid_metadata() { - let field = Field::new("", DataType::Null, false).with_metadata( - [ - (EXTENSION_TYPE_NAME_KEY.to_owned(), Opaque::NAME.to_owned()), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - r#"{ "type_name": "no-vendor" }"#.to_owned(), - ), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Null, false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, Opaque::NAME), + ( + EXTENSION_TYPE_METADATA_KEY, + r#"{ "type_name": "no-vendor" }"#, + ), + ]); field.extension_type::(); } } diff --git a/arrow-schema/src/extension/canonical/timestamp_with_offset.rs b/arrow-schema/src/extension/canonical/timestamp_with_offset.rs index 06ea98b3a0a1..84b49564aee6 100644 --- a/arrow-schema/src/extension/canonical/timestamp_with_offset.rs +++ b/arrow-schema/src/extension/canonical/timestamp_with_offset.rs @@ -307,7 +307,7 @@ mod tests { #[should_panic(expected = "Extension type name missing")] fn missing_name() { let field = make_valid_field_primitive(TimeUnit::Second) - .with_metadata([(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())].into()); + .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "")]); field.extension_type::(); } @@ -509,28 +509,17 @@ mod tests { #[test] fn no_metadata() { - let field = make_valid_field_primitive(TimeUnit::Second).with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - TimestampWithOffset::NAME.to_owned(), - )] - .into(), - ); + let field = make_valid_field_primitive(TimeUnit::Second) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, TimestampWithOffset::NAME)]); field.extension_type::(); } #[test] fn empty_metadata() { - let field = make_valid_field_primitive(TimeUnit::Second).with_metadata( - [ - ( - EXTENSION_TYPE_NAME_KEY.to_owned(), - TimestampWithOffset::NAME.to_owned(), - ), - (EXTENSION_TYPE_METADATA_KEY.to_owned(), String::new()), - ] - .into(), - ); + let field = make_valid_field_primitive(TimeUnit::Second).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, TimestampWithOffset::NAME), + (EXTENSION_TYPE_METADATA_KEY, ""), + ]); field.extension_type::(); } } diff --git a/arrow-schema/src/extension/canonical/uuid.rs b/arrow-schema/src/extension/canonical/uuid.rs index 7a9f6e6e538e..16e7de438092 100644 --- a/arrow-schema/src/extension/canonical/uuid.rs +++ b/arrow-schema/src/extension/canonical/uuid.rs @@ -123,30 +123,19 @@ mod tests { #[test] #[should_panic(expected = "Uuid extension type expects no metadata")] fn with_metadata() { - let field = Field::new("", DataType::FixedSizeBinary(16), false).with_metadata( - [ - (EXTENSION_TYPE_NAME_KEY.to_owned(), Uuid::NAME.to_owned()), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - "unexpected".to_owned(), - ), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::FixedSizeBinary(16), false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, Uuid::NAME), + (EXTENSION_TYPE_METADATA_KEY, "unexpected"), + ]); field.extension_type::(); } #[test] fn empty_metadata_string_is_treated_as_none() -> Result<(), ArrowError> { - let field = Field::new("", DataType::FixedSizeBinary(16), false).with_metadata( - [ - (EXTENSION_TYPE_NAME_KEY.to_owned(), Uuid::NAME.to_owned()), - (EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned()), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::FixedSizeBinary(16), false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, Uuid::NAME), + (EXTENSION_TYPE_METADATA_KEY, ""), + ]); field.try_extension_type::()?; Ok(()) } diff --git a/arrow-schema/src/extension/canonical/variable_shape_tensor.rs b/arrow-schema/src/extension/canonical/variable_shape_tensor.rs index fbc641f54366..762790a37988 100644 --- a/arrow-schema/src/extension/canonical/variable_shape_tensor.rs +++ b/arrow-schema/src/extension/canonical/variable_shape_tensor.rs @@ -548,11 +548,7 @@ mod tests { ], false, ) - .with_metadata( - [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "{}".to_owned())] - .into_iter() - .collect(), - ); + .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "{}")]); field.extension_type::(); } @@ -601,14 +597,7 @@ mod tests { ], false, ) - .with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - VariableShapeTensor::NAME.to_owned(), - )] - .into_iter() - .collect(), - ); + .with_metadata([(EXTENSION_TYPE_NAME_KEY, VariableShapeTensor::NAME)]); field.extension_type::(); } @@ -632,20 +621,13 @@ mod tests { ], false, ) - .with_metadata( - [ - ( - EXTENSION_TYPE_NAME_KEY.to_owned(), - VariableShapeTensor::NAME.to_owned(), - ), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - r#"{ "dim_names": [1, null, 3, 4] }"#.to_owned(), - ), - ] - .into_iter() - .collect(), - ); + .with_metadata([ + (EXTENSION_TYPE_NAME_KEY, VariableShapeTensor::NAME), + ( + EXTENSION_TYPE_METADATA_KEY, + r#"{ "dim_names": [1, null, 3, 4] }"#, + ), + ]); field.extension_type::(); } diff --git a/arrow-schema/src/extension/mod.rs b/arrow-schema/src/extension/mod.rs index 3dd1f8d3547f..58bbb70497ca 100644 --- a/arrow-schema/src/extension/mod.rs +++ b/arrow-schema/src/extension/mod.rs @@ -23,7 +23,6 @@ mod canonical; pub use canonical::*; use crate::{ArrowError, DataType}; -use std::collections::HashMap; /// The metadata key for the string name identifying an [`ExtensionType`]. pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; @@ -283,7 +282,7 @@ pub trait ExtensionType: Sized { /// [`Field`]: crate::Field fn try_new_from_field_metadata( data_type: &DataType, - metadata: &HashMap, + metadata: &crate::Metadata, ) -> Result { // Check the extension name in the metadata match metadata.get(EXTENSION_TYPE_NAME_KEY).map(|s| s.as_str()) { diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index c2df07376f84..0cd97645b3a2 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -929,7 +929,7 @@ mod tests { Field::new("address", DataType::Utf8, false), Field::new("priority", DataType::UInt8, false), ]) - .with_metadata([("hello".to_string(), "world".to_string())].into()); + .with_metadata([("hello", "world")]); round_trip_schema(schema); diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 0d12728ca23b..18d35d4c7c26 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -17,7 +17,6 @@ use crate::error::ArrowError; use std::cmp::Ordering; -use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::sync::Arc; @@ -26,7 +25,7 @@ use crate::datatype::DataType; use crate::extension::CanonicalExtensionType; use crate::schema::SchemaBuilder; use crate::{ - Fields, UnionFields, UnionMode, + Fields, Metadata, UnionFields, UnionMode, extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY, ExtensionType}, }; @@ -57,7 +56,7 @@ pub struct Field { dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. - metadata: HashMap, + metadata: Metadata, } impl std::fmt::Debug for Field { @@ -128,31 +127,7 @@ impl Ord for Field { .cmp(other.name()) .then_with(|| self.data_type.cmp(other.data_type())) .then_with(|| self.nullable.cmp(&other.nullable)) - .then_with(|| { - // ensure deterministic key order - let mut keys: Vec<&String> = - self.metadata.keys().chain(other.metadata.keys()).collect(); - keys.sort(); - for k in keys { - match (self.metadata.get(k), other.metadata.get(k)) { - (None, None) => {} - (Some(_), None) => { - return Ordering::Less; - } - (None, Some(_)) => { - return Ordering::Greater; - } - (Some(v1), Some(v2)) => match v1.cmp(v2) { - Ordering::Equal => {} - other => { - return other; - } - }, - } - } - - Ordering::Equal - }) + .then_with(|| self.metadata.cmp(&other.metadata)) } } @@ -161,14 +136,8 @@ impl Hash for Field { self.name.hash(state); self.data_type.hash(state); self.nullable.hash(state); - - // ensure deterministic key order - let mut keys: Vec<&String> = self.metadata.keys().collect(); - keys.sort(); - for k in keys { - k.hash(state); - self.metadata.get(k).expect("key valid").hash(state); - } + // `Metadata` iterates in deterministic (sorted) key order + self.metadata.hash(state); } } @@ -197,7 +166,7 @@ impl Field { nullable, dict_id: 0, dict_is_ordered: false, - metadata: HashMap::default(), + metadata: Default::default(), } } @@ -238,7 +207,7 @@ impl Field { nullable, dict_id, dict_is_ordered, - metadata: HashMap::default(), + metadata: Default::default(), } } @@ -368,25 +337,25 @@ impl Field { /// Sets the `Field`'s optional custom metadata. #[inline] - pub fn set_metadata(&mut self, metadata: HashMap) { - self.metadata = metadata; + pub fn set_metadata(&mut self, metadata: impl Into) { + self.metadata = metadata.into(); } /// Sets the metadata of this `Field` to be `metadata` and returns self - pub fn with_metadata(mut self, metadata: HashMap) -> Self { + pub fn with_metadata(mut self, metadata: impl Into) -> Self { self.set_metadata(metadata); self } /// Returns the immutable reference to the `Field`'s optional custom metadata. #[inline] - pub const fn metadata(&self) -> &HashMap { + pub const fn metadata(&self) -> &Metadata { &self.metadata } /// Returns a mutable reference to the `Field`'s optional custom metadata. #[inline] - pub fn metadata_mut(&mut self) -> &mut HashMap { + pub fn metadata_mut(&mut self) -> &mut Metadata { &mut self.metadata } @@ -464,11 +433,8 @@ impl Field { /// let field = Field::new("", DataType::Null, false); /// assert_eq!(field.extension_type_name(), None); /// - /// let field = Field::new("", DataType::Null, false).with_metadata( - /// [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())] - /// .into_iter() - /// .collect(), - /// ); + /// let field = Field::new("", DataType::Null, false) + /// .with_metadata([(EXTENSION_TYPE_NAME_KEY, "example")]); /// assert_eq!(field.extension_type_name(), Some("example")); /// ``` pub fn extension_type_name(&self) -> Option<&str> { @@ -491,11 +457,8 @@ impl Field { /// let field = Field::new("", DataType::Null, false); /// assert_eq!(field.extension_type_metadata(), None); /// - /// let field = Field::new("", DataType::Null, false).with_metadata( - /// [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())] - /// .into_iter() - /// .collect(), - /// ); + /// let field = Field::new("", DataType::Null, false) + /// .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "example")]); /// assert_eq!(field.extension_type_metadata(), Some("example")); /// ``` pub fn extension_type_metadata(&self) -> Option<&str> { @@ -963,7 +926,7 @@ impl Field { std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type) + self.data_type.size() + self.name.capacity() - + (std::mem::size_of::<(String, String)>() * self.metadata.capacity()) + + (std::mem::size_of::<(String, String)>() * self.metadata.len()) + self .metadata .iter() @@ -1009,6 +972,7 @@ impl std::fmt::Display for Field { #[cfg(test)] mod test { use super::*; + use std::collections::HashMap; use std::collections::hash_map::DefaultHasher; #[derive(Debug, Clone, Copy)] @@ -1048,40 +1012,21 @@ mod test { let no_extension = Field::new("f", DataType::Null, false); assert!(!no_extension.has_valid_extension_type::()); - let matching_name = Field::new("f", DataType::Null, false).with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - TestExtensionType::NAME.to_owned(), - )] - .into_iter() - .collect(), - ); + let matching_name = Field::new("f", DataType::Null, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, TestExtensionType::NAME)]); assert!(matching_name.has_valid_extension_type::()); let matching_name_with_invalid_metadata = Field::new("f", DataType::Null, false) - .with_metadata( - [ - ( - EXTENSION_TYPE_NAME_KEY.to_owned(), - TestExtensionType::NAME.to_owned(), - ), - (EXTENSION_TYPE_METADATA_KEY.to_owned(), "invalid".to_owned()), - ] - .into_iter() - .collect(), - ); + .with_metadata([ + (EXTENSION_TYPE_NAME_KEY, TestExtensionType::NAME), + (EXTENSION_TYPE_METADATA_KEY, "invalid"), + ]); assert!( !matching_name_with_invalid_metadata.has_valid_extension_type::() ); - let different_name = Field::new("f", DataType::Null, false).with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - "some.other_extension".to_owned(), - )] - .into_iter() - .collect(), - ); + let different_name = Field::new("f", DataType::Null, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, "some.other_extension")]); assert!(!different_name.has_valid_extension_type::()); } diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 1eeeb4d106fb..4f6771a1c953 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -37,6 +37,8 @@ mod field; pub use field::*; mod fields; pub use fields::*; +mod metadata; +pub use metadata::*; mod schema; pub use schema::*; use std::ops; diff --git a/arrow-schema/src/metadata.rs b/arrow-schema/src/metadata.rs new file mode 100644 index 000000000000..7b9ec707c747 --- /dev/null +++ b/arrow-schema/src/metadata.rs @@ -0,0 +1,473 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::{BTreeMap, HashMap, btree_map}; +use std::fmt; +use std::ops::Index; +use std::sync::Arc; + +/// A cheaply clonable map of key-value metadata, used by +/// [`Field`](crate::Field) and [`Schema`](crate::Schema). +/// +/// Cloning a `Metadata` is always cheap, as the underlying map is +/// reference-counted. Mutating a shared `Metadata` (e.g. via +/// [`Metadata::insert`]) will clone the underlying map if (and only if) +/// it is shared (copy-on-write). +/// +/// The entries are stored in a [`BTreeMap`], so iteration order is +/// deterministic (sorted by key). +/// +/// # Example +/// ``` +/// # use arrow_schema::Metadata; +/// let mut metadata = Metadata::new(); +/// metadata.insert("key", "value"); +/// +/// let clone = metadata.clone(); // cheap +/// assert_eq!(clone.get("key"), Some(&"value".to_string())); +/// +/// // Mutating one does not affect the other: +/// metadata.insert("key2", "value2"); +/// assert_eq!(metadata.len(), 2); +/// assert_eq!(clone.len(), 1); +/// ``` +#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Metadata( + // Invariant: the inner map is never empty (`None` encodes the empty map). + // This ensures the derived implementations of `PartialEq`, `Ord`, `Hash`, … + // treat an empty `Metadata` consistently, and the empty case never allocates. + // We use `BTreeMap` for deterministic iteration order. + Option>>, +); + +impl Metadata { + /// Creates an empty [`Metadata`]. + /// + /// This does not allocate. + pub const fn new() -> Self { + Self(None) + } + + /// Returns the number of entries. + pub fn len(&self) -> usize { + self.0.as_ref().map_or(0, |map| map.len()) + } + + /// Returns `true` if there are no entries. + pub fn is_empty(&self) -> bool { + self.0.is_none() + } + + /// Returns a reference to the value corresponding to the key. + pub fn get(&self, key: &str) -> Option<&String> { + self.0.as_ref()?.get(key) + } + + /// Returns `true` if the map contains a value for the specified key. + pub fn contains_key(&self, key: &str) -> bool { + self.0.as_ref().is_some_and(|map| map.contains_key(key)) + } + + /// Inserts a key-value pair, returning the old value of the key, if any. + /// + /// If the map already contained this key, the value is replaced. + /// + /// Clones the underlying map if (and only if) it is shared. + pub fn insert(&mut self, key: impl Into, value: impl Into) -> Option { + Arc::make_mut(self.0.get_or_insert_default()).insert(key.into(), value.into()) + } + + /// Inserts a key-value pair and returns `self`, for builder-style chaining. + /// + /// If the map already contained this key, the value is replaced. + /// + /// # Example + /// ``` + /// # use arrow_schema::Metadata; + /// let metadata = Metadata::new().with("a", "1").with("b", "2"); + /// assert_eq!(metadata.len(), 2); + /// ``` + #[must_use] + pub fn with(mut self, key: impl Into, value: impl Into) -> Self { + self.insert(key, value); + self + } + + /// Removes a key from the map, returning the value at the key + /// if the key was previously in the map. + /// + /// Clones the underlying map if (and only if) it is shared and contains the key. + pub fn remove(&mut self, key: &str) -> Option { + let map = self.0.as_mut()?; + if !map.contains_key(key) { + return None; + } + let removed = Arc::make_mut(map).remove(key); + if map.is_empty() { + self.0 = None; + } + removed + } + + /// Removes all entries. + pub fn clear(&mut self) { + self.0 = None; + } + + /// Returns an iterator over the entries, sorted by key. + pub fn iter(&self) -> MetadataIter<'_> { + self.0 + .as_deref() + .map(|map| map.iter()) + .into_iter() + .flatten() + } + + /// Returns an iterator over the keys, in sorted order. + pub fn keys(&self) -> impl Iterator { + self.iter().map(|(key, _)| key) + } + + /// Returns an iterator over the values, sorted by key. + pub fn values(&self) -> impl Iterator { + self.iter().map(|(_, value)| value) + } +} + +/// Iterator over the entries of a [`Metadata`], sorted by key. +pub type MetadataIter<'a> = + std::iter::Flatten>>; + +impl fmt::Debug for Metadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_map().entries(self.iter()).finish() + } +} + +impl Index<&str> for Metadata { + type Output = String; + + fn index(&self, key: &str) -> &String { + self.get(key) + .unwrap_or_else(|| panic!("no entry found for key {key:?}")) + } +} + +impl From> for Metadata { + fn from(map: BTreeMap) -> Self { + if map.is_empty() { + Self(None) + } else { + Self(Some(Arc::new(map))) + } + } +} + +impl From> for Metadata { + fn from(map: HashMap) -> Self { + map.into_iter().collect::>().into() + } +} + +impl From>> for Metadata { + fn from(map: Arc>) -> Self { + if map.is_empty() { + Self(None) + } else { + Self(Some(map)) + } + } +} + +impl, V: Into, const N: usize> From<[(K, V); N]> for Metadata { + fn from(entries: [(K, V); N]) -> Self { + entries.into_iter().collect() + } +} + +impl From for BTreeMap { + fn from(metadata: Metadata) -> Self { + match metadata.0 { + None => BTreeMap::new(), + Some(map) => Arc::try_unwrap(map).unwrap_or_else(|map| (*map).clone()), + } + } +} + +impl From for HashMap { + fn from(metadata: Metadata) -> Self { + metadata.into_iter().collect() + } +} + +impl From<&Metadata> for HashMap { + fn from(metadata: &Metadata) -> Self { + metadata + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect() + } +} + +impl, V: Into> FromIterator<(K, V)> for Metadata { + fn from_iter>(iter: T) -> Self { + iter.into_iter() + .map(|(k, v)| (k.into(), v.into())) + .collect::>() + .into() + } +} + +impl, V: Into> Extend<(K, V)> for Metadata { + fn extend>(&mut self, iter: T) { + let mut iter = iter + .into_iter() + .map(|(k, v)| (k.into(), v.into())) + .peekable(); + if iter.peek().is_none() { + return; // Avoid cloning a shared map for an empty iterator + } + Arc::make_mut(self.0.get_or_insert_default()).extend(iter); + } +} + +impl<'a> IntoIterator for &'a Metadata { + type Item = (&'a String, &'a String); + type IntoIter = MetadataIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl IntoIterator for Metadata { + type Item = (String, String); + type IntoIter = btree_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + BTreeMap::from(self).into_iter() + } +} + +impl PartialEq> for Metadata { + fn eq(&self, other: &HashMap) -> bool { + self.len() == other.len() + && self + .iter() + .all(|(k, v)| other.get(k).is_some_and(|other_v| v == other_v)) + } +} + +impl PartialEq> for Metadata { + fn eq(&self, other: &BTreeMap) -> bool { + self.iter().eq(other.iter()) + } +} + +#[cfg(feature = "serde")] +mod serde_impl { + use super::Metadata; + use std::collections::BTreeMap; + + impl serde_core::Serialize for Metadata { + fn serialize(&self, serializer: S) -> Result { + use serde_core::ser::SerializeMap as _; + let mut map = serializer.serialize_map(Some(self.len()))?; + for (key, value) in self { + map.serialize_entry(key, value)?; + } + map.end() + } + } + + impl<'de> serde_core::Deserialize<'de> for Metadata { + fn deserialize>( + deserializer: D, + ) -> Result { + Ok(BTreeMap::::deserialize(deserializer)?.into()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty() { + let metadata = Metadata::new(); + assert!(metadata.is_empty()); + assert_eq!(metadata.len(), 0); + assert_eq!(metadata.get("key"), None); + assert!(!metadata.contains_key("key")); + assert_eq!(metadata.iter().count(), 0); + assert_eq!(metadata, Metadata::default()); + + // Empty maps don't allocate: + assert!(Metadata::from(HashMap::new()).0.is_none()); + assert!(Metadata::from(BTreeMap::new()).0.is_none()); + assert!( + std::iter::empty::<(String, String)>() + .collect::() + .0 + .is_none() + ); + } + + #[test] + fn test_insert_get_remove() { + let mut metadata = Metadata::new(); + assert_eq!(metadata.insert("a", "1"), None); + assert_eq!(metadata.insert("a", "2"), Some("1".to_string())); + assert_eq!(metadata.insert("b", "3"), None); + + assert_eq!(metadata.len(), 2); + assert_eq!(metadata.get("a"), Some(&"2".to_string())); + assert_eq!(metadata["b"], "3"); + assert!(metadata.contains_key("a")); + assert!(!metadata.contains_key("c")); + + assert_eq!(metadata.remove("c"), None); + assert_eq!(metadata.remove("a"), Some("2".to_string())); + assert_eq!(metadata.remove("b"), Some("3".to_string())); + + // Invariant: removing the last entry restores the unallocated state + assert!(metadata.is_empty()); + assert!(metadata.0.is_none()); + } + + #[test] + fn test_clear() { + let mut metadata = Metadata::from([("a", "1")]); + metadata.clear(); + assert!(metadata.is_empty()); + assert!(metadata.0.is_none()); + } + + #[test] + #[should_panic(expected = "no entry found for key \"missing\"")] + fn test_index_panics() { + let metadata = Metadata::from([("a", "1")]); + let _ = &metadata["missing"]; + } + + #[test] + fn test_copy_on_write() { + let mut metadata = Metadata::from([("a", "1")]); + let clone = metadata.clone(); + + // Cloning is shallow: + let arc = metadata.0.as_ref().expect("non-empty"); + assert!(Arc::ptr_eq(arc, clone.0.as_ref().expect("non-empty"))); + + // Mutation clones the shared map, leaving the clone untouched: + metadata.insert("b", "2"); + assert_eq!(metadata.len(), 2); + assert_eq!(clone.len(), 1); + + // Mutating an unshared map does not clone it: + let arc = metadata.0.as_ref().expect("non-empty").clone(); + metadata.insert("c", "3"); + assert!( + Arc::ptr_eq(&arc, metadata.0.as_ref().expect("non-empty")) + || Arc::strong_count(&arc) == 1 + ); + + // Removing a missing key from a shared map does not clone it: + let clone = metadata.clone(); + let arc = metadata.0.as_ref().expect("non-empty"); + assert!(Arc::ptr_eq(arc, clone.0.as_ref().expect("non-empty"))); + let mut metadata2 = metadata.clone(); + assert_eq!(metadata2.remove("missing"), None); + assert!(Arc::ptr_eq( + metadata.0.as_ref().expect("non-empty"), + metadata2.0.as_ref().expect("non-empty") + )); + } + + #[test] + fn test_deterministic_iteration_order() { + let metadata: Metadata = [("b", "2"), ("a", "1"), ("c", "3")].into_iter().collect(); + let keys: Vec<&String> = metadata.keys().collect(); + assert_eq!(keys, ["a", "b", "c"]); + let values: Vec<&String> = metadata.values().collect(); + assert_eq!(values, ["1", "2", "3"]); + } + + #[test] + fn test_eq_with_std_maps() { + let metadata = Metadata::from([("a", "1"), ("b", "2")]); + + let hash_map: HashMap = metadata.clone().into(); + assert_eq!(metadata, hash_map); + + let btree_map: BTreeMap = metadata.clone().into(); + assert_eq!(metadata, btree_map); + + let mut different = hash_map.clone(); + different.insert("c".to_string(), "3".to_string()); + assert_ne!(metadata, different); + + let mut different = hash_map; + different.insert("a".to_string(), "other".to_string()); + assert_ne!(metadata, different); + } + + #[test] + fn test_extend() { + let mut metadata = Metadata::new(); + let clone = metadata.clone(); + metadata.extend(std::iter::empty::<(String, String)>()); + assert!(metadata.0.is_none()); // No allocation for an empty extend + + metadata.extend([("a", "1"), ("b", "2")]); + assert_eq!(metadata.len(), 2); + assert!(clone.is_empty()); + } + + #[test] + fn test_debug() { + let metadata = Metadata::from([("b", "2"), ("a", "1")]); + assert_eq!(format!("{metadata:?}"), r#"{"a": "1", "b": "2"}"#); + assert_eq!(format!("{:?}", Metadata::new()), "{}"); + } + + #[test] + #[cfg(feature = "serde")] + fn test_serde_round_trip() { + for metadata in [Metadata::new(), Metadata::from([("a", "1"), ("b", "2")])] { + let serialized = postcard::to_stdvec(&metadata).expect("serialize"); + let deserialized: Metadata = postcard::from_bytes(&serialized).expect("deserialize"); + assert_eq!(metadata, deserialized); + } + } + + #[test] + #[cfg(feature = "serde")] + fn test_serde_matches_std_map_format() { + // `Metadata` must serialize exactly like the maps it replaced + let metadata = Metadata::from([("a", "1"), ("b", "2")]); + let as_btree_map: BTreeMap = metadata.clone().into(); + + let serialized = postcard::to_stdvec(&metadata).expect("serialize"); + let map_serialized = postcard::to_stdvec(&as_btree_map).expect("serialize"); + assert_eq!(serialized, map_serialized); + + let deserialized: Metadata = postcard::from_bytes(&map_serialized).expect("deserialize"); + assert_eq!(metadata, deserialized); + } +} diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 0c7db39dfbb5..aa9ef877b183 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -15,20 +15,19 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; use std::fmt; use std::hash::Hash; use std::sync::Arc; use crate::error::ArrowError; use crate::field::Field; -use crate::{DataType, FieldRef, Fields}; +use crate::{DataType, FieldRef, Fields, Metadata}; /// A builder to facilitate building a [`Schema`] from iteratively from [`FieldRef`] #[derive(Debug, Default)] pub struct SchemaBuilder { fields: Vec, - metadata: HashMap, + metadata: Metadata, } impl SchemaBuilder { @@ -78,12 +77,12 @@ impl SchemaBuilder { } /// Returns an immutable reference to the Map of custom metadata key-value pairs. - pub fn metadata(&mut self) -> &HashMap { + pub fn metadata(&mut self) -> &Metadata { &self.metadata } /// Returns a mutable reference to the Map of custom metadata key-value pairs. - pub fn metadata_mut(&mut self) -> &mut HashMap { + pub fn metadata_mut(&mut self) -> &mut Metadata { &mut self.metadata } @@ -182,13 +181,13 @@ pub type SchemaRef = Arc; /// /// Note that this information is only part of the meta-data and not part of the physical /// memory layout. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Schema { /// A sequence of fields that describe the schema. pub fields: Fields, /// A map of key-value pairs containing additional metadata. - pub metadata: HashMap, + pub metadata: Metadata, } impl Schema { @@ -196,7 +195,7 @@ impl Schema { pub fn empty() -> Self { Self { fields: Default::default(), - metadata: HashMap::new(), + metadata: Default::default(), } } @@ -212,7 +211,7 @@ impl Schema { /// let schema = Schema::new(vec![field_a, field_b]); /// ``` pub fn new(fields: impl Into) -> Self { - Self::new_with_metadata(fields, HashMap::new()) + Self::new_with_metadata(fields, Metadata::new()) } /// Creates a new [`Schema`] from a sequence of [`Field`] values @@ -222,27 +221,22 @@ impl Schema { /// /// ``` /// # use arrow_schema::*; - /// # use std::collections::HashMap; - /// /// let field_a = Field::new("a", DataType::Int64, false); /// let field_b = Field::new("b", DataType::Boolean, false); /// - /// let mut metadata: HashMap = HashMap::new(); - /// metadata.insert("row_count".to_string(), "100".to_string()); - /// - /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); + /// let schema = Schema::new_with_metadata(vec![field_a, field_b], [("row_count", "100")]); /// ``` #[inline] - pub fn new_with_metadata(fields: impl Into, metadata: HashMap) -> Self { + pub fn new_with_metadata(fields: impl Into, metadata: impl Into) -> Self { Self { fields: fields.into(), - metadata, + metadata: metadata.into(), } } /// Sets the metadata of this `Schema` to be `metadata` and returns self - pub fn with_metadata(mut self, metadata: HashMap) -> Self { - self.metadata = metadata; + pub fn with_metadata(mut self, metadata: impl Into) -> Self { + self.metadata = metadata.into(); self } @@ -293,7 +287,7 @@ impl Schema { /// ); /// ``` pub fn try_merge(schemas: impl IntoIterator) -> Result { - let mut out_meta = HashMap::new(); + let mut out_meta = Metadata::new(); let mut out_fields = SchemaBuilder::new(); for schema in schemas { let Schema { metadata, fields } = schema; @@ -407,7 +401,7 @@ impl Schema { /// Returns an immutable reference to the Map of custom metadata key-value pairs. #[inline] - pub const fn metadata(&self) -> &HashMap { + pub const fn metadata(&self) -> &Metadata { &self.metadata } @@ -530,22 +524,6 @@ impl fmt::Display for Schema { } } -// need to implement `Hash` manually because `HashMap` implement Eq but no `Hash` -#[allow(clippy::derived_hash_with_manual_eq)] -impl Hash for Schema { - fn hash(&self, state: &mut H) { - self.fields.hash(state); - - // ensure deterministic key order - let mut keys: Vec<&String> = self.metadata.keys().collect(); - keys.sort(); - for k in keys { - k.hash(state); - self.metadata.get(k).expect("key valid").hash(state); - } - } -} - impl AsRef for Schema { fn as_ref(&self) -> &Schema { self @@ -556,6 +534,7 @@ impl AsRef for Schema { mod tests { use crate::datatype::DataType; use crate::{TimeUnit, UnionMode}; + use std::collections::HashMap; use super::*; @@ -594,8 +573,7 @@ mod tests { assert_eq!(schema, de_schema); // ser/de with non-empty metadata - let schema = - schema.with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect()); + let schema = schema.with_metadata([("key", "val")]); let json = serde_json::to_string(&schema).unwrap(); let de_schema = serde_json::from_str(&json).unwrap(); @@ -705,12 +683,7 @@ mod tests { assert_ne!(schema2, schema4); assert_ne!(schema3, schema4); - let f = Field::new("c1", DataType::Utf8, false).with_metadata( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - ); + let f = Field::new("c1", DataType::Utf8, false).with_metadata([("foo", "bar")]); let schema5 = Schema::new(vec![ f, Field::new("c2", DataType::Float64, true), @@ -1275,49 +1248,24 @@ mod tests { assert_eq!(f1.metadata(), f2.metadata()); // 3. Some + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - ); - let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata( - [("foo2".to_string(), "bar2".to_string())] - .iter() - .cloned() - .collect(), - ); + let mut f1 = + Field::new("first_name", DataType::Utf8, false).with_metadata([("foo", "bar")]); + let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata([("foo2", "bar2")]); assert!(f1.try_merge(&f2).is_ok()); assert!(!f1.metadata().is_empty()); assert_eq!( - f1.metadata().clone(), - [ - ("foo".to_string(), "bar".to_string()), - ("foo2".to_string(), "bar2".to_string()) - ] - .iter() - .cloned() - .collect() + f1.metadata(), + &Metadata::from([("foo", "bar"), ("foo2", "bar2")]) ); // 4. Some + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - ); + let mut f1 = + Field::new("first_name", DataType::Utf8, false).with_metadata([("foo", "bar")]); let f2 = Field::new("first_name", DataType::Utf8, false); assert!(f1.try_merge(&f2).is_ok()); assert!(!f1.metadata().is_empty()); - assert_eq!( - f1.metadata().clone(), - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect() - ); + assert_eq!(f1.metadata(), &Metadata::from([("foo", "bar")])); // 5. None + None. let mut f1 = Field::new("first_name", DataType::Utf8, false); @@ -1491,12 +1439,12 @@ mod tests { #[test] fn test_schema_builder_metadata() { - let mut metadata = HashMap::with_capacity(1); + let mut metadata: HashMap = HashMap::with_capacity(1); metadata.insert("key".to_string(), "value".to_string()); let fields = vec![Field::new("test", DataType::Int8, true)]; let mut builder: SchemaBuilder = Schema::new(fields).with_metadata(metadata).into(); - builder.metadata_mut().insert("k".into(), "v".into()); + builder.metadata_mut().insert("k", "v"); let out = builder.finish(); assert_eq!(out.metadata.len(), 2); assert_eq!(out.metadata["k"], "v"); diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 4286128a76e1..451fdf025155 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -27,6 +27,6 @@ pub use arrow_array::{ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType}; pub use arrow_buffer::{ArrowNativeType, ToByteSlice, i256}; pub use arrow_data::decimal::*; pub use arrow_schema::{ - DataType, Field, FieldRef, Fields, IntervalUnit, Schema, SchemaBuilder, SchemaRef, TimeUnit, - UnionFields, UnionMode, + DataType, Field, FieldRef, Fields, IntervalUnit, Metadata, Schema, SchemaBuilder, SchemaRef, + TimeUnit, UnionFields, UnionMode, }; diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 12c3e192cdfc..75c59e5b2cce 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3951,9 +3951,7 @@ pub(crate) mod tests { let schema_without_metadata = Arc::new(Schema::new(vec![field.clone()])); - let metadata = [("key".to_string(), "value".to_string())] - .into_iter() - .collect(); + let metadata = arrow_schema::Metadata::from([("key".to_string(), "value".to_string())]); let schema_with_metadata = Arc::new(Schema::new(vec![field.with_metadata(metadata)])); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 21650fe26eac..472d2da83f29 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -4653,11 +4653,7 @@ mod tests { #[test] fn test_arrow_writer_metadata() { let batch_schema = Schema::new(vec![Field::new("int32", DataType::Int32, false)]); - let file_schema = batch_schema.clone().with_metadata( - vec![("foo".to_string(), "bar".to_string())] - .into_iter() - .collect(), - ); + let file_schema = batch_schema.clone().with_metadata([("foo", "bar")]); let batch = RecordBatch::try_new( Arc::new(batch_schema), diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 161b26302825..99dc3d4dc762 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -493,10 +493,9 @@ impl Visitor { .with_nullable(false), ); let value_field = Arc::new(convert_field(map_value, &value, arrow_value, true)?); - let field_metadata = match arrow_map { - Some(field) => field.metadata().clone(), - _ => HashMap::default(), - }; + let field_metadata = arrow_map + .map(|field| field.metadata().clone()) + .unwrap_or_default(); let map_field = Field::new_struct( map_key_value.name(), diff --git a/parquet/src/arrow/schema/virtual_type.rs b/parquet/src/arrow/schema/virtual_type.rs index f66352ae1981..dc9e191e8f4b 100644 --- a/parquet/src/arrow/schema/virtual_type.rs +++ b/parquet/src/arrow/schema/virtual_type.rs @@ -153,11 +153,8 @@ mod tests { #[test] #[should_panic(expected = "Extension type name missing")] fn row_number_missing_name() { - let field = Field::new("", DataType::Int64, false).with_metadata( - [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int64, false) + .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "")]); field.extension_type::(); } @@ -170,34 +167,18 @@ mod tests { #[test] #[should_panic(expected = "Virtual column extension type expects an empty string as metadata")] fn row_number_missing_metadata() { - let field = Field::new("", DataType::Int64, false).with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - RowNumber::NAME.to_owned(), - )] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int64, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, RowNumber::NAME)]); field.extension_type::(); } #[test] #[should_panic(expected = "Virtual column extension type expects an empty string as metadata")] fn row_number_invalid_metadata() { - let field = Field::new("", DataType::Int64, false).with_metadata( - [ - ( - EXTENSION_TYPE_NAME_KEY.to_owned(), - RowNumber::NAME.to_owned(), - ), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - "non-empty".to_owned(), - ), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int64, false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, RowNumber::NAME), + (EXTENSION_TYPE_METADATA_KEY, "non-empty"), + ]); field.extension_type::(); } @@ -213,11 +194,8 @@ mod tests { #[test] #[should_panic(expected = "Extension type name missing")] fn row_group_index_missing_name() { - let field = Field::new("", DataType::Int64, false).with_metadata( - [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int64, false) + .with_metadata([(EXTENSION_TYPE_METADATA_KEY, "")]); field.extension_type::(); } @@ -230,34 +208,18 @@ mod tests { #[test] #[should_panic(expected = "Virtual column extension type expects an empty string as metadata")] fn row_group_index_missing_metadata() { - let field = Field::new("", DataType::Int64, false).with_metadata( - [( - EXTENSION_TYPE_NAME_KEY.to_owned(), - RowGroupIndex::NAME.to_owned(), - )] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int64, false) + .with_metadata([(EXTENSION_TYPE_NAME_KEY, RowGroupIndex::NAME)]); field.extension_type::(); } #[test] #[should_panic(expected = "Virtual column extension type expects an empty string as metadata")] fn row_group_index_invalid_metadata() { - let field = Field::new("", DataType::Int64, false).with_metadata( - [ - ( - EXTENSION_TYPE_NAME_KEY.to_owned(), - RowGroupIndex::NAME.to_owned(), - ), - ( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - "non-empty".to_owned(), - ), - ] - .into_iter() - .collect(), - ); + let field = Field::new("", DataType::Int64, false).with_metadata([ + (EXTENSION_TYPE_NAME_KEY, RowGroupIndex::NAME), + (EXTENSION_TYPE_METADATA_KEY, "non-empty"), + ]); field.extension_type::(); } }