From c3c71b792a76782ec6e98ab136c7ccd52028f1e6 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Fri, 20 Dec 2024 14:20:27 -0800 Subject: [PATCH 01/20] Implement Variant encoding --- parquet-variant/pom.xml | 88 +++ .../variant/MalformedVariantException.java | 23 + .../variant/UnknownVariantTypeException.java | 39 ++ .../org/apache/parquet/variant/Variant.java | 446 ++++++++++++ .../parquet/variant/VariantBuilder.java | 631 +++++++++++++++++ .../variant/VariantDuplicateKeyException.java | 39 ++ .../variant/VariantSizeLimitException.java | 24 + .../apache/parquet/variant/VariantUtil.java | 646 ++++++++++++++++++ .../parquet/variant/TestVariantEncoding.java | 490 +++++++++++++ pom.xml | 1 + 10 files changed, 2427 insertions(+) create mode 100644 parquet-variant/pom.xml create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java create mode 100644 parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java diff --git a/parquet-variant/pom.xml b/parquet-variant/pom.xml new file mode 100644 index 0000000000..6bfc2ff525 --- /dev/null +++ b/parquet-variant/pom.xml @@ -0,0 +1,88 @@ + + + + org.apache.parquet + parquet + ../pom.xml + 1.16.0-SNAPSHOT + + + 4.0.0 + + parquet-variant + jar + + Apache Parquet Variant + https://parquet.apache.org + + + + + + + org.apache.parquet + parquet-jackson + ${project.version} + runtime + + + ${jackson.groupId} + jackson-core + ${jackson.version} + + + ${jackson.groupId} + jackson-databind + ${jackson-databind.version} + test + + + com.google.guava + guava + ${guava.version} + test + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + test + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + org.apache.maven.plugins + maven-shade-plugin + + + + + diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java new file mode 100644 index 0000000000..e9bff469d2 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the Variant is malformed. + */ +public class MalformedVariantException extends RuntimeException { +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java new file mode 100644 index 0000000000..2f0bd5dce6 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the Variant contains an unknown type. + */ +public class UnknownVariantTypeException extends RuntimeException { + public final int typeId; + + /** + * @param typeId the type id that was unknown + */ + public UnknownVariantTypeException(int typeId) { + super("Unknown type in Variant. id: " + typeId); + this.typeId = typeId; + } + + /** + * @return the type id that was unknown + */ + public int getTypeId() { + return typeId; + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java new file mode 100644 index 0000000000..4fcdb6b0e5 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -0,0 +1,446 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.variant; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.Base64; +import java.util.Locale; + +import static java.time.temporal.ChronoField.*; +import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; +import static org.apache.parquet.variant.VariantUtil.*; + +/** + * This Variant class holds the Variant-encoded value and metadata binary values. + */ +public final class Variant { + final byte[] value; + final byte[] metadata; + /** + * The starting index into `value` where the variant value starts. This is used to avoid copying + * the value binary when reading a sub-variant in the array/object element. + */ + final int pos; + + public Variant(byte[] value, byte[] metadata) { + this(value, metadata, 0); + } + + Variant(byte[] value, byte[] metadata, int pos) { + this.value = value; + this.metadata = metadata; + this.pos = pos; + // There is currently only one allowed version. + if (metadata.length < 1 || (metadata[0] & VERSION_MASK) != VERSION) { + throw malformedVariant(); + } + } + + public byte[] getValue() { + if (pos == 0) return value; + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + return Arrays.copyOfRange(value, pos, pos + size); + } + + public byte[] getMetadata() { + return metadata; + } + + /** + * @return the boolean value + */ + public boolean getBoolean() { + return VariantUtil.getBoolean(value, pos); + } + + /** + * @return the long value + */ + public long getLong() { + return VariantUtil.getLong(value, pos); + } + + /** + * @return the double value + */ + public double getDouble() { + return VariantUtil.getDouble(value, pos); + } + + /** + * @return the decimal value + */ + public BigDecimal getDecimal() { + return VariantUtil.getDecimal(value, pos); + } + + /** + * @return the float value + */ + public float getFloat() { + return VariantUtil.getFloat(value, pos); + } + + /** + * @return the binary value + */ + public byte[] getBinary() { + return VariantUtil.getBinary(value, pos); + } + + /** + * @return the string value + */ + public String getString() { + return VariantUtil.getString(value, pos); + } + + /** + * @return the type info bits from a variant value + */ + public int getTypeInfo() { + return VariantUtil.getTypeInfo(value, pos); + } + + /** + * @return the type of the variant value + */ + public Type getType() { + return VariantUtil.getType(value, pos); + } + + /** + * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. + */ + public int objectSize() { + return handleObject(value, pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); + } + + // Find the field value whose key is equal to `key`. Return null if the key is not found. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + + /** + * Returns the object field Variant value whose key is equal to `key`. + * Return null if the key is not found. `getType()` must be `Type.OBJECT`. + * @param key the key to look up + * @return the field value whose key is equal to `key`, or null if key is not found + */ + public Variant getFieldByKey(String key) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + // Use linear search for a short list. Switch to binary search when the length reaches + // `BINARY_SEARCH_THRESHOLD`. + final int BINARY_SEARCH_THRESHOLD = 32; + if (size < BINARY_SEARCH_THRESHOLD) { + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + if (key.equals(getMetadataKey(metadata, id))) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + } + } else { + int low = 0; + int high = size - 1; + while (low <= high) { + // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a + // performance optimization, because it can properly handle the case where `low + high` + // overflows int. + int mid = (low + high) >>> 1; + int id = readUnsigned(value, idStart + idSize * mid, idSize); + int cmp = getMetadataKey(metadata, id).compareTo(key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + int offset = readUnsigned(value, offsetStart + offsetSize * mid, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + } + } + return null; + }); + } + + /** + * A field in a Variant object. + */ + public static final class ObjectField { + public final String key; + public final Variant value; + + public ObjectField(String key, Variant value) { + this.key = key; + this.value = value; + } + } + + // Get the object field at the `index` slot. Return null if `index` is out of the bound of + // `[0, objectSize())`. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + /** + * Returns the object field at the `index` slot. Return null if `index` is out of the bound of + * `[0, objectSize())`. `getType()` must be `Type.OBJECT`. + * @param index the index of the object field to get + * @return the Objectfield at the `index` slot, or null if `index` is out of bounds + */ + public ObjectField getFieldAtIndex(int index) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) return null; + int id = readUnsigned(value, idStart + idSize * index, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + String key = getMetadataKey(metadata, id); + Variant v = new Variant(value, metadata, dataStart + offset); + return new ObjectField(key, v); + }); + } + + /** + * Returns the dictionary ID for the object field at the `index` slot. + * `getType()` must be `Type.OBJECT`. + * @param index the index of the object field to get the dictionary ID for + * @return the dictionary ID for the object field at the `index` slot + * @throws MalformedVariantException if `index` is out of bounds + */ + public int getDictionaryIdAtIndex(int index) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) { + throw malformedVariant(); + } + return readUnsigned(value, idStart + idSize * index, idSize); + }); + } + + /** + * @return the number of array elements. `getType()` must be `Type.ARRAY`. + */ + public int arraySize() { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); + } + + /** + * Returns the array element Variant value at the `index` slot. Returns null if `index` is + * out of the bound of `[0, arraySize())`. `getType()` must be `Type.ARRAY`. + * @param index the index of the array element to get + * @return the array element Variant at the `index` slot, or null if `index` is out of bounds + */ + public Variant getElementAtIndex(int index) { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + if (index < 0 || index >= size) return null; + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + return new Variant(value, metadata, dataStart + offset); + }); + } + + /** + * @param zoneId The ZoneId to use for formatting timestamps + * @return the JSON representation of the variant + * @throws MalformedVariantException if the variant is malformed + */ + public String toJson(ZoneId zoneId) { + return toJson(zoneId, false); + } + + /** + * @param zoneId The ZoneId to use for formatting timestamps + * @param truncateTrailingZeros Whether to truncate trailing zeros in decimal values or timestamps + * @return the JSON representation of the variant + * @throws MalformedVariantException if the variant is malformed + */ + public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { + StringBuilder sb = new StringBuilder(); + toJsonImpl(value, metadata, pos, sb, zoneId, truncateTrailingZeros); + return sb.toString(); + } + + /** + * Escapes a string so that it can be pasted into a JSON structure. For example, if `str` + * only contains a new-line character, then the result is "\n" (4 characters) + * @param str the string to escape + * @return the escaped string + */ + private static String escapeJson(String str) { + try (CharArrayWriter writer = new CharArrayWriter(); + JsonGenerator gen = new JsonFactory().createGenerator(writer)) { + gen.writeString(str); + gen.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + // A simplified and more performant version of `sb.append(escapeJson(str))`. It is used when we + // know `str` doesn't contain any special character that needs escaping. + /** + * Appends a quoted string to a StringBuilder. It is used when we know `str` doesn't contain any + * special characters that needs escaping. This is more performant than + * `sb.append(escapeJson(str))`. + * @param sb the StringBuilder to append to + * @param str the string to append + */ + private static void appendQuoted(StringBuilder sb, String str) { + sb.append('"'); + sb.append(str); + sb.append('"'); + } + + /** The format for a timestamp without time zone. */ + private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendValue(HOUR_OF_DAY, 2) + .appendLiteral(':') + .appendValue(MINUTE_OF_HOUR, 2) + .optionalStart() + .appendLiteral(':') + .appendValue(SECOND_OF_MINUTE, 2) + .appendFraction(MICRO_OF_SECOND, 6, 6, true) + .toFormatter(Locale.US); + + /** The format for a timestamp with time zone. */ + private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + /** The format for a timestamp without time zone, truncating trailing microsecond zeros. */ + private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = + new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendValue(HOUR_OF_DAY, 2) + .appendLiteral(':') + .appendValue(MINUTE_OF_HOUR, 2) + .optionalStart() + .appendLiteral(':') + .appendValue(SECOND_OF_MINUTE, 2) + .optionalStart() + .appendFraction(MICRO_OF_SECOND, 0, 6, true) + .toFormatter(Locale.US); + + /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ + private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_TRUNC_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + private static Instant microsToInstant(long timestamp) { + return Instant.EPOCH.plus(timestamp, ChronoUnit.MICROS); + } + + private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb, + ZoneId zoneId, boolean truncateTrailingZeros) { + switch (VariantUtil.getType(value, pos)) { + case OBJECT: + handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + sb.append('{'); + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) sb.append(','); + sb.append(escapeJson(getMetadataKey(metadata, id))); + sb.append(':'); + toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros); + } + sb.append('}'); + return null; + }); + break; + case ARRAY: + handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + sb.append('['); + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) sb.append(','); + toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros); + } + sb.append(']'); + return null; + }); + break; + case NULL: + sb.append("null"); + break; + case BOOLEAN: + sb.append(VariantUtil.getBoolean(value, pos)); + break; + case LONG: + sb.append(VariantUtil.getLong(value, pos)); + break; + case STRING: + sb.append(escapeJson(VariantUtil.getString(value, pos))); + break; + case DOUBLE: + sb.append(VariantUtil.getDouble(value, pos)); + break; + case DECIMAL: + if (truncateTrailingZeros) { + sb.append(VariantUtil.getDecimal(value, pos).stripTrailingZeros().toPlainString()); + } else { + sb.append(VariantUtil.getDecimal(value, pos).toPlainString()); + } + break; + case DATE: + appendQuoted(sb, LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)).toString()); + break; + case TIMESTAMP: + if (truncateTrailingZeros) { + appendQuoted(sb, TIMESTAMP_TRUNC_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + } else { + appendQuoted(sb, TIMESTAMP_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + } + break; + case TIMESTAMP_NTZ: + if (truncateTrailingZeros) { + appendQuoted(sb, TIMESTAMP_NTZ_TRUNC_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + } else { + appendQuoted(sb, TIMESTAMP_NTZ_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + } + break; + case FLOAT: + sb.append(VariantUtil.getFloat(value, pos)); + break; + case BINARY: + appendQuoted(sb, Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); + break; + } + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java new file mode 100644 index 0000000000..574c8fdbde --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -0,0 +1,631 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.exc.InputCoercionException; + +import static org.apache.parquet.variant.VariantUtil.*; + +/** + * Builder for creating Variant value and metadata. + */ +public class VariantBuilder { + public VariantBuilder(boolean allowDuplicateKeys) { + this(allowDuplicateKeys, DEFAULT_SIZE_LIMIT); + } + + public VariantBuilder(boolean allowDuplicateKeys, int sizeLimitBytes) { + this.allowDuplicateKeys = allowDuplicateKeys; + this.sizeLimitBytes = sizeLimitBytes; + } + + /** + * Parse a JSON string as a Variant value. + * @param json the JSON string to parse + * @return the Variant value + * @throws IOException if any JSON parsing error happens + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public static Variant parseJson(String json) throws IOException { + return parseJson(json, new VariantBuilder(false)); + } + + /** + * Parse a JSON string as a Variant value. + * @param json the JSON string to parse + * @param builder the VariantBuilder to use for building the Variant + * @return the Variant value + * @throws IOException if any JSON parsing error happens + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public static Variant parseJson(String json, VariantBuilder builder) throws IOException { + try (JsonParser parser = new JsonFactory().createParser(json)) { + parser.nextToken(); + return parseJson(parser, builder); + } + } + + /** + * Parse a JSON parser as a Variant value. + * @param parser the JSON parser to use + * @param builder the VariantBuilder to use for building the Variant + * @return the Variant value + * @throws IOException if any JSON parsing error happens + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public static Variant parseJson(JsonParser parser, VariantBuilder builder) + throws IOException { + builder.buildFromJsonParser(parser); + return builder.result(); + } + + /** + * @return the Variant value + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public Variant result() { + int numKeys = dictionaryKeys.size(); + // Use long to avoid overflow in accumulating lengths. + long dictionaryStringSize = 0; + for (byte[] key : dictionaryKeys) { + dictionaryStringSize += key.length; + } + // Determine the number of bytes required per offset entry. + // The largest offset is the one-past-the-end value, which is total string size. It's very + // unlikely that the number of keys could be larger, but incorporate that into the calculation + // in case of pathological data. + long maxSize = Math.max(dictionaryStringSize, numKeys); + if (maxSize > sizeLimitBytes) { + throw new VariantSizeLimitException(); + } + int offsetSize = getMinIntegerSize((int)maxSize); + + int offsetStart = 1 + offsetSize; + int stringStart = offsetStart + (numKeys + 1) * offsetSize; + long metadataSize = stringStart + dictionaryStringSize; + + if (metadataSize > sizeLimitBytes) { + throw new VariantSizeLimitException(); + } + byte[] metadata = new byte[(int) metadataSize]; + int headerByte = VERSION | ((offsetSize - 1) << 6); + writeLong(metadata, 0, headerByte, 1); + writeLong(metadata, 1, numKeys, offsetSize); + int currentOffset = 0; + for (int i = 0; i < numKeys; ++i) { + writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize); + byte[] key = dictionaryKeys.get(i); + System.arraycopy(key, 0, metadata, stringStart + currentOffset, key.length); + currentOffset += key.length; + } + writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize); + return new Variant(Arrays.copyOfRange(writeBuffer, 0, writePos), metadata); + } + + public void appendString(String str) { + byte[] text = str.getBytes(StandardCharsets.UTF_8); + boolean longStr = text.length > MAX_SHORT_STR_SIZE; + checkCapacity((longStr ? 1 + U32_SIZE : 1) + text.length); + if (longStr) { + writeBuffer[writePos++] = primitiveHeader(LONG_STR); + writeLong(writeBuffer, writePos, text.length, U32_SIZE); + writePos += U32_SIZE; + } else { + writeBuffer[writePos++] = shortStrHeader(text.length); + } + System.arraycopy(text, 0, writeBuffer, writePos, text.length); + writePos += text.length; + } + + public void appendNull() { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(NULL); + } + + public void appendBoolean(boolean b) { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(b ? TRUE : FALSE); + } + + /** + * Appends a long value to the variant builder. The actual encoded integer type depends on the + * value range of the long value. + * @param l the long value to append + */ + public void appendLong(long l) { + checkCapacity(1 + 8); + if (l == (byte) l) { + writeBuffer[writePos++] = primitiveHeader(INT1); + writeLong(writeBuffer, writePos, l, 1); + writePos += 1; + } else if (l == (short) l) { + writeBuffer[writePos++] = primitiveHeader(INT2); + writeLong(writeBuffer, writePos, l, 2); + writePos += 2; + } else if (l == (int) l) { + writeBuffer[writePos++] = primitiveHeader(INT4); + writeLong(writeBuffer, writePos, l, 4); + writePos += 4; + } else { + writeBuffer[writePos++] = primitiveHeader(INT8); + writeLong(writeBuffer, writePos, l, 8); + writePos += 8; + } + } + + public void appendDouble(double d) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(DOUBLE); + writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8); + writePos += 8; + } + + /** + * Appends a decimal value to the variant builder. The actual encoded decimal type depends on the + * precision and scale of the decimal value. + * @param d the decimal value to append + */ + public void appendDecimal(BigDecimal d) { + checkCapacity(2 + 16); + BigInteger unscaled = d.unscaledValue(); + if (d.scale() <= MAX_DECIMAL4_PRECISION && d.precision() <= MAX_DECIMAL4_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL4); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4); + writePos += 4; + } else if (d.scale() <= MAX_DECIMAL8_PRECISION && d.precision() <= MAX_DECIMAL8_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL8); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8); + writePos += 8; + } else { + assert d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION; + writeBuffer[writePos++] = primitiveHeader(DECIMAL16); + writeBuffer[writePos++] = (byte) d.scale(); + // `toByteArray` returns a big-endian representation. We need to copy it reversely and sign + // extend it to 16 bytes. + byte[] bytes = unscaled.toByteArray(); + for (int i = 0; i < bytes.length; ++i) { + writeBuffer[writePos + i] = bytes[bytes.length - 1 - i]; + } + byte sign = (byte) (bytes[0] < 0 ? -1 : 0); + for (int i = bytes.length; i < 16; ++i) { + writeBuffer[writePos + i] = sign; + } + writePos += 16; + } + } + + public void appendDate(int daysSinceEpoch) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(DATE); + writeLong(writeBuffer, writePos, daysSinceEpoch, 4); + writePos += 4; + } + + public void appendTimestamp(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendTimestampNtz(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP_NTZ); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendFloat(float f) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(FLOAT); + writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 8); + writePos += 4; + } + + public void appendBinary(byte[] binary) { + checkCapacity(1 + U32_SIZE + binary.length); + writeBuffer[writePos++] = primitiveHeader(BINARY); + writeLong(writeBuffer, writePos, binary.length, U32_SIZE); + writePos += U32_SIZE; + System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); + writePos += binary.length; + } + + /** + * Adds a key to the Variant dictionary. If the key already exists, the dictionary is unmodified. + * @param key the key to add + * @return the id of the key + */ + public int addKey(String key) { + int id; + if (dictionary.containsKey(key)) { + id = dictionary.get(key); + } else { + id = dictionaryKeys.size(); + dictionary.put(key, id); + dictionaryKeys.add(key.getBytes(StandardCharsets.UTF_8)); + } + return id; + } + + /** + * @return the current write position of the variant builder + */ + public int getWritePos() { + return writePos; + } + + // Finish writing a variant object after all of its fields have already been written. The process + // is as follows: + // 1. The caller calls `getWritePos` before writing any fields to obtain the `start` parameter. + // 2. The caller appends all the object fields to the builder. In the meantime, it should maintain + // the `fields` parameter. Before appending each field, it should append an entry to `fields` to + // record the offset of the field. The offset is computed as `getWritePos() - start`. + // 3. The caller calls `finishWritingObject` to finish writing a variant object. + // + // This function is responsible to sort the fields by key. If there are duplicate field keys: + // - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last + // appended one) is kept. + // - otherwise, throw an exception. + /** + * Finish writing a Variant object after all of its fields have already been written. The process + * is as follows: + * 1. The caller calls `getWritePos()` before writing any fields to obtain the `start` parameter. + * 2. The caller appends all the object fields to the builder. In the meantime, it should maintain + * the `fields` parameter. Before appending each field, it should append an entry to `fields` to + * record the offset of the field. The offset is computed as `getWritePos() - start`. + * 3. The caller calls `finishWritingObject` to finish writing the Variant object. + * + * This method will sort the fields by key. If there are duplicate field keys: + * - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last + * appended one) is kept. + * - otherwise, throw an exception. + * @param start the start position of the object in the write buffer + * @param fields the list of `FieldEntry` in the object + * @throws VariantDuplicateKeyException if there are duplicate keys and `allowDuplicateKeys` is + * false + */ + public void finishWritingObject(int start, ArrayList fields) { + int size = fields.size(); + Collections.sort(fields); + int maxId = size == 0 ? 0 : fields.get(0).id; + if (allowDuplicateKeys) { + int distinctPos = 0; + // Maintain a list of distinct keys in-place. + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + if (fields.get(i).id == fields.get(i - 1).id) { + // Found a duplicate key. Keep the field with the greater offset. + if (fields.get(distinctPos).offset < fields.get(i).offset) { + fields.set(distinctPos, fields.get(distinctPos).withNewOffset(fields.get(i).offset)); + } + } else { + // Found a distinct key. Add the field to the list. + ++distinctPos; + fields.set(distinctPos, fields.get(i)); + } + } + if (distinctPos + 1 < fields.size()) { + size = distinctPos + 1; + // Resize `fields` to `size`. + fields.subList(size, fields.size()).clear(); + // Sort the fields by offsets so that we can move the value data of each field to the new + // offset without overwriting the fields after it. + fields.sort(Comparator.comparingInt(f -> f.offset)); + int currentOffset = 0; + for (int i = 0; i < size; ++i) { + int oldOffset = fields.get(i).offset; + int fieldSize = VariantUtil.valueSize(writeBuffer, start + oldOffset); + System.arraycopy(writeBuffer, start + oldOffset, + writeBuffer, start + currentOffset, fieldSize); + fields.set(i, fields.get(i).withNewOffset(currentOffset)); + currentOffset += fieldSize; + } + writePos = start + currentOffset; + // Change back to the sort order by field keys, required by the Variant specification. + Collections.sort(fields); + } + } else { + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + String key = fields.get(i).key; + if (key.equals(fields.get(i - 1).key)) { + throw new VariantDuplicateKeyException(key); + } + } + } + int dataSize = writePos - start; + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int idSize = getMinIntegerSize(maxId); + int offsetSize = getMinIntegerSize(dataSize); + // The space for header byte, object size, id list, and offset list. + int headerSize = 1 + sizeBytes + size * idSize + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the object header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = objectHeader(largeSize, idSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int idStart = start + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize); + writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + /** + * Finish writing a Variant array after all of its elements have already been written. The process + * is similar to that of `finishWritingObject`. + * @param start the start position of the array in the write buffer + * @param offsets the list of offsets of the array elements + */ + public void finishWritingArray(int start, ArrayList offsets) { + int dataSize = writePos - start; + int size = offsets.size(); + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int offsetSize = getMinIntegerSize(dataSize); + // The space for header byte, object size, and offset list. + int headerSize = 1 + sizeBytes + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = arrayHeader(largeSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int offsetStart = start + 1 + sizeBytes; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + /** + * Appends a Variant value to the Variant builder. The input Variant keys must be inserted into + * the builder dictionary and rebuilt with new field ids. For scalar values in the input + * Variant, we can directly copy the binary slice. + * @param v the Variant value to append + */ + public void appendVariant(Variant v) { + appendVariantImpl(v.value, v.metadata, v.pos); + } + + private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + switch (basicType) { + case OBJECT: + handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + ArrayList fields = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + String key = getMetadataKey(metadata, id); + int newId = addKey(key); + fields.add(new FieldEntry(key, newId, writePos - start)); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingObject(start, fields); + return null; + }); + break; + case ARRAY: + handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + ArrayList offsets = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + offsets.add(writePos - start); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingArray(start, offsets); + return null; + }); + break; + default: + shallowAppendVariantImpl(value, pos); + break; + } + } + + private void shallowAppendVariantImpl(byte[] value, int pos) { + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + checkCapacity(size); + System.arraycopy(value, pos, writeBuffer, writePos, size); + writePos += size; + } + + private void checkCapacity(int additionalBytes) { + int requiredBytes = writePos + additionalBytes; + if (requiredBytes > writeBuffer.length) { + // Allocate a new buffer with a capacity of the next power of 2 of `requiredBytes`. + int newCapacity = Integer.highestOneBit(requiredBytes); + newCapacity = newCapacity < requiredBytes ? newCapacity * 2 : newCapacity; + if (newCapacity > sizeLimitBytes) { + throw new VariantSizeLimitException(); + } + byte[] newValue = new byte[newCapacity]; + System.arraycopy(writeBuffer, 0, newValue, 0, writePos); + writeBuffer = newValue; + } + } + + // Temporarily store the information of a field. We need to collect all fields in an JSON object, + // sort them by their keys, and build the variant object in sorted order. + + /** + * Class to store the information of a Variant object field. We need to collect all fields of + * an object, sort them by their keys, and build the Variant object in sorted order. + */ + public static final class FieldEntry implements Comparable { + final String key; + final int id; + final int offset; + + public FieldEntry(String key, int id, int offset) { + this.key = key; + this.id = id; + this.offset = offset; + } + + FieldEntry withNewOffset(int newOffset) { + return new FieldEntry(key, id, newOffset); + } + + @Override + public int compareTo(FieldEntry other) { + return key.compareTo(other.key); + } + } + + private void buildFromJsonParser(JsonParser parser) throws IOException { + JsonToken token = parser.currentToken(); + if (token == null) { + throw new JsonParseException(parser, "Unexpected null token"); + } + switch (token) { + case START_OBJECT: { + ArrayList fields = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_OBJECT) { + String key = parser.currentName(); + parser.nextToken(); + int id = addKey(key); + fields.add(new FieldEntry(key, id, writePos - start)); + buildFromJsonParser(parser); + } + finishWritingObject(start, fields); + break; + } + case START_ARRAY: { + ArrayList offsets = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_ARRAY) { + offsets.add(writePos - start); + buildFromJsonParser(parser); + } + finishWritingArray(start, offsets); + break; + } + case VALUE_STRING: + appendString(parser.getText()); + break; + case VALUE_NUMBER_INT: + try { + appendLong(parser.getLongValue()); + } catch (InputCoercionException ignored) { + // If the value doesn't fit any integer type, parse it as decimal or floating instead. + parseAndAppendFloatingPoint(parser); + } + break; + case VALUE_NUMBER_FLOAT: + parseAndAppendFloatingPoint(parser); + break; + case VALUE_TRUE: + appendBoolean(true); + break; + case VALUE_FALSE: + appendBoolean(false); + break; + case VALUE_NULL: + appendNull(); + break; + default: + throw new JsonParseException(parser, "Unexpected token " + token); + } + } + + /** + * Returns the size (number of bytes) of the smallest unsigned integer type that can store + * `value`. It must be within `[0, U24_MAX]`. + * @param value the value to get the size for + * @return the size (number of bytes) of the smallest unsigned integer type that can store `value` + */ + private int getMinIntegerSize(int value) { + assert value >= 0 && value <= U24_MAX; + if (value <= U8_MAX) return 1; + if (value <= U16_MAX) return 2; + return U24_SIZE; + } + + /** + * Parse a JSON number as a floating point value. If the number can be parsed as a decimal, it + * will be appended as a decimal value. Otherwise, it will be appended as a double value. + * @param parser the JSON parser to use + */ + private void parseAndAppendFloatingPoint(JsonParser parser) throws IOException { + if (!tryParseDecimal(parser.getText())) { + appendDouble(parser.getDoubleValue()); + } + } + + /** + * Try to parse a JSON number as a decimal. The input must only use the decimal format + * (an integer value with an optional '.' in it) and must not use scientific notation. It also + * must fit into the precision limitation of decimal types. + * @param input the input string to parse as decimal + * @return whether the parsing succeeds + */ + private boolean tryParseDecimal(String input) { + for (int i = 0; i < input.length(); ++i) { + char ch = input.charAt(i); + if (ch != '-' && ch != '.' && !(ch >= '0' && ch <= '9')) { + return false; + } + } + BigDecimal d = new BigDecimal(input); + if (d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION) { + appendDecimal(d); + return true; + } + return false; + } + + /** The buffer for building the Variant value. The first `writePos` bytes have been written. */ + private byte[] writeBuffer = new byte[128]; + private int writePos = 0; + /** The dictionary for mapping keys to monotonically increasing ids. */ + private final HashMap dictionary = new HashMap<>(); + /** The keys in the dictionary, in id order. */ + private final ArrayList dictionaryKeys = new ArrayList<>(); + + private final boolean allowDuplicateKeys; + private final int sizeLimitBytes; +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java new file mode 100644 index 0000000000..12e94416c4 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the Variant contains a duplicate key. + */ +public class VariantDuplicateKeyException extends RuntimeException { + public final String key; + + /** + * @param key the key that was duplicated + */ + public VariantDuplicateKeyException(String key) { + super("Failed to build Variant because of duplicate object key: " + key); + this.key = key; + } + + /** + * @return the key that was duplicated + */ + public String getKey() { + return key; + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java new file mode 100644 index 0000000000..08556e762e --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the metadata or data size of the Variant exceeds the + * configured size limit. + */ +public class VariantSizeLimitException extends RuntimeException { +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java new file mode 100644 index 0000000000..aeebfe67e1 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -0,0 +1,646 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.Arrays; + +/** + * This class defines constants related to the Variant format and provides functions for + * manipulating Variant binaries. + + * A Variant is made up of 2 binaries: value and metadata. A Variant value consists of a one-byte + * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits + * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in + * the below constants for all possible basic type and type info values. + + * The Variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). + * Its binary format is: + * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. + * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the + * dictionary. + * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the + * starting position of string i, counting starting from the address of `offsets[0]`. Strings + * must be stored contiguously, so we don’t need to store the string size, instead, we compute it + * with `offset[i + 1] - offset[i]`. + * - UTF-8 string data. + */ +public class VariantUtil { + public static final int BASIC_TYPE_BITS = 2; + public static final int BASIC_TYPE_MASK = 0x3; + public static final int TYPE_INFO_MASK = 0x3F; + /** The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. */ + public static final int MAX_SHORT_STR_SIZE = 0x3F; + + // The basic types + + /** + * Primitive value. + * The type info value must be one of the values in the "Primitive" section below. + */ + public static final int PRIMITIVE = 0; + /** + * Short string value. + * The type info value is the string size, which must be in `[0, MAX_SHORT_STR_SIZE]`. + * The string content bytes directly follow the header byte. + */ + public static final int SHORT_STR = 1; + /** + * Object value. + * The content contains a size, a list of field ids, a list of field offsets, and + * the actual field values. The list of field ids has `size` ids, while the list of field offsets + * has `size + 1` offsets, where the last offset represents the total size of the field values + * data. The list of fields ids must be sorted by the field name in alphabetical order. + * Duplicate field names within one object are not allowed. + * 5 bits in the type info are used to specify the integer type of the object header. It is + * 0_b4_b3b2_b1b0 (MSB is 0), where: + * - b4: the integer type of size. When it is 0/1, `size` is a little-endian 1/4-byte + * unsigned integer. + * - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list contains + * 1/2/3-byte little-endian unsigned integers. + * - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the offset list contains + * 1/2/3-byte little-endian unsigned integers. + */ + public static final int OBJECT = 2; + /** + * Array value. + * The content contains a size, a list of field offsets, and the actual element values. + * It is similar to an object without the id list. The length of the offset list + * is `size + 1`, where the last offset represent the total size of the element data. + * Its type info is: 000_b2_b1b0: + * - b2: the type of size. + * - b1b0: the integer type of offset. + */ + public static final int ARRAY = 3; + + // The primitive types + + /** JSON Null value. Empty content. */ + public static final int NULL = 0; + /** True value. Empty content. */ + public static final int TRUE = 1; + /** False value. Empty content. */ + public static final int FALSE = 2; + /** 1-byte little-endian signed integer. */ + public static final int INT1 = 3; + /** 2-byte little-endian signed integer. */ + public static final int INT2 = 4; + /** 4-byte little-endian signed integer. */ + public static final int INT4 = 5; + /** 4-byte little-endian signed integer. */ + public static final int INT8 = 6; + /** 8-byte IEEE double. */ + public static final int DOUBLE = 7; + /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. */ + public static final int DECIMAL4 = 8; + /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. */ + public static final int DECIMAL8 = 9; + /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. */ + public static final int DECIMAL16 = 10; + /** + * Date value. Content is 4-byte little-endian signed integer that represents the + * number of days from the Unix epoch. + */ + public static final int DATE = 11; + /** + * Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in + * their local time zones and may be displayed differently depending on the execution environment. + */ + public static final int TIMESTAMP = 12; + /** + * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + * as if the local time zone is UTC. + */ + public static final int TIMESTAMP_NTZ = 13; + /** 4-byte IEEE float. */ + public static final int FLOAT = 14; + /** + * Binary value. The content is (4-byte little-endian unsigned integer representing the binary + * size) + (size bytes of binary content). + */ + public static final int BINARY = 15; + /** + * Long string value. The content is (4-byte little-endian unsigned integer representing the + * string size) + (size bytes of string content). + */ + public static final int LONG_STR = 16; + + // The metadata version. + public static final byte VERSION = 1; + // The lower 4 bits of the first metadata byte contain the version. + public static final byte VERSION_MASK = 0x0F; + + // Constants for various unsigned integer sizes. + public static final int U8_MAX = 0xFF; + public static final int U16_MAX = 0xFFFF; + public static final int U24_MAX = 0xFFFFFF; + public static final int U24_SIZE = 3; + public static final int U32_SIZE = 4; + + // Max decimal precision for each decimal type. + public static final int MAX_DECIMAL4_PRECISION = 9; + public static final int MAX_DECIMAL8_PRECISION = 18; + public static final int MAX_DECIMAL16_PRECISION = 38; + + // Default size limit for both variant value and variant metadata. + public static final int DEFAULT_SIZE_LIMIT = U24_MAX + 1; + + /** + * Write the least significant `numBytes` bytes in `value` into `bytes[pos, pos + numBytes)` in + * little endian. + * @param bytes The byte array to write into + * @param pos The starting index of the byte array to write into + * @param value The value to write + * @param numBytes The number of bytes to write + */ + public static void writeLong(byte[] bytes, int pos, long value, int numBytes) { + for (int i = 0; i < numBytes; ++i) { + bytes[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + } + + public static byte primitiveHeader(int type) { + return (byte) (type << 2 | PRIMITIVE); + } + + public static byte shortStrHeader(int size) { + return (byte) (size << 2 | SHORT_STR); + } + + public static byte objectHeader(boolean largeSize, int idSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) | + ((idSize - 1) << (BASIC_TYPE_BITS + 2)) | + ((offsetSize - 1) << BASIC_TYPE_BITS) | OBJECT); + } + + public static byte arrayHeader(boolean largeSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | + ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); + } + + public static MalformedVariantException malformedVariant() { + return new MalformedVariantException(); + } + + public static UnknownVariantTypeException unknownPrimitiveTypeInVariant(int id) { + return new UnknownVariantTypeException(id); + } + + /** + * Check the validity of an array index `pos`. + * @param pos The index to check + * @param length The length of the array + * @throws MalformedVariantException if the index is out of bound + */ + public static void checkIndex(int pos, int length) { + if (pos < 0 || pos >= length) throw malformedVariant(); + } + + /** + * Reads a little-endian signed long value from `bytes[pos, pos + numBytes)`. + * @param bytes The byte array to read from + * @param pos The starting index of the byte array to read from + * @param numBytes The number of bytes to read + * @return The long value + */ + static long readLong(byte[] bytes, int pos, int numBytes) { + checkIndex(pos, bytes.length); + checkIndex(pos + numBytes - 1, bytes.length); + long result = 0; + // All bytes except the most significant byte should be unsigned-extended and shifted + // (so we need & 0xFF`). The most significant byte should be sign-extended and is handled + // after the loop. + for (int i = 0; i < numBytes - 1; ++i) { + long unsignedByteValue = bytes[pos + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + long signedByteValue = bytes[pos + numBytes - 1]; + result |= signedByteValue << (8 * (numBytes - 1)); + return result; + } + + /** + * Read a little-endian unsigned int value from `bytes[pos, pos + numBytes)`. The value must fit + * into a non-negative int (`[0, Integer.MAX_VALUE]`). + */ + static int readUnsigned(byte[] bytes, int pos, int numBytes) { + checkIndex(pos, bytes.length); + checkIndex(pos + numBytes - 1, bytes.length); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsigned-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes[pos + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) throw malformedVariant(); + return result; + } + + /** + * The value type of Variant value. It is determined by the header byte but not a 1:1 mapping + * (for example, INT1/2/4/8 all maps to `Type.LONG`). + */ + public enum Type { + OBJECT, + ARRAY, + NULL, + BOOLEAN, + LONG, + STRING, + DOUBLE, + DECIMAL, + DATE, + TIMESTAMP, + TIMESTAMP_NTZ, + FLOAT, + BINARY, + } + + public static int getTypeInfo(byte[] value, int pos) { + checkIndex(pos, value.length); + return (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + } + + /** + * Returns the value type of Variant value `value[pos...]`. It is only legal to call `get*` if + * `getType` returns the corresponding type. For example, it is only legal to call + * `getLong` if this method returns `Type.Long`. + * @param value The Variant value to get the type from + * @param pos The starting index of the Variant value + * @return The type of the Variant value + */ + public static Type getType(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + switch (basicType) { + case SHORT_STR: + return Type.STRING; + case OBJECT: + return Type.OBJECT; + case ARRAY: + return Type.ARRAY; + default: + switch (typeInfo) { + case NULL: + return Type.NULL; + case TRUE: + case FALSE: + return Type.BOOLEAN; + case INT1: + case INT2: + case INT4: + case INT8: + return Type.LONG; + case DOUBLE: + return Type.DOUBLE; + case DECIMAL4: + case DECIMAL8: + case DECIMAL16: + return Type.DECIMAL; + case DATE: + return Type.DATE; + case TIMESTAMP: + return Type.TIMESTAMP; + case TIMESTAMP_NTZ: + return Type.TIMESTAMP_NTZ; + case FLOAT: + return Type.FLOAT; + case BINARY: + return Type.BINARY; + case LONG_STR: + return Type.STRING; + default: + throw unknownPrimitiveTypeInVariant(typeInfo); + } + } + } + + /** + * Computes the actual size (in bytes) of the Variant value at `value[pos...]`. + * `value.length - pos` is an upper bound of the size, but the actual size may be smaller. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @return The actual size of the Variant value + * @throws MalformedVariantException if the Variant is malformed + */ + public static int valueSize(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + switch (basicType) { + case SHORT_STR: + return 1 + typeInfo; + case OBJECT: + return handleObject(value, pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + case ARRAY: + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + default: + switch (typeInfo) { + case NULL: + case TRUE: + case FALSE: + return 1; + case INT1: + return 2; + case INT2: + return 3; + case INT4: + case DATE: + case FLOAT: + return 5; + case INT8: + case DOUBLE: + case TIMESTAMP: + case TIMESTAMP_NTZ: + return 9; + case DECIMAL4: + return 6; + case DECIMAL8: + return 10; + case DECIMAL16: + return 18; + case BINARY: + case LONG_STR: + return 1 + U32_SIZE + readUnsigned(value, pos + 1, U32_SIZE); + default: + throw unknownPrimitiveTypeInVariant(typeInfo); + } + } + } + + private static IllegalStateException unexpectedType(Type type) { + return new IllegalStateException("Expect type to be " + type); + } + + public static boolean getBoolean(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { + throw unexpectedType(Type.BOOLEAN); + } + return typeInfo == TRUE; + } + + /** + * Returns a long value from Variant value `value[pos...]`. + * It is only legal to call it if `getType` returns one of Type.LONG, DATE, TIMESTAMP, + * TIMESTAMP_NTZ. + * If the type is `DATE`, the return value is guaranteed to fit into an int and + * represents the number of days from the Unix epoch. + * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of + * microseconds from the Unix epoch. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @return The long value + */ + public static long getLong(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; + if (basicType != PRIMITIVE) throw new IllegalStateException(exceptionMessage); + switch (typeInfo) { + case INT1: + return readLong(value, pos + 1, 1); + case INT2: + return readLong(value, pos + 1, 2); + case INT4: + case DATE: + return readLong(value, pos + 1, 4); + case INT8: + case TIMESTAMP: + case TIMESTAMP_NTZ: + return readLong(value, pos + 1, 8); + default: + throw new IllegalStateException(exceptionMessage); + } + } + + public static double getDouble(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != DOUBLE) throw unexpectedType(Type.DOUBLE); + return Double.longBitsToDouble(readLong(value, pos + 1, 8)); + } + + /** + * Checks whether the precision and scale of the decimal are within the limit. + * @param d The decimal value to check + * @param maxPrecision The maximum precision allowed + * @throws MalformedVariantException if the decimal is malformed + */ + private static void checkDecimal(BigDecimal d, int maxPrecision) { + if (d.precision() > maxPrecision || d.scale() > maxPrecision) { + throw malformedVariant(); + } + } + + public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); + // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be + // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. + int scale = value[pos + 1] & 0xFF; + BigDecimal result; + switch (typeInfo) { + case DECIMAL4: + result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); + checkDecimal(result, MAX_DECIMAL4_PRECISION); + break; + case DECIMAL8: + result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); + checkDecimal(result, MAX_DECIMAL8_PRECISION); + break; + case DECIMAL16: + checkIndex(pos + 17, value.length); + byte[] bytes = new byte[16]; + // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian + // representation. + for (int i = 0; i < 16; ++i) { + bytes[i] = value[pos + 17 - i]; + } + result = new BigDecimal(new BigInteger(bytes), scale); + checkDecimal(result, MAX_DECIMAL16_PRECISION); + break; + default: + throw unexpectedType(Type.DECIMAL); + } + return result; + } + + public static BigDecimal getDecimal(byte[] value, int pos) { + return getDecimalWithOriginalScale(value, pos); + } + + public static float getFloat(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != FLOAT) throw unexpectedType(Type.FLOAT); + return Float.intBitsToFloat((int) readLong(value, pos + 1, 4)); + } + + public static byte[] getBinary(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != BINARY) throw unexpectedType(Type.BINARY); + int start = pos + 1 + U32_SIZE; + int length = readUnsigned(value, pos + 1, U32_SIZE); + checkIndex(start + length - 1, value.length); + return Arrays.copyOfRange(value, start, start + length); + } + + public static String getString(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { + int start; + int length; + if (basicType == SHORT_STR) { + start = pos + 1; + length = typeInfo; + } else { + start = pos + 1 + U32_SIZE; + length = readUnsigned(value, pos + 1, U32_SIZE); + } + checkIndex(start + length - 1, value.length); + return new String(value, start, length); + } + throw unexpectedType(Type.STRING); + } + + /** + * An interface for the Variant object handler. + * @param The return type of the handler + */ + public interface ObjectHandler { + /** + * @param size Number of object fields. + * @param idSize The integer size of the field id list. + * @param offsetSize The integer size of the offset list. + * @param idStart The starting index of the field id list in the variant value array. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of field data in the variant value array. + */ + T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart); + } + + /** + * A helper function to access a Variant object, at `value[pos...]`. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @param handler The handler to process the object + * @return The result of the handler + * @param The return type of the handler + */ + public static T handleObject(byte[] value, int pos, ObjectHandler handler) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != OBJECT) throw unexpectedType(Type.OBJECT); + // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts + // b4 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, pos + 1, sizeBytes); + // Extracts b3b2 to determine the integer size of the field id list. + int idSize = ((typeInfo >> 2) & 0x3) + 1; + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int idStart = pos + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart); + } + + /** + * An interface for the Variant array handler. + * @param The return type of the handler + */ + public interface ArrayHandler { + /** + * @param size Number of array elements. + * @param offsetSize The integer size of the offset list. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of element data in the variant value array. + */ + T apply(int size, int offsetSize, int offsetStart, int dataStart); + } + + /** + * A helper function to access a Variant array, at `value[pos...]`. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @param handler The handler to process the array + * @return The result of the handler + * @param The return type of the handler + */ + public static T handleArray(byte[] value, int pos, ArrayHandler handler) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != ARRAY) throw unexpectedType(Type.ARRAY); + // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts + // b2 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, pos + 1, sizeBytes); + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int offsetStart = pos + 1 + sizeBytes; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, offsetSize, offsetStart, dataStart); + } + + /** + * Returns a key at `id` in the Variant metadata. + * @param metadata The Variant metadata + * @param id The key id + * @return The key + * @throws MalformedVariantException if the Variant is malformed or if the id is out of bounds + */ + public static String getMetadataKey(byte[] metadata, int id) { + checkIndex(0, metadata.length); + // Extracts the highest 2 bits in the metadata header to determine the integer size of the + // offset list. + int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, 1, offsetSize); + if (id >= dictSize) throw malformedVariant(); + // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets + // before the string data. + int stringStart = 1 + (dictSize + 2) * offsetSize; + int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); + int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); + if (offset > nextOffset) throw malformedVariant(); + checkIndex(stringStart + nextOffset - 1, metadata.length); + return new String(metadata, stringStart + offset, nextOffset - offset); + } +} diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java new file mode 100644 index 0000000000..ea661df4dd --- /dev/null +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.variant; + +import java.io.IOException; +import java.math.BigDecimal; +import java.security.SecureRandom; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.Arrays; +import java.util.Base64; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.stream.IntStream; +import com.fasterxml.jackson.core.*; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class TestVariantEncoding { + private static final Logger LOG = LoggerFactory.getLogger(TestVariantEncoding.class); + private static final String RANDOM_CHARS = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + private static final List SAMPLE_JSON_VALUES = Arrays.asList( + "null", + "true", + "false", + "12", + "-9876543210", + "4.5678E123", + "8.765E-2", + "\"string value\"", + "-9876.543", + "234.456789", + "{\"a\": 1, \"b\": {\"e\": -4, \"f\": 5.5}, \"c\": true}", + "[1, -2, 4.5, -6.7, \"str\", true]" + ); + + /** Random number generator for generating random strings */ + private static SecureRandom random = new SecureRandom(); + /** Object mapper for comparing json values */ + private final ObjectMapper mapper = new ObjectMapper(); + + private void checkJson(String jsonValue) { + try { + StreamReadConstraints.overrideDefaultStreamReadConstraints( + StreamReadConstraints.builder().maxNestingDepth(100000).build()); + Variant v = VariantBuilder.parseJson(jsonValue); + Assert.assertEquals(mapper.readTree(jsonValue), + mapper.readTree(v.toJson(ZoneId.systemDefault()))); + } catch (IOException e) { + Assert.fail("Failed to parse json: " + jsonValue + " " + e); + } + } + + private void checkType(Variant v, int expectedBasicType, int expectedTypeInfo) { + Assert.assertEquals(expectedBasicType, v.value[v.pos] & VariantUtil.BASIC_TYPE_MASK); + Assert.assertEquals(expectedTypeInfo, v.getTypeInfo()); + } + + private long microsSinceEpoch(Instant instant) { + return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000; + } + + private String randomString(int len) { + StringBuilder sb = new StringBuilder(len); + for (int i = 0; i < len; i++) { + sb.append(RANDOM_CHARS.charAt(random.nextInt(RANDOM_CHARS.length()))); + } + return sb.toString(); + } + + @Test + public void testNullJson() { + checkJson("null"); + } + + @Test + public void testBooleanJson() { + Arrays.asList("true", "false").forEach(this::checkJson); + } + + @Test + public void testIntegerJson() { + Arrays.asList( + "0", + Byte.toString(Byte.MIN_VALUE), Byte.toString(Byte.MAX_VALUE), + Short.toString(Short.MIN_VALUE), Short.toString(Short.MAX_VALUE), + Integer.toString(Integer.MIN_VALUE), Integer.toString(Integer.MAX_VALUE), + Long.toString(Long.MIN_VALUE), Long.toString(Long.MAX_VALUE) + ).forEach(this::checkJson); + } + + @Test + public void testFloatJson() { + Arrays.asList( + Float.toString(Float.MIN_VALUE), Float.toString(Float.MAX_VALUE), + Double.toString(Double.MIN_VALUE), Double.toString(Double.MAX_VALUE) + ).forEach(this::checkJson); + } + + @Test + public void testStringJson() { + Arrays.asList( + "\"short string\"", + "\"long string: " + new String(new char[1000]).replace("\0", "x") + "\"" + ).forEach(this::checkJson); + } + + @Test + public void testDecimalJson() { + Arrays.asList( + "12.34", "-43.21", + "10.2147483647", "-1021474836.47", + "109223372036854775.807", "-109.223372036854775807" + ).forEach(this::checkJson); + } + + @Test + public void testNullBuilder() { + VariantBuilder vb = new VariantBuilder(false); + vb.appendNull(); + checkType(vb.result(), VariantUtil.NULL, 0); + } + + @Test + public void testBooleanBuilder() { + Arrays.asList(true, false).forEach( b -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendBoolean(b); + checkType(vb2.result(), VariantUtil.PRIMITIVE, b ? VariantUtil.TRUE : VariantUtil.FALSE); + }); + } + + @Test + public void testIntegerBuilder() { + Arrays.asList( + 0L, + (long)Byte.MIN_VALUE, (long)Byte.MAX_VALUE, + (long)Short.MIN_VALUE, (long)Short.MAX_VALUE, + (long)Integer.MIN_VALUE, (long)Integer.MAX_VALUE, + Long.MIN_VALUE, Long.MAX_VALUE + ).forEach( l -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendLong(l); + Variant v = vb2.result(); + if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT1); + } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT2); + } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT4); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + } + Assert.assertEquals((long)l, v.getLong()); + }); + } + + @Test + public void testFloatBuilder() { + Arrays.asList(Float.MIN_VALUE, Float.MAX_VALUE).forEach( f -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendFloat(f); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.FLOAT); + Assert.assertEquals(f, v.getFloat(), 0.000001); + }); + } + + @Test + public void testDoubleBuilder() { + Arrays.asList(Double.MIN_VALUE, Double.MAX_VALUE).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDouble(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DOUBLE); + Assert.assertEquals(d, v.getDouble(), 0.000001); + }); + } + + @Test + public void testStringBuilder() { + IntStream.range(VariantUtil.MAX_SHORT_STR_SIZE - 3, + VariantUtil.MAX_SHORT_STR_SIZE + 3).forEach( len -> { + VariantBuilder vb2 = new VariantBuilder(false); + String s = randomString(len); + vb2.appendString(s); + Variant v = vb2.result(); + if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { + checkType(v, VariantUtil.SHORT_STR, len); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.LONG_STR); + } + Assert.assertEquals(s, v.getString()); + }); + } + + @Test + public void testDecimalBuilder() { + // decimal4 + Arrays.asList(new BigDecimal("123.456"), new BigDecimal("-987.654")).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL4); + Assert.assertEquals(d, v.getDecimal()); + }); + + // decimal8 + Arrays.asList( + new BigDecimal("10.2147483647"), + new BigDecimal("-1021474836.47") + ).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL8); + Assert.assertEquals(d, v.getDecimal()); + }); + + // decimal16 + Arrays.asList( + new BigDecimal("109223372036854775.807"), + new BigDecimal("-109.223372036854775807") + ).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL16); + Assert.assertEquals(d, v.getDecimal()); + }); + } + + @Test + public void testDate() { + VariantBuilder vb = new VariantBuilder(false); + int days = Math.toIntExact(LocalDate.of(2024, 12, 16).toEpochDay()); + vb.appendDate(days); + Assert.assertEquals("\"2024-12-16\"", vb.result().toJson(ZoneId.systemDefault())); + Assert.assertEquals(days, vb.result().getLong()); + } + + @Test + public void testTimestamp() { + VariantBuilder vb = new VariantBuilder(false); + long micros = microsSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456-08:00")); + vb.appendTimestamp(micros); + Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", + vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", + vb.result().toJson(ZoneId.of("+01:00"))); + Assert.assertEquals(micros, vb.result().getLong()); + } + + @Test + public void testTimestampNtz() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; + VariantBuilder vb = new VariantBuilder(false); + long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.000001Z"))); + vb.appendTimestampNtz(micros); + Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", + vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), + vb.result().toJson(ZoneId.of("+02:00"))); + Assert.assertEquals(micros, vb.result().getLong()); + } + + @Test + public void testBinary() { + VariantBuilder vb = new VariantBuilder(false); + byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + vb.appendBinary(binary); + Assert.assertEquals("\"" + Base64.getEncoder().encodeToString(binary) + "\"", + vb.result().toJson(ZoneId.systemDefault())); + Assert.assertArrayEquals(binary, vb.result().getBinary()); + } + + @Test + public void testObject() { + // simple object + StringBuilder sb = new StringBuilder(); + sb.append("{"); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i)); + } + sb.append("}"); + checkJson(sb.toString()); + + // wide object + sb = new StringBuilder(); + sb.append("{"); + for (int i = 0; i < 50000; i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ") + .append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size())); + } + sb.append("}"); + checkJson(sb.toString()); + + // deep object + sb = new StringBuilder(); + // Jackson object mapper hit a stack overflow if json is too deep + for (int i = 0; i < 1000; i++) { + sb.append("{").append("\"field" + i + "\": "); + } + sb.append("{"); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i)); + } + sb.append("}"); + for (int i = 0; i < 1000; i++) { + sb.append("}"); + } + checkJson(sb.toString()); + } + + @Test + public void testArray() { + // simple array + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + if (i > 0) sb.append(", "); + sb.append(SAMPLE_JSON_VALUES.get(i)); + } + sb.append("]"); + checkJson(sb.toString()); + + // large array + sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < 50000; i++) { + if (i > 0) sb.append(", "); + sb.append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size())); + } + sb.append("]"); + checkJson(sb.toString()); + } + + @Test + public void testSizeLimit() { + // large metadata size + try { + VariantBuilder.parseJson( + "{\"12345678901234567890\": 1, \"123456789012345678901\": 2}", + new VariantBuilder(false, 20)); + Assert.fail("Expected VariantSizeLimitException with large metadata"); + } catch (IOException e) { + Assert.fail("Expected VariantSizeLimitException with large metadata"); + } catch (VariantSizeLimitException e) { + // Expected + } + + // large data size + try { + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < 100; i++) { + if (i > 0) sb.append(", "); + sb.append("{\"a\":1}"); + } + sb.append("]"); + VariantBuilder.parseJson(sb.toString(), new VariantBuilder(false, 20)); + Assert.fail("Expected VariantSizeLimitException with large data"); + } catch (IOException e) { + Assert.fail("Expected VariantSizeLimitException with large data"); + } catch (VariantSizeLimitException e) { + // Expected + } + } + + @Test + public void testAllowDuplicateKeys() { + // disallow duplicate keys + try { + VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}"); + Assert.fail("Expected VariantDuplicateKeyException with duplicate keys"); + } catch (IOException e) { + Assert.fail("Expected VariantDuplicateKeyException with duplicate keys"); + } catch (VariantDuplicateKeyException e) { + // Expected + } + + // allow duplicate keys + try { + Variant v = VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}", + new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT)); + Assert.assertEquals(1, v.objectSize()); + Assert.assertEquals(VariantUtil.Type.LONG, v.getFieldByKey("a").getType()); + Assert.assertEquals(2, v.getFieldByKey("a").getLong()); + } catch (Exception e) { + Assert.fail("Unexpected exception: " + e); + } + } + + @Test + public void testTruncateTrailingZeroDecimal() { + for (String[] strings : Arrays.asList( + // decimal4 + // truncate all trailing zeros + new String[]{"1234.0000", "1234"}, + // truncate some trailing zeros + new String[]{"1234.5600", "1234.56"}, + // truncate no trailing zeros + new String[]{"1234.5678", "1234.5678"}, + // decimal8 + // truncate all trailing zeros + new String[]{"-10.0000000000", "-10"}, + // truncate some trailing zeros + new String[]{"-10.2147000000", "-10.2147"}, + // truncate no trailing zeros + new String[]{"-10.2147483647", "-10.2147483647"}, + // decimal16 + // truncate all trailing zeros + new String[]{"1092233720368547.00000", "1092233720368547"}, + // truncate some trailing zeros + new String[]{"1092233720368547.75800", "1092233720368547.758"}, + // truncate no trailing zeros + new String[]{"1092233720368547.75807", "1092233720368547.75807"})) { + VariantBuilder vb = new VariantBuilder(false); + BigDecimal d = new BigDecimal(strings[0]); + vb.appendDecimal(d); + Variant v = vb.result(); + Assert.assertEquals(strings[0], v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(strings[1], v.toJson(ZoneId.of("-08:00"), true)); + } + } + + @Test + public void testTruncateTrailingZeroTimestamp() { + // timestamp + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"}, + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.123000-08:00", "2024-12-16T10:23:45.123-08:00"}, + // truncate no trailing zeros + new String[] {"2024-12-16T10:23:45.123456-08:00", "2024-12-16T10:23:45.123456-08:00"})) { + VariantBuilder vb = new VariantBuilder(false); + long micros = microsSinceEpoch(Instant.parse(strings[0])); + vb.appendTimestamp(micros); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + } + + // timestampNTZ + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.000000", "2024-12-16T10:23:45"}, + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.123000", "2024-12-16T10:23:45.123"}, + // truncate no trailing zeros + new String[] {"2024-12-16T10:23:45.123456", "2024-12-16T10:23:45.123456"})) { + VariantBuilder vb = new VariantBuilder(false); + + long micros = microsSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); + vb.appendTimestampNtz(micros); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(micros, vb.result().getLong()); + } + } +} diff --git a/pom.xml b/pom.xml index 2496171867..5f49bf1764 100644 --- a/pom.xml +++ b/pom.xml @@ -165,6 +165,7 @@ parquet-protobuf parquet-thrift parquet-hadoop-bundle + parquet-variant From c5d19e652b22380407fe277ab9bc9c82a90564d3 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 09:42:45 -0800 Subject: [PATCH 02/20] remove optional --- .../src/main/java/org/apache/parquet/variant/Variant.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index 4fcdb6b0e5..dfdb85b96e 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -323,7 +323,6 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendValue(HOUR_OF_DAY, 2) .appendLiteral(':') .appendValue(MINUTE_OF_HOUR, 2) - .optionalStart() .appendLiteral(':') .appendValue(SECOND_OF_MINUTE, 2) .appendFraction(MICRO_OF_SECOND, 6, 6, true) @@ -343,7 +342,6 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendValue(HOUR_OF_DAY, 2) .appendLiteral(':') .appendValue(MINUTE_OF_HOUR, 2) - .optionalStart() .appendLiteral(':') .appendValue(SECOND_OF_MINUTE, 2) .optionalStart() From 0086b3476e5badbfea775a5bbdf55d488d6019b7 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 10:46:45 -0800 Subject: [PATCH 03/20] split test --- .../java/org/apache/parquet/variant/TestVariantEncoding.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index ea661df4dd..0a8740e3c0 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -452,7 +452,6 @@ public void testTruncateTrailingZeroDecimal() { @Test public void testTruncateTrailingZeroTimestamp() { - // timestamp for (String[] strings : Arrays.asList( // truncate all trailing zeros new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"}, @@ -467,8 +466,10 @@ public void testTruncateTrailingZeroTimestamp() { Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); } + } - // timestampNTZ + @Test + public void testTruncateTrailingZeroTimestampNtz() { DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; for (String[] strings : Arrays.asList( // truncate all trailing zeros From 5af337ffe3214a66e7dee16b704eb0a507d719b2 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 10:50:24 -0800 Subject: [PATCH 04/20] cleanup --- .../org/apache/parquet/variant/Variant.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index dfdb85b96e..d88c38f0df 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -320,11 +320,7 @@ private static void appendQuoted(StringBuilder sb, String str) { private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE) .appendLiteral('T') - .appendValue(HOUR_OF_DAY, 2) - .appendLiteral(':') - .appendValue(MINUTE_OF_HOUR, 2) - .appendLiteral(':') - .appendValue(SECOND_OF_MINUTE, 2) + .appendPattern("HH:mm:ss") .appendFraction(MICRO_OF_SECOND, 6, 6, true) .toFormatter(Locale.US); @@ -339,13 +335,10 @@ private static void appendQuoted(StringBuilder sb, String str) { new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE) .appendLiteral('T') - .appendValue(HOUR_OF_DAY, 2) - .appendLiteral(':') - .appendValue(MINUTE_OF_HOUR, 2) - .appendLiteral(':') - .appendValue(SECOND_OF_MINUTE, 2) + .appendPattern("HH:mm:ss") .optionalStart() .appendFraction(MICRO_OF_SECOND, 0, 6, true) + .optionalEnd() .toFormatter(Locale.US); /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ @@ -354,8 +347,8 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendOffset("+HH:MM", "+00:00") .toFormatter(Locale.US); - private static Instant microsToInstant(long timestamp) { - return Instant.EPOCH.plus(timestamp, ChronoUnit.MICROS); + private static Instant microsToInstant(long microsSinceEpoch) { + return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS); } private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb, From 599773287c3ce96de9809497c9969c4dcf98948f Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 13:47:07 -0800 Subject: [PATCH 05/20] cleanup comment --- .../src/main/java/org/apache/parquet/variant/Variant.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index d88c38f0df..acb635119f 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -301,8 +301,6 @@ private static String escapeJson(String str) { } } - // A simplified and more performant version of `sb.append(escapeJson(str))`. It is used when we - // know `str` doesn't contain any special character that needs escaping. /** * Appends a quoted string to a StringBuilder. It is used when we know `str` doesn't contain any * special characters that needs escaping. This is more performant than From de96bac13747f6934fd6079f86751b75324f5e91 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 21 Jan 2025 09:09:39 -0800 Subject: [PATCH 06/20] Run mvn spotless:apply --- .../variant/MalformedVariantException.java | 3 +- .../org/apache/parquet/variant/Variant.java | 67 +++--- .../parquet/variant/VariantBuilder.java | 24 +- .../variant/VariantSizeLimitException.java | 3 +- .../apache/parquet/variant/VariantUtil.java | 25 ++- .../parquet/variant/TestVariantEncoding.java | 209 +++++++++--------- 6 files changed, 167 insertions(+), 164 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java index e9bff469d2..95e9925f40 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java @@ -19,5 +19,4 @@ /** * An exception indicating that the Variant is malformed. */ -public class MalformedVariantException extends RuntimeException { -} +public class MalformedVariantException extends RuntimeException {} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index acb635119f..cb7b9f642f 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -18,9 +18,11 @@ */ package org.apache.parquet.variant; +import static java.time.temporal.ChronoField.*; +import static org.apache.parquet.variant.VariantUtil.*; + import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonGenerator; - import java.io.CharArrayWriter; import java.io.IOException; import java.math.BigDecimal; @@ -35,10 +37,6 @@ import java.util.Base64; import java.util.Locale; -import static java.time.temporal.ChronoField.*; -import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; -import static org.apache.parquet.variant.VariantUtil.*; - /** * This Variant class holds the Variant-encoded value and metadata binary values. */ @@ -143,8 +141,7 @@ public Type getType() { * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. */ public int objectSize() { - return handleObject(value, pos, - (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); } // Find the field value whose key is equal to `key`. Return null if the key is not found. @@ -292,7 +289,7 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { */ private static String escapeJson(String str) { try (CharArrayWriter writer = new CharArrayWriter(); - JsonGenerator gen = new JsonFactory().createGenerator(writer)) { + JsonGenerator gen = new JsonFactory().createGenerator(writer)) { gen.writeString(str); gen.flush(); return writer.toString(); @@ -329,15 +326,14 @@ private static void appendQuoted(StringBuilder sb, String str) { .toFormatter(Locale.US); /** The format for a timestamp without time zone, truncating trailing microsecond zeros. */ - private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = - new DateTimeFormatterBuilder() - .append(DateTimeFormatter.ISO_LOCAL_DATE) - .appendLiteral('T') - .appendPattern("HH:mm:ss") - .optionalStart() - .appendFraction(MICRO_OF_SECOND, 0, 6, true) - .optionalEnd() - .toFormatter(Locale.US); + private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendPattern("HH:mm:ss") + .optionalStart() + .appendFraction(MICRO_OF_SECOND, 0, 6, true) + .optionalEnd() + .toFormatter(Locale.US); /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder() @@ -349,8 +345,8 @@ private static Instant microsToInstant(long microsSinceEpoch) { return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS); } - private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb, - ZoneId zoneId, boolean truncateTrailingZeros) { + private static void toJsonImpl( + byte[] value, byte[] metadata, int pos, StringBuilder sb, ZoneId zoneId, boolean truncateTrailingZeros) { switch (VariantUtil.getType(value, pos)) { case OBJECT: handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { @@ -398,30 +394,43 @@ private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBui break; case DECIMAL: if (truncateTrailingZeros) { - sb.append(VariantUtil.getDecimal(value, pos).stripTrailingZeros().toPlainString()); + sb.append(VariantUtil.getDecimal(value, pos) + .stripTrailingZeros() + .toPlainString()); } else { sb.append(VariantUtil.getDecimal(value, pos).toPlainString()); } break; case DATE: - appendQuoted(sb, LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)).toString()); + appendQuoted( + sb, + LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)) + .toString()); break; case TIMESTAMP: if (truncateTrailingZeros) { - appendQuoted(sb, TIMESTAMP_TRUNC_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + appendQuoted( + sb, + TIMESTAMP_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) + .atZone(zoneId))); } else { - appendQuoted(sb, TIMESTAMP_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + appendQuoted( + sb, + TIMESTAMP_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) + .atZone(zoneId))); } break; case TIMESTAMP_NTZ: if (truncateTrailingZeros) { - appendQuoted(sb, TIMESTAMP_NTZ_TRUNC_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + appendQuoted( + sb, + TIMESTAMP_NTZ_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) + .atZone(ZoneOffset.UTC))); } else { - appendQuoted(sb, TIMESTAMP_NTZ_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + appendQuoted( + sb, + TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) + .atZone(ZoneOffset.UTC))); } break; case FLOAT: diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 574c8fdbde..e6ef7f34ba 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -16,19 +16,18 @@ */ package org.apache.parquet.variant; -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.charset.StandardCharsets; -import java.util.*; +import static org.apache.parquet.variant.VariantUtil.*; import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.exc.InputCoercionException; - -import static org.apache.parquet.variant.VariantUtil.*; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.*; /** * Builder for creating Variant value and metadata. @@ -80,8 +79,7 @@ public static Variant parseJson(String json, VariantBuilder builder) throws IOEx * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed * the size limit */ - public static Variant parseJson(JsonParser parser, VariantBuilder builder) - throws IOException { + public static Variant parseJson(JsonParser parser, VariantBuilder builder) throws IOException { builder.buildFromJsonParser(parser); return builder.result(); } @@ -106,7 +104,7 @@ public Variant result() { if (maxSize > sizeLimitBytes) { throw new VariantSizeLimitException(); } - int offsetSize = getMinIntegerSize((int)maxSize); + int offsetSize = getMinIntegerSize((int) maxSize); int offsetStart = 1 + offsetSize; int stringStart = offsetStart + (numKeys + 1) * offsetSize; @@ -346,8 +344,7 @@ public void finishWritingObject(int start, ArrayList fields) { for (int i = 0; i < size; ++i) { int oldOffset = fields.get(i).offset; int fieldSize = VariantUtil.valueSize(writeBuffer, start + oldOffset); - System.arraycopy(writeBuffer, start + oldOffset, - writeBuffer, start + currentOffset, fieldSize); + System.arraycopy(writeBuffer, start + oldOffset, writeBuffer, start + currentOffset, fieldSize); fields.set(i, fields.get(i).withNewOffset(currentOffset)); currentOffset += fieldSize; } @@ -620,6 +617,7 @@ private boolean tryParseDecimal(String input) { /** The buffer for building the Variant value. The first `writePos` bytes have been written. */ private byte[] writeBuffer = new byte[128]; + private int writePos = 0; /** The dictionary for mapping keys to monotonically increasing ids. */ private final HashMap dictionary = new HashMap<>(); diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java index 08556e762e..29722d21d2 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java @@ -20,5 +20,4 @@ * An exception indicating that the metadata or data size of the Variant exceeds the * configured size limit. */ -public class VariantSizeLimitException extends RuntimeException { -} +public class VariantSizeLimitException extends RuntimeException {} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index aeebfe67e1..f61c684f54 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -23,12 +23,12 @@ /** * This class defines constants related to the Variant format and provides functions for * manipulating Variant binaries. - + * * A Variant is made up of 2 binaries: value and metadata. A Variant value consists of a one-byte * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in * the below constants for all possible basic type and type info values. - + * * The Variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). * Its binary format is: * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. @@ -184,14 +184,14 @@ public static byte shortStrHeader(int size) { } public static byte objectHeader(boolean largeSize, int idSize, int offsetSize) { - return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) | - ((idSize - 1) << (BASIC_TYPE_BITS + 2)) | - ((offsetSize - 1) << BASIC_TYPE_BITS) | OBJECT); + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) + | ((idSize - 1) << (BASIC_TYPE_BITS + 2)) + | ((offsetSize - 1) << BASIC_TYPE_BITS) + | OBJECT); } public static byte arrayHeader(boolean largeSize, int offsetSize) { - return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | - ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); } public static MalformedVariantException malformedVariant() { @@ -348,12 +348,17 @@ public static int valueSize(byte[] value, int pos) { case SHORT_STR: return 1 + typeInfo; case OBJECT: - return handleObject(value, pos, + return handleObject( + value, + pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); case ARRAY: - return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> - dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + return handleArray( + value, + pos, + (size, offsetSize, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); default: switch (typeInfo) { case NULL: diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 0a8740e3c0..09434732bb 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -18,6 +18,8 @@ */ package org.apache.parquet.variant; +import com.fasterxml.jackson.core.*; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.math.BigDecimal; import java.security.SecureRandom; @@ -30,18 +32,14 @@ import java.util.List; import java.util.concurrent.TimeUnit; import java.util.stream.IntStream; -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - public class TestVariantEncoding { private static final Logger LOG = LoggerFactory.getLogger(TestVariantEncoding.class); - private static final String RANDOM_CHARS = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + private static final String RANDOM_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; private static final List SAMPLE_JSON_VALUES = Arrays.asList( "null", "true", @@ -54,8 +52,7 @@ public class TestVariantEncoding { "-9876.543", "234.456789", "{\"a\": 1, \"b\": {\"e\": -4, \"f\": 5.5}, \"c\": true}", - "[1, -2, 4.5, -6.7, \"str\", true]" - ); + "[1, -2, 4.5, -6.7, \"str\", true]"); /** Random number generator for generating random strings */ private static SecureRandom random = new SecureRandom(); @@ -67,8 +64,7 @@ private void checkJson(String jsonValue) { StreamReadConstraints.overrideDefaultStreamReadConstraints( StreamReadConstraints.builder().maxNestingDepth(100000).build()); Variant v = VariantBuilder.parseJson(jsonValue); - Assert.assertEquals(mapper.readTree(jsonValue), - mapper.readTree(v.toJson(ZoneId.systemDefault()))); + Assert.assertEquals(mapper.readTree(jsonValue), mapper.readTree(v.toJson(ZoneId.systemDefault()))); } catch (IOException e) { Assert.fail("Failed to parse json: " + jsonValue + " " + e); } @@ -80,7 +76,7 @@ private void checkType(Variant v, int expectedBasicType, int expectedTypeInfo) { } private long microsSinceEpoch(Instant instant) { - return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000; + return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000; } private String randomString(int len) { @@ -104,37 +100,39 @@ public void testBooleanJson() { @Test public void testIntegerJson() { Arrays.asList( - "0", - Byte.toString(Byte.MIN_VALUE), Byte.toString(Byte.MAX_VALUE), - Short.toString(Short.MIN_VALUE), Short.toString(Short.MAX_VALUE), - Integer.toString(Integer.MIN_VALUE), Integer.toString(Integer.MAX_VALUE), - Long.toString(Long.MIN_VALUE), Long.toString(Long.MAX_VALUE) - ).forEach(this::checkJson); + "0", + Byte.toString(Byte.MIN_VALUE), + Byte.toString(Byte.MAX_VALUE), + Short.toString(Short.MIN_VALUE), + Short.toString(Short.MAX_VALUE), + Integer.toString(Integer.MIN_VALUE), + Integer.toString(Integer.MAX_VALUE), + Long.toString(Long.MIN_VALUE), + Long.toString(Long.MAX_VALUE)) + .forEach(this::checkJson); } @Test public void testFloatJson() { Arrays.asList( - Float.toString(Float.MIN_VALUE), Float.toString(Float.MAX_VALUE), - Double.toString(Double.MIN_VALUE), Double.toString(Double.MAX_VALUE) - ).forEach(this::checkJson); + Float.toString(Float.MIN_VALUE), Float.toString(Float.MAX_VALUE), + Double.toString(Double.MIN_VALUE), Double.toString(Double.MAX_VALUE)) + .forEach(this::checkJson); } @Test public void testStringJson() { - Arrays.asList( - "\"short string\"", - "\"long string: " + new String(new char[1000]).replace("\0", "x") + "\"" - ).forEach(this::checkJson); + Arrays.asList("\"short string\"", "\"long string: " + new String(new char[1000]).replace("\0", "x") + "\"") + .forEach(this::checkJson); } @Test public void testDecimalJson() { Arrays.asList( - "12.34", "-43.21", - "10.2147483647", "-1021474836.47", - "109223372036854775.807", "-109.223372036854775807" - ).forEach(this::checkJson); + "12.34", "-43.21", + "10.2147483647", "-1021474836.47", + "109223372036854775.807", "-109.223372036854775807") + .forEach(this::checkJson); } @Test @@ -146,7 +144,7 @@ public void testNullBuilder() { @Test public void testBooleanBuilder() { - Arrays.asList(true, false).forEach( b -> { + Arrays.asList(true, false).forEach(b -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendBoolean(b); checkType(vb2.result(), VariantUtil.PRIMITIVE, b ? VariantUtil.TRUE : VariantUtil.FALSE); @@ -156,31 +154,35 @@ public void testBooleanBuilder() { @Test public void testIntegerBuilder() { Arrays.asList( - 0L, - (long)Byte.MIN_VALUE, (long)Byte.MAX_VALUE, - (long)Short.MIN_VALUE, (long)Short.MAX_VALUE, - (long)Integer.MIN_VALUE, (long)Integer.MAX_VALUE, - Long.MIN_VALUE, Long.MAX_VALUE - ).forEach( l -> { - VariantBuilder vb2 = new VariantBuilder(false); - vb2.appendLong(l); - Variant v = vb2.result(); - if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT1); - } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT2); - } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT4); - } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); - } - Assert.assertEquals((long)l, v.getLong()); - }); + 0L, + (long) Byte.MIN_VALUE, + (long) Byte.MAX_VALUE, + (long) Short.MIN_VALUE, + (long) Short.MAX_VALUE, + (long) Integer.MIN_VALUE, + (long) Integer.MAX_VALUE, + Long.MIN_VALUE, + Long.MAX_VALUE) + .forEach(l -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendLong(l); + Variant v = vb2.result(); + if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT1); + } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT2); + } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT4); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + } + Assert.assertEquals((long) l, v.getLong()); + }); } @Test public void testFloatBuilder() { - Arrays.asList(Float.MIN_VALUE, Float.MAX_VALUE).forEach( f -> { + Arrays.asList(Float.MIN_VALUE, Float.MAX_VALUE).forEach(f -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendFloat(f); Variant v = vb2.result(); @@ -191,7 +193,7 @@ public void testFloatBuilder() { @Test public void testDoubleBuilder() { - Arrays.asList(Double.MIN_VALUE, Double.MAX_VALUE).forEach( d -> { + Arrays.asList(Double.MIN_VALUE, Double.MAX_VALUE).forEach(d -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDouble(d); Variant v = vb2.result(); @@ -202,25 +204,25 @@ public void testDoubleBuilder() { @Test public void testStringBuilder() { - IntStream.range(VariantUtil.MAX_SHORT_STR_SIZE - 3, - VariantUtil.MAX_SHORT_STR_SIZE + 3).forEach( len -> { - VariantBuilder vb2 = new VariantBuilder(false); - String s = randomString(len); - vb2.appendString(s); - Variant v = vb2.result(); - if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { - checkType(v, VariantUtil.SHORT_STR, len); - } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.LONG_STR); - } - Assert.assertEquals(s, v.getString()); - }); + IntStream.range(VariantUtil.MAX_SHORT_STR_SIZE - 3, VariantUtil.MAX_SHORT_STR_SIZE + 3) + .forEach(len -> { + VariantBuilder vb2 = new VariantBuilder(false); + String s = randomString(len); + vb2.appendString(s); + Variant v = vb2.result(); + if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { + checkType(v, VariantUtil.SHORT_STR, len); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.LONG_STR); + } + Assert.assertEquals(s, v.getString()); + }); } @Test public void testDecimalBuilder() { // decimal4 - Arrays.asList(new BigDecimal("123.456"), new BigDecimal("-987.654")).forEach( d -> { + Arrays.asList(new BigDecimal("123.456"), new BigDecimal("-987.654")).forEach(d -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); Variant v = vb2.result(); @@ -229,28 +231,24 @@ public void testDecimalBuilder() { }); // decimal8 - Arrays.asList( - new BigDecimal("10.2147483647"), - new BigDecimal("-1021474836.47") - ).forEach( d -> { - VariantBuilder vb2 = new VariantBuilder(false); - vb2.appendDecimal(d); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL8); - Assert.assertEquals(d, v.getDecimal()); - }); + Arrays.asList(new BigDecimal("10.2147483647"), new BigDecimal("-1021474836.47")) + .forEach(d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL8); + Assert.assertEquals(d, v.getDecimal()); + }); // decimal16 - Arrays.asList( - new BigDecimal("109223372036854775.807"), - new BigDecimal("-109.223372036854775807") - ).forEach( d -> { - VariantBuilder vb2 = new VariantBuilder(false); - vb2.appendDecimal(d); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL16); - Assert.assertEquals(d, v.getDecimal()); - }); + Arrays.asList(new BigDecimal("109223372036854775.807"), new BigDecimal("-109.223372036854775807")) + .forEach(d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL16); + Assert.assertEquals(d, v.getDecimal()); + }); } @Test @@ -267,10 +265,8 @@ public void testTimestamp() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456-08:00")); vb.appendTimestamp(micros); - Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", - vb.result().toJson(ZoneId.of("-08:00"))); - Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", - vb.result().toJson(ZoneId.of("+01:00"))); + Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", vb.result().toJson(ZoneId.of("+01:00"))); Assert.assertEquals(micros, vb.result().getLong()); } @@ -280,10 +276,8 @@ public void testTimestampNtz() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.000001Z"))); vb.appendTimestampNtz(micros); - Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", - vb.result().toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), - vb.result().toJson(ZoneId.of("+02:00"))); + Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), vb.result().toJson(ZoneId.of("+02:00"))); Assert.assertEquals(micros, vb.result().getLong()); } @@ -292,7 +286,8 @@ public void testBinary() { VariantBuilder vb = new VariantBuilder(false); byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; vb.appendBinary(binary); - Assert.assertEquals("\"" + Base64.getEncoder().encodeToString(binary) + "\"", + Assert.assertEquals( + "\"" + Base64.getEncoder().encodeToString(binary) + "\"", vb.result().toJson(ZoneId.systemDefault())); Assert.assertArrayEquals(binary, vb.result().getBinary()); } @@ -314,8 +309,7 @@ public void testObject() { sb.append("{"); for (int i = 0; i < 50000; i++) { if (i > 0) sb.append(", "); - sb.append("\"field" + i + "\": ") - .append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size())); + sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size())); } sb.append("}"); checkJson(sb.toString()); @@ -366,8 +360,7 @@ public void testSizeLimit() { // large metadata size try { VariantBuilder.parseJson( - "{\"12345678901234567890\": 1, \"123456789012345678901\": 2}", - new VariantBuilder(false, 20)); + "{\"12345678901234567890\": 1, \"123456789012345678901\": 2}", new VariantBuilder(false, 20)); Assert.fail("Expected VariantSizeLimitException with large metadata"); } catch (IOException e) { Assert.fail("Expected VariantSizeLimitException with large metadata"); @@ -407,8 +400,8 @@ public void testAllowDuplicateKeys() { // allow duplicate keys try { - Variant v = VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}", - new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT)); + Variant v = VariantBuilder.parseJson( + "{\"a\": 1, \"a\": 2}", new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT)); Assert.assertEquals(1, v.objectSize()); Assert.assertEquals(VariantUtil.Type.LONG, v.getFieldByKey("a").getType()); Assert.assertEquals(2, v.getFieldByKey("a").getLong()); @@ -422,25 +415,25 @@ public void testTruncateTrailingZeroDecimal() { for (String[] strings : Arrays.asList( // decimal4 // truncate all trailing zeros - new String[]{"1234.0000", "1234"}, + new String[] {"1234.0000", "1234"}, // truncate some trailing zeros - new String[]{"1234.5600", "1234.56"}, + new String[] {"1234.5600", "1234.56"}, // truncate no trailing zeros - new String[]{"1234.5678", "1234.5678"}, + new String[] {"1234.5678", "1234.5678"}, // decimal8 // truncate all trailing zeros - new String[]{"-10.0000000000", "-10"}, + new String[] {"-10.0000000000", "-10"}, // truncate some trailing zeros - new String[]{"-10.2147000000", "-10.2147"}, + new String[] {"-10.2147000000", "-10.2147"}, // truncate no trailing zeros - new String[]{"-10.2147483647", "-10.2147483647"}, + new String[] {"-10.2147483647", "-10.2147483647"}, // decimal16 // truncate all trailing zeros - new String[]{"1092233720368547.00000", "1092233720368547"}, + new String[] {"1092233720368547.00000", "1092233720368547"}, // truncate some trailing zeros - new String[]{"1092233720368547.75800", "1092233720368547.758"}, + new String[] {"1092233720368547.75800", "1092233720368547.758"}, // truncate no trailing zeros - new String[]{"1092233720368547.75807", "1092233720368547.75807"})) { + new String[] {"1092233720368547.75807", "1092233720368547.75807"})) { VariantBuilder vb = new VariantBuilder(false); BigDecimal d = new BigDecimal(strings[0]); vb.appendDecimal(d); From 848ddcb5c36525d7f4911b910e624baf42379ef1 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 21 Jan 2025 11:34:11 -0800 Subject: [PATCH 07/20] Fix dependencies --- parquet-variant/pom.xml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/parquet-variant/pom.xml b/parquet-variant/pom.xml index 6bfc2ff525..9fc9d97266 100644 --- a/parquet-variant/pom.xml +++ b/parquet-variant/pom.xml @@ -53,22 +53,11 @@ ${jackson-databind.version} test - - com.google.guava - guava - ${guava.version} - test - - - org.slf4j - slf4j-log4j12 - ${slf4j.version} - test - org.slf4j slf4j-api ${slf4j.version} + test From 1a448ea1742e6b42bfef7f9e6cdf87cb68f65e78 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 21 Jan 2025 13:45:22 -0800 Subject: [PATCH 08/20] Fix tests for older jdk versions --- .../apache/parquet/variant/TestVariantEncoding.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 09434732bb..6b78b8295d 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -262,8 +262,9 @@ public void testDate() { @Test public void testTimestamp() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; VariantBuilder vb = new VariantBuilder(false); - long micros = microsSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456-08:00")); + long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-12-16T10:23:45.321456-08:00"))); vb.appendTimestamp(micros); Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", vb.result().toJson(ZoneId.of("-08:00"))); Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", vb.result().toJson(ZoneId.of("+01:00"))); @@ -317,7 +318,7 @@ public void testObject() { // deep object sb = new StringBuilder(); // Jackson object mapper hit a stack overflow if json is too deep - for (int i = 0; i < 1000; i++) { + for (int i = 0; i < 500; i++) { sb.append("{").append("\"field" + i + "\": "); } sb.append("{"); @@ -326,7 +327,7 @@ public void testObject() { sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i)); } sb.append("}"); - for (int i = 0; i < 1000; i++) { + for (int i = 0; i < 500; i++) { sb.append("}"); } checkJson(sb.toString()); @@ -445,6 +446,7 @@ public void testTruncateTrailingZeroDecimal() { @Test public void testTruncateTrailingZeroTimestamp() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; for (String[] strings : Arrays.asList( // truncate all trailing zeros new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"}, @@ -453,7 +455,7 @@ public void testTruncateTrailingZeroTimestamp() { // truncate no trailing zeros new String[] {"2024-12-16T10:23:45.123456-08:00", "2024-12-16T10:23:45.123456-08:00"})) { VariantBuilder vb = new VariantBuilder(false); - long micros = microsSinceEpoch(Instant.parse(strings[0])); + long micros = microsSinceEpoch(Instant.from(dtf.parse(strings[0]))); vb.appendTimestamp(micros); Variant v = vb.result(); Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); From 205629735d1eddce74e0485448925ee0e5700282 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Wed, 5 Feb 2025 15:05:28 -0800 Subject: [PATCH 09/20] Address PR comments --- .../variant/MalformedVariantException.java | 10 +- .../org/apache/parquet/variant/Variant.java | 254 +++++++++--------- .../parquet/variant/VariantBuilder.java | 217 +++++++-------- .../variant/VariantSizeLimitException.java | 8 +- .../apache/parquet/variant/VariantUtil.java | 34 ++- .../parquet/variant/TestVariantEncoding.java | 60 ++++- 6 files changed, 330 insertions(+), 253 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java index 95e9925f40..ffbd0786b7 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java @@ -19,4 +19,12 @@ /** * An exception indicating that the Variant is malformed. */ -public class MalformedVariantException extends RuntimeException {} +public class MalformedVariantException extends RuntimeException { + public MalformedVariantException() { + super(); + } + + public MalformedVariantException(String message) { + super(message); + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index cb7b9f642f..75aa73869d 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -18,9 +18,6 @@ */ package org.apache.parquet.variant; -import static java.time.temporal.ChronoField.*; -import static org.apache.parquet.variant.VariantUtil.*; - import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonGenerator; import java.io.CharArrayWriter; @@ -32,6 +29,7 @@ import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoField; import java.time.temporal.ChronoUnit; import java.util.Arrays; import java.util.Base64; @@ -49,6 +47,14 @@ public final class Variant { */ final int pos; + /** + * The threshold to switch from linear search to binary search when looking up a field by key in + * an object. This is a performance optimization to avoid the overhead of binary search for a + * short list. + */ + static final int BINARY_SEARCH_THRESHOLD = 32; + static final ZoneId UTC = ZoneId.of("UTC"); + public Variant(byte[] value, byte[] metadata) { this(value, metadata, 0); } @@ -58,15 +64,19 @@ public Variant(byte[] value, byte[] metadata) { this.metadata = metadata; this.pos = pos; // There is currently only one allowed version. - if (metadata.length < 1 || (metadata[0] & VERSION_MASK) != VERSION) { - throw malformedVariant(); + if (metadata.length < 1 || (metadata[0] & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { + throw VariantUtil.malformedVariant(String.format( + "Unsupported variant metadata version: %02X", metadata[0] & VariantUtil.VERSION_MASK)); } } public byte[] getValue() { - if (pos == 0) return value; - int size = valueSize(value, pos); - checkIndex(pos + size - 1, value.length); + if (pos == 0) { + // Position 0 means the entire value is used. Return the original value. + return value; + } + int size = VariantUtil.valueSize(value, pos); + VariantUtil.checkIndex(pos + size - 1, value.length); return Arrays.copyOfRange(value, pos, pos + size); } @@ -124,16 +134,16 @@ public String getString() { } /** - * @return the type info bits from a variant value + * @return the primitive type id from a variant value */ - public int getTypeInfo() { - return VariantUtil.getTypeInfo(value, pos); + public int getPrimitiveTypeId() { + return VariantUtil.getPrimitiveTypeId(value, pos); } /** * @return the type of the variant value */ - public Type getType() { + public VariantUtil.Type getType() { return VariantUtil.getType(value, pos); } @@ -141,12 +151,10 @@ public Type getType() { * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. */ public int objectSize() { - return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); + return VariantUtil.handleObject( + value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); } - // Find the field value whose key is equal to `key`. Return null if the key is not found. - // It is only legal to call it when `getType()` is `Type.OBJECT`. - /** * Returns the object field Variant value whose key is equal to `key`. * Return null if the key is not found. `getType()` must be `Type.OBJECT`. @@ -154,16 +162,15 @@ public int objectSize() { * @return the field value whose key is equal to `key`, or null if key is not found */ public Variant getFieldByKey(String key) { - return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + return VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { // Use linear search for a short list. Switch to binary search when the length reaches // `BINARY_SEARCH_THRESHOLD`. - final int BINARY_SEARCH_THRESHOLD = 32; if (size < BINARY_SEARCH_THRESHOLD) { for (int i = 0; i < size; ++i) { - int id = readUnsigned(value, idStart + idSize * i, idSize); - if (key.equals(getMetadataKey(metadata, id))) { - int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); - return new Variant(value, metadata, dataStart + offset); + ObjectField field = + getFieldAtIndex(i, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); + if (field.key.equals(key)) { + return field.value; } } } else { @@ -174,15 +181,15 @@ public Variant getFieldByKey(String key) { // performance optimization, because it can properly handle the case where `low + high` // overflows int. int mid = (low + high) >>> 1; - int id = readUnsigned(value, idStart + idSize * mid, idSize); - int cmp = getMetadataKey(metadata, id).compareTo(key); + ObjectField field = + getFieldAtIndex(mid, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); + int cmp = field.key.compareTo(key); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { - int offset = readUnsigned(value, offsetStart + offsetSize * mid, offsetSize); - return new Variant(value, metadata, dataStart + offset); + return field.value; } } } @@ -203,26 +210,37 @@ public ObjectField(String key, Variant value) { } } - // Get the object field at the `index` slot. Return null if `index` is out of the bound of - // `[0, objectSize())`. - // It is only legal to call it when `getType()` is `Type.OBJECT`. /** - * Returns the object field at the `index` slot. Return null if `index` is out of the bound of + * Returns the ObjectField at the `index` slot. Return null if `index` is out of the bound of * `[0, objectSize())`. `getType()` must be `Type.OBJECT`. * @param index the index of the object field to get * @return the Objectfield at the `index` slot, or null if `index` is out of bounds */ public ObjectField getFieldAtIndex(int index) { - return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { - if (index < 0 || index >= size) return null; - int id = readUnsigned(value, idStart + idSize * index, idSize); - int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); - String key = getMetadataKey(metadata, id); - Variant v = new Variant(value, metadata, dataStart + offset); - return new ObjectField(key, v); + return VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) { + return null; + } + return getFieldAtIndex(index, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); }); } + private static ObjectField getFieldAtIndex( + int index, + byte[] value, + byte[] metadata, + int idSize, + int offsetSize, + int idStart, + int offsetStart, + int dataStart) { + int id = VariantUtil.readUnsigned(value, idStart + idSize * index, idSize); + int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + String key = VariantUtil.getMetadataKey(metadata, id); + Variant v = new Variant(value, metadata, dataStart + offset); + return new ObjectField(key, v); + } + /** * Returns the dictionary ID for the object field at the `index` slot. * `getType()` must be `Type.OBJECT`. @@ -231,11 +249,11 @@ public ObjectField getFieldAtIndex(int index) { * @throws MalformedVariantException if `index` is out of bounds */ public int getDictionaryIdAtIndex(int index) { - return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + return VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { if (index < 0 || index >= size) { - throw malformedVariant(); + throw VariantUtil.malformedVariant(); } - return readUnsigned(value, idStart + idSize * index, idSize); + return VariantUtil.readUnsigned(value, idStart + idSize * index, idSize); }); } @@ -243,7 +261,7 @@ public int getDictionaryIdAtIndex(int index) { * @return the number of array elements. `getType()` must be `Type.ARRAY`. */ public int arraySize() { - return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); + return VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); } /** @@ -253,13 +271,28 @@ public int arraySize() { * @return the array element Variant at the `index` slot, or null if `index` is out of bounds */ public Variant getElementAtIndex(int index) { - return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { - if (index < 0 || index >= size) return null; - int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); - return new Variant(value, metadata, dataStart + offset); + return VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + if (index < 0 || index >= size) { + return null; + } + return getElementAtIndex(index, value, metadata, offsetSize, offsetStart, dataStart); }); } + private static Variant getElementAtIndex( + int index, byte[] value, byte[] metadata, int offsetSize, int offsetStart, int dataStart) { + int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + + /** + * @return the JSON representation of the variant + * @throws MalformedVariantException if the variant is malformed + */ + public String toJson() { + return toJson(UTC, false); + } + /** * @param zoneId The ZoneId to use for formatting timestamps * @return the JSON representation of the variant @@ -276,21 +309,9 @@ public String toJson(ZoneId zoneId) { * @throws MalformedVariantException if the variant is malformed */ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { - StringBuilder sb = new StringBuilder(); - toJsonImpl(value, metadata, pos, sb, zoneId, truncateTrailingZeros); - return sb.toString(); - } - - /** - * Escapes a string so that it can be pasted into a JSON structure. For example, if `str` - * only contains a new-line character, then the result is "\n" (4 characters) - * @param str the string to escape - * @return the escaped string - */ - private static String escapeJson(String str) { try (CharArrayWriter writer = new CharArrayWriter(); JsonGenerator gen = new JsonFactory().createGenerator(writer)) { - gen.writeString(str); + toJsonImpl(value, metadata, pos, gen, zoneId, truncateTrailingZeros); gen.flush(); return writer.toString(); } catch (IOException e) { @@ -298,25 +319,12 @@ private static String escapeJson(String str) { } } - /** - * Appends a quoted string to a StringBuilder. It is used when we know `str` doesn't contain any - * special characters that needs escaping. This is more performant than - * `sb.append(escapeJson(str))`. - * @param sb the StringBuilder to append to - * @param str the string to append - */ - private static void appendQuoted(StringBuilder sb, String str) { - sb.append('"'); - sb.append(str); - sb.append('"'); - } - /** The format for a timestamp without time zone. */ private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE) .appendLiteral('T') .appendPattern("HH:mm:ss") - .appendFraction(MICRO_OF_SECOND, 6, 6, true) + .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) .toFormatter(Locale.US); /** The format for a timestamp with time zone. */ @@ -331,7 +339,7 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendLiteral('T') .appendPattern("HH:mm:ss") .optionalStart() - .appendFraction(MICRO_OF_SECOND, 0, 6, true) + .appendFraction(ChronoField.MICRO_OF_SECOND, 0, 6, true) .optionalEnd() .toFormatter(Locale.US); @@ -346,99 +354,101 @@ private static Instant microsToInstant(long microsSinceEpoch) { } private static void toJsonImpl( - byte[] value, byte[] metadata, int pos, StringBuilder sb, ZoneId zoneId, boolean truncateTrailingZeros) { + byte[] value, byte[] metadata, int pos, JsonGenerator gen, ZoneId zoneId, boolean truncateTrailingZeros) + throws IOException { switch (VariantUtil.getType(value, pos)) { case OBJECT: - handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { - sb.append('{'); - for (int i = 0; i < size; ++i) { - int id = readUnsigned(value, idStart + idSize * i, idSize); - int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); - int elementPos = dataStart + offset; - if (i != 0) sb.append(','); - sb.append(escapeJson(getMetadataKey(metadata, id))); - sb.append(':'); - toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros); + VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + try { + gen.writeStartObject(); + for (int i = 0; i < size; ++i) { + ObjectField field = getFieldAtIndex( + i, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); + gen.writeFieldName(field.key); + toJsonImpl( + field.value.value, + field.value.metadata, + field.value.pos, + gen, + zoneId, + truncateTrailingZeros); + } + gen.writeEndObject(); + } catch (IOException e) { + throw new RuntimeException(e); } - sb.append('}'); return null; }); break; case ARRAY: - handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { - sb.append('['); - for (int i = 0; i < size; ++i) { - int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); - int elementPos = dataStart + offset; - if (i != 0) sb.append(','); - toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros); + VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + try { + gen.writeStartArray(); + for (int i = 0; i < size; ++i) { + Variant v = getElementAtIndex(i, value, metadata, offsetSize, offsetStart, dataStart); + toJsonImpl(v.value, v.metadata, v.pos, gen, zoneId, truncateTrailingZeros); + } + gen.writeEndArray(); + } catch (IOException e) { + throw new RuntimeException(e); } - sb.append(']'); return null; }); break; case NULL: - sb.append("null"); + gen.writeNull(); break; case BOOLEAN: - sb.append(VariantUtil.getBoolean(value, pos)); + gen.writeBoolean(VariantUtil.getBoolean(value, pos)); break; case LONG: - sb.append(VariantUtil.getLong(value, pos)); + gen.writeNumber(VariantUtil.getLong(value, pos)); break; case STRING: - sb.append(escapeJson(VariantUtil.getString(value, pos))); + gen.writeString(VariantUtil.getString(value, pos)); break; case DOUBLE: - sb.append(VariantUtil.getDouble(value, pos)); + gen.writeNumber(VariantUtil.getDouble(value, pos)); break; case DECIMAL: if (truncateTrailingZeros) { - sb.append(VariantUtil.getDecimal(value, pos) + gen.writeNumber(VariantUtil.getDecimal(value, pos) .stripTrailingZeros() .toPlainString()); } else { - sb.append(VariantUtil.getDecimal(value, pos).toPlainString()); + gen.writeNumber(VariantUtil.getDecimal(value, pos).toPlainString()); } break; case DATE: - appendQuoted( - sb, - LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)) - .toString()); + gen.writeString(LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)) + .toString()); break; case TIMESTAMP: if (truncateTrailingZeros) { - appendQuoted( - sb, - TIMESTAMP_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) - .atZone(zoneId))); + gen.writeString(TIMESTAMP_TRUNC_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); } else { - appendQuoted( - sb, - TIMESTAMP_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) - .atZone(zoneId))); + gen.writeString(TIMESTAMP_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); } break; case TIMESTAMP_NTZ: if (truncateTrailingZeros) { - appendQuoted( - sb, - TIMESTAMP_NTZ_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) - .atZone(ZoneOffset.UTC))); + gen.writeString(TIMESTAMP_NTZ_TRUNC_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); } else { - appendQuoted( - sb, - TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, pos)) - .atZone(ZoneOffset.UTC))); + gen.writeString(TIMESTAMP_NTZ_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); } break; case FLOAT: - sb.append(VariantUtil.getFloat(value, pos)); + gen.writeNumber(VariantUtil.getFloat(value, pos)); break; case BINARY: - appendQuoted(sb, Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); + gen.writeString(Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); break; + default: + throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, pos)); } } } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index e6ef7f34ba..2c681795c1 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -16,8 +16,6 @@ */ package org.apache.parquet.variant; -import static org.apache.parquet.variant.VariantUtil.*; - import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.core.JsonParser; @@ -27,16 +25,31 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; /** * Builder for creating Variant value and metadata. */ public class VariantBuilder { + /** + * Creates a VariantBuilder. + * @param allowDuplicateKeys if true, only the last occurrence of a duplicate key will be kept. + * Otherwise, an exception will be thrown. + */ public VariantBuilder(boolean allowDuplicateKeys) { - this(allowDuplicateKeys, DEFAULT_SIZE_LIMIT); + this(allowDuplicateKeys, VariantUtil.DEFAULT_SIZE_LIMIT); } + /** + * Creates a VariantBuilder. + * @param allowDuplicateKeys if true, only the last occurrence of a duplicate key will be kept. + * Otherwise, an exception will be thrown. + * @param sizeLimitBytes the maximum size (in bytes) of the resulting Variant value or metadata + */ public VariantBuilder(boolean allowDuplicateKeys, int sizeLimitBytes) { this.allowDuplicateKeys = allowDuplicateKeys; this.sizeLimitBytes = sizeLimitBytes; @@ -102,7 +115,7 @@ public Variant result() { // in case of pathological data. long maxSize = Math.max(dictionaryStringSize, numKeys); if (maxSize > sizeLimitBytes) { - throw new VariantSizeLimitException(); + throw new VariantSizeLimitException(sizeLimitBytes, maxSize); } int offsetSize = getMinIntegerSize((int) maxSize); @@ -111,33 +124,33 @@ public Variant result() { long metadataSize = stringStart + dictionaryStringSize; if (metadataSize > sizeLimitBytes) { - throw new VariantSizeLimitException(); + throw new VariantSizeLimitException(sizeLimitBytes, metadataSize); } byte[] metadata = new byte[(int) metadataSize]; - int headerByte = VERSION | ((offsetSize - 1) << 6); - writeLong(metadata, 0, headerByte, 1); - writeLong(metadata, 1, numKeys, offsetSize); + int headerByte = VariantUtil.VERSION | ((offsetSize - 1) << 6); + VariantUtil.writeLong(metadata, 0, headerByte, 1); + VariantUtil.writeLong(metadata, 1, numKeys, offsetSize); int currentOffset = 0; for (int i = 0; i < numKeys; ++i) { - writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize); + VariantUtil.writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize); byte[] key = dictionaryKeys.get(i); System.arraycopy(key, 0, metadata, stringStart + currentOffset, key.length); currentOffset += key.length; } - writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize); + VariantUtil.writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize); return new Variant(Arrays.copyOfRange(writeBuffer, 0, writePos), metadata); } public void appendString(String str) { byte[] text = str.getBytes(StandardCharsets.UTF_8); - boolean longStr = text.length > MAX_SHORT_STR_SIZE; - checkCapacity((longStr ? 1 + U32_SIZE : 1) + text.length); + boolean longStr = text.length > VariantUtil.MAX_SHORT_STR_SIZE; + checkCapacity((longStr ? 1 + VariantUtil.U32_SIZE : 1) + text.length); if (longStr) { - writeBuffer[writePos++] = primitiveHeader(LONG_STR); - writeLong(writeBuffer, writePos, text.length, U32_SIZE); - writePos += U32_SIZE; + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.LONG_STR); + VariantUtil.writeLong(writeBuffer, writePos, text.length, VariantUtil.U32_SIZE); + writePos += VariantUtil.U32_SIZE; } else { - writeBuffer[writePos++] = shortStrHeader(text.length); + writeBuffer[writePos++] = VariantUtil.shortStrHeader(text.length); } System.arraycopy(text, 0, writeBuffer, writePos, text.length); writePos += text.length; @@ -145,12 +158,12 @@ public void appendString(String str) { public void appendNull() { checkCapacity(1); - writeBuffer[writePos++] = primitiveHeader(NULL); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.NULL); } public void appendBoolean(boolean b) { checkCapacity(1); - writeBuffer[writePos++] = primitiveHeader(b ? TRUE : FALSE); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(b ? VariantUtil.TRUE : VariantUtil.FALSE); } /** @@ -159,30 +172,33 @@ public void appendBoolean(boolean b) { * @param l the long value to append */ public void appendLong(long l) { - checkCapacity(1 + 8); if (l == (byte) l) { - writeBuffer[writePos++] = primitiveHeader(INT1); - writeLong(writeBuffer, writePos, l, 1); + checkCapacity(1 + 1); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT1); + VariantUtil.writeLong(writeBuffer, writePos, l, 1); writePos += 1; } else if (l == (short) l) { - writeBuffer[writePos++] = primitiveHeader(INT2); - writeLong(writeBuffer, writePos, l, 2); + checkCapacity(1 + 2); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT2); + VariantUtil.writeLong(writeBuffer, writePos, l, 2); writePos += 2; } else if (l == (int) l) { - writeBuffer[writePos++] = primitiveHeader(INT4); - writeLong(writeBuffer, writePos, l, 4); + checkCapacity(1 + 4); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT4); + VariantUtil.writeLong(writeBuffer, writePos, l, 4); writePos += 4; } else { - writeBuffer[writePos++] = primitiveHeader(INT8); - writeLong(writeBuffer, writePos, l, 8); + checkCapacity(1 + 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT8); + VariantUtil.writeLong(writeBuffer, writePos, l, 8); writePos += 8; } } public void appendDouble(double d) { checkCapacity(1 + 8); - writeBuffer[writePos++] = primitiveHeader(DOUBLE); - writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.DOUBLE); + VariantUtil.writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8); writePos += 8; } @@ -192,21 +208,25 @@ public void appendDouble(double d) { * @param d the decimal value to append */ public void appendDecimal(BigDecimal d) { - checkCapacity(2 + 16); BigInteger unscaled = d.unscaledValue(); - if (d.scale() <= MAX_DECIMAL4_PRECISION && d.precision() <= MAX_DECIMAL4_PRECISION) { - writeBuffer[writePos++] = primitiveHeader(DECIMAL4); + if (d.scale() <= VariantUtil.MAX_DECIMAL4_PRECISION && d.precision() <= VariantUtil.MAX_DECIMAL4_PRECISION) { + checkCapacity(2 + 4); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.DECIMAL4); writeBuffer[writePos++] = (byte) d.scale(); - writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4); + VariantUtil.writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4); writePos += 4; - } else if (d.scale() <= MAX_DECIMAL8_PRECISION && d.precision() <= MAX_DECIMAL8_PRECISION) { - writeBuffer[writePos++] = primitiveHeader(DECIMAL8); + } else if (d.scale() <= VariantUtil.MAX_DECIMAL8_PRECISION + && d.precision() <= VariantUtil.MAX_DECIMAL8_PRECISION) { + checkCapacity(2 + 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.DECIMAL8); writeBuffer[writePos++] = (byte) d.scale(); - writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8); + VariantUtil.writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8); writePos += 8; } else { - assert d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION; - writeBuffer[writePos++] = primitiveHeader(DECIMAL16); + assert d.scale() <= VariantUtil.MAX_DECIMAL16_PRECISION + && d.precision() <= VariantUtil.MAX_DECIMAL16_PRECISION; + checkCapacity(2 + 16); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.DECIMAL16); writeBuffer[writePos++] = (byte) d.scale(); // `toByteArray` returns a big-endian representation. We need to copy it reversely and sign // extend it to 16 bytes. @@ -224,37 +244,37 @@ public void appendDecimal(BigDecimal d) { public void appendDate(int daysSinceEpoch) { checkCapacity(1 + 4); - writeBuffer[writePos++] = primitiveHeader(DATE); - writeLong(writeBuffer, writePos, daysSinceEpoch, 4); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.DATE); + VariantUtil.writeLong(writeBuffer, writePos, daysSinceEpoch, 4); writePos += 4; } public void appendTimestamp(long microsSinceEpoch) { checkCapacity(1 + 8); - writeBuffer[writePos++] = primitiveHeader(TIMESTAMP); - writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIMESTAMP); + VariantUtil.writeLong(writeBuffer, writePos, microsSinceEpoch, 8); writePos += 8; } public void appendTimestampNtz(long microsSinceEpoch) { checkCapacity(1 + 8); - writeBuffer[writePos++] = primitiveHeader(TIMESTAMP_NTZ); - writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIMESTAMP_NTZ); + VariantUtil.writeLong(writeBuffer, writePos, microsSinceEpoch, 8); writePos += 8; } public void appendFloat(float f) { checkCapacity(1 + 4); - writeBuffer[writePos++] = primitiveHeader(FLOAT); - writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.FLOAT); + VariantUtil.writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 8); writePos += 4; } public void appendBinary(byte[] binary) { - checkCapacity(1 + U32_SIZE + binary.length); - writeBuffer[writePos++] = primitiveHeader(BINARY); - writeLong(writeBuffer, writePos, binary.length, U32_SIZE); - writePos += U32_SIZE; + checkCapacity(1 + VariantUtil.U32_SIZE + binary.length); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.BINARY); + VariantUtil.writeLong(writeBuffer, writePos, binary.length, VariantUtil.U32_SIZE); + writePos += VariantUtil.U32_SIZE; System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); writePos += binary.length; } @@ -265,15 +285,11 @@ public void appendBinary(byte[] binary) { * @return the id of the key */ public int addKey(String key) { - int id; - if (dictionary.containsKey(key)) { - id = dictionary.get(key); - } else { - id = dictionaryKeys.size(); - dictionary.put(key, id); - dictionaryKeys.add(key.getBytes(StandardCharsets.UTF_8)); - } - return id; + return dictionary.computeIfAbsent(key, newKey -> { + int id = dictionaryKeys.size(); + dictionaryKeys.add(newKey.getBytes(StandardCharsets.UTF_8)); + return id; + }); } /** @@ -283,18 +299,6 @@ public int getWritePos() { return writePos; } - // Finish writing a variant object after all of its fields have already been written. The process - // is as follows: - // 1. The caller calls `getWritePos` before writing any fields to obtain the `start` parameter. - // 2. The caller appends all the object fields to the builder. In the meantime, it should maintain - // the `fields` parameter. Before appending each field, it should append an entry to `fields` to - // record the offset of the field. The offset is computed as `getWritePos() - start`. - // 3. The caller calls `finishWritingObject` to finish writing a variant object. - // - // This function is responsible to sort the fields by key. If there are duplicate field keys: - // - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last - // appended one) is kept. - // - otherwise, throw an exception. /** * Finish writing a Variant object after all of its fields have already been written. The process * is as follows: @@ -362,8 +366,8 @@ public void finishWritingObject(int start, ArrayList fields) { } } int dataSize = writePos - start; - boolean largeSize = size > U8_MAX; - int sizeBytes = largeSize ? U32_SIZE : 1; + boolean largeSize = size > VariantUtil.U8_MAX; + int sizeBytes = largeSize ? VariantUtil.U32_SIZE : 1; int idSize = getMinIntegerSize(maxId); int offsetSize = getMinIntegerSize(dataSize); // The space for header byte, object size, id list, and offset list. @@ -372,15 +376,15 @@ public void finishWritingObject(int start, ArrayList fields) { // Shift the just-written field data to make room for the object header section. System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); writePos += headerSize; - writeBuffer[start] = objectHeader(largeSize, idSize, offsetSize); - writeLong(writeBuffer, start + 1, size, sizeBytes); + writeBuffer[start] = VariantUtil.objectHeader(largeSize, idSize, offsetSize); + VariantUtil.writeLong(writeBuffer, start + 1, size, sizeBytes); int idStart = start + 1 + sizeBytes; int offsetStart = idStart + size * idSize; for (int i = 0; i < size; ++i) { - writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize); - writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize); + VariantUtil.writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize); + VariantUtil.writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize); } - writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + VariantUtil.writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); } /** @@ -392,8 +396,8 @@ public void finishWritingObject(int start, ArrayList fields) { public void finishWritingArray(int start, ArrayList offsets) { int dataSize = writePos - start; int size = offsets.size(); - boolean largeSize = size > U8_MAX; - int sizeBytes = largeSize ? U32_SIZE : 1; + boolean largeSize = size > VariantUtil.U8_MAX; + int sizeBytes = largeSize ? VariantUtil.U32_SIZE : 1; int offsetSize = getMinIntegerSize(dataSize); // The space for header byte, object size, and offset list. int headerSize = 1 + sizeBytes + (size + 1) * offsetSize; @@ -401,13 +405,13 @@ public void finishWritingArray(int start, ArrayList offsets) { // Shift the just-written field data to make room for the header section. System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); writePos += headerSize; - writeBuffer[start] = arrayHeader(largeSize, offsetSize); - writeLong(writeBuffer, start + 1, size, sizeBytes); + writeBuffer[start] = VariantUtil.arrayHeader(largeSize, offsetSize); + VariantUtil.writeLong(writeBuffer, start + 1, size, sizeBytes); int offsetStart = start + 1 + sizeBytes; for (int i = 0; i < size; ++i) { - writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize); + VariantUtil.writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize); } - writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + VariantUtil.writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); } /** @@ -421,18 +425,18 @@ public void appendVariant(Variant v) { } private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; + VariantUtil.checkIndex(pos, value.length); + int basicType = value[pos] & VariantUtil.BASIC_TYPE_MASK; switch (basicType) { - case OBJECT: - handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + case VariantUtil.OBJECT: + VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { ArrayList fields = new ArrayList<>(size); int start = writePos; for (int i = 0; i < size; ++i) { - int id = readUnsigned(value, idStart + idSize * i, idSize); - int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int id = VariantUtil.readUnsigned(value, idStart + idSize * i, idSize); + int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * i, offsetSize); int elementPos = dataStart + offset; - String key = getMetadataKey(metadata, id); + String key = VariantUtil.getMetadataKey(metadata, id); int newId = addKey(key); fields.add(new FieldEntry(key, newId, writePos - start)); appendVariantImpl(value, metadata, elementPos); @@ -441,12 +445,12 @@ private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { return null; }); break; - case ARRAY: - handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + case VariantUtil.ARRAY: + VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { ArrayList offsets = new ArrayList<>(size); int start = writePos; for (int i = 0; i < size; ++i) { - int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * i, offsetSize); int elementPos = dataStart + offset; offsets.add(writePos - start); appendVariantImpl(value, metadata, elementPos); @@ -462,8 +466,8 @@ private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { } private void shallowAppendVariantImpl(byte[] value, int pos) { - int size = valueSize(value, pos); - checkIndex(pos + size - 1, value.length); + int size = VariantUtil.valueSize(value, pos); + VariantUtil.checkIndex(pos + size - 1, value.length); checkCapacity(size); System.arraycopy(value, pos, writeBuffer, writePos, size); writePos += size; @@ -476,7 +480,7 @@ private void checkCapacity(int additionalBytes) { int newCapacity = Integer.highestOneBit(requiredBytes); newCapacity = newCapacity < requiredBytes ? newCapacity * 2 : newCapacity; if (newCapacity > sizeLimitBytes) { - throw new VariantSizeLimitException(); + throw new VariantSizeLimitException(sizeLimitBytes, newCapacity); } byte[] newValue = new byte[newCapacity]; System.arraycopy(writeBuffer, 0, newValue, 0, writePos); @@ -484,9 +488,6 @@ private void checkCapacity(int additionalBytes) { } } - // Temporarily store the information of a field. We need to collect all fields in an JSON object, - // sort them by their keys, and build the variant object in sorted order. - /** * Class to store the information of a Variant object field. We need to collect all fields of * an object, sort them by their keys, and build the Variant object in sorted order. @@ -576,10 +577,14 @@ private void buildFromJsonParser(JsonParser parser) throws IOException { * @return the size (number of bytes) of the smallest unsigned integer type that can store `value` */ private int getMinIntegerSize(int value) { - assert value >= 0 && value <= U24_MAX; - if (value <= U8_MAX) return 1; - if (value <= U16_MAX) return 2; - return U24_SIZE; + assert value >= 0 && value <= VariantUtil.U24_MAX; + if (value <= VariantUtil.U8_MAX) { + return VariantUtil.U8_SIZE; + } + if (value <= VariantUtil.U16_MAX) { + return VariantUtil.U16_SIZE; + } + return VariantUtil.U24_SIZE; } /** @@ -608,7 +613,7 @@ private boolean tryParseDecimal(String input) { } } BigDecimal d = new BigDecimal(input); - if (d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION) { + if (d.scale() <= VariantUtil.MAX_DECIMAL16_PRECISION && d.precision() <= VariantUtil.MAX_DECIMAL16_PRECISION) { appendDecimal(d); return true; } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java index 29722d21d2..a86a41ad6e 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java @@ -20,4 +20,10 @@ * An exception indicating that the metadata or data size of the Variant exceeds the * configured size limit. */ -public class VariantSizeLimitException extends RuntimeException {} +public class VariantSizeLimitException extends RuntimeException { + public VariantSizeLimitException(long sizeLimitBytes, long estimatedSizeBytes) { + super(String.format( + "Variant size exceeds the limit of %d bytes. Estimated size: %d bytes", + sizeLimitBytes, estimatedSizeBytes)); + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index f61c684f54..39ee9da470 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -43,7 +43,7 @@ public class VariantUtil { public static final int BASIC_TYPE_BITS = 2; public static final int BASIC_TYPE_MASK = 0x3; - public static final int TYPE_INFO_MASK = 0x3F; + public static final int PRIMITIVE_TYPE_MASK = 0x3F; /** The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. */ public static final int MAX_SHORT_STR_SIZE = 0x3F; @@ -150,6 +150,8 @@ public class VariantUtil { public static final int U8_MAX = 0xFF; public static final int U16_MAX = 0xFFFF; public static final int U24_MAX = 0xFFFFFF; + public static final int U8_SIZE = 1; + public static final int U16_SIZE = 2; public static final int U24_SIZE = 3; public static final int U32_SIZE = 4; @@ -194,6 +196,10 @@ public static byte arrayHeader(boolean largeSize, int offsetSize) { return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); } + public static MalformedVariantException malformedVariant(String message) { + return new MalformedVariantException(message); + } + public static MalformedVariantException malformedVariant() { return new MalformedVariantException(); } @@ -272,9 +278,9 @@ public enum Type { BINARY, } - public static int getTypeInfo(byte[] value, int pos) { + public static int getPrimitiveTypeId(byte[] value, int pos) { checkIndex(pos, value.length); - return (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + return (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; } /** @@ -288,7 +294,7 @@ public static int getTypeInfo(byte[] value, int pos) { public static Type getType(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; switch (basicType) { case SHORT_STR: return Type.STRING; @@ -343,7 +349,7 @@ public static Type getType(byte[] value, int pos) { public static int valueSize(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; switch (basicType) { case SHORT_STR: return 1 + typeInfo; @@ -400,7 +406,7 @@ private static IllegalStateException unexpectedType(Type type) { public static boolean getBoolean(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { throw unexpectedType(Type.BOOLEAN); } @@ -422,7 +428,7 @@ public static boolean getBoolean(byte[] value, int pos) { public static long getLong(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; if (basicType != PRIMITIVE) throw new IllegalStateException(exceptionMessage); switch (typeInfo) { @@ -445,7 +451,7 @@ public static long getLong(byte[] value, int pos) { public static double getDouble(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != DOUBLE) throw unexpectedType(Type.DOUBLE); return Double.longBitsToDouble(readLong(value, pos + 1, 8)); } @@ -465,7 +471,7 @@ private static void checkDecimal(BigDecimal d, int maxPrecision) { public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. @@ -504,7 +510,7 @@ public static BigDecimal getDecimal(byte[] value, int pos) { public static float getFloat(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != FLOAT) throw unexpectedType(Type.FLOAT); return Float.intBitsToFloat((int) readLong(value, pos + 1, 4)); } @@ -512,7 +518,7 @@ public static float getFloat(byte[] value, int pos) { public static byte[] getBinary(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != BINARY) throw unexpectedType(Type.BINARY); int start = pos + 1 + U32_SIZE; int length = readUnsigned(value, pos + 1, U32_SIZE); @@ -523,7 +529,7 @@ public static byte[] getBinary(byte[] value, int pos) { public static String getString(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { int start; int length; @@ -567,7 +573,7 @@ public interface ObjectHandler { public static T handleObject(byte[] value, int pos, ObjectHandler handler) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != OBJECT) throw unexpectedType(Type.OBJECT); // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts @@ -610,7 +616,7 @@ public interface ArrayHandler { public static T handleArray(byte[] value, int pos, ArrayHandler handler) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != ARRAY) throw unexpectedType(Type.ARRAY); // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 6b78b8295d..1ceb91839f 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -64,15 +64,15 @@ private void checkJson(String jsonValue) { StreamReadConstraints.overrideDefaultStreamReadConstraints( StreamReadConstraints.builder().maxNestingDepth(100000).build()); Variant v = VariantBuilder.parseJson(jsonValue); - Assert.assertEquals(mapper.readTree(jsonValue), mapper.readTree(v.toJson(ZoneId.systemDefault()))); + Assert.assertEquals(mapper.readTree(jsonValue), mapper.readTree(v.toJson())); } catch (IOException e) { Assert.fail("Failed to parse json: " + jsonValue + " " + e); } } - private void checkType(Variant v, int expectedBasicType, int expectedTypeInfo) { + private void checkType(Variant v, int expectedBasicType, int expectedPrimitiveTypeId) { Assert.assertEquals(expectedBasicType, v.value[v.pos] & VariantUtil.BASIC_TYPE_MASK); - Assert.assertEquals(expectedTypeInfo, v.getTypeInfo()); + Assert.assertEquals(expectedPrimitiveTypeId, v.getPrimitiveTypeId()); } private long microsSinceEpoch(Instant instant) { @@ -256,7 +256,7 @@ public void testDate() { VariantBuilder vb = new VariantBuilder(false); int days = Math.toIntExact(LocalDate.of(2024, 12, 16).toEpochDay()); vb.appendDate(days); - Assert.assertEquals("\"2024-12-16\"", vb.result().toJson(ZoneId.systemDefault())); + Assert.assertEquals("\"2024-12-16\"", vb.result().toJson()); Assert.assertEquals(days, vb.result().getLong()); } @@ -266,6 +266,7 @@ public void testTimestamp() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-12-16T10:23:45.321456-08:00"))); vb.appendTimestamp(micros); + Assert.assertEquals("\"2024-12-16T18:23:45.321456+00:00\"", vb.result().toJson()); Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", vb.result().toJson(ZoneId.of("-08:00"))); Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", vb.result().toJson(ZoneId.of("+01:00"))); Assert.assertEquals(micros, vb.result().getLong()); @@ -277,6 +278,7 @@ public void testTimestampNtz() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.000001Z"))); vb.appendTimestampNtz(micros); + Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", vb.result().toJson()); Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", vb.result().toJson(ZoneId.of("-08:00"))); Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), vb.result().toJson(ZoneId.of("+02:00"))); Assert.assertEquals(micros, vb.result().getLong()); @@ -288,8 +290,7 @@ public void testBinary() { byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; vb.appendBinary(binary); Assert.assertEquals( - "\"" + Base64.getEncoder().encodeToString(binary) + "\"", - vb.result().toJson(ZoneId.systemDefault())); + "\"" + Base64.getEncoder().encodeToString(binary) + "\"", vb.result().toJson()); Assert.assertArrayEquals(binary, vb.result().getBinary()); } @@ -333,6 +334,47 @@ public void testObject() { checkJson(sb.toString()); } + @Test + public void testGetObjectFields() throws IOException { + // Create small object for linear search + StringBuilder sb = new StringBuilder(); + sb.append("{"); + for (int i = 0; i < Variant.BINARY_SEARCH_THRESHOLD / 2; i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ").append(i); + } + sb.append("}"); + Variant v = VariantBuilder.parseJson(sb.toString()); + Assert.assertEquals(Variant.BINARY_SEARCH_THRESHOLD / 2, v.objectSize()); + for (int i = 0; i < Variant.BINARY_SEARCH_THRESHOLD / 2; i++) { + String actual = v.getFieldByKey("field" + i).toJson(); + Assert.assertEquals(String.valueOf(i), actual); + // check by index + Variant.ObjectField field = v.getFieldAtIndex(i); + Assert.assertTrue(field.key.startsWith("field")); + Assert.assertEquals(field.key.substring("field".length()), field.value.toJson()); + } + + // Create larger object for binary search + sb = new StringBuilder(); + sb.append("{"); + for (int i = 0; i < 2 * Variant.BINARY_SEARCH_THRESHOLD; i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ").append(i); + } + sb.append("}"); + v = VariantBuilder.parseJson(sb.toString()); + Assert.assertEquals(2 * Variant.BINARY_SEARCH_THRESHOLD, v.objectSize()); + for (int i = 0; i < 2 * Variant.BINARY_SEARCH_THRESHOLD; i++) { + String actual = v.getFieldByKey("field" + i).toJson(); + Assert.assertEquals(String.valueOf(i), actual); + // check by index + Variant.ObjectField field = v.getFieldAtIndex(i); + Assert.assertTrue(field.key.startsWith("field")); + Assert.assertEquals(field.key.substring("field".length()), field.value.toJson()); + } + } + @Test public void testArray() { // simple array @@ -378,7 +420,7 @@ public void testSizeLimit() { sb.append("{\"a\":1}"); } sb.append("]"); - VariantBuilder.parseJson(sb.toString(), new VariantBuilder(false, 20)); + VariantBuilder.parseJson(sb.toString(), new VariantBuilder(false, 100)); Assert.fail("Expected VariantSizeLimitException with large data"); } catch (IOException e) { Assert.fail("Expected VariantSizeLimitException with large data"); @@ -439,8 +481,8 @@ public void testTruncateTrailingZeroDecimal() { BigDecimal d = new BigDecimal(strings[0]); vb.appendDecimal(d); Variant v = vb.result(); - Assert.assertEquals(strings[0], v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(strings[1], v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(strings[0], v.toJson()); + Assert.assertEquals(strings[1], v.toJson(ZoneId.of("UTC"), true)); } } From 1ea911cb251e1bd6b8d884de6963f6f11b335eb7 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Wed, 5 Feb 2025 15:55:18 -0800 Subject: [PATCH 10/20] Add new variant types --- .../org/apache/parquet/variant/Variant.java | 100 ++++++++++++++- .../parquet/variant/VariantBuilder.java | 29 +++++ .../apache/parquet/variant/VariantUtil.java | 60 ++++++++- .../parquet/variant/TestVariantEncoding.java | 115 +++++++++++++++++- 4 files changed, 298 insertions(+), 6 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index 75aa73869d..c4e0474591 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -23,10 +23,9 @@ import java.io.CharArrayWriter; import java.io.IOException; import java.math.BigDecimal; -import java.time.Instant; -import java.time.LocalDate; -import java.time.ZoneId; -import java.time.ZoneOffset; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.time.*; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; import java.time.temporal.ChronoField; @@ -53,6 +52,7 @@ public final class Variant { * short list. */ static final int BINARY_SEARCH_THRESHOLD = 32; + static final ZoneId UTC = ZoneId.of("UTC"); public Variant(byte[] value, byte[] metadata) { @@ -126,6 +126,13 @@ public byte[] getBinary() { return VariantUtil.getBinary(value, pos); } + /** + * @return the UUID value + */ + public byte[] getUUID() { + return VariantUtil.getUUID(value, pos); + } + /** * @return the string value */ @@ -327,12 +334,32 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) .toFormatter(Locale.US); + /** The format for a timestamp without time zone, with nanosecond precision. */ + private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendPattern("HH:mm:ss") + .appendFraction(ChronoField.NANO_OF_SECOND, 9, 9, true) + .toFormatter(Locale.US); + /** The format for a timestamp with time zone. */ private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() .append(TIMESTAMP_NTZ_FORMATTER) .appendOffset("+HH:MM", "+00:00") .toFormatter(Locale.US); + /** The format for a timestamp with time zone, with nanosecond precision. */ + private static final DateTimeFormatter TIMESTAMP_NANOS_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NANOS_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + /** The format for a time. */ + private static final DateTimeFormatter TIME_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("HH:mm:ss") + .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) + .toFormatter(Locale.US); + /** The format for a timestamp without time zone, truncating trailing microsecond zeros. */ private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE) @@ -343,16 +370,50 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { .optionalEnd() .toFormatter(Locale.US); + /** + * The format for a timestamp without time zone, with nanosecond precision, truncating + * trailing nanosecond zeros. + */ + private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendPattern("HH:mm:ss") + .optionalStart() + .appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true) + .optionalEnd() + .toFormatter(Locale.US); + /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder() .append(TIMESTAMP_NTZ_TRUNC_FORMATTER) .appendOffset("+HH:MM", "+00:00") .toFormatter(Locale.US); + /** + * The format for a timestamp with time zone, with nanosecond precision, truncating trailing + * nanosecond zeros. + */ + private static final DateTimeFormatter TIMESTAMP_NANOS_TRUNC_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + /** The format for a time, truncating trailing microsecond zeros. */ + private static final DateTimeFormatter TIME_TRUNC_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("HH:mm:ss") + .optionalStart() + .appendFraction(ChronoField.MICRO_OF_SECOND, 0, 6, true) + .optionalEnd() + .toFormatter(Locale.US); + private static Instant microsToInstant(long microsSinceEpoch) { return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS); } + private static Instant nanosToInstant(long timestampNanos) { + return Instant.EPOCH.plus(timestampNanos, ChronoUnit.NANOS); + } + private static void toJsonImpl( byte[] value, byte[] metadata, int pos, JsonGenerator gen, ZoneId zoneId, boolean truncateTrailingZeros) throws IOException { @@ -447,6 +508,37 @@ private static void toJsonImpl( case BINARY: gen.writeString(Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); break; + case TIME: + if (truncateTrailingZeros) { + gen.writeString(TIME_TRUNC_FORMATTER.format( + LocalTime.ofNanoOfDay(VariantUtil.getLong(value, pos) * 1_000))); + } else { + gen.writeString( + TIME_FORMATTER.format(LocalTime.ofNanoOfDay(VariantUtil.getLong(value, pos) * 1_000))); + } + break; + case TIMESTAMP_NANOS: + if (truncateTrailingZeros) { + gen.writeString(TIMESTAMP_NANOS_TRUNC_FORMATTER.format( + nanosToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + } else { + gen.writeString(TIMESTAMP_NANOS_FORMATTER.format( + nanosToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + } + break; + case TIMESTAMP_NANOS_NTZ: + if (truncateTrailingZeros) { + gen.writeString(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER.format( + nanosToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + } else { + gen.writeString(TIMESTAMP_NANOS_NTZ_FORMATTER.format( + nanosToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + } + break; + case UUID: + ByteBuffer bb = ByteBuffer.wrap(VariantUtil.getUUID(value, pos)).order(ByteOrder.BIG_ENDIAN); + gen.writeString(new java.util.UUID(bb.getLong(), bb.getLong()).toString()); + break; default: throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, pos)); } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 2c681795c1..3eb7955444 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -263,6 +263,27 @@ public void appendTimestampNtz(long microsSinceEpoch) { writePos += 8; } + public void appendTime(long microsSinceMidnight) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIME); + VariantUtil.writeLong(writeBuffer, writePos, microsSinceMidnight, 8); + writePos += 8; + } + + public void appendTimestampNanos(long nanosSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIMESTAMP_NANOS); + VariantUtil.writeLong(writeBuffer, writePos, nanosSinceEpoch, 8); + writePos += 8; + } + + public void appendTimestampNanosNtz(long nanosSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIMESTAMP_NANOS_NTZ); + VariantUtil.writeLong(writeBuffer, writePos, nanosSinceEpoch, 8); + writePos += 8; + } + public void appendFloat(float f) { checkCapacity(1 + 4); writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.FLOAT); @@ -279,6 +300,14 @@ public void appendBinary(byte[] binary) { writePos += binary.length; } + public void appendUUID(byte[] uuid) { + assert uuid.length == VariantUtil.UUID_SIZE; + checkCapacity(1 + VariantUtil.UUID_SIZE); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.UUID); + System.arraycopy(uuid, 0, writeBuffer, writePos, uuid.length); + writePos += uuid.length; + } + /** * Adds a key to the Variant dictionary. If the key already exists, the dictionary is unmodified. * @param key the key to add diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 39ee9da470..ee973892c9 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -140,6 +140,28 @@ public class VariantUtil { * string size) + (size bytes of string content). */ public static final int LONG_STR = 16; + /** + * Time value. Values can be from 00:00:00 to 23:59:59.999999. + * Content is 8-byte little-endian unsigned integer that represents the number of microseconds + * since midnight. + */ + public static final int TIME = 17; + /** + * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number of nanoseconds + * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. + */ + public static final int TIMESTAMP_NANOS = 18; + /** + * Timestamp nanos (without timestamp) value. It has the same content as `TIMESTAMP_NANOS` but + * should always be interpreted as if the local time zone is UTC. + */ + public static final int TIMESTAMP_NANOS_NTZ = 19; + /** + * UUID value. The content is a 16-byte binary, encoded using big-endian. + * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the bytes + * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff. + */ + public static final int UUID = 20; // The metadata version. public static final byte VERSION = 1; @@ -160,6 +182,9 @@ public class VariantUtil { public static final int MAX_DECIMAL8_PRECISION = 18; public static final int MAX_DECIMAL16_PRECISION = 38; + // The size (in bytes) of a UUID. + public static final int UUID_SIZE = 16; + // Default size limit for both variant value and variant metadata. public static final int DEFAULT_SIZE_LIMIT = U24_MAX + 1; @@ -276,6 +301,10 @@ public enum Type { TIMESTAMP_NTZ, FLOAT, BINARY, + TIME, + TIMESTAMP_NANOS, + TIMESTAMP_NANOS_NTZ, + UUID } public static int getPrimitiveTypeId(byte[] value, int pos) { @@ -332,6 +361,14 @@ public static Type getType(byte[] value, int pos) { return Type.BINARY; case LONG_STR: return Type.STRING; + case TIME: + return Type.TIME; + case TIMESTAMP_NANOS: + return Type.TIMESTAMP_NANOS; + case TIMESTAMP_NANOS_NTZ: + return Type.TIMESTAMP_NANOS_NTZ; + case UUID: + return Type.UUID; default: throw unknownPrimitiveTypeInVariant(typeInfo); } @@ -383,6 +420,9 @@ public static int valueSize(byte[] value, int pos) { case DOUBLE: case TIMESTAMP: case TIMESTAMP_NTZ: + case TIME: + case TIMESTAMP_NANOS: + case TIMESTAMP_NANOS_NTZ: return 9; case DECIMAL4: return 6; @@ -393,6 +433,8 @@ public static int valueSize(byte[] value, int pos) { case BINARY: case LONG_STR: return 1 + U32_SIZE + readUnsigned(value, pos + 1, U32_SIZE); + case UUID: + return 1 + UUID_SIZE; default: throw unknownPrimitiveTypeInVariant(typeInfo); } @@ -416,11 +458,14 @@ public static boolean getBoolean(byte[] value, int pos) { /** * Returns a long value from Variant value `value[pos...]`. * It is only legal to call it if `getType` returns one of Type.LONG, DATE, TIMESTAMP, - * TIMESTAMP_NTZ. + * TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ. * If the type is `DATE`, the return value is guaranteed to fit into an int and * represents the number of days from the Unix epoch. * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of * microseconds from the Unix epoch. + * If the type is `TIME`, the return value represents the number of microseconds since midnight. + * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value represents the number of + * nanoseconds from the Unix epoch. * @param value The Variant value * @param pos The starting index of the Variant value * @return The long value @@ -442,6 +487,9 @@ public static long getLong(byte[] value, int pos) { case INT8: case TIMESTAMP: case TIMESTAMP_NTZ: + case TIME: + case TIMESTAMP_NANOS: + case TIMESTAMP_NANOS_NTZ: return readLong(value, pos + 1, 8); default: throw new IllegalStateException(exceptionMessage); @@ -546,6 +594,16 @@ public static String getString(byte[] value, int pos) { throw unexpectedType(Type.STRING); } + public static byte[] getUUID(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + if (basicType != PRIMITIVE || typeInfo != UUID) throw unexpectedType(Type.UUID); + int start = pos + 1; + checkIndex(start + UUID_SIZE - 1, value.length); + return Arrays.copyOfRange(value, start, start + UUID_SIZE); + } + /** * An interface for the Variant object handler. * @param The return type of the handler diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 1ceb91839f..0fef8f66a5 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -25,6 +25,7 @@ import java.security.SecureRandom; import java.time.Instant; import java.time.LocalDate; +import java.time.LocalTime; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.util.Arrays; @@ -79,6 +80,10 @@ private long microsSinceEpoch(Instant instant) { return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000; } + private long nanosSinceEpoch(Instant instant) { + return TimeUnit.SECONDS.toNanos(instant.getEpochSecond()) + instant.getNano(); + } + private String randomString(int len) { StringBuilder sb = new StringBuilder(len); for (int i = 0; i < len; i++) { @@ -284,16 +289,65 @@ public void testTimestampNtz() { Assert.assertEquals(micros, vb.result().getLong()); } + @Test + public void testTime() { + for (String timeStr : Arrays.asList( + "00:00:00.000000", "00:00:00.000120", "12:00:00.000000", "12:00:00.002300", "23:59:59.999999")) { + VariantBuilder vb = new VariantBuilder(false); + long micros = LocalTime.parse(timeStr).toNanoOfDay() / 1_000; + vb.appendTime(micros); + Assert.assertEquals(String.format("\"%s\"", timeStr), vb.result().toJson()); + Assert.assertEquals(micros, vb.result().getLong()); + } + } + + @Test + public void testTimestampNanos() { + VariantBuilder vb = new VariantBuilder(false); + long nanos = nanosSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456987-08:00")); + vb.appendTimestampNanos(nanos); + Assert.assertEquals( + "\"2024-12-16T18:23:45.321456987+00:00\"", vb.result().toJson()); + Assert.assertEquals( + "\"2024-12-16T10:23:45.321456987-08:00\"", vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals( + "\"2024-12-16T19:23:45.321456987+01:00\"", vb.result().toJson(ZoneId.of("+01:00"))); + Assert.assertEquals(nanos, vb.result().getLong()); + } + + @Test + public void testTimestampNanosNtz() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; + VariantBuilder vb = new VariantBuilder(false); + long nanos = nanosSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.839280983Z"))); + vb.appendTimestampNanosNtz(nanos); + Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", vb.result().toJson()); + Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), vb.result().toJson(ZoneId.of("+02:00"))); + Assert.assertEquals(nanos, vb.result().getLong()); + } + @Test public void testBinary() { VariantBuilder vb = new VariantBuilder(false); byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; vb.appendBinary(binary); Assert.assertEquals( - "\"" + Base64.getEncoder().encodeToString(binary) + "\"", vb.result().toJson()); + "\"" + Base64.getEncoder().encodeToString(binary) + "\"", + vb.result().toJson()); Assert.assertArrayEquals(binary, vb.result().getBinary()); } + @Test + public void testUUID() { + VariantBuilder vb = new VariantBuilder(false); + byte[] uuid = new byte[] {0, 17, 34, 51, 68, 85, 102, 119, -120, -103, -86, -69, -52, -35, -18, -1}; + vb.appendUUID(uuid); + Assert.assertEquals( + "\"00112233-4455-6677-8899-aabbccddeeff\"", vb.result().toJson()); + Assert.assertArrayEquals(uuid, vb.result().getUUID()); + } + @Test public void testObject() { // simple object @@ -525,4 +579,63 @@ public void testTruncateTrailingZeroTimestampNtz() { Assert.assertEquals(micros, vb.result().getLong()); } } + + @Test + public void testTruncateTrailingZeroTime() { + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"10:23:45.000000", "10:23:45"}, + // truncate some trailing zeros + new String[] {"10:23:45.123000", "10:23:45.123"}, + // truncate no trailing zeros + new String[] {"10:23:45.123456", "10:23:45.123456"})) { + VariantBuilder vb = new VariantBuilder(false); + + long micros = LocalTime.parse(strings[0]).toNanoOfDay() / 1_000; + vb.appendTime(micros); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(micros, vb.result().getLong()); + } + } + + @Test + public void testTruncateTrailingZeroTimestampNanos() { + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.000000000-08:00", "2024-12-16T10:23:45-08:00"}, + // truncate some trailing zeros + new String[] {"2024-12-16T10:23:45.123450000-08:00", "2024-12-16T10:23:45.12345-08:00"}, + // truncate no trailing zeros + new String[] {"2024-12-16T10:23:45.123456789-08:00", "2024-12-16T10:23:45.123456789-08:00"})) { + VariantBuilder vb = new VariantBuilder(false); + long nanos = nanosSinceEpoch(Instant.parse(strings[0])); + vb.appendTimestampNanos(nanos); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + } + } + + @Test + public void testTruncateTrailingZeroTimestampNanosNtz() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.000000000", "2024-12-16T10:23:45"}, + // truncate some trailing zeros + new String[] {"2024-12-16T10:23:45.123450000", "2024-12-16T10:23:45.12345"}, + // truncate no trailing zeros + new String[] {"2024-12-16T10:23:45.123456789", "2024-12-16T10:23:45.123456789"})) { + VariantBuilder vb = new VariantBuilder(false); + + long nanos = nanosSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); + vb.appendTimestampNanosNtz(nanos); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(nanos, vb.result().getLong()); + } + } } From cb954a64f5dcff64a06e705bf51b825e351e88bc Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Mon, 10 Feb 2025 10:05:38 -0800 Subject: [PATCH 11/20] Fix tests for older JDK versions --- .../org/apache/parquet/variant/TestVariantEncoding.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 0fef8f66a5..68375ec7ce 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -303,8 +303,9 @@ public void testTime() { @Test public void testTimestampNanos() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; VariantBuilder vb = new VariantBuilder(false); - long nanos = nanosSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456987-08:00")); + long nanos = nanosSinceEpoch(Instant.from(dtf.parse("2024-12-16T10:23:45.321456987-08:00"))); vb.appendTimestampNanos(nanos); Assert.assertEquals( "\"2024-12-16T18:23:45.321456987+00:00\"", vb.result().toJson()); @@ -602,6 +603,7 @@ public void testTruncateTrailingZeroTime() { @Test public void testTruncateTrailingZeroTimestampNanos() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; for (String[] strings : Arrays.asList( // truncate all trailing zeros new String[] {"2024-12-16T10:23:45.000000000-08:00", "2024-12-16T10:23:45-08:00"}, @@ -610,7 +612,7 @@ public void testTruncateTrailingZeroTimestampNanos() { // truncate no trailing zeros new String[] {"2024-12-16T10:23:45.123456789-08:00", "2024-12-16T10:23:45.123456789-08:00"})) { VariantBuilder vb = new VariantBuilder(false); - long nanos = nanosSinceEpoch(Instant.parse(strings[0])); + long nanos = nanosSinceEpoch(Instant.from(dtf.parse(strings[0]))); vb.appendTimestampNanos(nanos); Variant v = vb.result(); Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); From db6b98ed7431c17288dcf4a991e6a4e2998231e6 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Thu, 13 Feb 2025 10:58:32 -0800 Subject: [PATCH 12/20] Return UUID --- .../src/main/java/org/apache/parquet/variant/Variant.java | 8 ++++++-- .../org/apache/parquet/variant/TestVariantEncoding.java | 8 +++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index c4e0474591..fe4a26718a 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -33,6 +33,7 @@ import java.util.Arrays; import java.util.Base64; import java.util.Locale; +import java.util.UUID; /** * This Variant class holds the Variant-encoded value and metadata binary values. @@ -129,8 +130,11 @@ public byte[] getBinary() { /** * @return the UUID value */ - public byte[] getUUID() { - return VariantUtil.getUUID(value, pos); + public UUID getUUID() { + byte[] uuidBytes = VariantUtil.getUUID(value, pos); + long msb = ByteBuffer.wrap(uuidBytes, 0, 8).order(ByteOrder.BIG_ENDIAN).getLong(); + long lsb = ByteBuffer.wrap(uuidBytes, 8, 8).order(ByteOrder.BIG_ENDIAN).getLong(); + return new UUID(msb, lsb); } /** diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 68375ec7ce..173e123d4a 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -22,6 +22,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.security.SecureRandom; import java.time.Instant; import java.time.LocalDate; @@ -31,6 +33,7 @@ import java.util.Arrays; import java.util.Base64; import java.util.List; +import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.stream.IntStream; import org.junit.Assert; @@ -346,7 +349,10 @@ public void testUUID() { vb.appendUUID(uuid); Assert.assertEquals( "\"00112233-4455-6677-8899-aabbccddeeff\"", vb.result().toJson()); - Assert.assertArrayEquals(uuid, vb.result().getUUID()); + long msb = ByteBuffer.wrap(uuid, 0, 8).order(ByteOrder.BIG_ENDIAN).getLong(); + long lsb = ByteBuffer.wrap(uuid, 8, 8).order(ByteOrder.BIG_ENDIAN).getLong(); + UUID expected = new UUID(msb, lsb); + Assert.assertEquals(expected, vb.result().getUUID()); } @Test From c220c3c29b109709b17b2ff2faba64f288b2829f Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Fri, 14 Feb 2025 12:41:07 -0800 Subject: [PATCH 13/20] Return java.util.UUID --- .../src/main/java/org/apache/parquet/variant/Variant.java | 8 ++------ .../main/java/org/apache/parquet/variant/VariantUtil.java | 7 +++++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index fe4a26718a..d7d48c11c0 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -131,10 +131,7 @@ public byte[] getBinary() { * @return the UUID value */ public UUID getUUID() { - byte[] uuidBytes = VariantUtil.getUUID(value, pos); - long msb = ByteBuffer.wrap(uuidBytes, 0, 8).order(ByteOrder.BIG_ENDIAN).getLong(); - long lsb = ByteBuffer.wrap(uuidBytes, 8, 8).order(ByteOrder.BIG_ENDIAN).getLong(); - return new UUID(msb, lsb); + return VariantUtil.getUUID(value, pos); } /** @@ -540,8 +537,7 @@ private static void toJsonImpl( } break; case UUID: - ByteBuffer bb = ByteBuffer.wrap(VariantUtil.getUUID(value, pos)).order(ByteOrder.BIG_ENDIAN); - gen.writeString(new java.util.UUID(bb.getLong(), bb.getLong()).toString()); + gen.writeString(VariantUtil.getUUID(value, pos).toString()); break; default: throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, pos)); diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index ee973892c9..88b041bad7 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -18,6 +18,8 @@ import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Arrays; /** @@ -594,14 +596,15 @@ public static String getString(byte[] value, int pos) { throw unexpectedType(Type.STRING); } - public static byte[] getUUID(byte[] value, int pos) { + public static java.util.UUID getUUID(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != UUID) throw unexpectedType(Type.UUID); int start = pos + 1; checkIndex(start + UUID_SIZE - 1, value.length); - return Arrays.copyOfRange(value, start, start + UUID_SIZE); + ByteBuffer bb = ByteBuffer.wrap(value, start, UUID_SIZE).order(ByteOrder.BIG_ENDIAN); + return new java.util.UUID(bb.getLong(), bb.getLong()); } /** From 23181149d8ab53dfc9b95e36d5f5028b93500b9d Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 25 Feb 2025 10:11:15 -0800 Subject: [PATCH 14/20] mvn spotless:apply --- .../src/main/java/org/apache/parquet/variant/Variant.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index d7d48c11c0..cf8df980a1 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -23,8 +23,6 @@ import java.io.CharArrayWriter; import java.io.IOException; import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.time.*; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; From 553dbe9a17a3911755a769355a31da14aa1c5883 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Thu, 27 Feb 2025 13:11:00 -0800 Subject: [PATCH 15/20] review feedback --- parquet-variant/pom.xml | 5 + .../variant/MalformedVariantException.java | 4 - .../variant/UnknownVariantTypeException.java | 2 +- .../org/apache/parquet/variant/Variant.java | 174 ++++++----- .../parquet/variant/VariantBuilder.java | 45 +-- .../apache/parquet/variant/VariantUtil.java | 271 +++++++++++++----- .../parquet/variant/TestVariantEncoding.java | 96 ++++++- 7 files changed, 420 insertions(+), 177 deletions(-) diff --git a/parquet-variant/pom.xml b/parquet-variant/pom.xml index 9fc9d97266..365fd826ea 100644 --- a/parquet-variant/pom.xml +++ b/parquet-variant/pom.xml @@ -36,6 +36,11 @@ + + org.apache.parquet + parquet-cli + ${project.version} + org.apache.parquet parquet-jackson diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java index ffbd0786b7..3ecc707a11 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java @@ -20,10 +20,6 @@ * An exception indicating that the Variant is malformed. */ public class MalformedVariantException extends RuntimeException { - public MalformedVariantException() { - super(); - } - public MalformedVariantException(String message) { super(message); } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java index 2f0bd5dce6..3cdacb5d99 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java @@ -33,7 +33,7 @@ public UnknownVariantTypeException(int typeId) { /** * @return the type id that was unknown */ - public int getTypeId() { + public int typeId() { return typeId; } } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index cf8df980a1..497c3d9848 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -32,6 +32,7 @@ import java.util.Base64; import java.util.Locale; import java.util.UUID; +import org.apache.parquet.cli.util.RuntimeIOException; /** * This Variant class holds the Variant-encoded value and metadata binary values. @@ -64,7 +65,7 @@ public Variant(byte[] value, byte[] metadata) { this.pos = pos; // There is currently only one allowed version. if (metadata.length < 1 || (metadata[0] & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { - throw VariantUtil.malformedVariant(String.format( + throw new MalformedVariantException(String.format( "Unsupported variant metadata version: %02X", metadata[0] & VariantUtil.VERSION_MASK)); } } @@ -90,6 +91,39 @@ public boolean getBoolean() { return VariantUtil.getBoolean(value, pos); } + /** + * @return the byte value + */ + public byte getByte() { + long longValue = VariantUtil.getLong(value, pos); + if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) { + throw new IllegalStateException("Value out of range for byte: " + longValue); + } + return (byte) longValue; + } + + /** + * @return the short value + */ + public short getShort() { + long longValue = VariantUtil.getLong(value, pos); + if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) { + throw new IllegalStateException("Value out of range for short: " + longValue); + } + return (short) longValue; + } + + /** + * @return the int value + */ + public int getInt() { + long longValue = VariantUtil.getLong(value, pos); + if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) { + throw new IllegalStateException("Value out of range for int: " + longValue); + } + return (int) longValue; + } + /** * @return the long value */ @@ -156,9 +190,8 @@ public VariantUtil.Type getType() { /** * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. */ - public int objectSize() { - return VariantUtil.handleObject( - value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); + public int numObjectElements() { + return VariantUtil.handleObject(value, pos, (info) -> info.numElements); } /** @@ -168,27 +201,41 @@ public int objectSize() { * @return the field value whose key is equal to `key`, or null if key is not found */ public Variant getFieldByKey(String key) { - return VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + return VariantUtil.handleObject(value, pos, (info) -> { // Use linear search for a short list. Switch to binary search when the length reaches // `BINARY_SEARCH_THRESHOLD`. - if (size < BINARY_SEARCH_THRESHOLD) { - for (int i = 0; i < size; ++i) { - ObjectField field = - getFieldAtIndex(i, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); + if (info.numElements < BINARY_SEARCH_THRESHOLD) { + for (int i = 0; i < info.numElements; ++i) { + ObjectField field = getFieldAtIndex( + i, + value, + metadata, + info.idSize, + info.offsetSize, + info.idStart, + info.offsetStart, + info.dataStart); if (field.key.equals(key)) { return field.value; } } } else { int low = 0; - int high = size - 1; + int high = info.numElements - 1; while (low <= high) { // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a // performance optimization, because it can properly handle the case where `low + high` // overflows int. int mid = (low + high) >>> 1; - ObjectField field = - getFieldAtIndex(mid, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); + ObjectField field = getFieldAtIndex( + mid, + value, + metadata, + info.idSize, + info.offsetSize, + info.idStart, + info.offsetStart, + info.dataStart); int cmp = field.key.compareTo(key); if (cmp < 0) { low = mid + 1; @@ -220,14 +267,22 @@ public ObjectField(String key, Variant value) { * Returns the ObjectField at the `index` slot. Return null if `index` is out of the bound of * `[0, objectSize())`. `getType()` must be `Type.OBJECT`. * @param index the index of the object field to get - * @return the Objectfield at the `index` slot, or null if `index` is out of bounds + * @return the ObjectField at the `index` slot, or null if `index` is out of bounds */ public ObjectField getFieldAtIndex(int index) { - return VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { - if (index < 0 || index >= size) { + return VariantUtil.handleObject(value, pos, (info) -> { + if (index < 0 || index >= info.numElements) { return null; } - return getFieldAtIndex(index, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); + return getFieldAtIndex( + index, + value, + metadata, + info.idSize, + info.offsetSize, + info.idStart, + info.offsetStart, + info.dataStart); }); } @@ -247,27 +302,11 @@ private static ObjectField getFieldAtIndex( return new ObjectField(key, v); } - /** - * Returns the dictionary ID for the object field at the `index` slot. - * `getType()` must be `Type.OBJECT`. - * @param index the index of the object field to get the dictionary ID for - * @return the dictionary ID for the object field at the `index` slot - * @throws MalformedVariantException if `index` is out of bounds - */ - public int getDictionaryIdAtIndex(int index) { - return VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { - if (index < 0 || index >= size) { - throw VariantUtil.malformedVariant(); - } - return VariantUtil.readUnsigned(value, idStart + idSize * index, idSize); - }); - } - /** * @return the number of array elements. `getType()` must be `Type.ARRAY`. */ - public int arraySize() { - return VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); + public int numArrayElements() { + return VariantUtil.handleArray(value, pos, (info) -> info.numElements); } /** @@ -277,11 +316,11 @@ public int arraySize() { * @return the array element Variant at the `index` slot, or null if `index` is out of bounds */ public Variant getElementAtIndex(int index) { - return VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { - if (index < 0 || index >= size) { + return VariantUtil.handleArray(value, pos, (info) -> { + if (index < 0 || index >= info.numElements) { return null; } - return getElementAtIndex(index, value, metadata, offsetSize, offsetStart, dataStart); + return getElementAtIndex(index, value, metadata, info.offsetSize, info.offsetStart, info.dataStart); }); } @@ -321,7 +360,7 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { gen.flush(); return writer.toString(); } catch (IOException e) { - throw new RuntimeException(e); + throw new RuntimeIOException("Failed to convert variant to json", e); } } @@ -418,40 +457,40 @@ private static void toJsonImpl( throws IOException { switch (VariantUtil.getType(value, pos)) { case OBJECT: - VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { - try { - gen.writeStartObject(); - for (int i = 0; i < size; ++i) { - ObjectField field = getFieldAtIndex( - i, value, metadata, idSize, offsetSize, idStart, offsetStart, dataStart); - gen.writeFieldName(field.key); - toJsonImpl( - field.value.value, - field.value.metadata, - field.value.pos, - gen, - zoneId, - truncateTrailingZeros); - } - gen.writeEndObject(); - } catch (IOException e) { - throw new RuntimeException(e); + VariantUtil.handleObjectException(value, pos, (info) -> { + gen.writeStartObject(); + for (int i = 0; i < info.numElements; ++i) { + ObjectField field = getFieldAtIndex( + i, + value, + metadata, + info.idSize, + info.offsetSize, + info.idStart, + info.offsetStart, + info.dataStart); + gen.writeFieldName(field.key); + toJsonImpl( + field.value.value, + field.value.metadata, + field.value.pos, + gen, + zoneId, + truncateTrailingZeros); } + gen.writeEndObject(); return null; }); break; case ARRAY: - VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { - try { - gen.writeStartArray(); - for (int i = 0; i < size; ++i) { - Variant v = getElementAtIndex(i, value, metadata, offsetSize, offsetStart, dataStart); - toJsonImpl(v.value, v.metadata, v.pos, gen, zoneId, truncateTrailingZeros); - } - gen.writeEndArray(); - } catch (IOException e) { - throw new RuntimeException(e); + VariantUtil.handleArrayException(value, pos, (info) -> { + gen.writeStartArray(); + for (int i = 0; i < info.numElements; ++i) { + Variant v = getElementAtIndex( + i, value, metadata, info.offsetSize, info.offsetStart, info.dataStart); + toJsonImpl(v.value, v.metadata, v.pos, gen, zoneId, truncateTrailingZeros); } + gen.writeEndArray(); return null; }); break; @@ -461,6 +500,9 @@ private static void toJsonImpl( case BOOLEAN: gen.writeBoolean(VariantUtil.getBoolean(value, pos)); break; + case BYTE: + case SHORT: + case INT: case LONG: gen.writeNumber(VariantUtil.getLong(value, pos)); break; diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 3eb7955444..7cf28ef1b9 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -24,6 +24,8 @@ import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; @@ -174,22 +176,22 @@ public void appendBoolean(boolean b) { public void appendLong(long l) { if (l == (byte) l) { checkCapacity(1 + 1); - writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT1); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT8); VariantUtil.writeLong(writeBuffer, writePos, l, 1); writePos += 1; } else if (l == (short) l) { checkCapacity(1 + 2); - writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT2); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT16); VariantUtil.writeLong(writeBuffer, writePos, l, 2); writePos += 2; } else if (l == (int) l) { checkCapacity(1 + 4); - writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT4); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT32); VariantUtil.writeLong(writeBuffer, writePos, l, 4); writePos += 4; } else { checkCapacity(1 + 8); - writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT8); + writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.INT64); VariantUtil.writeLong(writeBuffer, writePos, l, 8); writePos += 8; } @@ -300,12 +302,15 @@ public void appendBinary(byte[] binary) { writePos += binary.length; } - public void appendUUID(byte[] uuid) { - assert uuid.length == VariantUtil.UUID_SIZE; + public void appendUUID(java.util.UUID uuid) { checkCapacity(1 + VariantUtil.UUID_SIZE); writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.UUID); - System.arraycopy(uuid, 0, writeBuffer, writePos, uuid.length); - writePos += uuid.length; + + ByteBuffer bb = + ByteBuffer.wrap(writeBuffer, writePos, VariantUtil.UUID_SIZE).order(ByteOrder.BIG_ENDIAN); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + writePos += VariantUtil.UUID_SIZE; } /** @@ -458,13 +463,14 @@ private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { int basicType = value[pos] & VariantUtil.BASIC_TYPE_MASK; switch (basicType) { case VariantUtil.OBJECT: - VariantUtil.handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { - ArrayList fields = new ArrayList<>(size); + VariantUtil.handleObject(value, pos, (info) -> { + ArrayList fields = new ArrayList<>(info.numElements); int start = writePos; - for (int i = 0; i < size; ++i) { - int id = VariantUtil.readUnsigned(value, idStart + idSize * i, idSize); - int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * i, offsetSize); - int elementPos = dataStart + offset; + for (int i = 0; i < info.numElements; ++i) { + int id = VariantUtil.readUnsigned(value, info.idStart + info.idSize * i, info.idSize); + int offset = VariantUtil.readUnsigned( + value, info.offsetStart + info.offsetSize * i, info.offsetSize); + int elementPos = info.dataStart + offset; String key = VariantUtil.getMetadataKey(metadata, id); int newId = addKey(key); fields.add(new FieldEntry(key, newId, writePos - start)); @@ -475,12 +481,13 @@ private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { }); break; case VariantUtil.ARRAY: - VariantUtil.handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { - ArrayList offsets = new ArrayList<>(size); + VariantUtil.handleArray(value, pos, (info) -> { + ArrayList offsets = new ArrayList<>(info.numElements); int start = writePos; - for (int i = 0; i < size; ++i) { - int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * i, offsetSize); - int elementPos = dataStart + offset; + for (int i = 0; i < info.numElements; ++i) { + int offset = VariantUtil.readUnsigned( + value, info.offsetStart + info.offsetSize * i, info.offsetSize); + int elementPos = info.dataStart + offset; offsets.add(writePos - start); appendVariantImpl(value, metadata, elementPos); } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 88b041bad7..097c4cd811 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -16,6 +16,7 @@ */ package org.apache.parquet.variant; +import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; @@ -44,10 +45,10 @@ */ public class VariantUtil { public static final int BASIC_TYPE_BITS = 2; - public static final int BASIC_TYPE_MASK = 0x3; - public static final int PRIMITIVE_TYPE_MASK = 0x3F; + public static final int BASIC_TYPE_MASK = 0b00000011; + public static final int PRIMITIVE_TYPE_MASK = 0b00111111; /** The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. */ - public static final int MAX_SHORT_STR_SIZE = 0x3F; + public static final int MAX_SHORT_STR_SIZE = 0b00111111; // The basic types @@ -99,13 +100,13 @@ public class VariantUtil { /** False value. Empty content. */ public static final int FALSE = 2; /** 1-byte little-endian signed integer. */ - public static final int INT1 = 3; + public static final int INT8 = 3; /** 2-byte little-endian signed integer. */ - public static final int INT2 = 4; + public static final int INT16 = 4; /** 4-byte little-endian signed integer. */ - public static final int INT4 = 5; + public static final int INT32 = 5; /** 4-byte little-endian signed integer. */ - public static final int INT8 = 6; + public static final int INT64 = 6; /** 8-byte IEEE double. */ public static final int DOUBLE = 7; /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. */ @@ -223,18 +224,6 @@ public static byte arrayHeader(boolean largeSize, int offsetSize) { return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); } - public static MalformedVariantException malformedVariant(String message) { - return new MalformedVariantException(message); - } - - public static MalformedVariantException malformedVariant() { - return new MalformedVariantException(); - } - - public static UnknownVariantTypeException unknownPrimitiveTypeInVariant(int id) { - return new UnknownVariantTypeException(id); - } - /** * Check the validity of an array index `pos`. * @param pos The index to check @@ -242,7 +231,10 @@ public static UnknownVariantTypeException unknownPrimitiveTypeInVariant(int id) * @throws MalformedVariantException if the index is out of bound */ public static void checkIndex(int pos, int length) { - if (pos < 0 || pos >= length) throw malformedVariant(); + if (pos < 0 || pos >= length) { + throw new IllegalArgumentException( + String.format("Invalid byte-array offset (%d). length: %d", pos, length)); + } } /** @@ -281,7 +273,9 @@ static int readUnsigned(byte[] bytes, int pos, int numBytes) { int unsignedByteValue = bytes[pos + i] & 0xFF; result |= unsignedByteValue << (8 * i); } - if (result < 0) throw malformedVariant(); + if (result < 0) { + throw new MalformedVariantException(String.format("Failed to read unsigned int. numBytes: %d", numBytes)); + } return result; } @@ -294,6 +288,9 @@ public enum Type { ARRAY, NULL, BOOLEAN, + BYTE, + SHORT, + INT, LONG, STRING, DOUBLE, @@ -340,10 +337,13 @@ public static Type getType(byte[] value, int pos) { case TRUE: case FALSE: return Type.BOOLEAN; - case INT1: - case INT2: - case INT4: case INT8: + return Type.BYTE; + case INT16: + return Type.SHORT; + case INT32: + return Type.INT; + case INT64: return Type.LONG; case DOUBLE: return Type.DOUBLE; @@ -372,7 +372,7 @@ public static Type getType(byte[] value, int pos) { case UUID: return Type.UUID; default: - throw unknownPrimitiveTypeInVariant(typeInfo); + throw new UnknownVariantTypeException(typeInfo); } } } @@ -396,29 +396,33 @@ public static int valueSize(byte[] value, int pos) { return handleObject( value, pos, - (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> - dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + (info) -> info.dataStart + - pos + + readUnsigned( + value, info.offsetStart + info.numElements * info.offsetSize, info.offsetSize)); case ARRAY: return handleArray( value, pos, - (size, offsetSize, offsetStart, dataStart) -> - dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + (info) -> info.dataStart + - pos + + readUnsigned( + value, info.offsetStart + info.numElements * info.offsetSize, info.offsetSize)); default: switch (typeInfo) { case NULL: case TRUE: case FALSE: return 1; - case INT1: + case INT8: return 2; - case INT2: + case INT16: return 3; - case INT4: + case INT32: case DATE: case FLOAT: return 5; - case INT8: + case INT64: case DOUBLE: case TIMESTAMP: case TIMESTAMP_NTZ: @@ -438,13 +442,13 @@ public static int valueSize(byte[] value, int pos) { case UUID: return 1 + UUID_SIZE; default: - throw unknownPrimitiveTypeInVariant(typeInfo); + throw new UnknownVariantTypeException(typeInfo); } } } - private static IllegalStateException unexpectedType(Type type) { - return new IllegalStateException("Expect type to be " + type); + private static MalformedVariantException unexpectedType(Type type) { + return new MalformedVariantException("Expected type to be " + type); } public static boolean getBoolean(byte[] value, int pos) { @@ -459,8 +463,8 @@ public static boolean getBoolean(byte[] value, int pos) { /** * Returns a long value from Variant value `value[pos...]`. - * It is only legal to call it if `getType` returns one of Type.LONG, DATE, TIMESTAMP, - * TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ. + * It is only legal to call it if `getType` returns one of Type.BYTE, SHORT, INT, LONG, + * DATE, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ. * If the type is `DATE`, the return value is guaranteed to fit into an int and * represents the number of days from the Unix epoch. * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of @@ -476,17 +480,20 @@ public static long getLong(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; - if (basicType != PRIMITIVE) throw new IllegalStateException(exceptionMessage); + String exceptionMessage = + "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ"; + if (basicType != PRIMITIVE) { + throw new IllegalStateException(exceptionMessage); + } switch (typeInfo) { - case INT1: + case INT8: return readLong(value, pos + 1, 1); - case INT2: + case INT16: return readLong(value, pos + 1, 2); - case INT4: + case INT32: case DATE: return readLong(value, pos + 1, 4); - case INT8: + case INT64: case TIMESTAMP: case TIMESTAMP_NTZ: case TIME: @@ -502,7 +509,9 @@ public static double getDouble(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != PRIMITIVE || typeInfo != DOUBLE) throw unexpectedType(Type.DOUBLE); + if (basicType != PRIMITIVE || typeInfo != DOUBLE) { + throw unexpectedType(Type.DOUBLE); + } return Double.longBitsToDouble(readLong(value, pos + 1, 8)); } @@ -514,7 +523,9 @@ public static double getDouble(byte[] value, int pos) { */ private static void checkDecimal(BigDecimal d, int maxPrecision) { if (d.precision() > maxPrecision || d.scale() > maxPrecision) { - throw malformedVariant(); + throw new MalformedVariantException(String.format( + "Decimal (precision: %d, scale: %d) exceeds max precision %d", + d.precision(), d.scale(), maxPrecision)); } } @@ -522,7 +533,9 @@ public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); + if (basicType != PRIMITIVE) { + throw unexpectedType(Type.DECIMAL); + } // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. int scale = value[pos + 1] & 0xFF; @@ -561,7 +574,9 @@ public static float getFloat(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != PRIMITIVE || typeInfo != FLOAT) throw unexpectedType(Type.FLOAT); + if (basicType != PRIMITIVE || typeInfo != FLOAT) { + throw unexpectedType(Type.FLOAT); + } return Float.intBitsToFloat((int) readLong(value, pos + 1, 4)); } @@ -569,7 +584,9 @@ public static byte[] getBinary(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != PRIMITIVE || typeInfo != BINARY) throw unexpectedType(Type.BINARY); + if (basicType != PRIMITIVE || typeInfo != BINARY) { + throw unexpectedType(Type.BINARY); + } int start = pos + 1 + U32_SIZE; int length = readUnsigned(value, pos + 1, U32_SIZE); checkIndex(start + length - 1, value.length); @@ -600,27 +617,62 @@ public static java.util.UUID getUUID(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != PRIMITIVE || typeInfo != UUID) throw unexpectedType(Type.UUID); + if (basicType != PRIMITIVE || typeInfo != UUID) { + throw unexpectedType(Type.UUID); + } int start = pos + 1; checkIndex(start + UUID_SIZE - 1, value.length); ByteBuffer bb = ByteBuffer.wrap(value, start, UUID_SIZE).order(ByteOrder.BIG_ENDIAN); return new java.util.UUID(bb.getLong(), bb.getLong()); } + /** + * A helper class representing the details of a Variant object, used for `ObjectHandler`. + */ + public static class ObjectInfo { + /** Number of object fields. */ + public final int numElements; + /** The integer size of the field id list. */ + public final int idSize; + /** The integer size of the offset list. */ + public final int offsetSize; + /** The starting index of the field id list in the variant value array. */ + public final int idStart; + /** The starting index of the offset list in the variant value array. */ + public final int offsetStart; + /** The starting index of field data in the variant value array. */ + public final int dataStart; + + public ObjectInfo(int numElements, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart) { + this.numElements = numElements; + this.idSize = idSize; + this.offsetSize = offsetSize; + this.idStart = idStart; + this.offsetStart = offsetStart; + this.dataStart = dataStart; + } + } + /** * An interface for the Variant object handler. * @param The return type of the handler */ public interface ObjectHandler { /** - * @param size Number of object fields. - * @param idSize The integer size of the field id list. - * @param offsetSize The integer size of the offset list. - * @param idStart The starting index of the field id list in the variant value array. - * @param offsetStart The starting index of the offset list in the variant value array. - * @param dataStart The starting index of field data in the variant value array. + * @param objectInfo The details of the Variant object + */ + T apply(ObjectInfo objectInfo); + } + + /** + * An interface for the Variant object handler. + * @param The return type of the handler + */ + public interface ObjectHandlerException { + /** + * @param objectInfo The details of the Variant object */ - T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart); + T apply(ObjectInfo objectInfo) throws IOException; } /** @@ -632,24 +684,64 @@ public interface ObjectHandler { * @param The return type of the handler */ public static T handleObject(byte[] value, int pos, ObjectHandler handler) { + ObjectInfo info = parseObject(value, pos); + return handler.apply(info); + } + + /** + * Same as `handleObject` but handler can throw IOException. + */ + public static T handleObjectException(byte[] value, int pos, ObjectHandlerException handler) + throws IOException { + ObjectInfo info = parseObject(value, pos); + return handler.apply(info); + } + + /** + * Parses the object at `value[pos...]`, and returns the object details. + */ + private static ObjectInfo parseObject(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != OBJECT) throw unexpectedType(Type.OBJECT); + if (basicType != OBJECT) { + throw unexpectedType(Type.OBJECT); + } // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts // b4 to determine whether the object uses a 1/4-byte size. boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; int sizeBytes = (largeSize ? U32_SIZE : 1); - int size = readUnsigned(value, pos + 1, sizeBytes); + int numElements = readUnsigned(value, pos + 1, sizeBytes); // Extracts b3b2 to determine the integer size of the field id list. int idSize = ((typeInfo >> 2) & 0x3) + 1; // Extracts b1b0 to determine the integer size of the offset list. int offsetSize = (typeInfo & 0x3) + 1; int idStart = pos + 1 + sizeBytes; - int offsetStart = idStart + size * idSize; - int dataStart = offsetStart + (size + 1) * offsetSize; - return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart); + int offsetStart = idStart + numElements * idSize; + int dataStart = offsetStart + (numElements + 1) * offsetSize; + return new ObjectInfo(numElements, idSize, offsetSize, idStart, offsetStart, dataStart); + } + + /** + * A helper class representing the details of a Variant array, used for `ArrayHandler`. + */ + public static class ArrayInfo { + /** Number of object fields. */ + public final int numElements; + /** The integer size of the offset list. */ + public final int offsetSize; + /** The starting index of the offset list in the variant value array. */ + public final int offsetStart; + /** The starting index of field data in the variant value array. */ + public final int dataStart; + + public ArrayInfo(int numElements, int offsetSize, int offsetStart, int dataStart) { + this.numElements = numElements; + this.offsetSize = offsetSize; + this.offsetStart = offsetStart; + this.dataStart = dataStart; + } } /** @@ -658,12 +750,20 @@ public static T handleObject(byte[] value, int pos, ObjectHandler handler */ public interface ArrayHandler { /** - * @param size Number of array elements. - * @param offsetSize The integer size of the offset list. - * @param offsetStart The starting index of the offset list in the variant value array. - * @param dataStart The starting index of element data in the variant value array. + * @param arrayInfo The details of the Variant array + */ + T apply(ArrayInfo arrayInfo); + } + + /** + * An interface for the Variant array handler. + * @param The return type of the handler + */ + public interface ArrayHandlerException { + /** + * @param arrayInfo The details of the Variant array */ - T apply(int size, int offsetSize, int offsetStart, int dataStart); + T apply(ArrayInfo arrayInfo) throws IOException; } /** @@ -675,21 +775,40 @@ public interface ArrayHandler { * @param The return type of the handler */ public static T handleArray(byte[] value, int pos, ArrayHandler handler) { + ArrayInfo info = parseArray(value, pos); + return handler.apply(info); + } + + /** + * Same as `handleArray` but handler can throw IOException. + */ + public static T handleArrayException(byte[] value, int pos, ArrayHandlerException handler) + throws IOException { + ArrayInfo info = parseArray(value, pos); + return handler.apply(info); + } + + /** + * Parses the array at `value[pos...]`, and returns the array details. + */ + private static ArrayInfo parseArray(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - if (basicType != ARRAY) throw unexpectedType(Type.ARRAY); + if (basicType != ARRAY) { + throw unexpectedType(Type.ARRAY); + } // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts // b2 to determine whether the object uses a 1/4-byte size. boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; int sizeBytes = (largeSize ? U32_SIZE : 1); - int size = readUnsigned(value, pos + 1, sizeBytes); + int numElements = readUnsigned(value, pos + 1, sizeBytes); // Extracts b1b0 to determine the integer size of the offset list. int offsetSize = (typeInfo & 0x3) + 1; int offsetStart = pos + 1 + sizeBytes; - int dataStart = offsetStart + (size + 1) * offsetSize; - return handler.apply(size, offsetSize, offsetStart, dataStart); + int dataStart = offsetStart + (numElements + 1) * offsetSize; + return new ArrayInfo(numElements, offsetSize, offsetStart, dataStart); } /** @@ -705,13 +824,19 @@ public static String getMetadataKey(byte[] metadata, int id) { // offset list. int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; int dictSize = readUnsigned(metadata, 1, offsetSize); - if (id >= dictSize) throw malformedVariant(); + if (id >= dictSize) { + throw new MalformedVariantException( + String.format("Invalid dictionary id: %d. dictionary size: %d", id, dictSize)); + } // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets // before the string data. int stringStart = 1 + (dictSize + 2) * offsetSize; int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); - if (offset > nextOffset) throw malformedVariant(); + if (offset > nextOffset) { + throw new MalformedVariantException( + String.format("Invalid offset: %d. next offset: %d", offset, nextOffset)); + } checkIndex(stringStart + nextOffset - 1, metadata.length); return new String(metadata, stringStart + offset, nextOffset - offset); } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 173e123d4a..9735f7afc0 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -63,12 +63,22 @@ public class TestVariantEncoding { /** Object mapper for comparing json values */ private final ObjectMapper mapper = new ObjectMapper(); + private void checkJson(String expected, String actual) { + try { + StreamReadConstraints.overrideDefaultStreamReadConstraints( + StreamReadConstraints.builder().maxNestingDepth(100000).build()); + Assert.assertEquals(mapper.readTree(expected), mapper.readTree(actual)); + } catch (IOException e) { + Assert.fail("Failed to parse json: " + e); + } + } + private void checkJson(String jsonValue) { try { StreamReadConstraints.overrideDefaultStreamReadConstraints( StreamReadConstraints.builder().maxNestingDepth(100000).build()); Variant v = VariantBuilder.parseJson(jsonValue); - Assert.assertEquals(mapper.readTree(jsonValue), mapper.readTree(v.toJson())); + checkJson(jsonValue, v.toJson()); } catch (IOException e) { Assert.fail("Failed to parse json: " + jsonValue + " " + e); } @@ -176,16 +186,59 @@ public void testIntegerBuilder() { vb2.appendLong(l); Variant v = vb2.result(); if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT1); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT2); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT16); } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT4); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT32); } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT64); } Assert.assertEquals((long) l, v.getLong()); }); + + Arrays.asList( + 0, + (int) Byte.MIN_VALUE, + (int) Byte.MAX_VALUE, + (int) Short.MIN_VALUE, + (int) Short.MAX_VALUE, + Integer.MIN_VALUE, + Integer.MAX_VALUE) + .forEach(i -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendLong((long) i); + Variant v = vb2.result(); + if (Byte.MIN_VALUE <= i && i <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + } else if (Short.MIN_VALUE <= i && i <= Short.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT16); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT32); + } + Assert.assertEquals((int) i, v.getInt()); + }); + + Arrays.asList((short) 0, (short) Byte.MIN_VALUE, (short) Byte.MAX_VALUE, Short.MIN_VALUE, Short.MAX_VALUE) + .forEach(s -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendLong(s); + Variant v = vb2.result(); + if (Byte.MIN_VALUE <= s && s <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT16); + } + Assert.assertEquals((short) s, v.getShort()); + }); + + Arrays.asList((byte) 0, Byte.MIN_VALUE, Byte.MAX_VALUE).forEach(b -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendLong(b); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + Assert.assertEquals((byte) b, v.getByte()); + }); } @Test @@ -265,7 +318,7 @@ public void testDate() { int days = Math.toIntExact(LocalDate.of(2024, 12, 16).toEpochDay()); vb.appendDate(days); Assert.assertEquals("\"2024-12-16\"", vb.result().toJson()); - Assert.assertEquals(days, vb.result().getLong()); + Assert.assertEquals(days, vb.result().getInt()); } @Test @@ -346,12 +399,13 @@ public void testBinary() { public void testUUID() { VariantBuilder vb = new VariantBuilder(false); byte[] uuid = new byte[] {0, 17, 34, 51, 68, 85, 102, 119, -120, -103, -86, -69, -52, -35, -18, -1}; - vb.appendUUID(uuid); - Assert.assertEquals( - "\"00112233-4455-6677-8899-aabbccddeeff\"", vb.result().toJson()); long msb = ByteBuffer.wrap(uuid, 0, 8).order(ByteOrder.BIG_ENDIAN).getLong(); long lsb = ByteBuffer.wrap(uuid, 8, 8).order(ByteOrder.BIG_ENDIAN).getLong(); UUID expected = new UUID(msb, lsb); + + vb.appendUUID(expected); + Assert.assertEquals( + "\"00112233-4455-6677-8899-aabbccddeeff\"", vb.result().toJson()); Assert.assertEquals(expected, vb.result().getUUID()); } @@ -406,7 +460,7 @@ public void testGetObjectFields() throws IOException { } sb.append("}"); Variant v = VariantBuilder.parseJson(sb.toString()); - Assert.assertEquals(Variant.BINARY_SEARCH_THRESHOLD / 2, v.objectSize()); + Assert.assertEquals(Variant.BINARY_SEARCH_THRESHOLD / 2, v.numObjectElements()); for (int i = 0; i < Variant.BINARY_SEARCH_THRESHOLD / 2; i++) { String actual = v.getFieldByKey("field" + i).toJson(); Assert.assertEquals(String.valueOf(i), actual); @@ -425,7 +479,7 @@ public void testGetObjectFields() throws IOException { } sb.append("}"); v = VariantBuilder.parseJson(sb.toString()); - Assert.assertEquals(2 * Variant.BINARY_SEARCH_THRESHOLD, v.objectSize()); + Assert.assertEquals(2 * Variant.BINARY_SEARCH_THRESHOLD, v.numObjectElements()); for (int i = 0; i < 2 * Variant.BINARY_SEARCH_THRESHOLD; i++) { String actual = v.getFieldByKey("field" + i).toJson(); Assert.assertEquals(String.valueOf(i), actual); @@ -437,7 +491,7 @@ public void testGetObjectFields() throws IOException { } @Test - public void testArray() { + public void testArray() throws IOException { // simple array StringBuilder sb = new StringBuilder(); sb.append("["); @@ -447,6 +501,13 @@ public void testArray() { } sb.append("]"); checkJson(sb.toString()); + // Check array elements + Variant v = VariantBuilder.parseJson(sb.toString()); + Assert.assertEquals(SAMPLE_JSON_VALUES.size(), v.numArrayElements()); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + String actual = v.getElementAtIndex(i).toJson(); + checkJson(SAMPLE_JSON_VALUES.get(i), actual); + } // large array sb = new StringBuilder(); @@ -457,6 +518,13 @@ public void testArray() { } sb.append("]"); checkJson(sb.toString()); + // Check array elements + v = VariantBuilder.parseJson(sb.toString()); + Assert.assertEquals(50000, v.numArrayElements()); + for (int i = 0; i < 50000; i++) { + String actual = v.getElementAtIndex(i).toJson(); + checkJson(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size()), actual); + } } @Test @@ -506,8 +574,8 @@ public void testAllowDuplicateKeys() { try { Variant v = VariantBuilder.parseJson( "{\"a\": 1, \"a\": 2}", new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT)); - Assert.assertEquals(1, v.objectSize()); - Assert.assertEquals(VariantUtil.Type.LONG, v.getFieldByKey("a").getType()); + Assert.assertEquals(1, v.numObjectElements()); + Assert.assertEquals(VariantUtil.Type.BYTE, v.getFieldByKey("a").getType()); Assert.assertEquals(2, v.getFieldByKey("a").getLong()); } catch (Exception e) { Assert.fail("Unexpected exception: " + e); From 7f2cd6ecde0cc3e6937763be41a30f3a3356e4f6 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Mon, 24 Mar 2025 14:01:09 -0700 Subject: [PATCH 16/20] Cleanup/improve apis --- .../org/apache/parquet/variant/Variant.java | 218 ++++++++-------- .../parquet/variant/VariantBuilder.java | 82 +++--- .../variant/VariantSizeLimitException.java | 29 --- .../apache/parquet/variant/VariantUtil.java | 243 ++++++------------ .../parquet/variant/TestVariantEncoding.java | 73 ++---- 5 files changed, 237 insertions(+), 408 deletions(-) delete mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index 497c3d9848..a8ac01c52c 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -65,7 +65,7 @@ public Variant(byte[] value, byte[] metadata) { this.pos = pos; // There is currently only one allowed version. if (metadata.length < 1 || (metadata[0] & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { - throw new MalformedVariantException(String.format( + throw new UnsupportedOperationException(String.format( "Unsupported variant metadata version: %02X", metadata[0] & VariantUtil.VERSION_MASK)); } } @@ -173,13 +173,6 @@ public String getString() { return VariantUtil.getString(value, pos); } - /** - * @return the primitive type id from a variant value - */ - public int getPrimitiveTypeId() { - return VariantUtil.getPrimitiveTypeId(value, pos); - } - /** * @return the type of the variant value */ @@ -191,7 +184,7 @@ public VariantUtil.Type getType() { * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. */ public int numObjectElements() { - return VariantUtil.handleObject(value, pos, (info) -> info.numElements); + return VariantUtil.getObjectInfo(value, pos).numElements; } /** @@ -201,53 +194,52 @@ public int numObjectElements() { * @return the field value whose key is equal to `key`, or null if key is not found */ public Variant getFieldByKey(String key) { - return VariantUtil.handleObject(value, pos, (info) -> { - // Use linear search for a short list. Switch to binary search when the length reaches - // `BINARY_SEARCH_THRESHOLD`. - if (info.numElements < BINARY_SEARCH_THRESHOLD) { - for (int i = 0; i < info.numElements; ++i) { - ObjectField field = getFieldAtIndex( - i, - value, - metadata, - info.idSize, - info.offsetSize, - info.idStart, - info.offsetStart, - info.dataStart); - if (field.key.equals(key)) { - return field.value; - } + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + // Use linear search for a short list. Switch to binary search when the length reaches + // `BINARY_SEARCH_THRESHOLD`. + if (info.numElements < BINARY_SEARCH_THRESHOLD) { + for (int i = 0; i < info.numElements; ++i) { + ObjectField field = getFieldAtIndex( + i, + value, + metadata, + info.idSize, + info.offsetSize, + pos + info.idStartOffset, + pos + info.offsetStartOffset, + pos + info.dataStartOffset); + if (field.key.equals(key)) { + return field.value; } - } else { - int low = 0; - int high = info.numElements - 1; - while (low <= high) { - // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a - // performance optimization, because it can properly handle the case where `low + high` - // overflows int. - int mid = (low + high) >>> 1; - ObjectField field = getFieldAtIndex( - mid, - value, - metadata, - info.idSize, - info.offsetSize, - info.idStart, - info.offsetStart, - info.dataStart); - int cmp = field.key.compareTo(key); - if (cmp < 0) { - low = mid + 1; - } else if (cmp > 0) { - high = mid - 1; - } else { - return field.value; - } + } + } else { + int low = 0; + int high = info.numElements - 1; + while (low <= high) { + // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a + // performance optimization, because it can properly handle the case where `low + high` + // overflows int. + int mid = (low + high) >>> 1; + ObjectField field = getFieldAtIndex( + mid, + value, + metadata, + info.idSize, + info.offsetSize, + pos + info.idStartOffset, + pos + info.offsetStartOffset, + pos + info.dataStartOffset); + int cmp = field.key.compareTo(key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return field.value; } } - return null; - }); + } + return null; } /** @@ -270,20 +262,19 @@ public ObjectField(String key, Variant value) { * @return the ObjectField at the `index` slot, or null if `index` is out of bounds */ public ObjectField getFieldAtIndex(int index) { - return VariantUtil.handleObject(value, pos, (info) -> { - if (index < 0 || index >= info.numElements) { - return null; - } - return getFieldAtIndex( - index, - value, - metadata, - info.idSize, - info.offsetSize, - info.idStart, - info.offsetStart, - info.dataStart); - }); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + if (index < 0 || index >= info.numElements) { + return null; + } + return getFieldAtIndex( + index, + value, + metadata, + info.idSize, + info.offsetSize, + pos + info.idStartOffset, + pos + info.offsetStartOffset, + pos + info.dataStartOffset); } private static ObjectField getFieldAtIndex( @@ -306,7 +297,7 @@ private static ObjectField getFieldAtIndex( * @return the number of array elements. `getType()` must be `Type.ARRAY`. */ public int numArrayElements() { - return VariantUtil.handleArray(value, pos, (info) -> info.numElements); + return VariantUtil.getArrayInfo(value, pos).numElements; } /** @@ -316,12 +307,12 @@ public int numArrayElements() { * @return the array element Variant at the `index` slot, or null if `index` is out of bounds */ public Variant getElementAtIndex(int index) { - return VariantUtil.handleArray(value, pos, (info) -> { - if (index < 0 || index >= info.numElements) { - return null; - } - return getElementAtIndex(index, value, metadata, info.offsetSize, info.offsetStart, info.dataStart); - }); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + if (index < 0 || index >= info.numElements) { + return null; + } + return getElementAtIndex( + index, value, metadata, info.offsetSize, pos + info.offsetStartOffset, pos + info.dataStartOffset); } private static Variant getElementAtIndex( @@ -456,44 +447,47 @@ private static void toJsonImpl( byte[] value, byte[] metadata, int pos, JsonGenerator gen, ZoneId zoneId, boolean truncateTrailingZeros) throws IOException { switch (VariantUtil.getType(value, pos)) { - case OBJECT: - VariantUtil.handleObjectException(value, pos, (info) -> { - gen.writeStartObject(); - for (int i = 0; i < info.numElements; ++i) { - ObjectField field = getFieldAtIndex( - i, - value, - metadata, - info.idSize, - info.offsetSize, - info.idStart, - info.offsetStart, - info.dataStart); - gen.writeFieldName(field.key); - toJsonImpl( - field.value.value, - field.value.metadata, - field.value.pos, - gen, - zoneId, - truncateTrailingZeros); - } - gen.writeEndObject(); - return null; - }); + case OBJECT: { + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + gen.writeStartObject(); + for (int i = 0; i < info.numElements; ++i) { + ObjectField field = getFieldAtIndex( + i, + value, + metadata, + info.idSize, + info.offsetSize, + pos + info.idStartOffset, + pos + info.offsetStartOffset, + pos + info.dataStartOffset); + gen.writeFieldName(field.key); + toJsonImpl( + field.value.value, + field.value.metadata, + field.value.pos, + gen, + zoneId, + truncateTrailingZeros); + } + gen.writeEndObject(); break; - case ARRAY: - VariantUtil.handleArrayException(value, pos, (info) -> { - gen.writeStartArray(); - for (int i = 0; i < info.numElements; ++i) { - Variant v = getElementAtIndex( - i, value, metadata, info.offsetSize, info.offsetStart, info.dataStart); - toJsonImpl(v.value, v.metadata, v.pos, gen, zoneId, truncateTrailingZeros); - } - gen.writeEndArray(); - return null; - }); + } + case ARRAY: { + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + gen.writeStartArray(); + for (int i = 0; i < info.numElements; ++i) { + Variant v = getElementAtIndex( + i, + value, + metadata, + info.offsetSize, + pos + info.offsetStartOffset, + pos + info.dataStartOffset); + toJsonImpl(v.value, v.metadata, v.pos, gen, zoneId, truncateTrailingZeros); + } + gen.writeEndArray(); break; + } case NULL: gen.writeNull(); break; @@ -512,7 +506,9 @@ private static void toJsonImpl( case DOUBLE: gen.writeNumber(VariantUtil.getDouble(value, pos)); break; - case DECIMAL: + case DECIMAL4: + case DECIMAL8: + case DECIMAL16: if (truncateTrailingZeros) { gen.writeNumber(VariantUtil.getDecimal(value, pos) .stripTrailingZeros() diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 7cf28ef1b9..cdf23b55c1 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -62,7 +62,6 @@ public VariantBuilder(boolean allowDuplicateKeys, int sizeLimitBytes) { * @param json the JSON string to parse * @return the Variant value * @throws IOException if any JSON parsing error happens - * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed * the size limit */ public static Variant parseJson(String json) throws IOException { @@ -75,8 +74,6 @@ public static Variant parseJson(String json) throws IOException { * @param builder the VariantBuilder to use for building the Variant * @return the Variant value * @throws IOException if any JSON parsing error happens - * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed - * the size limit */ public static Variant parseJson(String json, VariantBuilder builder) throws IOException { try (JsonParser parser = new JsonFactory().createParser(json)) { @@ -91,8 +88,6 @@ public static Variant parseJson(String json, VariantBuilder builder) throws IOEx * @param builder the VariantBuilder to use for building the Variant * @return the Variant value * @throws IOException if any JSON parsing error happens - * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed - * the size limit */ public static Variant parseJson(JsonParser parser, VariantBuilder builder) throws IOException { builder.buildFromJsonParser(parser); @@ -101,8 +96,6 @@ public static Variant parseJson(JsonParser parser, VariantBuilder builder) throw /** * @return the Variant value - * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed - * the size limit */ public Variant result() { int numKeys = dictionaryKeys.size(); @@ -116,18 +109,12 @@ public Variant result() { // unlikely that the number of keys could be larger, but incorporate that into the calculation // in case of pathological data. long maxSize = Math.max(dictionaryStringSize, numKeys); - if (maxSize > sizeLimitBytes) { - throw new VariantSizeLimitException(sizeLimitBytes, maxSize); - } int offsetSize = getMinIntegerSize((int) maxSize); int offsetStart = 1 + offsetSize; int stringStart = offsetStart + (numKeys + 1) * offsetSize; long metadataSize = stringStart + dictionaryStringSize; - if (metadataSize > sizeLimitBytes) { - throw new VariantSizeLimitException(sizeLimitBytes, metadataSize); - } byte[] metadata = new byte[(int) metadataSize]; int headerByte = VariantUtil.VERSION | ((offsetSize - 1) << 6); VariantUtil.writeLong(metadata, 0, headerByte, 1); @@ -462,39 +449,37 @@ private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { VariantUtil.checkIndex(pos, value.length); int basicType = value[pos] & VariantUtil.BASIC_TYPE_MASK; switch (basicType) { - case VariantUtil.OBJECT: - VariantUtil.handleObject(value, pos, (info) -> { - ArrayList fields = new ArrayList<>(info.numElements); - int start = writePos; - for (int i = 0; i < info.numElements; ++i) { - int id = VariantUtil.readUnsigned(value, info.idStart + info.idSize * i, info.idSize); - int offset = VariantUtil.readUnsigned( - value, info.offsetStart + info.offsetSize * i, info.offsetSize); - int elementPos = info.dataStart + offset; - String key = VariantUtil.getMetadataKey(metadata, id); - int newId = addKey(key); - fields.add(new FieldEntry(key, newId, writePos - start)); - appendVariantImpl(value, metadata, elementPos); - } - finishWritingObject(start, fields); - return null; - }); + case VariantUtil.OBJECT: { + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + ArrayList fields = new ArrayList<>(info.numElements); + int start = writePos; + for (int i = 0; i < info.numElements; ++i) { + int id = VariantUtil.readUnsigned(value, pos + info.idStartOffset + info.idSize * i, info.idSize); + int offset = VariantUtil.readUnsigned( + value, pos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); + int elementPos = pos + info.dataStartOffset + offset; + String key = VariantUtil.getMetadataKey(metadata, id); + int newId = addKey(key); + fields.add(new FieldEntry(key, newId, writePos - start)); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingObject(start, fields); break; - case VariantUtil.ARRAY: - VariantUtil.handleArray(value, pos, (info) -> { - ArrayList offsets = new ArrayList<>(info.numElements); - int start = writePos; - for (int i = 0; i < info.numElements; ++i) { - int offset = VariantUtil.readUnsigned( - value, info.offsetStart + info.offsetSize * i, info.offsetSize); - int elementPos = info.dataStart + offset; - offsets.add(writePos - start); - appendVariantImpl(value, metadata, elementPos); - } - finishWritingArray(start, offsets); - return null; - }); + } + case VariantUtil.ARRAY: { + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + ArrayList offsets = new ArrayList<>(info.numElements); + int start = writePos; + for (int i = 0; i < info.numElements; ++i) { + int offset = VariantUtil.readUnsigned( + value, pos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); + int elementPos = pos + info.dataStartOffset + offset; + offsets.add(writePos - start); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingArray(start, offsets); break; + } default: shallowAppendVariantImpl(value, pos); break; @@ -515,9 +500,6 @@ private void checkCapacity(int additionalBytes) { // Allocate a new buffer with a capacity of the next power of 2 of `requiredBytes`. int newCapacity = Integer.highestOneBit(requiredBytes); newCapacity = newCapacity < requiredBytes ? newCapacity * 2 : newCapacity; - if (newCapacity > sizeLimitBytes) { - throw new VariantSizeLimitException(sizeLimitBytes, newCapacity); - } byte[] newValue = new byte[newCapacity]; System.arraycopy(writeBuffer, 0, newValue, 0, writePos); writeBuffer = newValue; @@ -585,8 +567,10 @@ private void buildFromJsonParser(JsonParser parser) throws IOException { try { appendLong(parser.getLongValue()); } catch (InputCoercionException ignored) { - // If the value doesn't fit any integer type, parse it as decimal or floating instead. - parseAndAppendFloatingPoint(parser); + // If the value doesn't fit any integer type, try to parse it as decimal instead. + if (!tryParseDecimal(parser.getText())) { + throw new JsonParseException(parser, "Cannot parse token as int/decimal. token: " + token); + } } break; case VALUE_NUMBER_FLOAT: diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java deleted file mode 100644 index a86a41ad6e..0000000000 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.parquet.variant; - -/** - * An exception indicating that the metadata or data size of the Variant exceeds the - * configured size limit. - */ -public class VariantSizeLimitException extends RuntimeException { - public VariantSizeLimitException(long sizeLimitBytes, long estimatedSizeBytes) { - super(String.format( - "Variant size exceeds the limit of %d bytes. Estimated size: %d bytes", - sizeLimitBytes, estimatedSizeBytes)); - } -} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 097c4cd811..5c2e693e98 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -16,7 +16,6 @@ */ package org.apache.parquet.variant; -import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; @@ -228,7 +227,7 @@ public static byte arrayHeader(boolean largeSize, int offsetSize) { * Check the validity of an array index `pos`. * @param pos The index to check * @param length The length of the array - * @throws MalformedVariantException if the index is out of bound + * @throws IllegalArgumentException if the index is out of bound */ public static void checkIndex(int pos, int length) { if (pos < 0 || pos >= length) { @@ -274,14 +273,13 @@ static int readUnsigned(byte[] bytes, int pos, int numBytes) { result |= unsignedByteValue << (8 * i); } if (result < 0) { - throw new MalformedVariantException(String.format("Failed to read unsigned int. numBytes: %d", numBytes)); + throw new IllegalArgumentException(String.format("Failed to read unsigned int. numBytes: %d", numBytes)); } return result; } /** - * The value type of Variant value. It is determined by the header byte but not a 1:1 mapping - * (for example, INT1/2/4/8 all maps to `Type.LONG`). + * The value type of Variant value. It is determined by the header byte. */ public enum Type { OBJECT, @@ -294,7 +292,9 @@ public enum Type { LONG, STRING, DOUBLE, - DECIMAL, + DECIMAL4, + DECIMAL8, + DECIMAL16, DATE, TIMESTAMP, TIMESTAMP_NTZ, @@ -348,9 +348,11 @@ public static Type getType(byte[] value, int pos) { case DOUBLE: return Type.DOUBLE; case DECIMAL4: + return Type.DECIMAL4; case DECIMAL8: + return Type.DECIMAL8; case DECIMAL16: - return Type.DECIMAL; + return Type.DECIMAL16; case DATE: return Type.DATE; case TIMESTAMP: @@ -383,7 +385,6 @@ public static Type getType(byte[] value, int pos) { * @param value The Variant value * @param pos The starting index of the Variant value * @return The actual size of the Variant value - * @throws MalformedVariantException if the Variant is malformed */ public static int valueSize(byte[] value, int pos) { checkIndex(pos, value.length); @@ -392,22 +393,22 @@ public static int valueSize(byte[] value, int pos) { switch (basicType) { case SHORT_STR: return 1 + typeInfo; - case OBJECT: - return handleObject( - value, - pos, - (info) -> info.dataStart - - pos - + readUnsigned( - value, info.offsetStart + info.numElements * info.offsetSize, info.offsetSize)); - case ARRAY: - return handleArray( - value, - pos, - (info) -> info.dataStart - - pos - + readUnsigned( - value, info.offsetStart + info.numElements * info.offsetSize, info.offsetSize)); + case OBJECT: { + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + return info.dataStartOffset + + readUnsigned( + value, + pos + info.offsetStartOffset + info.numElements * info.offsetSize, + info.offsetSize); + } + case ARRAY: { + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + return info.dataStartOffset + + readUnsigned( + value, + pos + info.offsetStartOffset + info.numElements * info.offsetSize, + info.offsetSize); + } default: switch (typeInfo) { case NULL: @@ -447,8 +448,12 @@ public static int valueSize(byte[] value, int pos) { } } - private static MalformedVariantException unexpectedType(Type type) { - return new MalformedVariantException("Expected type to be " + type); + private static IllegalArgumentException unexpectedType(Type type) { + return new IllegalArgumentException("Expected type to be " + type); + } + + private static IllegalArgumentException unexpectedType(Type[] types) { + return new IllegalArgumentException("Expected type to be one of: " + Arrays.toString(types)); } public static boolean getBoolean(byte[] value, int pos) { @@ -515,26 +520,12 @@ public static double getDouble(byte[] value, int pos) { return Double.longBitsToDouble(readLong(value, pos + 1, 8)); } - /** - * Checks whether the precision and scale of the decimal are within the limit. - * @param d The decimal value to check - * @param maxPrecision The maximum precision allowed - * @throws MalformedVariantException if the decimal is malformed - */ - private static void checkDecimal(BigDecimal d, int maxPrecision) { - if (d.precision() > maxPrecision || d.scale() > maxPrecision) { - throw new MalformedVariantException(String.format( - "Decimal (precision: %d, scale: %d) exceeds max precision %d", - d.precision(), d.scale(), maxPrecision)); - } - } - public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE) { - throw unexpectedType(Type.DECIMAL); + throw unexpectedType(new Type[] {Type.DECIMAL4, Type.DECIMAL8, Type.DECIMAL16}); } // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. @@ -543,11 +534,9 @@ public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { switch (typeInfo) { case DECIMAL4: result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); - checkDecimal(result, MAX_DECIMAL4_PRECISION); break; case DECIMAL8: result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); - checkDecimal(result, MAX_DECIMAL8_PRECISION); break; case DECIMAL16: checkIndex(pos + 17, value.length); @@ -558,10 +547,9 @@ public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { bytes[i] = value[pos + 17 - i]; } result = new BigDecimal(new BigInteger(bytes), scale); - checkDecimal(result, MAX_DECIMAL16_PRECISION); break; default: - throw unexpectedType(Type.DECIMAL); + throw unexpectedType(new Type[] {Type.DECIMAL4, Type.DECIMAL8, Type.DECIMAL16}); } return result; } @@ -636,71 +624,33 @@ public static class ObjectInfo { public final int idSize; /** The integer size of the offset list. */ public final int offsetSize; - /** The starting index of the field id list in the variant value array. */ - public final int idStart; - /** The starting index of the offset list in the variant value array. */ - public final int offsetStart; - /** The starting index of field data in the variant value array. */ - public final int dataStart; - - public ObjectInfo(int numElements, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart) { + /** The byte offset (from the beginning of the Variant object) of the field id list. */ + public final int idStartOffset; + /** The byte offset (from the beginning of the Variant object) of the offset list. */ + public final int offsetStartOffset; + /** The byte offset (from the beginning of the Variant object) of the field data. */ + public final int dataStartOffset; + + public ObjectInfo( + int numElements, + int idSize, + int offsetSize, + int idStartOffset, + int offsetStartOffset, + int dataStartOffset) { this.numElements = numElements; this.idSize = idSize; this.offsetSize = offsetSize; - this.idStart = idStart; - this.offsetStart = offsetStart; - this.dataStart = dataStart; + this.idStartOffset = idStartOffset; + this.offsetStartOffset = offsetStartOffset; + this.dataStartOffset = dataStartOffset; } } - /** - * An interface for the Variant object handler. - * @param The return type of the handler - */ - public interface ObjectHandler { - /** - * @param objectInfo The details of the Variant object - */ - T apply(ObjectInfo objectInfo); - } - - /** - * An interface for the Variant object handler. - * @param The return type of the handler - */ - public interface ObjectHandlerException { - /** - * @param objectInfo The details of the Variant object - */ - T apply(ObjectInfo objectInfo) throws IOException; - } - - /** - * A helper function to access a Variant object, at `value[pos...]`. - * @param value The Variant value - * @param pos The starting index of the Variant value - * @param handler The handler to process the object - * @return The result of the handler - * @param The return type of the handler - */ - public static T handleObject(byte[] value, int pos, ObjectHandler handler) { - ObjectInfo info = parseObject(value, pos); - return handler.apply(info); - } - - /** - * Same as `handleObject` but handler can throw IOException. - */ - public static T handleObjectException(byte[] value, int pos, ObjectHandlerException handler) - throws IOException { - ObjectInfo info = parseObject(value, pos); - return handler.apply(info); - } - /** * Parses the object at `value[pos...]`, and returns the object details. */ - private static ObjectInfo parseObject(byte[] value, int pos) { + public static ObjectInfo getObjectInfo(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; @@ -717,10 +667,11 @@ private static ObjectInfo parseObject(byte[] value, int pos) { int idSize = ((typeInfo >> 2) & 0x3) + 1; // Extracts b1b0 to determine the integer size of the offset list. int offsetSize = (typeInfo & 0x3) + 1; - int idStart = pos + 1 + sizeBytes; - int offsetStart = idStart + numElements * idSize; - int dataStart = offsetStart + (numElements + 1) * offsetSize; - return new ObjectInfo(numElements, idSize, offsetSize, idStart, offsetStart, dataStart); + // int idStart = pos + 1 + sizeBytes; + int idStartOffset = 1 + sizeBytes; + int offsetStartOffset = idStartOffset + numElements * idSize; + int dataStartOffset = offsetStartOffset + (numElements + 1) * offsetSize; + return new ObjectInfo(numElements, idSize, offsetSize, idStartOffset, offsetStartOffset, dataStartOffset); } /** @@ -731,67 +682,23 @@ public static class ArrayInfo { public final int numElements; /** The integer size of the offset list. */ public final int offsetSize; - /** The starting index of the offset list in the variant value array. */ - public final int offsetStart; - /** The starting index of field data in the variant value array. */ - public final int dataStart; + /** The byte offset (from the beginning of the Variant array) of the offset list. */ + public final int offsetStartOffset; + /** The byte offset (from the beginning of the Variant array) of the field data. */ + public final int dataStartOffset; - public ArrayInfo(int numElements, int offsetSize, int offsetStart, int dataStart) { + public ArrayInfo(int numElements, int offsetSize, int offsetStartOffset, int dataStartOffset) { this.numElements = numElements; this.offsetSize = offsetSize; - this.offsetStart = offsetStart; - this.dataStart = dataStart; + this.offsetStartOffset = offsetStartOffset; + this.dataStartOffset = dataStartOffset; } } - /** - * An interface for the Variant array handler. - * @param The return type of the handler - */ - public interface ArrayHandler { - /** - * @param arrayInfo The details of the Variant array - */ - T apply(ArrayInfo arrayInfo); - } - - /** - * An interface for the Variant array handler. - * @param The return type of the handler - */ - public interface ArrayHandlerException { - /** - * @param arrayInfo The details of the Variant array - */ - T apply(ArrayInfo arrayInfo) throws IOException; - } - - /** - * A helper function to access a Variant array, at `value[pos...]`. - * @param value The Variant value - * @param pos The starting index of the Variant value - * @param handler The handler to process the array - * @return The result of the handler - * @param The return type of the handler - */ - public static T handleArray(byte[] value, int pos, ArrayHandler handler) { - ArrayInfo info = parseArray(value, pos); - return handler.apply(info); - } - - /** - * Same as `handleArray` but handler can throw IOException. - */ - public static T handleArrayException(byte[] value, int pos, ArrayHandlerException handler) - throws IOException { - ArrayInfo info = parseArray(value, pos); - return handler.apply(info); - } - /** * Parses the array at `value[pos...]`, and returns the array details. */ - private static ArrayInfo parseArray(byte[] value, int pos) { + public static ArrayInfo getArrayInfo(byte[] value, int pos) { checkIndex(pos, value.length); int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; @@ -806,9 +713,9 @@ private static ArrayInfo parseArray(byte[] value, int pos) { int numElements = readUnsigned(value, pos + 1, sizeBytes); // Extracts b1b0 to determine the integer size of the offset list. int offsetSize = (typeInfo & 0x3) + 1; - int offsetStart = pos + 1 + sizeBytes; - int dataStart = offsetStart + (numElements + 1) * offsetSize; - return new ArrayInfo(numElements, offsetSize, offsetStart, dataStart); + int offsetStartOffset = 1 + sizeBytes; + int dataStartOffset = offsetStartOffset + (numElements + 1) * offsetSize; + return new ArrayInfo(numElements, offsetSize, offsetStartOffset, dataStartOffset); } /** @@ -816,7 +723,8 @@ private static ArrayInfo parseArray(byte[] value, int pos) { * @param metadata The Variant metadata * @param id The key id * @return The key - * @throws MalformedVariantException if the Variant is malformed or if the id is out of bounds + * @throws MalformedVariantException if the Variant is malformed + * @throws IllegalArgumentException the id is out of bounds */ public static String getMetadataKey(byte[] metadata, int id) { checkIndex(0, metadata.length); @@ -825,19 +733,20 @@ public static String getMetadataKey(byte[] metadata, int id) { int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; int dictSize = readUnsigned(metadata, 1, offsetSize); if (id >= dictSize) { - throw new MalformedVariantException( + throw new IllegalArgumentException( String.format("Invalid dictionary id: %d. dictionary size: %d", id, dictSize)); } - // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets - // before the string data. - int stringStart = 1 + (dictSize + 2) * offsetSize; + // The offset list after the header byte, and a `dictSize` with `offsetSize` bytes. + int offsetListOffset = 1 + offsetSize; + // The data starts after the offset list, and `(dictSize + 1)` offset values. + int dataOffset = offsetListOffset + (dictSize + 1) * offsetSize; int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); if (offset > nextOffset) { throw new MalformedVariantException( String.format("Invalid offset: %d. next offset: %d", offset, nextOffset)); } - checkIndex(stringStart + nextOffset - 1, metadata.length); - return new String(metadata, stringStart + offset, nextOffset - offset); + checkIndex(dataOffset + nextOffset - 1, metadata.length); + return new String(metadata, dataOffset + offset, nextOffset - offset); } } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 9735f7afc0..60a767df0c 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -84,9 +84,9 @@ private void checkJson(String jsonValue) { } } - private void checkType(Variant v, int expectedBasicType, int expectedPrimitiveTypeId) { + private void checkType(Variant v, int expectedBasicType, VariantUtil.Type expectedType) { Assert.assertEquals(expectedBasicType, v.value[v.pos] & VariantUtil.BASIC_TYPE_MASK); - Assert.assertEquals(expectedPrimitiveTypeId, v.getPrimitiveTypeId()); + Assert.assertEquals(expectedType, v.getType()); } private long microsSinceEpoch(Instant instant) { @@ -157,7 +157,7 @@ public void testDecimalJson() { public void testNullBuilder() { VariantBuilder vb = new VariantBuilder(false); vb.appendNull(); - checkType(vb.result(), VariantUtil.NULL, 0); + checkType(vb.result(), VariantUtil.NULL, VariantUtil.Type.NULL); } @Test @@ -165,7 +165,7 @@ public void testBooleanBuilder() { Arrays.asList(true, false).forEach(b -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendBoolean(b); - checkType(vb2.result(), VariantUtil.PRIMITIVE, b ? VariantUtil.TRUE : VariantUtil.FALSE); + checkType(vb2.result(), VariantUtil.PRIMITIVE, VariantUtil.Type.BOOLEAN); }); } @@ -186,13 +186,13 @@ public void testIntegerBuilder() { vb2.appendLong(l); Variant v = vb2.result(); if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT16); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT32); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.INT); } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT64); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.LONG); } Assert.assertEquals((long) l, v.getLong()); }); @@ -210,11 +210,11 @@ public void testIntegerBuilder() { vb2.appendLong((long) i); Variant v = vb2.result(); if (Byte.MIN_VALUE <= i && i <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); } else if (Short.MIN_VALUE <= i && i <= Short.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT16); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT32); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.INT); } Assert.assertEquals((int) i, v.getInt()); }); @@ -225,9 +225,9 @@ public void testIntegerBuilder() { vb2.appendLong(s); Variant v = vb2.result(); if (Byte.MIN_VALUE <= s && s <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT16); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); } Assert.assertEquals((short) s, v.getShort()); }); @@ -236,7 +236,7 @@ public void testIntegerBuilder() { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendLong(b); Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); Assert.assertEquals((byte) b, v.getByte()); }); } @@ -247,7 +247,7 @@ public void testFloatBuilder() { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendFloat(f); Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.FLOAT); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.FLOAT); Assert.assertEquals(f, v.getFloat(), 0.000001); }); } @@ -258,7 +258,7 @@ public void testDoubleBuilder() { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDouble(d); Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DOUBLE); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DOUBLE); Assert.assertEquals(d, v.getDouble(), 0.000001); }); } @@ -272,9 +272,9 @@ public void testStringBuilder() { vb2.appendString(s); Variant v = vb2.result(); if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { - checkType(v, VariantUtil.SHORT_STR, len); + checkType(v, VariantUtil.SHORT_STR, VariantUtil.Type.STRING); } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.LONG_STR); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.STRING); } Assert.assertEquals(s, v.getString()); }); @@ -287,7 +287,7 @@ public void testDecimalBuilder() { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL4); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL4); Assert.assertEquals(d, v.getDecimal()); }); @@ -297,7 +297,7 @@ public void testDecimalBuilder() { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL8); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL8); Assert.assertEquals(d, v.getDecimal()); }); @@ -307,7 +307,7 @@ public void testDecimalBuilder() { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL16); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL16); Assert.assertEquals(d, v.getDecimal()); }); } @@ -527,37 +527,6 @@ public void testArray() throws IOException { } } - @Test - public void testSizeLimit() { - // large metadata size - try { - VariantBuilder.parseJson( - "{\"12345678901234567890\": 1, \"123456789012345678901\": 2}", new VariantBuilder(false, 20)); - Assert.fail("Expected VariantSizeLimitException with large metadata"); - } catch (IOException e) { - Assert.fail("Expected VariantSizeLimitException with large metadata"); - } catch (VariantSizeLimitException e) { - // Expected - } - - // large data size - try { - StringBuilder sb = new StringBuilder(); - sb.append("["); - for (int i = 0; i < 100; i++) { - if (i > 0) sb.append(", "); - sb.append("{\"a\":1}"); - } - sb.append("]"); - VariantBuilder.parseJson(sb.toString(), new VariantBuilder(false, 100)); - Assert.fail("Expected VariantSizeLimitException with large data"); - } catch (IOException e) { - Assert.fail("Expected VariantSizeLimitException with large data"); - } catch (VariantSizeLimitException e) { - // Expected - } - } - @Test public void testAllowDuplicateKeys() { // disallow duplicate keys From f310080eff10dea2508435ae95f5b59fe06da1e8 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Mon, 24 Mar 2025 14:25:31 -0700 Subject: [PATCH 17/20] cleanup unused constructor/member --- .../org/apache/parquet/variant/VariantBuilder.java | 12 ------------ .../apache/parquet/variant/TestVariantEncoding.java | 3 +-- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index cdf23b55c1..5f814c3722 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -43,18 +43,7 @@ public class VariantBuilder { * Otherwise, an exception will be thrown. */ public VariantBuilder(boolean allowDuplicateKeys) { - this(allowDuplicateKeys, VariantUtil.DEFAULT_SIZE_LIMIT); - } - - /** - * Creates a VariantBuilder. - * @param allowDuplicateKeys if true, only the last occurrence of a duplicate key will be kept. - * Otherwise, an exception will be thrown. - * @param sizeLimitBytes the maximum size (in bytes) of the resulting Variant value or metadata - */ - public VariantBuilder(boolean allowDuplicateKeys, int sizeLimitBytes) { this.allowDuplicateKeys = allowDuplicateKeys; - this.sizeLimitBytes = sizeLimitBytes; } /** @@ -650,5 +639,4 @@ private boolean tryParseDecimal(String input) { private final ArrayList dictionaryKeys = new ArrayList<>(); private final boolean allowDuplicateKeys; - private final int sizeLimitBytes; } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 60a767df0c..9326d866c7 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -541,8 +541,7 @@ public void testAllowDuplicateKeys() { // allow duplicate keys try { - Variant v = VariantBuilder.parseJson( - "{\"a\": 1, \"a\": 2}", new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT)); + Variant v = VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}", new VariantBuilder(true)); Assert.assertEquals(1, v.numObjectElements()); Assert.assertEquals(VariantUtil.Type.BYTE, v.getFieldByKey("a").getType()); Assert.assertEquals(2, v.getFieldByKey("a").getLong()); From 1040ae892001f0183d54577440adb34a29f7e025 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 25 Mar 2025 14:24:10 -0700 Subject: [PATCH 18/20] Update api to use byte-array + offset --- .../org/apache/parquet/variant/Variant.java | 206 ++++++----- .../parquet/variant/VariantBuilder.java | 31 +- .../apache/parquet/variant/VariantUtil.java | 24 +- .../parquet/variant/TestVariantEncoding.java | 345 +++++++++++------- 4 files changed, 345 insertions(+), 261 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index a8ac01c52c..92ee094feb 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -28,7 +28,6 @@ import java.time.format.DateTimeFormatterBuilder; import java.time.temporal.ChronoField; import java.time.temporal.ChronoUnit; -import java.util.Arrays; import java.util.Base64; import java.util.Locale; import java.util.UUID; @@ -38,13 +37,15 @@ * This Variant class holds the Variant-encoded value and metadata binary values. */ public final class Variant { + /** The buffer that contains the Variant value. */ final byte[] value; + /** The starting index into `value` where the Variant value begins. */ + final int valuePos; + + /** The buffer that contains the Variant metadata. */ final byte[] metadata; - /** - * The starting index into `value` where the variant value starts. This is used to avoid copying - * the value binary when reading a sub-variant in the array/object element. - */ - final int pos; + /** The starting index into `metadata` where the Variant metadata begins. */ + final int metadataPos; /** * The threshold to switch from linear search to binary search when looking up a field by key in @@ -56,46 +57,42 @@ public final class Variant { static final ZoneId UTC = ZoneId.of("UTC"); public Variant(byte[] value, byte[] metadata) { - this(value, metadata, 0); + this(value, 0, metadata, 0); } - Variant(byte[] value, byte[] metadata, int pos) { + Variant(byte[] value, int valuePos, byte[] metadata, int metadataPos) { + if (valuePos < 0 || valuePos >= value.length) { + throw new IllegalArgumentException( + String.format("Invalid valuePos: %d. value.length: %d", valuePos, value.length)); + } + if (metadataPos < 0 || metadataPos >= metadata.length) { + throw new IllegalArgumentException( + String.format("Invalid metadataPos: %d. metadata.length: %d", metadataPos, metadata.length)); + } this.value = value; + this.valuePos = valuePos; + this.metadata = metadata; - this.pos = pos; + this.metadataPos = metadataPos; // There is currently only one allowed version. - if (metadata.length < 1 || (metadata[0] & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { + if ((metadata[metadataPos] & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { throw new UnsupportedOperationException(String.format( - "Unsupported variant metadata version: %02X", metadata[0] & VariantUtil.VERSION_MASK)); + "Unsupported variant metadata version: %02X", metadata[metadataPos] & VariantUtil.VERSION_MASK)); } } - public byte[] getValue() { - if (pos == 0) { - // Position 0 means the entire value is used. Return the original value. - return value; - } - int size = VariantUtil.valueSize(value, pos); - VariantUtil.checkIndex(pos + size - 1, value.length); - return Arrays.copyOfRange(value, pos, pos + size); - } - - public byte[] getMetadata() { - return metadata; - } - /** * @return the boolean value */ public boolean getBoolean() { - return VariantUtil.getBoolean(value, pos); + return VariantUtil.getBoolean(value, valuePos); } /** * @return the byte value */ public byte getByte() { - long longValue = VariantUtil.getLong(value, pos); + long longValue = VariantUtil.getLong(value, valuePos); if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) { throw new IllegalStateException("Value out of range for byte: " + longValue); } @@ -106,7 +103,7 @@ public byte getByte() { * @return the short value */ public short getShort() { - long longValue = VariantUtil.getLong(value, pos); + long longValue = VariantUtil.getLong(value, valuePos); if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) { throw new IllegalStateException("Value out of range for short: " + longValue); } @@ -117,7 +114,7 @@ public short getShort() { * @return the int value */ public int getInt() { - long longValue = VariantUtil.getLong(value, pos); + long longValue = VariantUtil.getLong(value, valuePos); if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) { throw new IllegalStateException("Value out of range for int: " + longValue); } @@ -128,63 +125,63 @@ public int getInt() { * @return the long value */ public long getLong() { - return VariantUtil.getLong(value, pos); + return VariantUtil.getLong(value, valuePos); } /** * @return the double value */ public double getDouble() { - return VariantUtil.getDouble(value, pos); + return VariantUtil.getDouble(value, valuePos); } /** * @return the decimal value */ public BigDecimal getDecimal() { - return VariantUtil.getDecimal(value, pos); + return VariantUtil.getDecimal(value, valuePos); } /** * @return the float value */ public float getFloat() { - return VariantUtil.getFloat(value, pos); + return VariantUtil.getFloat(value, valuePos); } /** * @return the binary value */ public byte[] getBinary() { - return VariantUtil.getBinary(value, pos); + return VariantUtil.getBinary(value, valuePos); } /** * @return the UUID value */ public UUID getUUID() { - return VariantUtil.getUUID(value, pos); + return VariantUtil.getUUID(value, valuePos); } /** * @return the string value */ public String getString() { - return VariantUtil.getString(value, pos); + return VariantUtil.getString(value, valuePos); } /** * @return the type of the variant value */ public VariantUtil.Type getType() { - return VariantUtil.getType(value, pos); + return VariantUtil.getType(value, valuePos); } /** * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. */ public int numObjectElements() { - return VariantUtil.getObjectInfo(value, pos).numElements; + return VariantUtil.getObjectInfo(value, valuePos).numElements; } /** @@ -194,7 +191,7 @@ public int numObjectElements() { * @return the field value whose key is equal to `key`, or null if key is not found */ public Variant getFieldByKey(String key) { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); // Use linear search for a short list. Switch to binary search when the length reaches // `BINARY_SEARCH_THRESHOLD`. if (info.numElements < BINARY_SEARCH_THRESHOLD) { @@ -203,11 +200,12 @@ public Variant getFieldByKey(String key) { i, value, metadata, + metadataPos, info.idSize, info.offsetSize, - pos + info.idStartOffset, - pos + info.offsetStartOffset, - pos + info.dataStartOffset); + valuePos + info.idStartOffset, + valuePos + info.offsetStartOffset, + valuePos + info.dataStartOffset); if (field.key.equals(key)) { return field.value; } @@ -224,11 +222,12 @@ public Variant getFieldByKey(String key) { mid, value, metadata, + metadataPos, info.idSize, info.offsetSize, - pos + info.idStartOffset, - pos + info.offsetStartOffset, - pos + info.dataStartOffset); + valuePos + info.idStartOffset, + valuePos + info.offsetStartOffset, + valuePos + info.dataStartOffset); int cmp = field.key.compareTo(key); if (cmp < 0) { low = mid + 1; @@ -262,7 +261,7 @@ public ObjectField(String key, Variant value) { * @return the ObjectField at the `index` slot, or null if `index` is out of bounds */ public ObjectField getFieldAtIndex(int index) { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); if (index < 0 || index >= info.numElements) { return null; } @@ -270,26 +269,29 @@ public ObjectField getFieldAtIndex(int index) { index, value, metadata, + metadataPos, info.idSize, info.offsetSize, - pos + info.idStartOffset, - pos + info.offsetStartOffset, - pos + info.dataStartOffset); + valuePos + info.idStartOffset, + valuePos + info.offsetStartOffset, + valuePos + info.dataStartOffset); } private static ObjectField getFieldAtIndex( int index, byte[] value, byte[] metadata, + int metadataPos, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart) { + // idStart, offsetStart, and dataStart are absolute positions in the `value` buffer. int id = VariantUtil.readUnsigned(value, idStart + idSize * index, idSize); int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize); - String key = VariantUtil.getMetadataKey(metadata, id); - Variant v = new Variant(value, metadata, dataStart + offset); + String key = VariantUtil.getMetadataKey(metadata, metadataPos, id); + Variant v = new Variant(value, dataStart + offset, metadata, metadataPos); return new ObjectField(key, v); } @@ -297,7 +299,7 @@ private static ObjectField getFieldAtIndex( * @return the number of array elements. `getType()` must be `Type.ARRAY`. */ public int numArrayElements() { - return VariantUtil.getArrayInfo(value, pos).numElements; + return VariantUtil.getArrayInfo(value, valuePos).numElements; } /** @@ -307,18 +309,25 @@ public int numArrayElements() { * @return the array element Variant at the `index` slot, or null if `index` is out of bounds */ public Variant getElementAtIndex(int index) { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, valuePos); if (index < 0 || index >= info.numElements) { return null; } return getElementAtIndex( - index, value, metadata, info.offsetSize, pos + info.offsetStartOffset, pos + info.dataStartOffset); + index, + value, + metadata, + metadataPos, + info.offsetSize, + valuePos + info.offsetStartOffset, + valuePos + info.dataStartOffset); } private static Variant getElementAtIndex( - int index, byte[] value, byte[] metadata, int offsetSize, int offsetStart, int dataStart) { + int index, byte[] value, byte[] metadata, int metadataPos, int offsetSize, int offsetStart, int dataStart) { + // offsetStart and dataStart are absolute positions in the `value` buffer. int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize); - return new Variant(value, metadata, dataStart + offset); + return new Variant(value, dataStart + offset, metadata, metadataPos); } /** @@ -347,7 +356,7 @@ public String toJson(ZoneId zoneId) { public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { try (CharArrayWriter writer = new CharArrayWriter(); JsonGenerator gen = new JsonFactory().createGenerator(writer)) { - toJsonImpl(value, metadata, pos, gen, zoneId, truncateTrailingZeros); + toJsonImpl(value, valuePos, metadata, metadataPos, gen, zoneId, truncateTrailingZeros); gen.flush(); return writer.toString(); } catch (IOException e) { @@ -444,27 +453,35 @@ private static Instant nanosToInstant(long timestampNanos) { } private static void toJsonImpl( - byte[] value, byte[] metadata, int pos, JsonGenerator gen, ZoneId zoneId, boolean truncateTrailingZeros) + byte[] value, + int valuePos, + byte[] metadata, + int metadataPos, + JsonGenerator gen, + ZoneId zoneId, + boolean truncateTrailingZeros) throws IOException { - switch (VariantUtil.getType(value, pos)) { + switch (VariantUtil.getType(value, valuePos)) { case OBJECT: { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); gen.writeStartObject(); for (int i = 0; i < info.numElements; ++i) { ObjectField field = getFieldAtIndex( i, value, metadata, + metadataPos, info.idSize, info.offsetSize, - pos + info.idStartOffset, - pos + info.offsetStartOffset, - pos + info.dataStartOffset); + valuePos + info.idStartOffset, + valuePos + info.offsetStartOffset, + valuePos + info.dataStartOffset); gen.writeFieldName(field.key); toJsonImpl( field.value.value, + field.value.valuePos, field.value.metadata, - field.value.pos, + metadataPos, gen, zoneId, truncateTrailingZeros); @@ -473,17 +490,18 @@ private static void toJsonImpl( break; } case ARRAY: { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, valuePos); gen.writeStartArray(); for (int i = 0; i < info.numElements; ++i) { Variant v = getElementAtIndex( i, value, metadata, + metadataPos, info.offsetSize, - pos + info.offsetStartOffset, - pos + info.dataStartOffset); - toJsonImpl(v.value, v.metadata, v.pos, gen, zoneId, truncateTrailingZeros); + valuePos + info.offsetStartOffset, + valuePos + info.dataStartOffset); + toJsonImpl(v.value, v.valuePos, v.metadata, metadataPos, gen, zoneId, truncateTrailingZeros); } gen.writeEndArray(); break; @@ -492,91 +510,93 @@ private static void toJsonImpl( gen.writeNull(); break; case BOOLEAN: - gen.writeBoolean(VariantUtil.getBoolean(value, pos)); + gen.writeBoolean(VariantUtil.getBoolean(value, valuePos)); break; case BYTE: case SHORT: case INT: case LONG: - gen.writeNumber(VariantUtil.getLong(value, pos)); + gen.writeNumber(VariantUtil.getLong(value, valuePos)); break; case STRING: - gen.writeString(VariantUtil.getString(value, pos)); + gen.writeString(VariantUtil.getString(value, valuePos)); break; case DOUBLE: - gen.writeNumber(VariantUtil.getDouble(value, pos)); + gen.writeNumber(VariantUtil.getDouble(value, valuePos)); break; case DECIMAL4: case DECIMAL8: case DECIMAL16: if (truncateTrailingZeros) { - gen.writeNumber(VariantUtil.getDecimal(value, pos) + gen.writeNumber(VariantUtil.getDecimal(value, valuePos) .stripTrailingZeros() .toPlainString()); } else { - gen.writeNumber(VariantUtil.getDecimal(value, pos).toPlainString()); + gen.writeNumber(VariantUtil.getDecimal(value, valuePos).toPlainString()); } break; case DATE: - gen.writeString(LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)) + gen.writeString(LocalDate.ofEpochDay((int) VariantUtil.getLong(value, valuePos)) .toString()); break; case TIMESTAMP: if (truncateTrailingZeros) { - gen.writeString(TIMESTAMP_TRUNC_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + gen.writeString( + TIMESTAMP_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) + .atZone(zoneId))); } else { - gen.writeString(TIMESTAMP_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + gen.writeString(TIMESTAMP_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) + .atZone(zoneId))); } break; case TIMESTAMP_NTZ: if (truncateTrailingZeros) { - gen.writeString(TIMESTAMP_NTZ_TRUNC_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + gen.writeString( + TIMESTAMP_NTZ_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) + .atZone(ZoneOffset.UTC))); } else { - gen.writeString(TIMESTAMP_NTZ_FORMATTER.format( - microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + gen.writeString(TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) + .atZone(ZoneOffset.UTC))); } break; case FLOAT: - gen.writeNumber(VariantUtil.getFloat(value, pos)); + gen.writeNumber(VariantUtil.getFloat(value, valuePos)); break; case BINARY: - gen.writeString(Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); + gen.writeString(Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, valuePos))); break; case TIME: if (truncateTrailingZeros) { gen.writeString(TIME_TRUNC_FORMATTER.format( - LocalTime.ofNanoOfDay(VariantUtil.getLong(value, pos) * 1_000))); + LocalTime.ofNanoOfDay(VariantUtil.getLong(value, valuePos) * 1_000))); } else { gen.writeString( - TIME_FORMATTER.format(LocalTime.ofNanoOfDay(VariantUtil.getLong(value, pos) * 1_000))); + TIME_FORMATTER.format(LocalTime.ofNanoOfDay(VariantUtil.getLong(value, valuePos) * 1_000))); } break; case TIMESTAMP_NANOS: if (truncateTrailingZeros) { gen.writeString(TIMESTAMP_NANOS_TRUNC_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(zoneId))); } else { gen.writeString(TIMESTAMP_NANOS_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(zoneId))); } break; case TIMESTAMP_NANOS_NTZ: if (truncateTrailingZeros) { gen.writeString(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(ZoneOffset.UTC))); } else { gen.writeString(TIMESTAMP_NANOS_NTZ_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(ZoneOffset.UTC))); } break; case UUID: - gen.writeString(VariantUtil.getUUID(value, pos).toString()); + gen.writeString(VariantUtil.getUUID(value, valuePos).toString()); break; default: - throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, pos)); + throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, valuePos)); } } } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 5f814c3722..41f9e017f1 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -431,46 +431,47 @@ public void finishWritingArray(int start, ArrayList offsets) { * @param v the Variant value to append */ public void appendVariant(Variant v) { - appendVariantImpl(v.value, v.metadata, v.pos); + appendVariantImpl(v.value, v.valuePos, v.metadata, v.metadataPos); } - private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { - VariantUtil.checkIndex(pos, value.length); - int basicType = value[pos] & VariantUtil.BASIC_TYPE_MASK; + private void appendVariantImpl(byte[] value, int valuePos, byte[] metadata, int metadataPos) { + VariantUtil.checkIndex(valuePos, value.length); + int basicType = value[valuePos] & VariantUtil.BASIC_TYPE_MASK; switch (basicType) { case VariantUtil.OBJECT: { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); ArrayList fields = new ArrayList<>(info.numElements); int start = writePos; for (int i = 0; i < info.numElements; ++i) { - int id = VariantUtil.readUnsigned(value, pos + info.idStartOffset + info.idSize * i, info.idSize); + int id = VariantUtil.readUnsigned( + value, valuePos + info.idStartOffset + info.idSize * i, info.idSize); int offset = VariantUtil.readUnsigned( - value, pos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); - int elementPos = pos + info.dataStartOffset + offset; - String key = VariantUtil.getMetadataKey(metadata, id); + value, valuePos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); + int elementPos = valuePos + info.dataStartOffset + offset; + String key = VariantUtil.getMetadataKey(metadata, metadataPos, id); int newId = addKey(key); fields.add(new FieldEntry(key, newId, writePos - start)); - appendVariantImpl(value, metadata, elementPos); + appendVariantImpl(value, elementPos, metadata, metadataPos); } finishWritingObject(start, fields); break; } case VariantUtil.ARRAY: { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, valuePos); ArrayList offsets = new ArrayList<>(info.numElements); int start = writePos; for (int i = 0; i < info.numElements; ++i) { int offset = VariantUtil.readUnsigned( - value, pos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); - int elementPos = pos + info.dataStartOffset + offset; + value, valuePos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); + int elementPos = valuePos + info.dataStartOffset + offset; offsets.add(writePos - start); - appendVariantImpl(value, metadata, elementPos); + appendVariantImpl(value, elementPos, metadata, metadataPos); } finishWritingArray(start, offsets); break; } default: - shallowAppendVariantImpl(value, pos); + shallowAppendVariantImpl(value, valuePos); break; } } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 5c2e693e98..1c94074bf3 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -720,33 +720,35 @@ public static ArrayInfo getArrayInfo(byte[] value, int pos) { /** * Returns a key at `id` in the Variant metadata. + * * @param metadata The Variant metadata + * @param metadataPos the position of the metadata in the byte array * @param id The key id * @return The key * @throws MalformedVariantException if the Variant is malformed - * @throws IllegalArgumentException the id is out of bounds + * @throws IllegalArgumentException the id is out of bounds */ - public static String getMetadataKey(byte[] metadata, int id) { - checkIndex(0, metadata.length); + public static String getMetadataKey(byte[] metadata, int metadataPos, int id) { + checkIndex(metadataPos, metadata.length); // Extracts the highest 2 bits in the metadata header to determine the integer size of the // offset list. - int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; - int dictSize = readUnsigned(metadata, 1, offsetSize); + int offsetSize = ((metadata[metadataPos] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, metadataPos + 1, offsetSize); if (id >= dictSize) { throw new IllegalArgumentException( String.format("Invalid dictionary id: %d. dictionary size: %d", id, dictSize)); } // The offset list after the header byte, and a `dictSize` with `offsetSize` bytes. - int offsetListOffset = 1 + offsetSize; + int offsetListPos = metadataPos + 1 + offsetSize; // The data starts after the offset list, and `(dictSize + 1)` offset values. - int dataOffset = offsetListOffset + (dictSize + 1) * offsetSize; - int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); - int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); + int dataPos = offsetListPos + (dictSize + 1) * offsetSize; + int offset = readUnsigned(metadata, offsetListPos + (id) * offsetSize, offsetSize); + int nextOffset = readUnsigned(metadata, offsetListPos + (id + 1) * offsetSize, offsetSize); if (offset > nextOffset) { throw new MalformedVariantException( String.format("Invalid offset: %d. next offset: %d", offset, nextOffset)); } - checkIndex(dataOffset + nextOffset - 1, metadata.length); - return new String(metadata, dataOffset + offset, nextOffset - offset); + checkIndex(dataPos + nextOffset - 1, metadata.length); + return new String(metadata, dataPos + offset, nextOffset - offset); } } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 9326d866c7..7dfef93185 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.UUID; import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; import java.util.stream.IntStream; import org.junit.Assert; import org.junit.Test; @@ -85,7 +86,7 @@ private void checkJson(String jsonValue) { } private void checkType(Variant v, int expectedBasicType, VariantUtil.Type expectedType) { - Assert.assertEquals(expectedBasicType, v.value[v.pos] & VariantUtil.BASIC_TYPE_MASK); + Assert.assertEquals(expectedBasicType, v.value[v.valuePos] & VariantUtil.BASIC_TYPE_MASK); Assert.assertEquals(expectedType, v.getType()); } @@ -105,6 +106,19 @@ private String randomString(int len) { return sb.toString(); } + private void testVariant(Variant v, Consumer consumer) { + consumer.accept(v); + // Create new Variant with different byte offsets + byte[] newValue = new byte[v.value.length + 50]; + byte[] newMetadata = new byte[v.metadata.length + 50]; + Arrays.fill(newValue, (byte) 0xFF); + Arrays.fill(newMetadata, (byte) 0xFF); + System.arraycopy(v.value, 0, newValue, 25, v.value.length); + System.arraycopy(v.metadata, 0, newMetadata, 25, v.metadata.length); + Variant v2 = new Variant(newValue, 25 + v.valuePos, newMetadata, 25 + v.metadataPos); + consumer.accept(v2); + } + @Test public void testNullJson() { checkJson("null"); @@ -157,15 +171,18 @@ public void testDecimalJson() { public void testNullBuilder() { VariantBuilder vb = new VariantBuilder(false); vb.appendNull(); - checkType(vb.result(), VariantUtil.NULL, VariantUtil.Type.NULL); + testVariant(vb.result(), v -> checkType(v, VariantUtil.NULL, VariantUtil.Type.NULL)); } @Test public void testBooleanBuilder() { Arrays.asList(true, false).forEach(b -> { - VariantBuilder vb2 = new VariantBuilder(false); - vb2.appendBoolean(b); - checkType(vb2.result(), VariantUtil.PRIMITIVE, VariantUtil.Type.BOOLEAN); + VariantBuilder vb = new VariantBuilder(false); + vb.appendBoolean(b); + testVariant(vb.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BOOLEAN); + Assert.assertEquals(b, v.getBoolean()); + }); }); } @@ -184,17 +201,18 @@ public void testIntegerBuilder() { .forEach(l -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendLong(l); - Variant v = vb2.result(); - if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); - } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); - } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.INT); - } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.LONG); - } - Assert.assertEquals((long) l, v.getLong()); + testVariant(vb2.result(), v -> { + if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); + } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); + } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.INT); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.LONG); + } + Assert.assertEquals((long) l, v.getLong()); + }); }); Arrays.asList( @@ -208,58 +226,63 @@ public void testIntegerBuilder() { .forEach(i -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendLong((long) i); - Variant v = vb2.result(); - if (Byte.MIN_VALUE <= i && i <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); - } else if (Short.MIN_VALUE <= i && i <= Short.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); - } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.INT); - } - Assert.assertEquals((int) i, v.getInt()); + testVariant(vb2.result(), v -> { + if (Byte.MIN_VALUE <= i && i <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); + } else if (Short.MIN_VALUE <= i && i <= Short.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.INT); + } + Assert.assertEquals((int) i, v.getInt()); + }); }); Arrays.asList((short) 0, (short) Byte.MIN_VALUE, (short) Byte.MAX_VALUE, Short.MIN_VALUE, Short.MAX_VALUE) .forEach(s -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendLong(s); - Variant v = vb2.result(); - if (Byte.MIN_VALUE <= s && s <= Byte.MAX_VALUE) { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); - } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); - } - Assert.assertEquals((short) s, v.getShort()); + testVariant(vb2.result(), v -> { + if (Byte.MIN_VALUE <= s && s <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.SHORT); + } + Assert.assertEquals((short) s, v.getShort()); + }); }); Arrays.asList((byte) 0, Byte.MIN_VALUE, Byte.MAX_VALUE).forEach(b -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendLong(b); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); - Assert.assertEquals((byte) b, v.getByte()); + testVariant(vb2.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.BYTE); + Assert.assertEquals((byte) b, v.getByte()); + }); }); } @Test public void testFloatBuilder() { - Arrays.asList(Float.MIN_VALUE, Float.MAX_VALUE).forEach(f -> { + Arrays.asList(Float.MIN_VALUE, 0f, Float.MAX_VALUE).forEach(f -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendFloat(f); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.FLOAT); - Assert.assertEquals(f, v.getFloat(), 0.000001); + testVariant(vb2.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.FLOAT); + Assert.assertEquals(f, v.getFloat(), 0.000001); + }); }); } @Test public void testDoubleBuilder() { - Arrays.asList(Double.MIN_VALUE, Double.MAX_VALUE).forEach(d -> { + Arrays.asList(Double.MIN_VALUE, 0d, Double.MAX_VALUE).forEach(d -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDouble(d); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DOUBLE); - Assert.assertEquals(d, v.getDouble(), 0.000001); + testVariant(vb2.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DOUBLE); + Assert.assertEquals(d, v.getDouble(), 0.000001); + }); }); } @@ -270,13 +293,14 @@ public void testStringBuilder() { VariantBuilder vb2 = new VariantBuilder(false); String s = randomString(len); vb2.appendString(s); - Variant v = vb2.result(); - if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { - checkType(v, VariantUtil.SHORT_STR, VariantUtil.Type.STRING); - } else { - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.STRING); - } - Assert.assertEquals(s, v.getString()); + testVariant(vb2.result(), v -> { + if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { + checkType(v, VariantUtil.SHORT_STR, VariantUtil.Type.STRING); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.STRING); + } + Assert.assertEquals(s, v.getString()); + }); }); } @@ -286,9 +310,10 @@ public void testDecimalBuilder() { Arrays.asList(new BigDecimal("123.456"), new BigDecimal("-987.654")).forEach(d -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL4); - Assert.assertEquals(d, v.getDecimal()); + testVariant(vb2.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL4); + Assert.assertEquals(d, v.getDecimal()); + }); }); // decimal8 @@ -296,9 +321,10 @@ public void testDecimalBuilder() { .forEach(d -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL8); - Assert.assertEquals(d, v.getDecimal()); + testVariant(vb2.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL8); + Assert.assertEquals(d, v.getDecimal()); + }); }); // decimal16 @@ -306,19 +332,36 @@ public void testDecimalBuilder() { .forEach(d -> { VariantBuilder vb2 = new VariantBuilder(false); vb2.appendDecimal(d); - Variant v = vb2.result(); - checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL16); - Assert.assertEquals(d, v.getDecimal()); + testVariant(vb2.result(), v -> { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.Type.DECIMAL16); + Assert.assertEquals(d, v.getDecimal()); + }); }); } + @Test + public void testVariantBuilder() throws IOException { + Variant subV = + VariantBuilder.parseJson("{\"a\": 1.1, \"b\": {\"d\": [[1], \"foo\"]}, \"c\": [true, {\"a\": 2}]}"); + testVariant(subV, v -> { + VariantBuilder vb = new VariantBuilder(false); + vb.appendVariant(v); + testVariant(vb.result(), v2 -> { + checkType(v2, VariantUtil.OBJECT, VariantUtil.Type.OBJECT); + checkJson(v.toJson(), v2.toJson()); + }); + }); + } + @Test public void testDate() { VariantBuilder vb = new VariantBuilder(false); int days = Math.toIntExact(LocalDate.of(2024, 12, 16).toEpochDay()); vb.appendDate(days); - Assert.assertEquals("\"2024-12-16\"", vb.result().toJson()); - Assert.assertEquals(days, vb.result().getInt()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"2024-12-16\"", v.toJson()); + Assert.assertEquals(days, v.getInt()); + }); } @Test @@ -327,10 +370,12 @@ public void testTimestamp() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-12-16T10:23:45.321456-08:00"))); vb.appendTimestamp(micros); - Assert.assertEquals("\"2024-12-16T18:23:45.321456+00:00\"", vb.result().toJson()); - Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", vb.result().toJson(ZoneId.of("-08:00"))); - Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", vb.result().toJson(ZoneId.of("+01:00"))); - Assert.assertEquals(micros, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"2024-12-16T18:23:45.321456+00:00\"", v.toJson()); + Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", v.toJson(ZoneId.of("+01:00"))); + Assert.assertEquals(micros, v.getLong()); + }); } @Test @@ -339,10 +384,12 @@ public void testTimestampNtz() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.000001Z"))); vb.appendTimestampNtz(micros); - Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", vb.result().toJson()); - Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", vb.result().toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), vb.result().toJson(ZoneId.of("+02:00"))); - Assert.assertEquals(micros, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", v.toJson()); + Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(v.toJson(ZoneId.of("-08:00")), v.toJson(ZoneId.of("+02:00"))); + Assert.assertEquals(micros, v.getLong()); + }); } @Test @@ -352,8 +399,10 @@ public void testTime() { VariantBuilder vb = new VariantBuilder(false); long micros = LocalTime.parse(timeStr).toNanoOfDay() / 1_000; vb.appendTime(micros); - Assert.assertEquals(String.format("\"%s\"", timeStr), vb.result().toJson()); - Assert.assertEquals(micros, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals(String.format("\"%s\"", timeStr), v.toJson()); + Assert.assertEquals(micros, v.getLong()); + }); } } @@ -363,13 +412,12 @@ public void testTimestampNanos() { VariantBuilder vb = new VariantBuilder(false); long nanos = nanosSinceEpoch(Instant.from(dtf.parse("2024-12-16T10:23:45.321456987-08:00"))); vb.appendTimestampNanos(nanos); - Assert.assertEquals( - "\"2024-12-16T18:23:45.321456987+00:00\"", vb.result().toJson()); - Assert.assertEquals( - "\"2024-12-16T10:23:45.321456987-08:00\"", vb.result().toJson(ZoneId.of("-08:00"))); - Assert.assertEquals( - "\"2024-12-16T19:23:45.321456987+01:00\"", vb.result().toJson(ZoneId.of("+01:00"))); - Assert.assertEquals(nanos, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"2024-12-16T18:23:45.321456987+00:00\"", v.toJson()); + Assert.assertEquals("\"2024-12-16T10:23:45.321456987-08:00\"", v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals("\"2024-12-16T19:23:45.321456987+01:00\"", v.toJson(ZoneId.of("+01:00"))); + Assert.assertEquals(nanos, v.getLong()); + }); } @Test @@ -378,10 +426,12 @@ public void testTimestampNanosNtz() { VariantBuilder vb = new VariantBuilder(false); long nanos = nanosSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.839280983Z"))); vb.appendTimestampNanosNtz(nanos); - Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", vb.result().toJson()); - Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", vb.result().toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), vb.result().toJson(ZoneId.of("+02:00"))); - Assert.assertEquals(nanos, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", v.toJson()); + Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(v.toJson(ZoneId.of("-08:00")), v.toJson(ZoneId.of("+02:00"))); + Assert.assertEquals(nanos, v.getLong()); + }); } @Test @@ -389,10 +439,10 @@ public void testBinary() { VariantBuilder vb = new VariantBuilder(false); byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; vb.appendBinary(binary); - Assert.assertEquals( - "\"" + Base64.getEncoder().encodeToString(binary) + "\"", - vb.result().toJson()); - Assert.assertArrayEquals(binary, vb.result().getBinary()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"" + Base64.getEncoder().encodeToString(binary) + "\"", v.toJson()); + Assert.assertArrayEquals(binary, v.getBinary()); + }); } @Test @@ -404,9 +454,10 @@ public void testUUID() { UUID expected = new UUID(msb, lsb); vb.appendUUID(expected); - Assert.assertEquals( - "\"00112233-4455-6677-8899-aabbccddeeff\"", vb.result().toJson()); - Assert.assertEquals(expected, vb.result().getUUID()); + testVariant(vb.result(), v -> { + Assert.assertEquals("\"00112233-4455-6677-8899-aabbccddeeff\"", v.toJson()); + Assert.assertEquals(expected, v.getUUID()); + }); } @Test @@ -459,16 +510,17 @@ public void testGetObjectFields() throws IOException { sb.append("\"field" + i + "\": ").append(i); } sb.append("}"); - Variant v = VariantBuilder.parseJson(sb.toString()); - Assert.assertEquals(Variant.BINARY_SEARCH_THRESHOLD / 2, v.numObjectElements()); - for (int i = 0; i < Variant.BINARY_SEARCH_THRESHOLD / 2; i++) { - String actual = v.getFieldByKey("field" + i).toJson(); - Assert.assertEquals(String.valueOf(i), actual); - // check by index - Variant.ObjectField field = v.getFieldAtIndex(i); - Assert.assertTrue(field.key.startsWith("field")); - Assert.assertEquals(field.key.substring("field".length()), field.value.toJson()); - } + testVariant(VariantBuilder.parseJson(sb.toString()), v -> { + Assert.assertEquals(Variant.BINARY_SEARCH_THRESHOLD / 2, v.numObjectElements()); + for (int i = 0; i < Variant.BINARY_SEARCH_THRESHOLD / 2; i++) { + String actual = v.getFieldByKey("field" + i).toJson(); + Assert.assertEquals(String.valueOf(i), actual); + // check by index + Variant.ObjectField field = v.getFieldAtIndex(i); + Assert.assertTrue(field.key.startsWith("field")); + Assert.assertEquals(field.key.substring("field".length()), field.value.toJson()); + } + }); // Create larger object for binary search sb = new StringBuilder(); @@ -478,16 +530,17 @@ public void testGetObjectFields() throws IOException { sb.append("\"field" + i + "\": ").append(i); } sb.append("}"); - v = VariantBuilder.parseJson(sb.toString()); - Assert.assertEquals(2 * Variant.BINARY_SEARCH_THRESHOLD, v.numObjectElements()); - for (int i = 0; i < 2 * Variant.BINARY_SEARCH_THRESHOLD; i++) { - String actual = v.getFieldByKey("field" + i).toJson(); - Assert.assertEquals(String.valueOf(i), actual); - // check by index - Variant.ObjectField field = v.getFieldAtIndex(i); - Assert.assertTrue(field.key.startsWith("field")); - Assert.assertEquals(field.key.substring("field".length()), field.value.toJson()); - } + testVariant(VariantBuilder.parseJson(sb.toString()), v -> { + Assert.assertEquals(2 * Variant.BINARY_SEARCH_THRESHOLD, v.numObjectElements()); + for (int i = 0; i < 2 * Variant.BINARY_SEARCH_THRESHOLD; i++) { + String actual = v.getFieldByKey("field" + i).toJson(); + Assert.assertEquals(String.valueOf(i), actual); + // check by index + Variant.ObjectField field = v.getFieldAtIndex(i); + Assert.assertTrue(field.key.startsWith("field")); + Assert.assertEquals(field.key.substring("field".length()), field.value.toJson()); + } + }); } @Test @@ -502,12 +555,13 @@ public void testArray() throws IOException { sb.append("]"); checkJson(sb.toString()); // Check array elements - Variant v = VariantBuilder.parseJson(sb.toString()); - Assert.assertEquals(SAMPLE_JSON_VALUES.size(), v.numArrayElements()); - for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { - String actual = v.getElementAtIndex(i).toJson(); - checkJson(SAMPLE_JSON_VALUES.get(i), actual); - } + testVariant(VariantBuilder.parseJson(sb.toString()), v -> { + Assert.assertEquals(SAMPLE_JSON_VALUES.size(), v.numArrayElements()); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + String actual = v.getElementAtIndex(i).toJson(); + checkJson(SAMPLE_JSON_VALUES.get(i), actual); + } + }); // large array sb = new StringBuilder(); @@ -519,12 +573,13 @@ public void testArray() throws IOException { sb.append("]"); checkJson(sb.toString()); // Check array elements - v = VariantBuilder.parseJson(sb.toString()); - Assert.assertEquals(50000, v.numArrayElements()); - for (int i = 0; i < 50000; i++) { - String actual = v.getElementAtIndex(i).toJson(); - checkJson(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size()), actual); - } + testVariant(VariantBuilder.parseJson(sb.toString()), v -> { + Assert.assertEquals(50000, v.numArrayElements()); + for (int i = 0; i < 50000; i++) { + String actual = v.getElementAtIndex(i).toJson(); + checkJson(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size()), actual); + } + }); } @Test @@ -577,9 +632,10 @@ public void testTruncateTrailingZeroDecimal() { VariantBuilder vb = new VariantBuilder(false); BigDecimal d = new BigDecimal(strings[0]); vb.appendDecimal(d); - Variant v = vb.result(); - Assert.assertEquals(strings[0], v.toJson()); - Assert.assertEquals(strings[1], v.toJson(ZoneId.of("UTC"), true)); + testVariant(vb.result(), v -> { + Assert.assertEquals(strings[0], v.toJson()); + Assert.assertEquals(strings[1], v.toJson(ZoneId.of("UTC"), true)); + }); } } @@ -596,9 +652,10 @@ public void testTruncateTrailingZeroTimestamp() { VariantBuilder vb = new VariantBuilder(false); long micros = microsSinceEpoch(Instant.from(dtf.parse(strings[0]))); vb.appendTimestamp(micros); - Variant v = vb.result(); - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + testVariant(vb.result(), v -> { + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + }); } } @@ -616,10 +673,11 @@ public void testTruncateTrailingZeroTimestampNtz() { long micros = microsSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); vb.appendTimestampNtz(micros); - Variant v = vb.result(); - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - Assert.assertEquals(micros, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(micros, v.getLong()); + }); } } @@ -636,10 +694,11 @@ public void testTruncateTrailingZeroTime() { long micros = LocalTime.parse(strings[0]).toNanoOfDay() / 1_000; vb.appendTime(micros); - Variant v = vb.result(); - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - Assert.assertEquals(micros, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(micros, v.getLong()); + }); } } @@ -656,9 +715,10 @@ public void testTruncateTrailingZeroTimestampNanos() { VariantBuilder vb = new VariantBuilder(false); long nanos = nanosSinceEpoch(Instant.from(dtf.parse(strings[0]))); vb.appendTimestampNanos(nanos); - Variant v = vb.result(); - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + testVariant(vb.result(), v -> { + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + }); } } @@ -676,10 +736,11 @@ public void testTruncateTrailingZeroTimestampNanosNtz() { long nanos = nanosSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); vb.appendTimestampNanosNtz(nanos); - Variant v = vb.result(); - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - Assert.assertEquals(nanos, vb.result().getLong()); + testVariant(vb.result(), v -> { + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(nanos, vb.result().getLong()); + }); } } } From ba905c8fa45179e0018e1a5c9dbf77a555b97aca Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Wed, 26 Mar 2025 14:58:29 -0700 Subject: [PATCH 19/20] Use ScalarToJson interface --- .../parquet/variant/DefaultScalarToJson.java | 157 +++++++++++ .../org/apache/parquet/variant/Variant.java | 246 +++++------------- .../parquet/variant/TestVariantEncoding.java | 148 ----------- 3 files changed, 227 insertions(+), 324 deletions(-) create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/DefaultScalarToJson.java diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/DefaultScalarToJson.java b/parquet-variant/src/main/java/org/apache/parquet/variant/DefaultScalarToJson.java new file mode 100644 index 0000000000..d9d5813472 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/DefaultScalarToJson.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.variant; + +import com.fasterxml.jackson.core.JsonGenerator; +import java.io.IOException; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoField; +import java.time.temporal.ChronoUnit; +import java.util.Base64; +import java.util.Locale; +import java.util.UUID; + +/** + * This converts Variant scalar values to JSON. + */ +public class DefaultScalarToJson implements Variant.ScalarToJson { + /** The format for a timestamp without time zone. */ + private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendPattern("HH:mm:ss") + .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) + .toFormatter(Locale.US); + + /** The format for a timestamp without time zone, with nanosecond precision. */ + private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendPattern("HH:mm:ss") + .appendFraction(ChronoField.NANO_OF_SECOND, 9, 9, true) + .toFormatter(Locale.US); + + /** The format for a timestamp with time zone. */ + private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + /** The format for a timestamp with time zone, with nanosecond precision. */ + private static final DateTimeFormatter TIMESTAMP_NANOS_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NANOS_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + /** The format for a time. */ + private static final DateTimeFormatter TIME_FORMATTER = new DateTimeFormatterBuilder() + .appendPattern("HH:mm:ss") + .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) + .toFormatter(Locale.US); + + public void writeNull(JsonGenerator gen) throws IOException { + gen.writeNull(); + } + + public void writeBoolean(JsonGenerator gen, boolean value) throws IOException { + gen.writeBoolean(value); + } + + public void writeByte(JsonGenerator gen, byte value) throws IOException { + gen.writeNumber(value); + } + + public void writeShort(JsonGenerator gen, short value) throws IOException { + gen.writeNumber(value); + } + + public void writeInt(JsonGenerator gen, int value) throws IOException { + gen.writeNumber(value); + } + + public void writeLong(JsonGenerator gen, long value) throws IOException { + gen.writeNumber(value); + } + + public void writeFloat(JsonGenerator gen, float value) throws IOException { + gen.writeNumber(value); + } + + public void writeDouble(JsonGenerator gen, double value) throws IOException { + gen.writeNumber(value); + } + + public void writeString(JsonGenerator gen, String value) throws IOException { + gen.writeString(value); + } + + public void writeBinary(JsonGenerator gen, byte[] value) throws IOException { + gen.writeString(Base64.getEncoder().encodeToString(value)); + } + + public void writeDecimal(JsonGenerator gen, BigDecimal value) throws IOException { + gen.writeNumber(value.toPlainString()); + } + + public void writeUUID(JsonGenerator gen, UUID value) throws IOException { + gen.writeString(value.toString()); + } + + public void writeDate(JsonGenerator gen, int value) throws IOException { + gen.writeString(LocalDate.ofEpochDay(value).toString()); + } + + public void writeTime(JsonGenerator gen, long microsSinceMidnight) throws IOException { + gen.writeString(TIME_FORMATTER.format(LocalTime.ofNanoOfDay(microsSinceMidnight * 1_000))); + } + + public void writeTimestamp(JsonGenerator gen, long microsSinceEpoch) throws IOException { + gen.writeString( + TIMESTAMP_FORMATTER.format(microsToInstant(microsSinceEpoch).atZone(ZoneOffset.UTC))); + } + + public void writeTimestampNtz(JsonGenerator gen, long microsSinceEpoch) throws IOException { + gen.writeString( + TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(microsSinceEpoch).atZone(ZoneOffset.UTC))); + } + + public void writeTimestampNanos(JsonGenerator gen, long nanosSinceEpoch) throws IOException { + gen.writeString( + TIMESTAMP_NANOS_FORMATTER.format(nanosToInstant(nanosSinceEpoch).atZone(ZoneOffset.UTC))); + } + + public void writeTimestampNanosNtz(JsonGenerator gen, long nanosSinceEpoch) throws IOException { + gen.writeString(TIMESTAMP_NANOS_NTZ_FORMATTER.format( + nanosToInstant(nanosSinceEpoch).atZone(ZoneOffset.UTC))); + } + + protected Instant microsToInstant(long microsSinceEpoch) { + return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS); + } + + protected Instant nanosToInstant(long timestampNanos) { + return Instant.EPOCH.plus(timestampNanos, ChronoUnit.NANOS); + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index 92ee094feb..54e465e0e3 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -23,13 +23,6 @@ import java.io.CharArrayWriter; import java.io.IOException; import java.math.BigDecimal; -import java.time.*; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeFormatterBuilder; -import java.time.temporal.ChronoField; -import java.time.temporal.ChronoUnit; -import java.util.Base64; -import java.util.Locale; import java.util.UUID; import org.apache.parquet.cli.util.RuntimeIOException; @@ -54,8 +47,6 @@ public final class Variant { */ static final int BINARY_SEARCH_THRESHOLD = 32; - static final ZoneId UTC = ZoneId.of("UTC"); - public Variant(byte[] value, byte[] metadata) { this(value, 0, metadata, 0); } @@ -331,32 +322,63 @@ private static Variant getElementAtIndex( } /** - * @return the JSON representation of the variant - * @throws MalformedVariantException if the variant is malformed + * An interface to write Variant scalar values to a JSON generator. */ - public String toJson() { - return toJson(UTC, false); + public interface ScalarToJson { + void writeNull(JsonGenerator gen) throws IOException; + + void writeBoolean(JsonGenerator gen, boolean value) throws IOException; + + void writeByte(JsonGenerator gen, byte value) throws IOException; + + void writeShort(JsonGenerator gen, short value) throws IOException; + + void writeInt(JsonGenerator gen, int value) throws IOException; + + void writeLong(JsonGenerator gen, long value) throws IOException; + + void writeFloat(JsonGenerator gen, float value) throws IOException; + + void writeDouble(JsonGenerator gen, double value) throws IOException; + + void writeString(JsonGenerator gen, String value) throws IOException; + + void writeBinary(JsonGenerator gen, byte[] value) throws IOException; + + void writeDecimal(JsonGenerator gen, BigDecimal value) throws IOException; + + void writeUUID(JsonGenerator gen, UUID value) throws IOException; + + void writeDate(JsonGenerator gen, int value) throws IOException; + + void writeTime(JsonGenerator gen, long microsSinceMidnight) throws IOException; + + void writeTimestamp(JsonGenerator gen, long microsSinceEpoch) throws IOException; + + void writeTimestampNtz(JsonGenerator gen, long microsSinceEpoch) throws IOException; + + void writeTimestampNanos(JsonGenerator gen, long nanosSinceEpoch) throws IOException; + + void writeTimestampNanosNtz(JsonGenerator gen, long nanosSinceEpoch) throws IOException; } /** - * @param zoneId The ZoneId to use for formatting timestamps * @return the JSON representation of the variant * @throws MalformedVariantException if the variant is malformed */ - public String toJson(ZoneId zoneId) { - return toJson(zoneId, false); + public String toJson() { + return toJson(new DefaultScalarToJson()); } /** - * @param zoneId The ZoneId to use for formatting timestamps - * @param truncateTrailingZeros Whether to truncate trailing zeros in decimal values or timestamps + * @param scalarWriter the writer to use for writing scalar values * @return the JSON representation of the variant * @throws MalformedVariantException if the variant is malformed */ - public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { + public String toJson(ScalarToJson scalarWriter) { try (CharArrayWriter writer = new CharArrayWriter(); JsonGenerator gen = new JsonFactory().createGenerator(writer)) { - toJsonImpl(value, valuePos, metadata, metadataPos, gen, zoneId, truncateTrailingZeros); + toJsonImpl(value, valuePos, metadata, metadataPos, gen, scalarWriter); gen.flush(); return writer.toString(); } catch (IOException e) { @@ -364,102 +386,8 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { } } - /** The format for a timestamp without time zone. */ - private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() - .append(DateTimeFormatter.ISO_LOCAL_DATE) - .appendLiteral('T') - .appendPattern("HH:mm:ss") - .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) - .toFormatter(Locale.US); - - /** The format for a timestamp without time zone, with nanosecond precision. */ - private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_FORMATTER = new DateTimeFormatterBuilder() - .append(DateTimeFormatter.ISO_LOCAL_DATE) - .appendLiteral('T') - .appendPattern("HH:mm:ss") - .appendFraction(ChronoField.NANO_OF_SECOND, 9, 9, true) - .toFormatter(Locale.US); - - /** The format for a timestamp with time zone. */ - private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() - .append(TIMESTAMP_NTZ_FORMATTER) - .appendOffset("+HH:MM", "+00:00") - .toFormatter(Locale.US); - - /** The format for a timestamp with time zone, with nanosecond precision. */ - private static final DateTimeFormatter TIMESTAMP_NANOS_FORMATTER = new DateTimeFormatterBuilder() - .append(TIMESTAMP_NANOS_NTZ_FORMATTER) - .appendOffset("+HH:MM", "+00:00") - .toFormatter(Locale.US); - - /** The format for a time. */ - private static final DateTimeFormatter TIME_FORMATTER = new DateTimeFormatterBuilder() - .appendPattern("HH:mm:ss") - .appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true) - .toFormatter(Locale.US); - - /** The format for a timestamp without time zone, truncating trailing microsecond zeros. */ - private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder() - .append(DateTimeFormatter.ISO_LOCAL_DATE) - .appendLiteral('T') - .appendPattern("HH:mm:ss") - .optionalStart() - .appendFraction(ChronoField.MICRO_OF_SECOND, 0, 6, true) - .optionalEnd() - .toFormatter(Locale.US); - - /** - * The format for a timestamp without time zone, with nanosecond precision, truncating - * trailing nanosecond zeros. - */ - private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder() - .append(DateTimeFormatter.ISO_LOCAL_DATE) - .appendLiteral('T') - .appendPattern("HH:mm:ss") - .optionalStart() - .appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true) - .optionalEnd() - .toFormatter(Locale.US); - - /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ - private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder() - .append(TIMESTAMP_NTZ_TRUNC_FORMATTER) - .appendOffset("+HH:MM", "+00:00") - .toFormatter(Locale.US); - - /** - * The format for a timestamp with time zone, with nanosecond precision, truncating trailing - * nanosecond zeros. - */ - private static final DateTimeFormatter TIMESTAMP_NANOS_TRUNC_FORMATTER = new DateTimeFormatterBuilder() - .append(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER) - .appendOffset("+HH:MM", "+00:00") - .toFormatter(Locale.US); - - /** The format for a time, truncating trailing microsecond zeros. */ - private static final DateTimeFormatter TIME_TRUNC_FORMATTER = new DateTimeFormatterBuilder() - .appendPattern("HH:mm:ss") - .optionalStart() - .appendFraction(ChronoField.MICRO_OF_SECOND, 0, 6, true) - .optionalEnd() - .toFormatter(Locale.US); - - private static Instant microsToInstant(long microsSinceEpoch) { - return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS); - } - - private static Instant nanosToInstant(long timestampNanos) { - return Instant.EPOCH.plus(timestampNanos, ChronoUnit.NANOS); - } - private static void toJsonImpl( - byte[] value, - int valuePos, - byte[] metadata, - int metadataPos, - JsonGenerator gen, - ZoneId zoneId, - boolean truncateTrailingZeros) + byte[] value, int valuePos, byte[] metadata, int metadataPos, JsonGenerator gen, ScalarToJson scalarWriter) throws IOException { switch (VariantUtil.getType(value, valuePos)) { case OBJECT: { @@ -483,8 +411,7 @@ private static void toJsonImpl( field.value.metadata, metadataPos, gen, - zoneId, - truncateTrailingZeros); + scalarWriter); } gen.writeEndObject(); break; @@ -501,99 +428,66 @@ private static void toJsonImpl( info.offsetSize, valuePos + info.offsetStartOffset, valuePos + info.dataStartOffset); - toJsonImpl(v.value, v.valuePos, v.metadata, metadataPos, gen, zoneId, truncateTrailingZeros); + toJsonImpl(v.value, v.valuePos, v.metadata, metadataPos, gen, scalarWriter); } gen.writeEndArray(); break; } case NULL: - gen.writeNull(); + scalarWriter.writeNull(gen); break; case BOOLEAN: - gen.writeBoolean(VariantUtil.getBoolean(value, valuePos)); + scalarWriter.writeBoolean(gen, VariantUtil.getBoolean(value, valuePos)); break; case BYTE: + scalarWriter.writeByte(gen, (byte) VariantUtil.getLong(value, valuePos)); + break; case SHORT: + scalarWriter.writeShort(gen, (short) VariantUtil.getLong(value, valuePos)); + break; case INT: + scalarWriter.writeInt(gen, (int) VariantUtil.getLong(value, valuePos)); + break; case LONG: - gen.writeNumber(VariantUtil.getLong(value, valuePos)); + scalarWriter.writeLong(gen, VariantUtil.getLong(value, valuePos)); break; case STRING: - gen.writeString(VariantUtil.getString(value, valuePos)); + scalarWriter.writeString(gen, VariantUtil.getString(value, valuePos)); + break; + case BINARY: + scalarWriter.writeBinary(gen, VariantUtil.getBinary(value, valuePos)); + break; + case FLOAT: + scalarWriter.writeFloat(gen, VariantUtil.getFloat(value, valuePos)); break; case DOUBLE: - gen.writeNumber(VariantUtil.getDouble(value, valuePos)); + scalarWriter.writeDouble(gen, VariantUtil.getDouble(value, valuePos)); break; case DECIMAL4: case DECIMAL8: case DECIMAL16: - if (truncateTrailingZeros) { - gen.writeNumber(VariantUtil.getDecimal(value, valuePos) - .stripTrailingZeros() - .toPlainString()); - } else { - gen.writeNumber(VariantUtil.getDecimal(value, valuePos).toPlainString()); - } + scalarWriter.writeDecimal(gen, VariantUtil.getDecimal(value, valuePos)); break; case DATE: - gen.writeString(LocalDate.ofEpochDay((int) VariantUtil.getLong(value, valuePos)) - .toString()); + scalarWriter.writeDate(gen, (int) VariantUtil.getLong(value, valuePos)); break; case TIMESTAMP: - if (truncateTrailingZeros) { - gen.writeString( - TIMESTAMP_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) - .atZone(zoneId))); - } else { - gen.writeString(TIMESTAMP_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) - .atZone(zoneId))); - } + scalarWriter.writeTimestamp(gen, VariantUtil.getLong(value, valuePos)); break; case TIMESTAMP_NTZ: - if (truncateTrailingZeros) { - gen.writeString( - TIMESTAMP_NTZ_TRUNC_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) - .atZone(ZoneOffset.UTC))); - } else { - gen.writeString(TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(VariantUtil.getLong(value, valuePos)) - .atZone(ZoneOffset.UTC))); - } - break; - case FLOAT: - gen.writeNumber(VariantUtil.getFloat(value, valuePos)); - break; - case BINARY: - gen.writeString(Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, valuePos))); + scalarWriter.writeTimestampNtz(gen, VariantUtil.getLong(value, valuePos)); break; case TIME: - if (truncateTrailingZeros) { - gen.writeString(TIME_TRUNC_FORMATTER.format( - LocalTime.ofNanoOfDay(VariantUtil.getLong(value, valuePos) * 1_000))); - } else { - gen.writeString( - TIME_FORMATTER.format(LocalTime.ofNanoOfDay(VariantUtil.getLong(value, valuePos) * 1_000))); - } + scalarWriter.writeTime(gen, VariantUtil.getLong(value, valuePos)); break; case TIMESTAMP_NANOS: - if (truncateTrailingZeros) { - gen.writeString(TIMESTAMP_NANOS_TRUNC_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(zoneId))); - } else { - gen.writeString(TIMESTAMP_NANOS_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(zoneId))); - } + scalarWriter.writeTimestampNanos(gen, VariantUtil.getLong(value, valuePos)); break; case TIMESTAMP_NANOS_NTZ: - if (truncateTrailingZeros) { - gen.writeString(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(ZoneOffset.UTC))); - } else { - gen.writeString(TIMESTAMP_NANOS_NTZ_FORMATTER.format( - nanosToInstant(VariantUtil.getLong(value, valuePos)).atZone(ZoneOffset.UTC))); - } + scalarWriter.writeTimestampNanosNtz(gen, VariantUtil.getLong(value, valuePos)); break; case UUID: - gen.writeString(VariantUtil.getUUID(value, valuePos).toString()); + scalarWriter.writeUUID(gen, VariantUtil.getUUID(value, valuePos)); break; default: throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, valuePos)); diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 7dfef93185..1c7543d964 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -28,7 +28,6 @@ import java.time.Instant; import java.time.LocalDate; import java.time.LocalTime; -import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.util.Arrays; import java.util.Base64; @@ -372,8 +371,6 @@ public void testTimestamp() { vb.appendTimestamp(micros); testVariant(vb.result(), v -> { Assert.assertEquals("\"2024-12-16T18:23:45.321456+00:00\"", v.toJson()); - Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", v.toJson(ZoneId.of("+01:00"))); Assert.assertEquals(micros, v.getLong()); }); } @@ -386,8 +383,6 @@ public void testTimestampNtz() { vb.appendTimestampNtz(micros); testVariant(vb.result(), v -> { Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", v.toJson()); - Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(v.toJson(ZoneId.of("-08:00")), v.toJson(ZoneId.of("+02:00"))); Assert.assertEquals(micros, v.getLong()); }); } @@ -414,8 +409,6 @@ public void testTimestampNanos() { vb.appendTimestampNanos(nanos); testVariant(vb.result(), v -> { Assert.assertEquals("\"2024-12-16T18:23:45.321456987+00:00\"", v.toJson()); - Assert.assertEquals("\"2024-12-16T10:23:45.321456987-08:00\"", v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals("\"2024-12-16T19:23:45.321456987+01:00\"", v.toJson(ZoneId.of("+01:00"))); Assert.assertEquals(nanos, v.getLong()); }); } @@ -428,8 +421,6 @@ public void testTimestampNanosNtz() { vb.appendTimestampNanosNtz(nanos); testVariant(vb.result(), v -> { Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", v.toJson()); - Assert.assertEquals("\"2024-01-01T23:00:00.839280983\"", v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(v.toJson(ZoneId.of("-08:00")), v.toJson(ZoneId.of("+02:00"))); Assert.assertEquals(nanos, v.getLong()); }); } @@ -604,143 +595,4 @@ public void testAllowDuplicateKeys() { Assert.fail("Unexpected exception: " + e); } } - - @Test - public void testTruncateTrailingZeroDecimal() { - for (String[] strings : Arrays.asList( - // decimal4 - // truncate all trailing zeros - new String[] {"1234.0000", "1234"}, - // truncate some trailing zeros - new String[] {"1234.5600", "1234.56"}, - // truncate no trailing zeros - new String[] {"1234.5678", "1234.5678"}, - // decimal8 - // truncate all trailing zeros - new String[] {"-10.0000000000", "-10"}, - // truncate some trailing zeros - new String[] {"-10.2147000000", "-10.2147"}, - // truncate no trailing zeros - new String[] {"-10.2147483647", "-10.2147483647"}, - // decimal16 - // truncate all trailing zeros - new String[] {"1092233720368547.00000", "1092233720368547"}, - // truncate some trailing zeros - new String[] {"1092233720368547.75800", "1092233720368547.758"}, - // truncate no trailing zeros - new String[] {"1092233720368547.75807", "1092233720368547.75807"})) { - VariantBuilder vb = new VariantBuilder(false); - BigDecimal d = new BigDecimal(strings[0]); - vb.appendDecimal(d); - testVariant(vb.result(), v -> { - Assert.assertEquals(strings[0], v.toJson()); - Assert.assertEquals(strings[1], v.toJson(ZoneId.of("UTC"), true)); - }); - } - } - - @Test - public void testTruncateTrailingZeroTimestamp() { - DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; - for (String[] strings : Arrays.asList( - // truncate all trailing zeros - new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"}, - // truncate all trailing zeros - new String[] {"2024-12-16T10:23:45.123000-08:00", "2024-12-16T10:23:45.123-08:00"}, - // truncate no trailing zeros - new String[] {"2024-12-16T10:23:45.123456-08:00", "2024-12-16T10:23:45.123456-08:00"})) { - VariantBuilder vb = new VariantBuilder(false); - long micros = microsSinceEpoch(Instant.from(dtf.parse(strings[0]))); - vb.appendTimestamp(micros); - testVariant(vb.result(), v -> { - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - }); - } - } - - @Test - public void testTruncateTrailingZeroTimestampNtz() { - DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; - for (String[] strings : Arrays.asList( - // truncate all trailing zeros - new String[] {"2024-12-16T10:23:45.000000", "2024-12-16T10:23:45"}, - // truncate all trailing zeros - new String[] {"2024-12-16T10:23:45.123000", "2024-12-16T10:23:45.123"}, - // truncate no trailing zeros - new String[] {"2024-12-16T10:23:45.123456", "2024-12-16T10:23:45.123456"})) { - VariantBuilder vb = new VariantBuilder(false); - - long micros = microsSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); - vb.appendTimestampNtz(micros); - testVariant(vb.result(), v -> { - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - Assert.assertEquals(micros, v.getLong()); - }); - } - } - - @Test - public void testTruncateTrailingZeroTime() { - for (String[] strings : Arrays.asList( - // truncate all trailing zeros - new String[] {"10:23:45.000000", "10:23:45"}, - // truncate some trailing zeros - new String[] {"10:23:45.123000", "10:23:45.123"}, - // truncate no trailing zeros - new String[] {"10:23:45.123456", "10:23:45.123456"})) { - VariantBuilder vb = new VariantBuilder(false); - - long micros = LocalTime.parse(strings[0]).toNanoOfDay() / 1_000; - vb.appendTime(micros); - testVariant(vb.result(), v -> { - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - Assert.assertEquals(micros, v.getLong()); - }); - } - } - - @Test - public void testTruncateTrailingZeroTimestampNanos() { - DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; - for (String[] strings : Arrays.asList( - // truncate all trailing zeros - new String[] {"2024-12-16T10:23:45.000000000-08:00", "2024-12-16T10:23:45-08:00"}, - // truncate some trailing zeros - new String[] {"2024-12-16T10:23:45.123450000-08:00", "2024-12-16T10:23:45.12345-08:00"}, - // truncate no trailing zeros - new String[] {"2024-12-16T10:23:45.123456789-08:00", "2024-12-16T10:23:45.123456789-08:00"})) { - VariantBuilder vb = new VariantBuilder(false); - long nanos = nanosSinceEpoch(Instant.from(dtf.parse(strings[0]))); - vb.appendTimestampNanos(nanos); - testVariant(vb.result(), v -> { - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - }); - } - } - - @Test - public void testTruncateTrailingZeroTimestampNanosNtz() { - DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; - for (String[] strings : Arrays.asList( - // truncate all trailing zeros - new String[] {"2024-12-16T10:23:45.000000000", "2024-12-16T10:23:45"}, - // truncate some trailing zeros - new String[] {"2024-12-16T10:23:45.123450000", "2024-12-16T10:23:45.12345"}, - // truncate no trailing zeros - new String[] {"2024-12-16T10:23:45.123456789", "2024-12-16T10:23:45.123456789"})) { - VariantBuilder vb = new VariantBuilder(false); - - long nanos = nanosSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); - vb.appendTimestampNanosNtz(nanos); - testVariant(vb.result(), v -> { - Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); - Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); - Assert.assertEquals(nanos, vb.result().getLong()); - }); - } - } } From 968e9a10935498f7af55201d49864f24a85b2e52 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Sun, 6 Apr 2025 10:51:12 -0700 Subject: [PATCH 20/20] Use ByteByffer instead --- .../org/apache/parquet/variant/Variant.java | 179 ++++++------- .../parquet/variant/VariantBuilder.java | 26 +- .../apache/parquet/variant/VariantUtil.java | 247 ++++++++++-------- .../parquet/variant/TestVariantEncoding.java | 18 +- 4 files changed, 250 insertions(+), 220 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index 54e465e0e3..c278dae93e 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -23,6 +23,7 @@ import java.io.CharArrayWriter; import java.io.IOException; import java.math.BigDecimal; +import java.nio.ByteBuffer; import java.util.UUID; import org.apache.parquet.cli.util.RuntimeIOException; @@ -31,14 +32,10 @@ */ public final class Variant { /** The buffer that contains the Variant value. */ - final byte[] value; - /** The starting index into `value` where the Variant value begins. */ - final int valuePos; + final ByteBuffer value; /** The buffer that contains the Variant metadata. */ - final byte[] metadata; - /** The starting index into `metadata` where the Variant metadata begins. */ - final int metadataPos; + final ByteBuffer metadata; /** * The threshold to switch from linear search to binary search when looking up a field by key in @@ -52,23 +49,23 @@ public Variant(byte[] value, byte[] metadata) { } Variant(byte[] value, int valuePos, byte[] metadata, int metadataPos) { - if (valuePos < 0 || valuePos >= value.length) { - throw new IllegalArgumentException( - String.format("Invalid valuePos: %d. value.length: %d", valuePos, value.length)); - } - if (metadataPos < 0 || metadataPos >= metadata.length) { - throw new IllegalArgumentException( - String.format("Invalid metadataPos: %d. metadata.length: %d", metadataPos, metadata.length)); - } - this.value = value; - this.valuePos = valuePos; + this( + ByteBuffer.wrap(value, valuePos, value.length - valuePos), + ByteBuffer.wrap(metadata, metadataPos, metadata.length - metadataPos)); + } + + Variant(ByteBuffer value, ByteBuffer metadata) { + this.value = value.asReadOnlyBuffer(); + this.value.mark(); + + this.metadata = metadata.asReadOnlyBuffer(); + this.metadata.mark(); - this.metadata = metadata; - this.metadataPos = metadataPos; // There is currently only one allowed version. - if ((metadata[metadataPos] & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { + if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) { throw new UnsupportedOperationException(String.format( - "Unsupported variant metadata version: %02X", metadata[metadataPos] & VariantUtil.VERSION_MASK)); + "Unsupported variant metadata version: %02X", + metadata.get(metadata.position()) & VariantUtil.VERSION_MASK)); } } @@ -76,14 +73,14 @@ public Variant(byte[] value, byte[] metadata) { * @return the boolean value */ public boolean getBoolean() { - return VariantUtil.getBoolean(value, valuePos); + return VariantUtil.getBoolean(value); } /** * @return the byte value */ public byte getByte() { - long longValue = VariantUtil.getLong(value, valuePos); + long longValue = VariantUtil.getLong(value); if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) { throw new IllegalStateException("Value out of range for byte: " + longValue); } @@ -94,7 +91,7 @@ public byte getByte() { * @return the short value */ public short getShort() { - long longValue = VariantUtil.getLong(value, valuePos); + long longValue = VariantUtil.getLong(value); if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) { throw new IllegalStateException("Value out of range for short: " + longValue); } @@ -105,7 +102,7 @@ public short getShort() { * @return the int value */ public int getInt() { - long longValue = VariantUtil.getLong(value, valuePos); + long longValue = VariantUtil.getLong(value); if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) { throw new IllegalStateException("Value out of range for int: " + longValue); } @@ -116,63 +113,63 @@ public int getInt() { * @return the long value */ public long getLong() { - return VariantUtil.getLong(value, valuePos); + return VariantUtil.getLong(value); } /** * @return the double value */ public double getDouble() { - return VariantUtil.getDouble(value, valuePos); + return VariantUtil.getDouble(value); } /** * @return the decimal value */ public BigDecimal getDecimal() { - return VariantUtil.getDecimal(value, valuePos); + return VariantUtil.getDecimal(value); } /** * @return the float value */ public float getFloat() { - return VariantUtil.getFloat(value, valuePos); + return VariantUtil.getFloat(value); } /** * @return the binary value */ public byte[] getBinary() { - return VariantUtil.getBinary(value, valuePos); + return VariantUtil.getBinary(value); } /** * @return the UUID value */ public UUID getUUID() { - return VariantUtil.getUUID(value, valuePos); + return VariantUtil.getUUID(value); } /** * @return the string value */ public String getString() { - return VariantUtil.getString(value, valuePos); + return VariantUtil.getString(value); } /** * @return the type of the variant value */ public VariantUtil.Type getType() { - return VariantUtil.getType(value, valuePos); + return VariantUtil.getType(value); } /** * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. */ public int numObjectElements() { - return VariantUtil.getObjectInfo(value, valuePos).numElements; + return VariantUtil.getObjectInfo(value).numElements; } /** @@ -182,7 +179,7 @@ public int numObjectElements() { * @return the field value whose key is equal to `key`, or null if key is not found */ public Variant getFieldByKey(String key) { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value); // Use linear search for a short list. Switch to binary search when the length reaches // `BINARY_SEARCH_THRESHOLD`. if (info.numElements < BINARY_SEARCH_THRESHOLD) { @@ -191,12 +188,11 @@ public Variant getFieldByKey(String key) { i, value, metadata, - metadataPos, info.idSize, info.offsetSize, - valuePos + info.idStartOffset, - valuePos + info.offsetStartOffset, - valuePos + info.dataStartOffset); + value.position() + info.idStartOffset, + value.position() + info.offsetStartOffset, + value.position() + info.dataStartOffset); if (field.key.equals(key)) { return field.value; } @@ -213,12 +209,11 @@ public Variant getFieldByKey(String key) { mid, value, metadata, - metadataPos, info.idSize, info.offsetSize, - valuePos + info.idStartOffset, - valuePos + info.offsetStartOffset, - valuePos + info.dataStartOffset); + value.position() + info.idStartOffset, + value.position() + info.offsetStartOffset, + value.position() + info.dataStartOffset); int cmp = field.key.compareTo(key); if (cmp < 0) { low = mid + 1; @@ -252,7 +247,7 @@ public ObjectField(String key, Variant value) { * @return the ObjectField at the `index` slot, or null if `index` is out of bounds */ public ObjectField getFieldAtIndex(int index) { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value); if (index < 0 || index >= info.numElements) { return null; } @@ -260,19 +255,17 @@ public ObjectField getFieldAtIndex(int index) { index, value, metadata, - metadataPos, info.idSize, info.offsetSize, - valuePos + info.idStartOffset, - valuePos + info.offsetStartOffset, - valuePos + info.dataStartOffset); + value.position() + info.idStartOffset, + value.position() + info.offsetStartOffset, + value.position() + info.dataStartOffset); } private static ObjectField getFieldAtIndex( int index, - byte[] value, - byte[] metadata, - int metadataPos, + ByteBuffer value, + ByteBuffer metadata, int idSize, int offsetSize, int idStart, @@ -281,8 +274,8 @@ private static ObjectField getFieldAtIndex( // idStart, offsetStart, and dataStart are absolute positions in the `value` buffer. int id = VariantUtil.readUnsigned(value, idStart + idSize * index, idSize); int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize); - String key = VariantUtil.getMetadataKey(metadata, metadataPos, id); - Variant v = new Variant(value, dataStart + offset, metadata, metadataPos); + String key = VariantUtil.getMetadataKey(metadata, id); + Variant v = new Variant(VariantUtil.slice(value, dataStart + offset), metadata); return new ObjectField(key, v); } @@ -290,7 +283,7 @@ private static ObjectField getFieldAtIndex( * @return the number of array elements. `getType()` must be `Type.ARRAY`. */ public int numArrayElements() { - return VariantUtil.getArrayInfo(value, valuePos).numElements; + return VariantUtil.getArrayInfo(value).numElements; } /** @@ -300,7 +293,7 @@ public int numArrayElements() { * @return the array element Variant at the `index` slot, or null if `index` is out of bounds */ public Variant getElementAtIndex(int index) { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, valuePos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value); if (index < 0 || index >= info.numElements) { return null; } @@ -308,17 +301,16 @@ public Variant getElementAtIndex(int index) { index, value, metadata, - metadataPos, info.offsetSize, - valuePos + info.offsetStartOffset, - valuePos + info.dataStartOffset); + value.position() + info.offsetStartOffset, + value.position() + info.dataStartOffset); } private static Variant getElementAtIndex( - int index, byte[] value, byte[] metadata, int metadataPos, int offsetSize, int offsetStart, int dataStart) { + int index, ByteBuffer value, ByteBuffer metadata, int offsetSize, int offsetStart, int dataStart) { // offsetStart and dataStart are absolute positions in the `value` buffer. int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize); - return new Variant(value, dataStart + offset, metadata, metadataPos); + return new Variant(VariantUtil.slice(value, dataStart + offset), metadata); } /** @@ -378,7 +370,7 @@ public String toJson() { public String toJson(ScalarToJson scalarWriter) { try (CharArrayWriter writer = new CharArrayWriter(); JsonGenerator gen = new JsonFactory().createGenerator(writer)) { - toJsonImpl(value, valuePos, metadata, metadataPos, gen, scalarWriter); + toJsonImpl(value, metadata, gen, scalarWriter); gen.flush(); return writer.toString(); } catch (IOException e) { @@ -386,49 +378,40 @@ public String toJson(ScalarToJson scalarWriter) { } } - private static void toJsonImpl( - byte[] value, int valuePos, byte[] metadata, int metadataPos, JsonGenerator gen, ScalarToJson scalarWriter) + private static void toJsonImpl(ByteBuffer value, ByteBuffer metadata, JsonGenerator gen, ScalarToJson scalarWriter) throws IOException { - switch (VariantUtil.getType(value, valuePos)) { + switch (VariantUtil.getType(value)) { case OBJECT: { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value); gen.writeStartObject(); for (int i = 0; i < info.numElements; ++i) { ObjectField field = getFieldAtIndex( i, value, metadata, - metadataPos, info.idSize, info.offsetSize, - valuePos + info.idStartOffset, - valuePos + info.offsetStartOffset, - valuePos + info.dataStartOffset); + value.position() + info.idStartOffset, + value.position() + info.offsetStartOffset, + value.position() + info.dataStartOffset); gen.writeFieldName(field.key); - toJsonImpl( - field.value.value, - field.value.valuePos, - field.value.metadata, - metadataPos, - gen, - scalarWriter); + toJsonImpl(field.value.value, field.value.metadata, gen, scalarWriter); } gen.writeEndObject(); break; } case ARRAY: { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, valuePos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value); gen.writeStartArray(); for (int i = 0; i < info.numElements; ++i) { Variant v = getElementAtIndex( i, value, metadata, - metadataPos, info.offsetSize, - valuePos + info.offsetStartOffset, - valuePos + info.dataStartOffset); - toJsonImpl(v.value, v.valuePos, v.metadata, metadataPos, gen, scalarWriter); + value.position() + info.offsetStartOffset, + value.position() + info.dataStartOffset); + toJsonImpl(v.value, v.metadata, gen, scalarWriter); } gen.writeEndArray(); break; @@ -437,60 +420,60 @@ private static void toJsonImpl( scalarWriter.writeNull(gen); break; case BOOLEAN: - scalarWriter.writeBoolean(gen, VariantUtil.getBoolean(value, valuePos)); + scalarWriter.writeBoolean(gen, VariantUtil.getBoolean(value)); break; case BYTE: - scalarWriter.writeByte(gen, (byte) VariantUtil.getLong(value, valuePos)); + scalarWriter.writeByte(gen, (byte) VariantUtil.getLong(value)); break; case SHORT: - scalarWriter.writeShort(gen, (short) VariantUtil.getLong(value, valuePos)); + scalarWriter.writeShort(gen, (short) VariantUtil.getLong(value)); break; case INT: - scalarWriter.writeInt(gen, (int) VariantUtil.getLong(value, valuePos)); + scalarWriter.writeInt(gen, (int) VariantUtil.getLong(value)); break; case LONG: - scalarWriter.writeLong(gen, VariantUtil.getLong(value, valuePos)); + scalarWriter.writeLong(gen, VariantUtil.getLong(value)); break; case STRING: - scalarWriter.writeString(gen, VariantUtil.getString(value, valuePos)); + scalarWriter.writeString(gen, VariantUtil.getString(value)); break; case BINARY: - scalarWriter.writeBinary(gen, VariantUtil.getBinary(value, valuePos)); + scalarWriter.writeBinary(gen, VariantUtil.getBinary(value)); break; case FLOAT: - scalarWriter.writeFloat(gen, VariantUtil.getFloat(value, valuePos)); + scalarWriter.writeFloat(gen, VariantUtil.getFloat(value)); break; case DOUBLE: - scalarWriter.writeDouble(gen, VariantUtil.getDouble(value, valuePos)); + scalarWriter.writeDouble(gen, VariantUtil.getDouble(value)); break; case DECIMAL4: case DECIMAL8: case DECIMAL16: - scalarWriter.writeDecimal(gen, VariantUtil.getDecimal(value, valuePos)); + scalarWriter.writeDecimal(gen, VariantUtil.getDecimal(value)); break; case DATE: - scalarWriter.writeDate(gen, (int) VariantUtil.getLong(value, valuePos)); + scalarWriter.writeDate(gen, (int) VariantUtil.getLong(value)); break; case TIMESTAMP: - scalarWriter.writeTimestamp(gen, VariantUtil.getLong(value, valuePos)); + scalarWriter.writeTimestamp(gen, VariantUtil.getLong(value)); break; case TIMESTAMP_NTZ: - scalarWriter.writeTimestampNtz(gen, VariantUtil.getLong(value, valuePos)); + scalarWriter.writeTimestampNtz(gen, VariantUtil.getLong(value)); break; case TIME: - scalarWriter.writeTime(gen, VariantUtil.getLong(value, valuePos)); + scalarWriter.writeTime(gen, VariantUtil.getLong(value)); break; case TIMESTAMP_NANOS: - scalarWriter.writeTimestampNanos(gen, VariantUtil.getLong(value, valuePos)); + scalarWriter.writeTimestampNanos(gen, VariantUtil.getLong(value)); break; case TIMESTAMP_NANOS_NTZ: - scalarWriter.writeTimestampNanosNtz(gen, VariantUtil.getLong(value, valuePos)); + scalarWriter.writeTimestampNanosNtz(gen, VariantUtil.getLong(value)); break; case UUID: - scalarWriter.writeUUID(gen, VariantUtil.getUUID(value, valuePos)); + scalarWriter.writeUUID(gen, VariantUtil.getUUID(value)); break; default: - throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, valuePos)); + throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value)); } } } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 41f9e017f1..41434cc8a8 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -431,15 +431,15 @@ public void finishWritingArray(int start, ArrayList offsets) { * @param v the Variant value to append */ public void appendVariant(Variant v) { - appendVariantImpl(v.value, v.valuePos, v.metadata, v.metadataPos); + appendVariantImpl(v.value, v.value.position(), v.metadata); } - private void appendVariantImpl(byte[] value, int valuePos, byte[] metadata, int metadataPos) { - VariantUtil.checkIndex(valuePos, value.length); - int basicType = value[valuePos] & VariantUtil.BASIC_TYPE_MASK; + private void appendVariantImpl(ByteBuffer value, int valuePos, ByteBuffer metadata) { + VariantUtil.checkIndex(valuePos, value.limit()); + int basicType = value.get(valuePos) & VariantUtil.BASIC_TYPE_MASK; switch (basicType) { case VariantUtil.OBJECT: { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, valuePos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(VariantUtil.slice(value, valuePos)); ArrayList fields = new ArrayList<>(info.numElements); int start = writePos; for (int i = 0; i < info.numElements; ++i) { @@ -448,16 +448,16 @@ private void appendVariantImpl(byte[] value, int valuePos, byte[] metadata, int int offset = VariantUtil.readUnsigned( value, valuePos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); int elementPos = valuePos + info.dataStartOffset + offset; - String key = VariantUtil.getMetadataKey(metadata, metadataPos, id); + String key = VariantUtil.getMetadataKey(metadata, id); int newId = addKey(key); fields.add(new FieldEntry(key, newId, writePos - start)); - appendVariantImpl(value, elementPos, metadata, metadataPos); + appendVariantImpl(value, elementPos, metadata); } finishWritingObject(start, fields); break; } case VariantUtil.ARRAY: { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, valuePos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(VariantUtil.slice(value, valuePos)); ArrayList offsets = new ArrayList<>(info.numElements); int start = writePos; for (int i = 0; i < info.numElements; ++i) { @@ -465,7 +465,7 @@ private void appendVariantImpl(byte[] value, int valuePos, byte[] metadata, int value, valuePos + info.offsetStartOffset + info.offsetSize * i, info.offsetSize); int elementPos = valuePos + info.dataStartOffset + offset; offsets.add(writePos - start); - appendVariantImpl(value, elementPos, metadata, metadataPos); + appendVariantImpl(value, elementPos, metadata); } finishWritingArray(start, offsets); break; @@ -476,11 +476,11 @@ private void appendVariantImpl(byte[] value, int valuePos, byte[] metadata, int } } - private void shallowAppendVariantImpl(byte[] value, int pos) { - int size = VariantUtil.valueSize(value, pos); - VariantUtil.checkIndex(pos + size - 1, value.length); + private void shallowAppendVariantImpl(ByteBuffer value, int valuePos) { + int size = VariantUtil.valueSize(value, valuePos); + VariantUtil.checkIndex(valuePos + size - 1, value.limit()); checkCapacity(size); - System.arraycopy(value, pos, writeBuffer, writePos, size); + VariantUtil.slice(value, valuePos).get(writeBuffer, writePos, size); writePos += size; } diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 1c94074bf3..9c4c13472e 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -187,9 +187,6 @@ public class VariantUtil { // The size (in bytes) of a UUID. public static final int UUID_SIZE = 16; - // Default size limit for both variant value and variant metadata. - public static final int DEFAULT_SIZE_LIMIT = U24_MAX + 1; - /** * Write the least significant `numBytes` bytes in `value` into `bytes[pos, pos + numBytes)` in * little endian. @@ -237,24 +234,24 @@ public static void checkIndex(int pos, int length) { } /** - * Reads a little-endian signed long value from `bytes[pos, pos + numBytes)`. - * @param bytes The byte array to read from - * @param pos The starting index of the byte array to read from + * Reads a little-endian signed long value from `buffer[pos, pos + numBytes)`. + * @param buffer The ByteBuffer to read from + * @param pos The starting index of the buffer to read from * @param numBytes The number of bytes to read * @return The long value */ - static long readLong(byte[] bytes, int pos, int numBytes) { - checkIndex(pos, bytes.length); - checkIndex(pos + numBytes - 1, bytes.length); + static long readLong(ByteBuffer buffer, int pos, int numBytes) { + checkIndex(pos, buffer.limit()); + checkIndex(pos + numBytes - 1, buffer.limit()); long result = 0; // All bytes except the most significant byte should be unsigned-extended and shifted // (so we need & 0xFF`). The most significant byte should be sign-extended and is handled // after the loop. for (int i = 0; i < numBytes - 1; ++i) { - long unsignedByteValue = bytes[pos + i] & 0xFF; + long unsignedByteValue = buffer.get(pos + i) & 0xFF; result |= unsignedByteValue << (8 * i); } - long signedByteValue = bytes[pos + numBytes - 1]; + long signedByteValue = buffer.get(pos + numBytes - 1); result |= signedByteValue << (8 * (numBytes - 1)); return result; } @@ -278,6 +275,25 @@ static int readUnsigned(byte[] bytes, int pos, int numBytes) { return result; } + /** + * Read a little-endian unsigned int value from `bytes[pos, pos + numBytes)`. The value must fit + * into a non-negative int (`[0, Integer.MAX_VALUE]`). + */ + static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) { + checkIndex(pos, bytes.limit()); + checkIndex(pos + numBytes - 1, bytes.limit()); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsigned-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes.get(pos + i) & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) { + throw new IllegalArgumentException(String.format("Failed to read unsigned int. numBytes: %d", numBytes)); + } + return result; + } + /** * The value type of Variant value. It is determined by the header byte. */ @@ -306,23 +322,17 @@ public enum Type { UUID } - public static int getPrimitiveTypeId(byte[] value, int pos) { - checkIndex(pos, value.length); - return (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; - } - /** * Returns the value type of Variant value `value[pos...]`. It is only legal to call `get*` if * `getType` returns the corresponding type. For example, it is only legal to call * `getLong` if this method returns `Type.Long`. * @param value The Variant value to get the type from - * @param pos The starting index of the Variant value * @return The type of the Variant value */ - public static Type getType(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static Type getType(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; switch (basicType) { case SHORT_STR: return Type.STRING; @@ -387,14 +397,18 @@ public static Type getType(byte[] value, int pos) { * @return The actual size of the Variant value */ public static int valueSize(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + return valueSize(ByteBuffer.wrap(value), pos); + } + + public static int valueSize(ByteBuffer value, int pos) { + checkIndex(pos, value.limit()); + int basicType = value.get(pos) & BASIC_TYPE_MASK; + int typeInfo = (value.get(pos) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; switch (basicType) { case SHORT_STR: return 1 + typeInfo; case OBJECT: { - VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value, pos); + VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(slice(value, pos)); return info.dataStartOffset + readUnsigned( value, @@ -402,7 +416,7 @@ public static int valueSize(byte[] value, int pos) { info.offsetSize); } case ARRAY: { - VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value, pos); + VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(slice(value, pos)); return info.dataStartOffset + readUnsigned( value, @@ -456,10 +470,10 @@ private static IllegalArgumentException unexpectedType(Type[] types) { return new IllegalArgumentException("Expected type to be one of: " + Arrays.toString(types)); } - public static boolean getBoolean(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static boolean getBoolean(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { throw unexpectedType(Type.BOOLEAN); } @@ -478,13 +492,12 @@ public static boolean getBoolean(byte[] value, int pos) { * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value represents the number of * nanoseconds from the Unix epoch. * @param value The Variant value - * @param pos The starting index of the Variant value * @return The long value */ - public static long getLong(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static long getLong(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; String exceptionMessage = "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ"; if (basicType != PRIMITIVE) { @@ -492,59 +505,59 @@ public static long getLong(byte[] value, int pos) { } switch (typeInfo) { case INT8: - return readLong(value, pos + 1, 1); + return readLong(value, value.position() + 1, 1); case INT16: - return readLong(value, pos + 1, 2); + return readLong(value, value.position() + 1, 2); case INT32: case DATE: - return readLong(value, pos + 1, 4); + return readLong(value, value.position() + 1, 4); case INT64: case TIMESTAMP: case TIMESTAMP_NTZ: case TIME: case TIMESTAMP_NANOS: case TIMESTAMP_NANOS_NTZ: - return readLong(value, pos + 1, 8); + return readLong(value, value.position() + 1, 8); default: throw new IllegalStateException(exceptionMessage); } } - public static double getDouble(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static double getDouble(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != DOUBLE) { throw unexpectedType(Type.DOUBLE); } - return Double.longBitsToDouble(readLong(value, pos + 1, 8)); + return Double.longBitsToDouble(readLong(value, value.position() + 1, 8)); } - public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static BigDecimal getDecimalWithOriginalScale(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE) { throw unexpectedType(new Type[] {Type.DECIMAL4, Type.DECIMAL8, Type.DECIMAL16}); } // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. - int scale = value[pos + 1] & 0xFF; + int scale = value.get(value.position() + 1) & 0xFF; BigDecimal result; switch (typeInfo) { case DECIMAL4: - result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); + result = BigDecimal.valueOf(readLong(value, value.position() + 2, 4), scale); break; case DECIMAL8: - result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); + result = BigDecimal.valueOf(readLong(value, value.position() + 2, 8), scale); break; case DECIMAL16: - checkIndex(pos + 17, value.length); + checkIndex(value.position() + 17, value.limit()); byte[] bytes = new byte[16]; // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian // representation. for (int i = 0; i < 16; ++i) { - bytes[i] = value[pos + 17 - i]; + bytes[i] = value.get(value.position() + 17 - i); } result = new BigDecimal(new BigInteger(bytes), scale); break; @@ -554,66 +567,90 @@ public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { return result; } - public static BigDecimal getDecimal(byte[] value, int pos) { - return getDecimalWithOriginalScale(value, pos); + public static BigDecimal getDecimal(ByteBuffer value) { + return getDecimalWithOriginalScale(value); } - public static float getFloat(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static float getFloat(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != FLOAT) { throw unexpectedType(Type.FLOAT); } - return Float.intBitsToFloat((int) readLong(value, pos + 1, 4)); + return Float.intBitsToFloat((int) readLong(value, value.position() + 1, 4)); } - public static byte[] getBinary(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static byte[] getBinary(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != BINARY) { throw unexpectedType(Type.BINARY); } - int start = pos + 1 + U32_SIZE; - int length = readUnsigned(value, pos + 1, U32_SIZE); - checkIndex(start + length - 1, value.length); - return Arrays.copyOfRange(value, start, start + length); + int start = value.position() + 1 + U32_SIZE; + int length = readUnsigned(value, value.position() + 1, U32_SIZE); + checkIndex(start + length - 1, value.limit()); + byte[] ret = new byte[length]; + slice(value, start).get(ret); + return ret; } - public static String getString(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static String getString(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { int start; int length; if (basicType == SHORT_STR) { - start = pos + 1; + start = value.position() + 1; length = typeInfo; } else { - start = pos + 1 + U32_SIZE; - length = readUnsigned(value, pos + 1, U32_SIZE); + start = value.position() + 1 + U32_SIZE; + length = readUnsigned(value, value.position() + 1, U32_SIZE); + } + checkIndex(start + length - 1, value.limit()); + if (value.hasArray()) { + // If the buffer is backed by an array, we can use the array directly. + return new String(value.array(), value.arrayOffset() + start, length); + } else { + // If the buffer is not backed by an array, we need to copy the bytes into a new array. + byte[] valueArray = new byte[length]; + slice(value, start).get(valueArray); + return new String(valueArray); } - checkIndex(start + length - 1, value.length); - return new String(value, start, length); } throw unexpectedType(Type.STRING); } - public static java.util.UUID getUUID(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static java.util.UUID getUUID(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != PRIMITIVE || typeInfo != UUID) { throw unexpectedType(Type.UUID); } - int start = pos + 1; - checkIndex(start + UUID_SIZE - 1, value.length); - ByteBuffer bb = ByteBuffer.wrap(value, start, UUID_SIZE).order(ByteOrder.BIG_ENDIAN); + int start = value.position() + 1; + checkIndex(start + UUID_SIZE - 1, value.limit()); + ByteBuffer bb = VariantUtil.slice(value, start).order(ByteOrder.BIG_ENDIAN); return new java.util.UUID(bb.getLong(), bb.getLong()); } + /** + * Slices the `value` buffer starting from `start` index. + * @param value The ByteBuffer to slice + * @param start The starting index of the slice + * @return The sliced ByteBuffer + */ + public static ByteBuffer slice(ByteBuffer value, int start) { + int oldPos = value.position(); + value.position(start); + ByteBuffer newSlice = value.slice(); + value.position(oldPos); + return newSlice; + } + /** * A helper class representing the details of a Variant object, used for `ObjectHandler`. */ @@ -650,10 +687,10 @@ public ObjectInfo( /** * Parses the object at `value[pos...]`, and returns the object details. */ - public static ObjectInfo getObjectInfo(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static ObjectInfo getObjectInfo(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != OBJECT) { throw unexpectedType(Type.OBJECT); } @@ -662,12 +699,11 @@ public static ObjectInfo getObjectInfo(byte[] value, int pos) { // b4 to determine whether the object uses a 1/4-byte size. boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; int sizeBytes = (largeSize ? U32_SIZE : 1); - int numElements = readUnsigned(value, pos + 1, sizeBytes); + int numElements = readUnsigned(value, value.position() + 1, sizeBytes); // Extracts b3b2 to determine the integer size of the field id list. int idSize = ((typeInfo >> 2) & 0x3) + 1; // Extracts b1b0 to determine the integer size of the offset list. int offsetSize = (typeInfo & 0x3) + 1; - // int idStart = pos + 1 + sizeBytes; int idStartOffset = 1 + sizeBytes; int offsetStartOffset = idStartOffset + numElements * idSize; int dataStartOffset = offsetStartOffset + (numElements + 1) * offsetSize; @@ -698,10 +734,10 @@ public ArrayInfo(int numElements, int offsetSize, int offsetStartOffset, int dat /** * Parses the array at `value[pos...]`, and returns the array details. */ - public static ArrayInfo getArrayInfo(byte[] value, int pos) { - checkIndex(pos, value.length); - int basicType = value[pos] & BASIC_TYPE_MASK; - int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + public static ArrayInfo getArrayInfo(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; if (basicType != ARRAY) { throw unexpectedType(Type.ARRAY); } @@ -710,7 +746,7 @@ public static ArrayInfo getArrayInfo(byte[] value, int pos) { // b2 to determine whether the object uses a 1/4-byte size. boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; int sizeBytes = (largeSize ? U32_SIZE : 1); - int numElements = readUnsigned(value, pos + 1, sizeBytes); + int numElements = readUnsigned(value, value.position() + 1, sizeBytes); // Extracts b1b0 to determine the integer size of the offset list. int offsetSize = (typeInfo & 0x3) + 1; int offsetStartOffset = 1 + sizeBytes; @@ -722,24 +758,22 @@ public static ArrayInfo getArrayInfo(byte[] value, int pos) { * Returns a key at `id` in the Variant metadata. * * @param metadata The Variant metadata - * @param metadataPos the position of the metadata in the byte array * @param id The key id * @return The key * @throws MalformedVariantException if the Variant is malformed * @throws IllegalArgumentException the id is out of bounds */ - public static String getMetadataKey(byte[] metadata, int metadataPos, int id) { - checkIndex(metadataPos, metadata.length); + public static String getMetadataKey(ByteBuffer metadata, int id) { // Extracts the highest 2 bits in the metadata header to determine the integer size of the // offset list. - int offsetSize = ((metadata[metadataPos] >> 6) & 0x3) + 1; - int dictSize = readUnsigned(metadata, metadataPos + 1, offsetSize); + int offsetSize = ((metadata.get(metadata.position()) >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, metadata.position() + 1, offsetSize); if (id >= dictSize) { throw new IllegalArgumentException( String.format("Invalid dictionary id: %d. dictionary size: %d", id, dictSize)); } // The offset list after the header byte, and a `dictSize` with `offsetSize` bytes. - int offsetListPos = metadataPos + 1 + offsetSize; + int offsetListPos = metadata.position() + 1 + offsetSize; // The data starts after the offset list, and `(dictSize + 1)` offset values. int dataPos = offsetListPos + (dictSize + 1) * offsetSize; int offset = readUnsigned(metadata, offsetListPos + (id) * offsetSize, offsetSize); @@ -748,7 +782,14 @@ public static String getMetadataKey(byte[] metadata, int metadataPos, int id) { throw new MalformedVariantException( String.format("Invalid offset: %d. next offset: %d", offset, nextOffset)); } - checkIndex(dataPos + nextOffset - 1, metadata.length); - return new String(metadata, dataPos + offset, nextOffset - offset); + checkIndex(dataPos + nextOffset - 1, metadata.limit()); + if (metadata.hasArray()) { + return new String(metadata.array(), metadata.arrayOffset() + dataPos + offset, nextOffset - offset); + } else { + // ByteBuffer does not have an array, so we need to use the `get` method to read the bytes. + byte[] metadataArray = new byte[nextOffset - offset]; + slice(metadata, dataPos + offset).get(metadataArray); + return new String(metadataArray); + } } } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index 1c7543d964..fbeac8e24f 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -85,7 +85,7 @@ private void checkJson(String jsonValue) { } private void checkType(Variant v, int expectedBasicType, VariantUtil.Type expectedType) { - Assert.assertEquals(expectedBasicType, v.value[v.valuePos] & VariantUtil.BASIC_TYPE_MASK); + Assert.assertEquals(expectedBasicType, v.value.get(v.value.position()) & VariantUtil.BASIC_TYPE_MASK); Assert.assertEquals(expectedType, v.getType()); } @@ -108,13 +108,19 @@ private String randomString(int len) { private void testVariant(Variant v, Consumer consumer) { consumer.accept(v); // Create new Variant with different byte offsets - byte[] newValue = new byte[v.value.length + 50]; - byte[] newMetadata = new byte[v.metadata.length + 50]; + byte[] newValue = new byte[v.value.capacity() + 50]; + byte[] newMetadata = new byte[v.metadata.capacity() + 50]; Arrays.fill(newValue, (byte) 0xFF); Arrays.fill(newMetadata, (byte) 0xFF); - System.arraycopy(v.value, 0, newValue, 25, v.value.length); - System.arraycopy(v.metadata, 0, newMetadata, 25, v.metadata.length); - Variant v2 = new Variant(newValue, 25 + v.valuePos, newMetadata, 25 + v.metadataPos); + v.value.position(0); + v.value.get(newValue, 25, v.value.capacity()); + v.value.position(0); + v.metadata.position(0); + v.metadata.get(newMetadata, 25, v.metadata.capacity()); + v.metadata.position(0); + Variant v2 = new Variant( + ByteBuffer.wrap(newValue, 25, v.value.capacity()), + ByteBuffer.wrap(newMetadata, 25, v.metadata.capacity())); consumer.accept(v2); }