diff --git a/src/Database/Database.php b/src/Database/Database.php index ba66fbc82..b5ac3cdc9 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -412,6 +412,11 @@ private function shouldUseRelationshipBulkWrites(): bool */ protected array $relationshipDeleteStack = []; + /** + * Reusable instance of single-pass DocumentProcessor + */ + private ?DocumentProcessor $singlePassProcessor = null; + private bool $adapterFiltersRegistered = false; /** * @param Adapter $adapter * @param Cache $cache @@ -3552,8 +3557,19 @@ public function getDocument(string $collection, string $id, array $queries = [], } } - $document = $this->casting($collection, $document); - $document = $this->decode($collection, $document, $selections); + if ($this->shouldUseSinglePassProcessor() && $this->canUseSinglePass($collection)) { + $this->singlePassProcessor ??= new DocumentProcessor(); + $document = $this->singlePassProcessor->processRead( + $collection, + $document, + fn (string $k) => $this->adapter->filter($k), + $selections, + $this->adapter->getSupportForCasting() + ); + } else { + $document = $this->casting($collection, $document); + $document = $this->decode($collection, $document, $selections); + } // Skip relationship population if we're in batch mode (relationships will be populated later) if (!$this->inBatchRelationshipPopulation && $this->resolveRelationships && !empty($relationships) && (empty($selects) || !empty($nestedSelections))) { @@ -7231,15 +7247,27 @@ public function find(string $collection, array $queries = [], string $forPermiss } } - foreach ($results as $index => $node) { - $node = $this->casting($collection, $node); - $node = $this->decode($collection, $node, $selections); + if ($this->shouldUseSinglePassProcessor() && $this->canUseSinglePass($collection)) { + $this->singlePassProcessor ??= new DocumentProcessor(); + $results = $this->singlePassProcessor->processReadBatch( + $collection, + $results, + fn (string $k) => $this->adapter->filter($k), + $selections, + $this->adapter->getSupportForCasting() + ); + } else { + foreach ($results as $index => $node) { + $node = $this->casting($collection, $node); + $node = $this->decode($collection, $node, $selections); + $results[$index] = $node; + } + } + foreach ($results as $index => $node) { if (!$node->isEmpty()) { $node->setAttribute('$collection', $collection->getId()); } - - $results[$index] = $node; } $this->trigger(self::EVENT_DOCUMENT_FIND, $results); @@ -7247,6 +7275,78 @@ public function find(string $collection, array $queries = [], string $forPermiss return $results; } + private function shouldUseSinglePassProcessor(): bool + { + $val = getenv('DB_SINGLE_PASS_PROCESSOR'); + if ($val === false || $val === '') { + return false; + } + $val = strtolower((string)$val); + if (in_array($val, ['0', 'false', 'off'], true)) { + return false; + } + // Do not use single-pass when relationship resolution is disabled + if (!$this->resolveRelationships) { + return false; + } + return true; + } + + private function canUseSinglePass(Document $collection): bool + { + // Register adapter-aware filters (spatial) once so support list is complete + if (!$this->adapterFiltersRegistered) { + DocumentProcessor::registerAdapterFilters($this->adapter); + $this->adapterFiltersRegistered = true; + } + + // Safe if: no relationships AND all filters are within DocumentProcessor supported set + $supported = DocumentProcessor::getSupportedFilters(); + $instanceFilterNames = \array_keys($this->getInstanceFilters()); + + // Guard against disabled relationship handling + if (!$this->resolveRelationships) { + return false; + } + + // Respect runtime filter flags to preserve decode semantics + if ($this->filter === false) { + return false; + } + + if (!empty($this->disabledFilters)) { + return false; + } + + $attributes = $collection->getAttribute('attributes', []); + foreach ($attributes as $attr) { + $filters = $attr['filters'] ?? []; + foreach ($filters as $filter) { + if (!in_array($filter, $supported, true)) { + return false; + } + // If an instance filter overrides behavior, skip single-pass to keep parity + if (in_array($filter, $instanceFilterNames, true)) { + return false; + } + } + } + + // Internal attributes allowed if within supported filters + foreach (Database::INTERNAL_ATTRIBUTES as $internal) { + $filters = $internal['filters']; + foreach ($filters as $filter) { + if (!in_array($filter, $supported, true)) { + return false; + } + if (in_array($filter, $instanceFilterNames, true)) { + return false; + } + } + } + return true; + } + /** * Call callback for each document of the given collection * that matches the given queries diff --git a/src/Database/DocumentProcessor.php b/src/Database/DocumentProcessor.php new file mode 100644 index 000000000..5185022d3 --- /dev/null +++ b/src/Database/DocumentProcessor.php @@ -0,0 +1,553 @@ + + */ + private static array $filters = []; + + /** + * Guard to ensure we only register filters once. + */ + private static bool $initialized = false; + + public function __construct() + { + self::ensureInitialized(); + } + + private static function ensureInitialized(): void + { + if (self::$initialized) { + return; + } + // Register standard filters (matching Database class filters semantics used on read) + // Note: simdjson provides ~10-15% improvement on JSON parsing but throws exceptions + // For production use with controlled JSON, consider json_decode for better compatibility + self::$filters["json"] = [ + "decode" => function (mixed $value) { + if (!is_string($value)) { + return $value; + } + + // Use standard json_decode for reliability + // (simdjson is faster but has compatibility issues with some edge cases) + $value = json_decode($value, true) ?? []; + + if (is_array($value) && array_key_exists('$id', $value)) { + return new Document($value); + } + if (is_array($value)) { + // Manual loop faster than array_map for small arrays + foreach ($value as $i => $item) { + if (is_array($item) && array_key_exists('$id', $item)) { + $value[$i] = new Document($item); + } + } + } + return $value; + }, + ]; + + self::$filters["datetime"] = [ + "decode" => function (?string $value) { + return DateTime::formatTz($value); + }, + ]; + + self::$initialized = true; + } + + /** + * Expose supported filter names for gating logic. + * + * @return array + */ + public static function getSupportedFilters(): array + { + self::ensureInitialized(); + return array_keys(self::$filters); + } + + /** + * Register adapter-aware decoders (spatial types) for single-pass processing. + * Safe to call multiple times; overwrites existing entries. + */ + public static function registerAdapterFilters(Adapter $adapter): void + { + self::ensureInitialized(); + + self::$filters[Database::VAR_POINT] = [ + 'decode' => function (?string $value) use ($adapter) { + if ($value === null) { + return null; + } + return $adapter->decodePoint($value); + }, + ]; + + self::$filters[Database::VAR_LINESTRING] = [ + 'decode' => function (?string $value) use ($adapter) { + if ($value === null) { + return null; + } + return $adapter->decodeLinestring($value); + }, + ]; + + self::$filters[Database::VAR_POLYGON] = [ + 'decode' => function (?string $value) use ($adapter) { + if ($value === null) { + return null; + } + return $adapter->decodePolygon($value); + }, + ]; + } + + /** + * Process document for read (decode + casting) in a single pass. + * + * @param Document $collection + * @param Document $document + * @param callable|null $keyMapper + * @param array $selections + * @param bool $skipCasting + * @return Document + */ + public function processRead( + Document $collection, + Document $document, + ?callable $keyMapper = null, + array $selections = [], + bool $skipCasting = false + ): Document { + $attributes = $collection->getAttribute("attributes", []); + + // Pre-normalize relationship keys like Database::decode + $relationships = \array_filter( + $attributes, + fn ($attribute) => ($attribute['type'] ?? '') === Database::VAR_RELATIONSHIP + ); + if (!empty($relationships) && $keyMapper !== null) { + foreach ($relationships as $relationship) { + $rKey = $relationship['$id'] ?? ''; + if ($rKey === '') { + continue; + } + $mapped = $keyMapper($rKey); + $hasOriginal = \array_key_exists($rKey, (array)$document); + $hasMapped = is_string($mapped) && \array_key_exists($mapped, (array)$document); + if ($hasOriginal || $hasMapped) { + $value = $document->getAttribute($rKey); + if ($value === null && $hasMapped) { + $value = $document->getAttribute($mapped); + } + if ($hasMapped) { + $document->removeAttribute($mapped); + } + $document->setAttribute($rKey, $value); + } + } + } + + // Iterate attributes and skip relationships without creating a new array + $filteredValues = []; + foreach ($attributes as $attribute) { + if (($attribute['type'] ?? '') === Database::VAR_RELATIONSHIP) { + continue; + } + $key = $attribute['$id'] ?? ""; + $type = $attribute["type"] ?? ""; + $array = $attribute["array"] ?? false; + $filters = $attribute["filters"] ?? []; + + if ($key === '$permissions') { + continue; + } + + // Prefer original key; fall back to adapter-mapped key if provided + $value = $document->getAttribute($key); + if ($value === null && $keyMapper !== null) { + $mapped = $keyMapper($key); + if (is_string($mapped) && $mapped !== $key) { + $value = $document->getAttribute($mapped); + if ($value !== null) { + $document->removeAttribute($mapped); + } + } + } + + if ($array) { + // In a single pass, if DB returns arrays as JSON strings, normalize once + if (is_string($value)) { + $decoded = json_decode($value, true); + $value = \is_array($decoded) ? $decoded : $value; + } + if (!\is_array($value)) { + $value = $value === null ? [] : [$value]; + } + + $revFilters = empty($filters) ? [] : array_reverse($filters); + foreach ($value as $i => $node) { + foreach ($revFilters as $filter) { + $node = $this->decodeAttribute($filter, $node); + } + $value[$i] = $skipCasting ? $node : $this->castNode($type, $node); + } + $filteredValues[$key] = $value; + if (empty($selections) || \in_array($key, $selections, true) || \in_array('*', $selections, true)) { + $document->setAttribute($key, $value); + } + } else { + // Apply filters for non-array values + if (!empty($filters)) { + foreach (array_reverse($filters) as $filter) { + $value = $this->decodeAttribute($filter, $value); + } + } + $final = $skipCasting ? $value : $this->castNode($type, $value); + $filteredValues[$key] = $final; + if (empty($selections) || \in_array($key, $selections, true) || \in_array('*', $selections, true)) { + $document->setAttribute($key, $final); + } + } + } + + // Apply internal attributes at the end to keep behavior consistent + // Note: All internal attributes have array=false, so no array handling needed + foreach (Database::INTERNAL_ATTRIBUTES as $attribute) { + $key = $attribute['$id']; + $type = $attribute["type"]; + $filters = $attribute["filters"]; + + if ($key === '$permissions') { + continue; + } + + $value = $document->getAttribute($key); + if ($value === null && $keyMapper !== null) { + $mapped = $keyMapper($key); + if (is_string($mapped) && $mapped !== $key) { + $value = $document->getAttribute($mapped); + if ($value !== null) { + $document->removeAttribute($mapped); + } + } + } + + if (!empty($filters)) { + foreach (array_reverse($filters) as $filter) { + $value = $this->decodeAttribute($filter, $value); + } + } + $final = $skipCasting ? $value : $this->castNode($type, $value); + if (empty($selections) || \in_array($key, $selections, true) || \in_array('*', $selections, true)) { + $document->setAttribute($key, $final); + } + } + + // Relationship selection semantics: if selecting relationship attributes, also include + // non-relationship attributes even if not explicitly selected. + $hasRelationshipSelections = false; + if (!empty($selections)) { + foreach ($selections as $sel) { + if (\str_contains($sel, '.')) { + $hasRelationshipSelections = true; + break; + } + } + } + if ($hasRelationshipSelections && !empty($selections) && !\in_array('*', $selections, true)) { + foreach ($collection->getAttribute('attributes', []) as $attribute) { + $key = $attribute['$id'] ?? ''; + if (($attribute['type'] ?? '') === Database::VAR_RELATIONSHIP || $key === '$permissions') { + continue; + } + if (!\in_array($key, $selections, true) && \array_key_exists($key, $filteredValues)) { + $document->setAttribute($key, $filteredValues[$key]); + } + } + } + + return $document; + } + + /** + * Prepare a per-collection plan for batch processing. + * + * @param Document $collection + * @param callable|null $keyMapper + * @param array $selections + * @param bool $skipCasting + * @return array{ + * relationships: array, + * attrs: array, selected:bool}>, + * internals: array, selected:bool}>, + * skipCasting: bool, + * hasRelSelects: bool + * } + */ + private function preparePlan(Document $collection, ?callable $keyMapper, array $selections, bool $skipCasting): array + { + $attributes = $collection->getAttribute('attributes', []); + + $relationships = []; + $attrs = []; + foreach ($attributes as $attr) { + $type = $attr['type'] ?? ''; + $key = $attr['$id'] ?? ''; + if ($type === Database::VAR_RELATIONSHIP) { + $mapped = ($keyMapper !== null) ? $keyMapper($key) : null; + $relationships[] = [ + 'key' => $key, + 'mapped' => (is_string($mapped) && $mapped !== $key) ? $mapped : null, + ]; + continue; + } + $mapped = ($keyMapper !== null) ? $keyMapper($key) : null; + $attrs[] = [ + 'key' => $key, + 'mapped' => (is_string($mapped) && $mapped !== $key) ? $mapped : null, + 'type' => $type, + 'array' => (bool)($attr['array'] ?? false), + 'filters' => array_reverse($attr['filters'] ?? []), + 'selected' => empty($selections) || in_array($key, $selections, true) || in_array('*', $selections, true), + ]; + } + + $internals = []; + foreach (Database::INTERNAL_ATTRIBUTES as $attr) { + $key = $attr['$id']; + if ($key === '$permissions') { + continue; + } + $internals[] = [ + 'key' => $key, + 'type' => $attr['type'], + 'array' => (bool)$attr['array'], + 'filters' => array_reverse($attr['filters']), + 'selected' => empty($selections) || in_array($key, $selections, true) || in_array('*', $selections, true), + ]; + } + + // Detect relationship selections + $hasRelationshipSelections = false; + if (!empty($selections)) { + foreach ($selections as $sel) { + if (\str_contains($sel, '.')) { + $hasRelationshipSelections = true; + break; + } + } + } + + return [ + 'relationships' => $relationships, + 'attrs' => $attrs, + 'internals' => $internals, + 'skipCasting' => $skipCasting, + 'hasRelSelects' => $hasRelationshipSelections && !empty($selections) && !\in_array('*', $selections, true), + ]; + } + + /** + * Batch version of processRead preserving parity semantics. + * + * @param Document $collection + * @param array $documents + * @param callable|null $keyMapper + * @param array $selections + * @param bool $skipCasting + * @return array + */ + public function processReadBatch( + Document $collection, + array $documents, + ?callable $keyMapper = null, + array $selections = [], + bool $skipCasting = false + ): array { + if (empty($documents)) { + return $documents; + } + + $plan = $this->preparePlan($collection, $keyMapper, $selections, $skipCasting); + + foreach ($documents as $idx => $document) { + if (!$document instanceof Document) { + continue; + } + + // Relationship key normalization + if (!empty($plan['relationships'])) { + foreach ($plan['relationships'] as $rel) { + $key = $rel['key']; + $mapped = $rel['mapped'] ?? null; + $hasOriginal = array_key_exists($key, (array)$document); + $hasMapped = $mapped && array_key_exists($mapped, (array)$document); + if ($hasOriginal || $hasMapped) { + $value = $document->getAttribute($key); + if ($value === null && $hasMapped) { + $value = $document->getAttribute($mapped); + } + if ($hasMapped) { + $document->removeAttribute($mapped); + } + $document->setAttribute($key, $value); + } + } + } + + // Regular attributes + $filteredValues = []; + foreach ($plan['attrs'] as $a) { + $key = $a['key']; + if ($key === '$permissions') { + continue; + } + $value = $document->getAttribute($key); + if ($value === null && !empty($a['mapped'])) { + $value = $document->getAttribute($a['mapped']); + if ($value !== null) { + $document->removeAttribute($a['mapped']); + } + } + + if ($a['array']) { + if (is_string($value)) { + $decoded = json_decode($value, true); + $value = is_array($decoded) ? $decoded : $value; + } + if (!is_array($value)) { + $value = $value === null ? [] : [$value]; + } + foreach ($value as $i => $node) { + foreach ($a['filters'] as $filter) { + $node = $this->decodeAttribute($filter, $node); + } + $value[$i] = $plan['skipCasting'] ? $node : $this->castNode($a['type'], $node); + } + $filteredValues[$key] = $value; + if ($a['selected']) { + $document->setAttribute($key, $value); + } + } else { + foreach ($a['filters'] as $filter) { + $value = $this->decodeAttribute($filter, $value); + } + $final = $plan['skipCasting'] ? $value : $this->castNode($a['type'], $value); + $filteredValues[$key] = $final; + if ($a['selected']) { + $document->setAttribute($key, $final); + } + } + } + + // Internal attributes + foreach ($plan['internals'] as $a) { + $key = $a['key']; + $value = $document->getAttribute($key); + + if ($a['array']) { + if (is_string($value)) { + $decoded = json_decode($value, true); + $value = is_array($decoded) ? $decoded : $value; + } + if (!is_array($value)) { + $value = $value === null ? [] : [$value]; + } + foreach ($value as $i => $node) { + foreach ($a['filters'] as $filter) { + $node = $this->decodeAttribute($filter, $node); + } + $value[$i] = $plan['skipCasting'] ? $node : $this->castNode($a['type'], $node); + } + if ($a['selected']) { + $document->setAttribute($key, $value); + } + } else { + foreach ($a['filters'] as $filter) { + $value = $this->decodeAttribute($filter, $value); + } + $final = $plan['skipCasting'] ? $value : $this->castNode($a['type'], $value); + if ($a['selected']) { + $document->setAttribute($key, $final); + } + } + } + + // Relationship selection semantic adjustment + if ($plan['hasRelSelects']) { + foreach ($plan['attrs'] as $a) { + if ($a['selected']) { + continue; + } + $key = $a['key']; + if (\array_key_exists($key, $filteredValues)) { + $document->setAttribute($key, $filteredValues[$key]); + } + } + } + + $documents[$idx] = $document; + } + + return $documents; + } + + /** + * Apply a decode filter to a value + * + * @param string $filter + * @param mixed $value + * @return mixed + */ + protected function decodeAttribute(string $filter, mixed $value): mixed + { + if (!array_key_exists($filter, self::$filters)) { + return $value; // Unknown filter, pass through + } + return self::$filters[$filter]["decode"]($value); + } + + private function castNode(string $type, mixed $node): mixed + { + // Preserve null values like legacy decode does + if ($node === null) { + return null; + } + + switch ($type) { + case Database::VAR_ID: + return (string) $node; + case Database::VAR_BOOLEAN: + return (bool) $node; + case Database::VAR_INTEGER: + return (int) $node; + case Database::VAR_FLOAT: + return (float) $node; + default: + return $node; + } + } + + /** + * Add a custom filter + * + * @param string $name + * @param callable $decode + * @return void + */ + public static function addFilter(string $name, callable $decode): void + { + self::$filters[$name] = ["decode" => $decode]; + } +} diff --git a/tests/benchmarking/document_processor_benchmark.php b/tests/benchmarking/document_processor_benchmark.php new file mode 100755 index 000000000..81315e749 --- /dev/null +++ b/tests/benchmarking/document_processor_benchmark.php @@ -0,0 +1,476 @@ + ["docs" => 1000, "arrays" => 1, "array_size" => 10], + "MEDIUM" => ["docs" => 5000, "arrays" => 1, "array_size" => 10], + "HEAVY" => ["docs" => 10000, "arrays" => 2, "array_size" => 20], + // Spatial-heavy scenario: adds spatial attributes with decode filters + "SPATIAL" => ["docs" => 5000, "arrays" => 1, "array_size" => 10, "spatial" => true], +]; + +if (!isset($levels[$level])) { + fwrite(STDERR, "Invalid level: {$level}\n"); + exit(1); +} + +$cfg = $levels[$level]; +$docs = $cfg["docs"]; +$arraySize = $cfg["array_size"]; + +// Build a realistic collection schema with filters (optionally spatial) +function buildCollection(bool $spatial = false): Document +{ + $attributes = []; + for ($i = 1; $i <= 3; $i++) { + $attributes[] = ['$id' => "s{$i}", "type" => Database::VAR_STRING, "array" => false, "filters" => []]; + } + $attributes[] = ['$id' => "jsonData", "type" => Database::VAR_STRING, "array" => false, "filters" => ["json"]]; + $attributes[] = ['$id' => "jsonArray", "type" => Database::VAR_STRING, "array" => true, "filters" => ["json"]]; + for ($i = 1; $i <= 2; $i++) { + $attributes[] = ['$id' => "n{$i}", "type" => Database::VAR_INTEGER, "array" => false, "filters" => []]; + } + $attributes[] = ['$id' => "b1", "type" => Database::VAR_BOOLEAN, "array" => false, "filters" => []]; + $attributes[] = ['$id' => "d1", "type" => Database::VAR_DATETIME, "array" => false, "filters" => ["datetime"]]; + $attributes[] = ['$id' => "d2", "type" => Database::VAR_DATETIME, "array" => false, "filters" => ["datetime"]]; + $attributes[] = ['$id' => "arr", "type" => Database::VAR_STRING, "array" => true, "filters" => []]; + + if ($spatial) { + $attributes[] = ['$id' => 'p1', 'type' => Database::VAR_POINT, 'array' => false, 'filters' => [Database::VAR_POINT]]; + $attributes[] = ['$id' => 'ls1', 'type' => Database::VAR_LINESTRING, 'array' => false, 'filters' => [Database::VAR_LINESTRING]]; + $attributes[] = ['$id' => 'pg1', 'type' => Database::VAR_POLYGON, 'array' => false, 'filters' => [Database::VAR_POLYGON]]; + } + + return new Document(["attributes" => $attributes]); +} + +function makeDoc(int $i, int $arraySize, bool $spatial = false): Document +{ + $d = new Document([ + '$id' => "doc{$i}", + "s1" => "alpha{$i}", + "s2" => "beta{$i}", + "s3" => "gamma{$i}", + "jsonData" => ["nested" => "data", "count" => $i], + "jsonArray" => [["id" => 1], ["id" => 2]], + "n1" => $i, + "n2" => $i * 2, + "b1" => $i % 2 === 0, + "d1" => "2024-01-15 10:30:00", + "d2" => "2024-01-15 15:45:30", + "arr" => array_map(fn ($k) => "it{$k}", range(1, $arraySize)), + ]); + + if ($spatial) { + // Encode spatial as JSON strings to simulate adapter-encoded values + $lon = ($i % 180) - 90; + $lat = (($i * 2) % 180) - 90; + $d->setAttribute('p1', json_encode(['type' => 'Point', 'coordinates' => [$lon, $lat]])); + $d->setAttribute('ls1', json_encode(['type' => 'LineString', 'coordinates' => [[$lon, $lat], [$lon + 1, $lat + 1], [$lon + 2, $lat + 2]]])); + $d->setAttribute('pg1', json_encode(['type' => 'Polygon', 'coordinates' => [[[$lon, $lat], [$lon + 1, $lat], [$lon + 1, $lat + 1], [$lon, $lat + 1], [$lon, $lat]]]])); + } + + return $d; +} + +$collection = buildCollection((bool)($cfg['spatial'] ?? false)); + +/** + * @return array + */ +function measure(callable $fn, int $repeat = 1, int $warmup = 0): array +{ + for ($w = 0; $w < $warmup; $w++) { + $fn(); + } + $times = []; + for ($r = 0; $r < $repeat; $r++) { + $start = microtime(true); + $fn(); + $times[] = (microtime(true) - $start) * 1000; + } + sort($times); + return $times; // sorted ascending +} + +// Baseline and optimized functions +$baseline = new BaselineProcessor(); +$processor = new DocumentProcessorWithFilters(); + +$spatialEnabled = (bool)($cfg['spatial'] ?? false); + +$baselineTimes = measure(function () use ($baseline, $collection, $docs, $arraySize, $spatialEnabled) { + for ($i = 1; $i <= $docs; $i++) { + $doc = makeDoc($i, $arraySize, $spatialEnabled); + $doc = $baseline->decodeBaseline($collection, $doc); + $doc = $baseline->castingBaseline($collection, $doc); + } +}, $repeat, $warmup); + +$optimizedTimes = measure(function () use ($processor, $collection, $docs, $arraySize, $spatialEnabled) { + for ($i = 1; $i <= $docs; $i++) { + $doc = makeDoc($i, $arraySize, $spatialEnabled); + $doc = $processor->processRead($collection, $doc); + } +}, $repeat, $warmup); + +$baselineMs = (int) round($baselineTimes[(int) floor((count($baselineTimes) - 1) / 2)]); +$optMs = (int) round($optimizedTimes[(int) floor((count($optimizedTimes) - 1) / 2)]); + +$gain = $baselineMs > 0 ? (($baselineMs - $optMs) / $baselineMs) * 100 : 0; + +echo "\nDocument Processor Benchmark - {$level} (WITH FILTERS)\n"; +echo "+---------+----------+----------+--------+\n"; +echo "| Metric | Baseline | Optimized| Gain |\n"; +echo "+---------+----------+----------+--------+\n"; +printf("| %-7s | %8d | %8d | %6.1f%% |\n", "time", (int) $baselineMs, (int) $optMs, $gain); +if ($repeat > 1) { + echo "(median of {$repeat} runs, warmup={$warmup})\n"; +} +echo "\n"; + +// Optional parity assert mode +if ($assertParity) { + $checks = min($docs, 1000); + for ($i = 1; $i <= $checks; $i++) { + $docA = makeDoc($i, $arraySize, $spatialEnabled); + $base = $baseline->decodeBaseline($collection, clone $docA); + $base = $baseline->castingBaseline($collection, $base); + + $docB = makeDoc($i, $arraySize, $spatialEnabled); + $opt = $processor->processRead($collection, $docB); + + $a = $base->getArrayCopy(); + $b = $opt->getArrayCopy(); + if ($a != $b) { + fwrite(STDERR, "Parity mismatch on doc {$i}\n"); + // Find first differing key + foreach ($a as $k => $v) { + $va = $a[$k] ?? null; + $vb = $b[$k] ?? null; + if ($va != $vb) { + fwrite(STDERR, " - Attribute '{$k}' differs\n"); + break; + } + } + exit(1); + } + } + echo "Parity assertion passed on {$checks} docs.\n\n"; +} +echo "+---------+----------+----------+--------+\n\n"; + +/** + * DocumentProcessor with proper filter support for fair comparison + */ +class DocumentProcessorWithFilters +{ + /** + * @var array + */ + private static array $filters = []; + + public function __construct() + { + // Register the same filters as Database class + self::$filters["json"] = [ + "decode" => function (mixed $value) { + if (!is_string($value)) { + return $value; + } + $value = json_decode($value, true) ?? []; + if (array_key_exists('$id', $value)) { + return new Document($value); + } else { + $value = array_map(function ($item) { + if (is_array($item) && array_key_exists('$id', $item)) { + return new Document($item); + } + return $item; + }, $value); + } + return $value; + }, + ]; + + self::$filters["datetime"] = [ + "decode" => function (?string $value) { + return DateTime::formatTz($value); + }, + ]; + + // Spatial-like decoders for benchmark (decode JSON strings) + self::$filters[Database::VAR_POINT] = [ + 'decode' => function (?string $value) { + return is_string($value) ? (json_decode($value, true) ?? $value) : $value; + }, + ]; + self::$filters[Database::VAR_LINESTRING] = [ + 'decode' => function (?string $value) { + return is_string($value) ? (json_decode($value, true) ?? $value) : $value; + }, + ]; + self::$filters[Database::VAR_POLYGON] = [ + 'decode' => function (?string $value) { + return is_string($value) ? (json_decode($value, true) ?? $value) : $value; + }, + ]; + } + + public function processRead(Document $collection, Document $document): Document + { + $attributes = \array_filter( + $collection->getAttribute("attributes", []), + fn ($attribute) => $attribute["type"] !== Database::VAR_RELATIONSHIP, + ); + + foreach (Database::INTERNAL_ATTRIBUTES as $attribute) { + $attributes[] = $attribute; + } + + foreach ($attributes as $attribute) { + $key = $attribute['$id'] ?? ""; + $type = $attribute["type"] ?? ""; + $array = $attribute["array"] ?? false; + $filters = $attribute["filters"] ?? []; + + if ($key === '$permissions') { + continue; + } + + $value = $document->getAttribute($key); + + if ($array) { + if (is_string($value)) { + $decoded = json_decode($value, true); + $value = \is_array($decoded) ? $decoded : $value; + } + if (!\is_array($value)) { + $value = $value === null ? [] : [$value]; + } + + foreach ($value as $i => $node) { + // Apply filters in reverse order like Database::decode + foreach (array_reverse($filters) as $filter) { + $node = $this->decodeAttribute($filter, $node); + } + $value[$i] = $this->castNode($type, $node); + } + $document->setAttribute($key, $value); + } else { + // Apply filters + foreach (array_reverse($filters) as $filter) { + $value = $this->decodeAttribute($filter, $value); + } + $document->setAttribute($key, $this->castNode($type, $value)); + } + } + + return $document; + } + + protected function decodeAttribute(string $filter, mixed $value): mixed + { + if (!array_key_exists($filter, self::$filters)) { + return $value; // Unknown filter, pass through + } + return self::$filters[$filter]["decode"]($value); + } + + private function castNode(string $type, mixed $node): mixed + { + switch ($type) { + case Database::VAR_ID: + return (string) $node; + case Database::VAR_BOOLEAN: + return (bool) $node; + case Database::VAR_INTEGER: + return (int) $node; + case Database::VAR_FLOAT: + return (float) $node; + default: + return $node; + } + } +} + +/** + * Baseline processor that properly handles filters for fair comparison + */ +class BaselineProcessor +{ + /** + * @var array + */ + private static array $filters = []; + + public function __construct() + { + // Register the same filters as Database class + self::$filters["json"] = [ + "decode" => function (mixed $value) { + if (!is_string($value)) { + return $value; + } + $value = json_decode($value, true) ?? []; + if (array_key_exists('$id', $value)) { + return new Document($value); + } else { + $value = array_map(function ($item) { + if (is_array($item) && array_key_exists('$id', $item)) { + return new Document($item); + } + return $item; + }, $value); + } + return $value; + }, + ]; + + self::$filters["datetime"] = [ + "decode" => function (?string $value) { + return DateTime::formatTz($value); + }, + ]; + + // Spatial-like decoders for benchmark (decode JSON strings) + self::$filters[Database::VAR_POINT] = [ + 'decode' => function (?string $value) { + return is_string($value) ? (json_decode($value, true) ?? $value) : $value; + }, + ]; + self::$filters[Database::VAR_LINESTRING] = [ + 'decode' => function (?string $value) { + return is_string($value) ? (json_decode($value, true) ?? $value) : $value; + }, + ]; + self::$filters[Database::VAR_POLYGON] = [ + 'decode' => function (?string $value) { + return is_string($value) ? (json_decode($value, true) ?? $value) : $value; + }, + ]; + } + + public function decodeBaseline(Document $collection, Document $document): Document + { + $attributes = \array_filter( + $collection->getAttribute("attributes", []), + fn ($attribute) => $attribute["type"] !== Database::VAR_RELATIONSHIP, + ); + foreach (Database::INTERNAL_ATTRIBUTES as $attribute) { + $attributes[] = $attribute; + } + foreach ($attributes as $attribute) { + $key = $attribute['$id'] ?? ""; + $array = $attribute["array"] ?? false; + $filters = $attribute["filters"] ?? []; + + if ($key === '$permissions') { + continue; + } + $value = $document->getAttribute($key); + $value = $array ? $value : [$value]; + $value = is_null($value) ? [] : $value; + + // PROPERLY APPLY FILTERS like Database::decode does + foreach ($value as $index => $node) { + foreach (array_reverse($filters) as $filter) { + $node = $this->decodeAttribute($filter, $node); + } + $value[$index] = $node; + } + + $document->setAttribute($key, $array ? $value : $value[0]); + } + return $document; + } + + public function castingBaseline(Document $collection, Document $document): Document + { + $attributes = $collection->getAttribute("attributes", []); + foreach (Database::INTERNAL_ATTRIBUTES as $attribute) { + $attributes[] = $attribute; + } + foreach ($attributes as $attribute) { + $key = $attribute['$id'] ?? ""; + $type = $attribute["type"] ?? ""; + $array = $attribute["array"] ?? false; + if ($key === '$permissions') { + continue; + } + $value = $document->getAttribute($key); + if ($array) { + if (is_string($value)) { + $decoded = json_decode($value, true); + $value = \is_array($decoded) ? $decoded : $value; + } + if (!\is_array($value)) { + $value = $value === null ? [] : [$value]; + } + foreach ($value as $i => $node) { + $value[$i] = $this->castNode($type, $node); + } + $document->setAttribute($key, $value); + } else { + $document->setAttribute($key, $this->castNode($type, $value)); + } + } + return $document; + } + + protected function decodeAttribute(string $filter, mixed $value): mixed + { + if (!array_key_exists($filter, self::$filters)) { + return $value; // Unknown filter, pass through + } + return self::$filters[$filter]["decode"]($value); + } + + private function castNode(string $type, mixed $node): mixed + { + switch ($type) { + case Database::VAR_ID: + return (string) $node; + case Database::VAR_BOOLEAN: + return (bool) $node; + case Database::VAR_INTEGER: + return (int) $node; + case Database::VAR_FLOAT: + return (float) $node; + default: + return $node; + } + } +} diff --git a/tests/benchmarking/run-processor-benchmark.sh b/tests/benchmarking/run-processor-benchmark.sh new file mode 100755 index 000000000..769b229a4 --- /dev/null +++ b/tests/benchmarking/run-processor-benchmark.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Usage: +# ./tests/benchmarking/run-processor-benchmark.sh [LEVEL] [--assert] [--repeat=N] [--warmup=N] +# Levels: +# LIGHT | MEDIUM | HEAVY | SPATIAL (default: MEDIUM) + +LEVEL=${1:-MEDIUM} +shift || true + +printf "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" +printf " Document Processor Benchmark\n" +printf "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" + +printf "\n🔧 Starting Docker containers...\n" +docker-compose up -d --build --remove-orphans >/dev/null +printf "✅ Docker containers ready\n" + +# Pass through any extra flags to the PHP benchmark script +docker-compose exec -T tests php tests/benchmarking/document_processor_benchmark.php "${LEVEL}" "$@" + +printf "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" +printf " Benchmark Complete\n" +printf "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" diff --git a/tests/unit/DocumentProcessorTest.php b/tests/unit/DocumentProcessorTest.php new file mode 100644 index 000000000..e3e2d316b --- /dev/null +++ b/tests/unit/DocumentProcessorTest.php @@ -0,0 +1,367 @@ +collection = new Document([ + '$id' => 'test_collection', + 'attributes' => [ + [ + '$id' => 'name', + 'type' => Database::VAR_STRING, + 'size' => 255, + 'required' => false, + 'signed' => true, + 'array' => false, + 'filters' => [], + ], + [ + '$id' => 'age', + 'type' => Database::VAR_INTEGER, + 'size' => 0, + 'required' => false, + 'signed' => true, + 'array' => false, + 'filters' => [], + ], + [ + '$id' => 'active', + 'type' => Database::VAR_BOOLEAN, + 'size' => 0, + 'required' => false, + 'signed' => true, + 'array' => false, + 'filters' => [], + ], + [ + '$id' => 'tags', + 'type' => Database::VAR_STRING, + 'size' => 255, + 'required' => false, + 'signed' => true, + 'array' => true, + 'filters' => ['json'], + ], + [ + '$id' => 'metadata', + 'type' => Database::VAR_STRING, + 'size' => 16777216, + 'required' => false, + 'signed' => true, + 'array' => false, + 'filters' => ['json'], + ], + [ + '$id' => 'created_at', + 'type' => Database::VAR_DATETIME, + 'size' => 0, + 'required' => false, + 'signed' => false, + 'array' => false, + 'filters' => ['datetime'], + ], + [ + '$id' => 'scores', + 'type' => Database::VAR_FLOAT, + 'size' => 0, + 'required' => false, + 'signed' => true, + 'array' => true, + 'filters' => [], + ], + ], + ]); + + $this->processor = new DocumentProcessor(); + } + + public function testStringAttributeEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'name' => 'John Doe', + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertEquals($legacyResult->getAttribute('name'), $processorResult->getAttribute('name')); + } + + public function testIntegerCastingEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'age' => '25', // String that should be cast to int + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertSame($legacyResult->getAttribute('age'), $processorResult->getAttribute('age')); + $this->assertIsInt($processorResult->getAttribute('age')); + } + + public function testBooleanCastingEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'active' => 1, // Int that should be cast to bool + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertSame($legacyResult->getAttribute('active'), $processorResult->getAttribute('active')); + $this->assertIsBool($processorResult->getAttribute('active')); + } + + public function testJsonFilterEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'metadata' => '{"key":"value","nested":{"foo":"bar"}}', + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertEquals($legacyResult->getAttribute('metadata'), $processorResult->getAttribute('metadata')); + $this->assertIsArray($processorResult->getAttribute('metadata')); + } + + public function testJsonArrayFilterEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'tags' => '["tag1","tag2","tag3"]', + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertEquals($legacyResult->getAttribute('tags'), $processorResult->getAttribute('tags')); + $this->assertIsArray($processorResult->getAttribute('tags')); + } + + public function testDatetimeFilterEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'created_at' => '2024-01-15T10:30:00.000+00:00', + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertEquals($legacyResult->getAttribute('created_at'), $processorResult->getAttribute('created_at')); + } + + public function testArrayAttributeEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'scores' => '[1.5, 2.3, 3.7]', + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertEquals($legacyResult->getAttribute('scores'), $processorResult->getAttribute('scores')); + $this->assertIsArray($processorResult->getAttribute('scores')); + foreach ($processorResult->getAttribute('scores') as $score) { + $this->assertIsFloat($score); + } + } + + public function testNullValueEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'name' => null, + 'age' => null, + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + $this->assertNull($processorResult->getAttribute('name')); + $this->assertNull($processorResult->getAttribute('age')); + } + + public function testSelectionsEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + 'name' => 'John', + 'age' => 25, + 'active' => true, + ]); + + $selections = ['name', 'age']; + + $legacyResult = $this->legacyProcess(clone $doc, $selections); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, $selections, false); + + // Both should have selected attributes + $this->assertEquals($legacyResult->getAttribute('name'), $processorResult->getAttribute('name')); + $this->assertEquals($legacyResult->getAttribute('age'), $processorResult->getAttribute('age')); + + // Check if non-selected attributes are handled the same way + $this->assertEquals( + $legacyResult->getAttribute('active'), + $processorResult->getAttribute('active') + ); + } + + public function testComplexDocumentEquivalence(): void + { + $doc = new Document([ + '$id' => 'doc1', + '$permissions' => ['read("any")'], + 'name' => 'Complex Doc', + 'age' => '30', + 'active' => 1, + 'tags' => '["tag1","tag2"]', + 'metadata' => '{"nested":{"deep":"value"}}', + 'created_at' => '2024-01-15T10:30:00.000+00:00', + 'scores' => '[9.5, 8.3, 7.1]', + ]); + + $legacyResult = $this->legacyProcess(clone $doc); + $processorResult = $this->processor->processRead($this->collection, clone $doc, null, [], false); + + // Compare all attributes + foreach ($this->collection->getAttribute('attributes', []) as $attr) { + $key = $attr['$id']; + $this->assertEquals( + $legacyResult->getAttribute($key), + $processorResult->getAttribute($key), + "Attribute '$key' differs between legacy and processor" + ); + } + } + + /** + * Simulate legacy decode + casting process + * + * @param Document $doc + * @param array $selections + * @return Document + */ + private function legacyProcess(Document $doc, array $selections = []): Document + { + // Simulate casting + foreach ($this->collection->getAttribute('attributes', []) as $attribute) { + $key = $attribute['$id']; + $type = $attribute['type']; + $array = $attribute['array'] ?? false; + $value = $doc->getAttribute($key); + + if ($value === null) { + continue; + } + + if ($array) { + if (is_string($value)) { + $value = json_decode($value, true) ?? []; + } + if (!is_array($value)) { + $value = [$value]; + } + foreach ($value as $i => $node) { + $value[$i] = $this->castValue($type, $node); + } + } else { + $value = $this->castValue($type, $value); + } + + $doc->setAttribute($key, $value); + } + + // Simulate decode (filters) + foreach ($this->collection->getAttribute('attributes', []) as $attribute) { + $key = $attribute['$id']; + $filters = $attribute['filters'] ?? []; + $array = $attribute['array'] ?? false; + $value = $doc->getAttribute($key); + + if (empty($filters)) { + continue; + } + + foreach (array_reverse($filters) as $filter) { + if ($array && is_array($value)) { + foreach ($value as $i => $node) { + $value[$i] = $this->applyFilter($filter, $node); + } + } else { + $value = $this->applyFilter($filter, $value); + } + } + + $doc->setAttribute($key, $value); + } + + return $doc; + } + + private function castValue(string $type, mixed $value): mixed + { + switch ($type) { + case Database::VAR_STRING: + case Database::VAR_ID: + return (string) $value; + case Database::VAR_INTEGER: + return (int) $value; + case Database::VAR_FLOAT: + return (float) $value; + case Database::VAR_BOOLEAN: + return (bool) $value; + default: + return $value; + } + } + + private function applyFilter(string $filter, mixed $value): mixed + { + switch ($filter) { + case 'json': + if (!is_string($value)) { + return $value; + } + $decoded = json_decode($value, true) ?? []; + if (array_key_exists('$id', $decoded)) { + return new Document($decoded); + } + if (is_array($decoded)) { + foreach ($decoded as $i => $item) { + if (is_array($item) && array_key_exists('$id', $item)) { + $decoded[$i] = new Document($item); + } + } + } + return $decoded; + case 'datetime': + return DateTime::formatTz($value); + default: + return $value; + } + } +}