From 4fc86703d29615aea2d8196c813d0790aeab08e6 Mon Sep 17 00:00:00 2001 From: Travis Cline Date: Wed, 28 Jan 2026 14:04:39 -0800 Subject: [PATCH 1/5] struct_arm64: cache struct types and pool instances in bundleStackArgs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cache the reflect.StructOf result and pool reflect.New instances for Darwin ARM64 stack argument bundling. This avoids recreating the packed struct type and allocating a new instance on every call when arguments spill to the stack. CFunc/10args: ~810 → ~560 ns/op (-31%), 23 → 21 allocs CFunc/15args: ~1840 → ~810 ns/op (-56%), 44 → 30 allocs Callbacks unchanged (bundling path is guarded by !isCallback). --- struct_arm64.go | 112 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 89 insertions(+), 23 deletions(-) diff --git a/struct_arm64.go b/struct_arm64.go index f55f1903..2b3bd2a8 100644 --- a/struct_arm64.go +++ b/struct_arm64.go @@ -8,12 +8,34 @@ import ( "reflect" "runtime" "strconv" - stdstrings "strings" + "sync" "unsafe" "github.com/ebitengine/purego/internal/strings" ) +// structTypeCache caches reflect.Type for bundled stack arg structs, keyed by +// a string built from the constituent arg types. +var structTypeCache sync.Map // map[string]cachedStructInfo + +// cachedStructInfo holds a cached struct type and the indices of non-padding fields. +type cachedStructInfo struct { + typ reflect.Type + valueIndices []int // indices of non-padding fields in the struct +} + +// structInstancePool pools reflect.Value instances for each cached struct type. +var structInstancePool sync.Map // map[string]*sync.Pool + +// fieldNames are pre-computed field name strings to avoid per-call strconv.Itoa. +var fieldNames [maxArgs * 2]string + +func init() { + for i := range fieldNames { + fieldNames[i] = strconv.Itoa(i) + } +} + func getStruct(outType reflect.Type, syscall syscall15Args) (v reflect.Value) { outSize := outType.Size() switch { @@ -485,18 +507,29 @@ const ( paddingFieldPrefix = "Pad" ) -// bundleStackArgs bundles remaining arguments for Darwin ARM64 C-style stack packing. -// It creates a packed struct with proper alignment and copies it to the stack in 8-byte chunks. -func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { - if runtime.GOOS != "darwin" { - panic("purego: bundleStackArgs should only be called on darwin") +// buildStructCacheKey builds a cache key string from the types of stack arguments. +func buildStructCacheKey(stackArgs []reflect.Value) string { + // Use a simple concatenation of type strings + var buf []byte + for i, val := range stackArgs { + if i > 0 { + buf = append(buf, ',') + } + buf = append(buf, val.Type().String()...) } - if len(stackArgs) == 0 { - return + return string(buf) +} + +// getOrCreateStructInfo returns a cached struct type and field indices for the +// given stack argument types, creating and caching them if necessary. +func getOrCreateStructInfo(stackArgs []reflect.Value, key string) cachedStructInfo { + if v, ok := structTypeCache.Load(key); ok { + return v.(cachedStructInfo) } // Build struct fields with proper C alignment and padding var fields []reflect.StructField + var valueIndices []int currentOffset := uintptr(0) fieldIndex := 0 @@ -513,7 +546,7 @@ func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { if currentOffset%uintptr(valAlign) != 0 { paddingNeeded := uintptr(valAlign) - (currentOffset % uintptr(valAlign)) fields = append(fields, reflect.StructField{ - Name: paddingFieldPrefix + strconv.Itoa(fieldIndex), + Name: paddingFieldPrefix + fieldNames[fieldIndex], Type: reflect.ArrayOf(int(paddingNeeded), reflect.TypeOf(byte(0))), }) currentOffset += paddingNeeded @@ -521,29 +554,62 @@ func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { } fields = append(fields, reflect.StructField{ - Name: "X" + strconv.Itoa(j), + Name: "X" + fieldNames[j], Type: val.Type(), }) + valueIndices = append(valueIndices, fieldIndex) currentOffset += valSize fieldIndex++ } - // Create and populate the packed struct - structType := reflect.StructOf(fields) - structInstance := reflect.New(structType).Elem() + info := cachedStructInfo{ + typ: reflect.StructOf(fields), + valueIndices: valueIndices, + } + structTypeCache.Store(key, info) + return info +} + +// getPooledStructInstance returns a pooled struct instance for the given cache key and type. +func getPooledStructInstance(key string, info cachedStructInfo) reflect.Value { + pool, _ := structInstancePool.LoadOrStore(key, &sync.Pool{ + New: func() any { + return reflect.New(info.typ) + }, + }) + return pool.(*sync.Pool).Get().(reflect.Value).Elem() +} - // Set values (skip padding fields) - argIndex := 0 - for j := 0; j < structInstance.NumField(); j++ { - fieldName := structType.Field(j).Name - if stdstrings.HasPrefix(fieldName, paddingFieldPrefix) { - continue - } - structInstance.Field(j).Set(stackArgs[argIndex]) - argIndex++ +// returnPooledStructInstance returns a struct instance to the pool. +func returnPooledStructInstance(key string, v reflect.Value) { + if pool, ok := structInstancePool.Load(key); ok { + pool.(*sync.Pool).Put(v.Addr()) + } +} + +// bundleStackArgs bundles remaining arguments for Darwin ARM64 C-style stack packing. +// It creates a packed struct with proper alignment and copies it to the stack in 8-byte chunks. +// Struct types and instances are cached/pooled to avoid per-call reflection overhead. +func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { + if runtime.GOOS != "darwin" { + panic("purego: bundleStackArgs should only be called on darwin") + } + if len(stackArgs) == 0 { + return + } + + key := buildStructCacheKey(stackArgs) + info := getOrCreateStructInfo(stackArgs, key) + structInstance := getPooledStructInstance(key, info) + + // Set values using pre-computed indices + for i, idx := range info.valueIndices { + structInstance.Field(idx).Set(stackArgs[i]) } ptr := unsafe.Pointer(structInstance.Addr().Pointer()) - size := structType.Size() + size := info.typ.Size() copyStruct8ByteChunks(ptr, size, addStack) + + returnPooledStructInstance(key, structInstance) } From 92422d21cb400e22fe71cc8fdb40e8e8e44145f1 Mon Sep 17 00:00:00 2001 From: Travis Cline Date: Wed, 28 Jan 2026 14:05:38 -0800 Subject: [PATCH 2/5] syscall_sysv: stack-allocate args slice in callbackWrap Replace make([]reflect.Value, fnType.NumIn()) with a stack-allocated fixed-size array, eliminating a heap allocation on every callback invocation. The profiling report identified this as the single largest allocator at 38.5% of total allocations (5.87 GB). --- syscall_sysv.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/syscall_sysv.go b/syscall_sysv.go index f96bfd0b..d3695979 100644 --- a/syscall_sysv.go +++ b/syscall_sysv.go @@ -143,7 +143,9 @@ func callbackWrap(a *callbackArgs) { fn := cbs.funcs[a.index] cbs.lock.Unlock() fnType := fn.Type() - args := make([]reflect.Value, fnType.NumIn()) + // numOfIntegerRegisters() + numOfFloatRegisters covers the max callback inputs + var argsBuf [maxArgs + numOfFloatRegisters]reflect.Value + args := argsBuf[:fnType.NumIn()] frame := (*[callbackMaxFrame]uintptr)(a.args) var floatsN int // floatsN represents the number of float arguments processed var intsN int // intsN represents the number of integer arguments processed From f2d7ee62c4d071a466980c06e9d80378d2233639 Mon Sep 17 00:00:00 2001 From: Travis Cline Date: Wed, 28 Jan 2026 14:07:32 -0800 Subject: [PATCH 3/5] func: remove defer from RegisterFunc hot path Replace defer-based runtime.KeepAlive and thePool.Put with explicit calls after the syscall returns. This eliminates deferred closure overhead on every RegisterFunc invocation. --- func.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/func.go b/func.go index 49f746f5..4c163434 100644 --- a/func.go +++ b/func.go @@ -273,10 +273,6 @@ func RegisterFunc(fptr any, cfn uintptr) { } var keepAlive []any - defer func() { - runtime.KeepAlive(keepAlive) - runtime.KeepAlive(args) - }() var arm64_r8 uintptr if ty.NumOut() == 1 && ty.Out(0).Kind() == reflect.Struct { @@ -321,7 +317,6 @@ func RegisterFunc(fptr any, cfn uintptr) { } syscall := thePool.Get().(*syscall15Args) - defer thePool.Put(syscall) if runtime.GOARCH == "loong64" || runtime.GOARCH == "riscv64" { *syscall = syscall15Args{ @@ -353,6 +348,9 @@ func RegisterFunc(fptr any, cfn uintptr) { syscall.f1 = syscall.a2 // on amd64 a2 stores the float return. On 32bit platforms floats aren't support } if ty.NumOut() == 0 { + thePool.Put(syscall) + runtime.KeepAlive(keepAlive) + runtime.KeepAlive(args) return nil } outType := ty.Out(0) @@ -388,13 +386,15 @@ func RegisterFunc(fptr any, cfn uintptr) { default: panic("purego: unsupported return kind: " + outType.Kind().String()) } + thePool.Put(syscall) + runtime.KeepAlive(keepAlive) + runtime.KeepAlive(args) if len(args) > 0 { // reuse args slice instead of allocating one when possible args[0] = v return args[:1] - } else { - return []reflect.Value{v} } + return []reflect.Value{v} }) fn.Set(v) } From 4120a3d8ecd02dc46dd4e8e99e4920c3b067b92a Mon Sep 17 00:00:00 2001 From: Travis Cline Date: Wed, 28 Jan 2026 14:09:23 -0800 Subject: [PATCH 4/5] func: pre-compute arg kinds at registration time Build a per-function argKind slice during RegisterFunc instead of type-switching via reflect.Kind on every call. This replaces the generic addValue dispatch with a direct switch on pre-computed enum values, reducing per-call reflect overhead. --- func.go | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/func.go b/func.go index 4c163434..d5b4076a 100644 --- a/func.go +++ b/func.go @@ -14,7 +14,6 @@ import ( "unsafe" "github.com/ebitengine/purego/internal/strings" - "github.com/ebitengine/purego/internal/xreflect" ) const ( @@ -229,6 +228,52 @@ func RegisterFunc(fptr any, cfn uintptr) { // When callbacks can unpack tightly-packed arguments, this workaround can be removed. isCallback := isCallbackFunction(cfn) + // Pre-compute arg kinds at registration time to avoid per-call reflect overhead. + type argKind uint8 + const ( + akUint argKind = iota + akInt + akFloat32 + akFloat64 + akString + akBool + akPtr + akFunc + akStruct + akVariadic // last arg is []any + ) + numIn := ty.NumIn() + argKinds := make([]argKind, numIn) + for i := 0; i < numIn; i++ { + switch ty.In(i).Kind() { + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + argKinds[i] = akInt + case reflect.Uintptr, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + argKinds[i] = akUint + case reflect.Float32: + argKinds[i] = akFloat32 + case reflect.Float64: + argKinds[i] = akFloat64 + case reflect.String: + argKinds[i] = akString + case reflect.Bool: + argKinds[i] = akBool + case reflect.Func: + argKinds[i] = akFunc + case reflect.Struct: + argKinds[i] = akStruct + case reflect.Ptr, reflect.UnsafePointer, reflect.Slice: + argKinds[i] = akPtr + } + } + // Check if the last arg is variadic []any + if numIn > 0 { + lastIn := ty.In(numIn - 1) + if lastIn.Kind() == reflect.Slice && lastIn.Elem().Kind() == reflect.Interface { + argKinds[numIn-1] = akVariadic + } + } + v := reflect.MakeFunc(ty, func(args []reflect.Value) (results []reflect.Value) { var sysargs [maxArgs]uintptr var floats [numOfFloatRegisters]uintptr @@ -291,29 +336,53 @@ func RegisterFunc(fptr any, cfn uintptr) { } } for i, v := range args { - if variadic, ok := xreflect.TypeAssert[[]any](args[i]); ok { + ak := argKinds[i] + // Handle variadic expansion + if ak == akVariadic { if i != len(args)-1 { panic("purego: can only expand last parameter") } + variadic := v.Interface().([]any) for _, x := range variadic { keepAlive = addValue(reflect.ValueOf(x), keepAlive, addInt, addFloat, addStack, &numInts, &numFloats, &numStack) } continue } // Check if we need to start Darwin ARM64 C-style stack packing - // Skip tight packing for callbacks since they still use 8-byte slot unpacking - // TODO: Remove !isCallback condition once callback unpacking supports tight packing if runtime.GOARCH == "arm64" && runtime.GOOS == "darwin" && !isCallback && shouldBundleStackArgs(v, numInts, numFloats) { - // Collect and separate remaining args into register vs stack stackArgs, newKeepAlive := collectStackArgs(args, i, numInts, numFloats, keepAlive, addInt, addFloat, addStack, &numInts, &numFloats, &numStack) keepAlive = newKeepAlive - - // Bundle stack arguments with C-style packing bundleStackArgs(stackArgs, addStack) break } - keepAlive = addValue(v, keepAlive, addInt, addFloat, addStack, &numInts, &numFloats, &numStack) + // Fast dispatch using pre-computed arg kind + switch ak { + case akUint: + addInt(uintptr(v.Uint())) + case akInt: + addInt(uintptr(v.Int())) + case akFloat32: + addFloat(uintptr(math.Float32bits(float32(v.Float())))) + case akFloat64: + addFloat(uintptr(math.Float64bits(v.Float()))) + case akString: + ptr := strings.CString(v.String()) + keepAlive = append(keepAlive, ptr) + addInt(uintptr(unsafe.Pointer(ptr))) + case akBool: + if v.Bool() { + addInt(1) + } else { + addInt(0) + } + case akPtr: + addInt(v.Pointer()) + case akFunc: + addInt(NewCallback(v.Interface())) + case akStruct: + keepAlive = addStruct(v, &numInts, &numFloats, &numStack, addInt, addFloat, addStack, keepAlive) + } } syscall := thePool.Get().(*syscall15Args) From 41f1060870a10da28b95e5ac12768c24bac17f7e Mon Sep 17 00:00:00 2001 From: Travis Cline Date: Wed, 28 Jan 2026 14:24:09 -0800 Subject: [PATCH 5/5] purego: pre-compute bundle info and stack-allocate collectStackArgs buffer Pre-compute struct cache key and bundle info at RegisterFunc registration time instead of rebuilding them on every call. This eliminates per-call buildStructCacheKey allocations (521 MB in profiling) and sync.Map lookups for Darwin ARM64 stack argument packing. Also stack-allocate the collectStackArgs buffer by accepting a caller- provided []reflect.Value slice, eliminating the per-call heap allocation for stack argument collection (1.09 GB in profiling). String arguments that spill to stack are mapped to *byte type in the pre-computed info since collectStackArgs converts them via CString before bundling. Variadic and struct-containing signatures fall back to runtime computation. RegisterFunc/CFunc/10args: -19% latency, -19% memory RegisterFunc/CFunc/15args: -30% latency, -45% memory, -9 allocs --- func.go | 24 +++++++++- struct_386.go | 8 +++- struct_amd64.go | 8 +++- struct_arm.go | 8 +++- struct_arm64.go | 118 +++++++++++++++++++++++++++++++++++++++++----- struct_loong64.go | 8 +++- struct_riscv64.go | 7 +++ 7 files changed, 164 insertions(+), 17 deletions(-) diff --git a/func.go b/func.go index d5b4076a..b93d7987 100644 --- a/func.go +++ b/func.go @@ -25,6 +25,21 @@ var thePool = sync.Pool{New: func() any { return new(syscall15Args) }} +// cachedStructInfo holds a cached struct type and the indices of non-padding fields. +type cachedStructInfo struct { + typ reflect.Type + valueIndices []int // indices of non-padding fields in the struct +} + +// preBundleInfo holds pre-computed bundling information created at registration time +// to avoid per-call cache key construction and sync.Map lookups for Darwin ARM64 +// stack argument packing. +type preBundleInfo struct { + key string + info cachedStructInfo + pool *sync.Pool +} + // RegisterLibFunc is a wrapper around RegisterFunc that uses the C function returned from Dlsym(handle, name). // It panics if it can't find the name symbol. func RegisterLibFunc(fptr any, handle uintptr, name string) { @@ -274,6 +289,10 @@ func RegisterFunc(fptr any, cfn uintptr) { } } + // Pre-compute bundle info for Darwin ARM64 stack argument packing. + // This avoids per-call cache key construction and sync.Map lookups. + preBundleInfoVal := precomputeBundleInfo(ty) + v := reflect.MakeFunc(ty, func(args []reflect.Value) (results []reflect.Value) { var sysargs [maxArgs]uintptr var floats [numOfFloatRegisters]uintptr @@ -350,10 +369,11 @@ func RegisterFunc(fptr any, cfn uintptr) { } // Check if we need to start Darwin ARM64 C-style stack packing if runtime.GOARCH == "arm64" && runtime.GOOS == "darwin" && !isCallback && shouldBundleStackArgs(v, numInts, numFloats) { + var stackArgsBuf [maxArgs]reflect.Value stackArgs, newKeepAlive := collectStackArgs(args, i, numInts, numFloats, - keepAlive, addInt, addFloat, addStack, &numInts, &numFloats, &numStack) + keepAlive, addInt, addFloat, addStack, &numInts, &numFloats, &numStack, stackArgsBuf[:]) keepAlive = newKeepAlive - bundleStackArgs(stackArgs, addStack) + bundleStackArgsWithInfo(stackArgs, addStack, preBundleInfoVal) break } // Fast dispatch using pre-computed arg kind diff --git a/struct_386.go b/struct_386.go index 02c8ac45..b71f651c 100644 --- a/struct_386.go +++ b/struct_386.go @@ -31,11 +31,17 @@ func structFitsInRegisters(val reflect.Value, tempNumInts, tempNumFloats int) (b // collectStackArgs is not used on 386. func collectStackArgs(args []reflect.Value, startIdx int, numInts, numFloats int, keepAlive []any, addInt, addFloat, addStack func(uintptr), - pNumInts, pNumFloats, pNumStack *int) ([]reflect.Value, []any) { + pNumInts, pNumFloats, pNumStack *int, stackBuf []reflect.Value) ([]reflect.Value, []any) { panic("purego: collectStackArgs should not be called on 386") } +func precomputeBundleInfo(ty reflect.Type) *preBundleInfo { return nil } + // bundleStackArgs is not used on 386. func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { panic("purego: bundleStackArgs should not be called on 386") } + +func bundleStackArgsWithInfo(stackArgs []reflect.Value, addStack func(uintptr), pre *preBundleInfo) { + panic("purego: bundleStackArgsWithInfo should not be called on 386") +} diff --git a/struct_amd64.go b/struct_amd64.go index e3b6a7bb..daa851cd 100644 --- a/struct_amd64.go +++ b/struct_amd64.go @@ -277,11 +277,17 @@ func structFitsInRegisters(val reflect.Value, tempNumInts, tempNumFloats int) (b // collectStackArgs is not used on amd64. func collectStackArgs(args []reflect.Value, startIdx int, numInts, numFloats int, keepAlive []any, addInt, addFloat, addStack func(uintptr), - pNumInts, pNumFloats, pNumStack *int) ([]reflect.Value, []any) { + pNumInts, pNumFloats, pNumStack *int, stackBuf []reflect.Value) ([]reflect.Value, []any) { panic("purego: collectStackArgs should not be called on amd64") } +func precomputeBundleInfo(ty reflect.Type) *preBundleInfo { return nil } + // bundleStackArgs is not used on amd64. func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { panic("purego: bundleStackArgs should not be called on amd64") } + +func bundleStackArgsWithInfo(stackArgs []reflect.Value, addStack func(uintptr), pre *preBundleInfo) { + panic("purego: bundleStackArgsWithInfo should not be called on amd64") +} diff --git a/struct_arm.go b/struct_arm.go index d2a29f60..9356e72f 100644 --- a/struct_arm.go +++ b/struct_arm.go @@ -31,11 +31,17 @@ func structFitsInRegisters(val reflect.Value, tempNumInts, tempNumFloats int) (b // collectStackArgs is not used on arm. func collectStackArgs(args []reflect.Value, startIdx int, numInts, numFloats int, keepAlive []any, addInt, addFloat, addStack func(uintptr), - pNumInts, pNumFloats, pNumStack *int) ([]reflect.Value, []any) { + pNumInts, pNumFloats, pNumStack *int, stackBuf []reflect.Value) ([]reflect.Value, []any) { panic("purego: collectStackArgs should not be called on arm") } +func precomputeBundleInfo(ty reflect.Type) *preBundleInfo { return nil } + // bundleStackArgs is not used on arm. func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { panic("purego: bundleStackArgs should not be called on arm") } + +func bundleStackArgsWithInfo(stackArgs []reflect.Value, addStack func(uintptr), pre *preBundleInfo) { + panic("purego: bundleStackArgsWithInfo should not be called on arm") +} diff --git a/struct_arm64.go b/struct_arm64.go index 2b3bd2a8..70d3b371 100644 --- a/struct_arm64.go +++ b/struct_arm64.go @@ -18,11 +18,6 @@ import ( // a string built from the constituent arg types. var structTypeCache sync.Map // map[string]cachedStructInfo -// cachedStructInfo holds a cached struct type and the indices of non-padding fields. -type cachedStructInfo struct { - typ reflect.Type - valueIndices []int // indices of non-padding fields in the struct -} // structInstancePool pools reflect.Value instances for each cached struct type. var structInstancePool sync.Map // map[string]*sync.Pool @@ -449,15 +444,16 @@ func structFitsInRegisters(val reflect.Value, tempNumInts, tempNumFloats int) (b } // collectStackArgs separates remaining arguments into those that fit in registers vs those that go on stack. -// It returns the stack arguments and processes register arguments through addValue. +// It returns the stack arguments (using the provided buffer to avoid allocation) and processes +// register arguments through addValue. func collectStackArgs(args []reflect.Value, startIdx int, numInts, numFloats int, keepAlive []any, addInt, addFloat, addStack func(uintptr), - pNumInts, pNumFloats, pNumStack *int) ([]reflect.Value, []any) { + pNumInts, pNumFloats, pNumStack *int, stackBuf []reflect.Value) ([]reflect.Value, []any) { if runtime.GOOS != "darwin" { panic("purego: collectStackArgs should only be called on darwin") } - var stackArgs []reflect.Value + stackArgs := stackBuf[:0] tempNumInts := numInts tempNumFloats := numFloats @@ -589,8 +585,14 @@ func returnPooledStructInstance(key string, v reflect.Value) { // bundleStackArgs bundles remaining arguments for Darwin ARM64 C-style stack packing. // It creates a packed struct with proper alignment and copies it to the stack in 8-byte chunks. -// Struct types and instances are cached/pooled to avoid per-call reflection overhead. +// When pre-computed bundle info is provided (non-nil), it skips cache key construction and lookup. func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { + bundleStackArgsWithInfo(stackArgs, addStack, nil) +} + +// bundleStackArgsWithInfo is the implementation of bundleStackArgs that optionally accepts +// pre-computed bundle info to skip per-call cache key construction and sync.Map lookups. +func bundleStackArgsWithInfo(stackArgs []reflect.Value, addStack func(uintptr), pre *preBundleInfo) { if runtime.GOOS != "darwin" { panic("purego: bundleStackArgs should only be called on darwin") } @@ -598,8 +600,15 @@ func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { return } - key := buildStructCacheKey(stackArgs) - info := getOrCreateStructInfo(stackArgs, key) + var info cachedStructInfo + var key string + if pre != nil { + info = pre.info + key = pre.key + } else { + key = buildStructCacheKey(stackArgs) + info = getOrCreateStructInfo(stackArgs, key) + } structInstance := getPooledStructInstance(key, info) // Set values using pre-computed indices @@ -613,3 +622,90 @@ func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { returnPooledStructInstance(key, structInstance) } + + +// precomputeBundleInfo simulates register assignment for the given function type +// and pre-computes the struct cache key and info for stack arguments. +// Returns nil if no args spill to stack. +func precomputeBundleInfo(ty reflect.Type) *preBundleInfo { + if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" { + return nil + } + + // Simulate register assignment to find which args spill. + // String args are converted to *byte by collectStackArgs before bundling, + // so we must use the post-conversion type here. + numInts, numFloats := 0, 0 + var spillTypes []reflect.Type + ptrByteType := reflect.TypeOf((*byte)(nil)) + for i := 0; i < ty.NumIn(); i++ { + arg := ty.In(i) + switch arg.Kind() { + case reflect.Float32, reflect.Float64: + if numFloats < numOfFloatRegisters { + numFloats++ + } else { + spillTypes = append(spillTypes, arg) + } + case reflect.Struct: + // Structs are complex; skip pre-computation for now + // (they go through structFitsInRegisters at runtime) + return nil + case reflect.Slice: + // Variadic []any — can't pre-compute + if arg.Elem().Kind() == reflect.Interface { + return nil + } + if numInts < numOfIntegerRegisters() { + numInts++ + } else { + spillTypes = append(spillTypes, arg) + } + default: + if numInts < numOfIntegerRegisters() { + numInts++ + } else { + // Strings become *byte after CString conversion + if arg.Kind() == reflect.String { + spillTypes = append(spillTypes, ptrByteType) + } else { + spillTypes = append(spillTypes, arg) + } + } + } + } + + if len(spillTypes) == 0 { + return nil + } + + // Build cache key from spill types + var buf []byte + for i, t := range spillTypes { + if i > 0 { + buf = append(buf, ',') + } + buf = append(buf, t.String()...) + } + key := string(buf) + + // Build dummy values to compute struct info + dummyArgs := make([]reflect.Value, len(spillTypes)) + for i, t := range spillTypes { + dummyArgs[i] = reflect.New(t).Elem() + } + info := getOrCreateStructInfo(dummyArgs, key) + + // Warm the pool + pool, _ := structInstancePool.LoadOrStore(key, &sync.Pool{ + New: func() any { + return reflect.New(info.typ) + }, + }) + + return &preBundleInfo{ + key: key, + info: info, + pool: pool.(*sync.Pool), + } +} diff --git a/struct_loong64.go b/struct_loong64.go index 7ad162f5..d3ee3382 100644 --- a/struct_loong64.go +++ b/struct_loong64.go @@ -203,11 +203,17 @@ func structFitsInRegisters(val reflect.Value, tempNumInts, tempNumFloats int) (b // collectStackArgs is not used on loong64. func collectStackArgs(args []reflect.Value, startIdx int, numInts, numFloats int, keepAlive []any, addInt, addFloat, addStack func(uintptr), - pNumInts, pNumFloats, pNumStack *int) ([]reflect.Value, []any) { + pNumInts, pNumFloats, pNumStack *int, stackBuf []reflect.Value) ([]reflect.Value, []any) { panic("purego: collectStackArgs should not be called on loong64") } +func precomputeBundleInfo(ty reflect.Type) *preBundleInfo { return nil } + // bundleStackArgs is not used on loong64. func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { panic("purego: bundleStackArgs should not be called on loong64") } + +func bundleStackArgsWithInfo(stackArgs []reflect.Value, addStack func(uintptr), pre *preBundleInfo) { + panic("purego: bundleStackArgsWithInfo should not be called on loong64") +} diff --git a/struct_riscv64.go b/struct_riscv64.go index eac0b880..c2fff479 100644 --- a/struct_riscv64.go +++ b/struct_riscv64.go @@ -134,6 +134,7 @@ func collectStackArgs( keepAlive []any, addInt, addFloat, addStack func(uintptr), numIntsPtr, numFloatsPtr, numStackPtr *int, + stackBuf []reflect.Value, ) ([]reflect.Value, []any) { return nil, keepAlive } @@ -141,3 +142,9 @@ func collectStackArgs( func bundleStackArgs(stackArgs []reflect.Value, addStack func(uintptr)) { panic("bundleStackArgs not supported on RISCV64") } + +func bundleStackArgsWithInfo(stackArgs []reflect.Value, addStack func(uintptr), pre *preBundleInfo) { + panic("purego: bundleStackArgsWithInfo should not be called on riscv64") +} + +func precomputeBundleInfo(ty reflect.Type) *preBundleInfo { return nil }