Skip to content

Commit c315033

Browse files
authored
Implement LOAD_ATTR inline caching with adaptive specialization (#7292)
* Implement LOAD_ATTR inline caching with adaptive specialization Add type version counter (tp_version_tag) to PyType with subclass invalidation cascade. Add cache read/write methods (u16/u32/u64) to CodeUnits. Implement adaptive specialization in load_attr that replaces the opcode with specialized variants on first execution: - LoadAttrMethodNoDict: cached method lookup for slotted types - LoadAttrMethodWithValues: cached method with dict shadow check - LoadAttrInstanceValue: direct dict lookup skipping descriptors Specialized opcodes guard on type_version_tag and deoptimize back to generic LOAD_ATTR with backoff counter on cache miss. * Add BINARY_OP and CALL adaptive specialization BINARY_OP: Specialize int add/subtract/multiply and float add/subtract/multiply with type guards and deoptimization. CALL: Add func_version to PyFunction, specialize simple function calls (CallPyExactArgs, CallBoundMethodExactArgs) with invoke_exact_args fast path that skips FuncArgs allocation and fill_locals_from_args. * Lazy quickening for adaptive specialization counters Move counter initialization from compile-time to RESUME execution, matching CPython's _PyCode_Quicken pattern. Store counter in CACHE entry's arg byte to preserve op=Instruction::Cache for dis/JIT. Add PyCode.quickened flag for one-time initialization. * Add Instruction::deoptimize() and CodeUnits::original_bytes() - deoptimize() maps specialized opcodes back to their base adaptive variant - original_bytes() produces deoptimized bytecode with zeroed CACHE entries - co_code now returns deoptimized bytes, _co_code_adaptive returns current bytes - Marshal serialization uses original_bytes() instead of raw transmute * Fix monitoring and specialization interaction - cache_entries() returns correct count for instrumented opcodes - deoptimize() maps instrumented opcodes back to base - quicken() skips adaptive counter for instrumented opcodes - instrument_code Phase 3 deoptimizes specialized opcodes and clears CACHE entries to prevent stale pointer dereferences * Address review: bounds checks, UB fix, version overflow, error handling - Add bounds checks to read_cache_u16/u32/u64 - Fix quicken() aliasing UB by using &mut directly - Add JumpBackwardJit/JumpBackwardNoJit to deoptimize() - Guard can_specialize_call with NEWLOCALS flag check - Use compare_exchange_weak for version tag to prevent wraparound - Propagate dict lookup errors in LoadAttrMethodWithValues - Apply adaptive backoff on version tag assignment failure - Remove duplicate imports in frame.rs
1 parent b1cddc4 commit c315033

File tree

12 files changed

+1033
-42
lines changed

12 files changed

+1033
-42
lines changed

.cspell.dict/cpython.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ copyslot
4444
cpucount
4545
defaultdict
4646
denom
47+
deopt
4748
dictbytype
4849
DICTFLAG
4950
dictoffset

.cspell.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
"dedentations",
6161
"dedents",
6262
"deduped",
63+
"deoptimize",
6364
"downcastable",
6465
"downcasted",
6566
"dumpable",
@@ -73,6 +74,7 @@
7374
"interps",
7475
"jitted",
7576
"jitting",
77+
"kwonly",
7678
"lossily",
7779
"makeunicodedata",
7880
"microbenchmark",

crates/codegen/src/ir.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -457,11 +457,13 @@ impl CodeInfo {
457457
.map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte))
458458
.chain([CodeUnit { op, arg: lo_arg }]),
459459
);
460-
// Emit CACHE code units after the instruction
461-
instructions.extend(core::iter::repeat_n(
462-
CodeUnit::new(Instruction::Cache, 0.into()),
463-
cache_count,
464-
));
460+
// Emit CACHE code units after the instruction (all zeroed)
461+
if cache_count > 0 {
462+
instructions.extend(core::iter::repeat_n(
463+
CodeUnit::new(Instruction::Cache, 0.into()),
464+
cache_count,
465+
));
466+
}
465467
current_offset = offset_after;
466468
}
467469
next_block = block.next;

crates/compiler-core/src/bytecode.rs

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,11 @@ pub struct CodeUnit {
343343

344344
const _: () = assert!(mem::size_of::<CodeUnit>() == 2);
345345

346+
/// Adaptive specialization: number of executions before attempting specialization.
347+
pub const ADAPTIVE_WARMUP_VALUE: u8 = 50;
348+
/// Adaptive specialization: backoff counter after de-optimization.
349+
pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250;
350+
346351
impl CodeUnit {
347352
pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
348353
Self { op, arg }
@@ -391,7 +396,11 @@ impl TryFrom<&[u8]> for CodeUnits {
391396
return Err(Self::Error::InvalidBytecode);
392397
}
393398

394-
value.chunks_exact(2).map(CodeUnit::try_from).collect()
399+
let units: Self = value
400+
.chunks_exact(2)
401+
.map(CodeUnit::try_from)
402+
.collect::<Result<_, _>>()?;
403+
Ok(units)
395404
}
396405
}
397406

@@ -441,6 +450,140 @@ impl CodeUnits {
441450
core::ptr::write(op_ptr, new_op.into());
442451
}
443452
}
453+
454+
/// Write a u16 value into a CACHE code unit at `index`.
455+
/// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly.
456+
///
457+
/// # Safety
458+
/// - `index` must be in bounds and point to a CACHE entry.
459+
/// - The caller must ensure no concurrent reads/writes to the same slot.
460+
pub unsafe fn write_cache_u16(&self, index: usize, value: u16) {
461+
unsafe {
462+
let units = &mut *self.0.get();
463+
let ptr = units.as_mut_ptr().add(index) as *mut u8;
464+
core::ptr::write_unaligned(ptr as *mut u16, value);
465+
}
466+
}
467+
468+
/// Read a u16 value from a CACHE code unit at `index`.
469+
///
470+
/// # Panics
471+
/// Panics if `index` is out of bounds.
472+
pub fn read_cache_u16(&self, index: usize) -> u16 {
473+
let units = unsafe { &*self.0.get() };
474+
assert!(index < units.len(), "read_cache_u16: index out of bounds");
475+
let ptr = units.as_ptr().wrapping_add(index) as *const u8;
476+
unsafe { core::ptr::read_unaligned(ptr as *const u16) }
477+
}
478+
479+
/// Write a u32 value across two consecutive CACHE code units starting at `index`.
480+
///
481+
/// # Safety
482+
/// Same requirements as `write_cache_u16`.
483+
pub unsafe fn write_cache_u32(&self, index: usize, value: u32) {
484+
unsafe {
485+
self.write_cache_u16(index, value as u16);
486+
self.write_cache_u16(index + 1, (value >> 16) as u16);
487+
}
488+
}
489+
490+
/// Read a u32 value from two consecutive CACHE code units starting at `index`.
491+
///
492+
/// # Panics
493+
/// Panics if `index + 1` is out of bounds.
494+
pub fn read_cache_u32(&self, index: usize) -> u32 {
495+
let lo = self.read_cache_u16(index) as u32;
496+
let hi = self.read_cache_u16(index + 1) as u32;
497+
lo | (hi << 16)
498+
}
499+
500+
/// Write a u64 value across four consecutive CACHE code units starting at `index`.
501+
///
502+
/// # Safety
503+
/// Same requirements as `write_cache_u16`.
504+
pub unsafe fn write_cache_u64(&self, index: usize, value: u64) {
505+
unsafe {
506+
self.write_cache_u32(index, value as u32);
507+
self.write_cache_u32(index + 2, (value >> 32) as u32);
508+
}
509+
}
510+
511+
/// Read a u64 value from four consecutive CACHE code units starting at `index`.
512+
///
513+
/// # Panics
514+
/// Panics if `index + 3` is out of bounds.
515+
pub fn read_cache_u64(&self, index: usize) -> u64 {
516+
let lo = self.read_cache_u32(index) as u64;
517+
let hi = self.read_cache_u32(index + 2) as u64;
518+
lo | (hi << 32)
519+
}
520+
521+
/// Read the adaptive counter from the first CACHE entry's `arg` byte.
522+
/// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`.
523+
pub fn read_adaptive_counter(&self, index: usize) -> u8 {
524+
let units = unsafe { &*self.0.get() };
525+
u8::from(units[index].arg)
526+
}
527+
528+
/// Write the adaptive counter to the first CACHE entry's `arg` byte.
529+
/// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`.
530+
///
531+
/// # Safety
532+
/// - `index` must be in bounds and point to a CACHE entry.
533+
pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) {
534+
let units = unsafe { &mut *self.0.get() };
535+
units[index].arg = OpArgByte::from(value);
536+
}
537+
538+
/// Produce a clean copy of the bytecode suitable for serialization
539+
/// (marshal) and `co_code`. Specialized opcodes are mapped back to their
540+
/// base variants via `deoptimize()` and all CACHE entries are zeroed.
541+
pub fn original_bytes(&self) -> Vec<u8> {
542+
let units = unsafe { &*self.0.get() };
543+
let mut out = Vec::with_capacity(units.len() * 2);
544+
let len = units.len();
545+
let mut i = 0;
546+
while i < len {
547+
let op = units[i].op.deoptimize();
548+
let caches = op.cache_entries();
549+
out.push(u8::from(op));
550+
out.push(u8::from(units[i].arg));
551+
// Zero-fill all CACHE entries (counter + cached data)
552+
for _ in 0..caches {
553+
i += 1;
554+
out.push(0); // op = Cache = 0
555+
out.push(0); // arg = 0
556+
}
557+
i += 1;
558+
}
559+
out
560+
}
561+
562+
/// Initialize adaptive warmup counters for all cacheable instructions.
563+
/// Called lazily at RESUME (first execution of a code object).
564+
/// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
565+
pub fn quicken(&self) {
566+
let units = unsafe { &mut *self.0.get() };
567+
let len = units.len();
568+
let mut i = 0;
569+
while i < len {
570+
let op = units[i].op;
571+
let caches = op.cache_entries();
572+
if caches > 0 {
573+
// Don't write adaptive counter for instrumented opcodes;
574+
// specialization is skipped while monitoring is active.
575+
if !op.is_instrumented() {
576+
let cache_base = i + 1;
577+
if cache_base < len {
578+
units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE);
579+
}
580+
}
581+
i += 1 + caches;
582+
} else {
583+
i += 1;
584+
}
585+
}
586+
}
444587
}
445588

446589
/// A Constant (which usually encapsulates data within it)

crates/compiler-core/src/bytecode/instruction.rs

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,126 @@ impl Instruction {
512512
})
513513
}
514514

515+
/// Map a specialized opcode back to its adaptive (base) variant.
516+
/// `_PyOpcode_Deopt`
517+
pub fn deoptimize(self) -> Self {
518+
match self {
519+
// LOAD_ATTR specializations
520+
Self::LoadAttrClass
521+
| Self::LoadAttrClassWithMetaclassCheck
522+
| Self::LoadAttrGetattributeOverridden
523+
| Self::LoadAttrInstanceValue
524+
| Self::LoadAttrMethodLazyDict
525+
| Self::LoadAttrMethodNoDict
526+
| Self::LoadAttrMethodWithValues
527+
| Self::LoadAttrModule
528+
| Self::LoadAttrNondescriptorNoDict
529+
| Self::LoadAttrNondescriptorWithValues
530+
| Self::LoadAttrProperty
531+
| Self::LoadAttrSlot
532+
| Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() },
533+
// BINARY_OP specializations
534+
Self::BinaryOpAddFloat
535+
| Self::BinaryOpAddInt
536+
| Self::BinaryOpAddUnicode
537+
| Self::BinaryOpExtend
538+
| Self::BinaryOpInplaceAddUnicode
539+
| Self::BinaryOpMultiplyFloat
540+
| Self::BinaryOpMultiplyInt
541+
| Self::BinaryOpSubscrDict
542+
| Self::BinaryOpSubscrGetitem
543+
| Self::BinaryOpSubscrListInt
544+
| Self::BinaryOpSubscrListSlice
545+
| Self::BinaryOpSubscrStrInt
546+
| Self::BinaryOpSubscrTupleInt
547+
| Self::BinaryOpSubtractFloat
548+
| Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() },
549+
// CALL specializations
550+
Self::CallAllocAndEnterInit
551+
| Self::CallBoundMethodExactArgs
552+
| Self::CallBoundMethodGeneral
553+
| Self::CallBuiltinClass
554+
| Self::CallBuiltinFast
555+
| Self::CallBuiltinFastWithKeywords
556+
| Self::CallBuiltinO
557+
| Self::CallIsinstance
558+
| Self::CallLen
559+
| Self::CallListAppend
560+
| Self::CallMethodDescriptorFast
561+
| Self::CallMethodDescriptorFastWithKeywords
562+
| Self::CallMethodDescriptorNoargs
563+
| Self::CallMethodDescriptorO
564+
| Self::CallNonPyGeneral
565+
| Self::CallPyExactArgs
566+
| Self::CallPyGeneral
567+
| Self::CallStr1
568+
| Self::CallTuple1
569+
| Self::CallType1 => Self::Call {
570+
nargs: Arg::marker(),
571+
},
572+
// CALL_KW specializations
573+
Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw {
574+
nargs: Arg::marker(),
575+
},
576+
// TO_BOOL specializations
577+
Self::ToBoolAlwaysTrue
578+
| Self::ToBoolBool
579+
| Self::ToBoolInt
580+
| Self::ToBoolList
581+
| Self::ToBoolNone
582+
| Self::ToBoolStr => Self::ToBool,
583+
// COMPARE_OP specializations
584+
Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => {
585+
Self::CompareOp { op: Arg::marker() }
586+
}
587+
// CONTAINS_OP specializations
588+
Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()),
589+
// FOR_ITER specializations
590+
Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => {
591+
Self::ForIter {
592+
target: Arg::marker(),
593+
}
594+
}
595+
// LOAD_GLOBAL specializations
596+
Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()),
597+
// STORE_ATTR specializations
598+
Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => {
599+
Self::StoreAttr { idx: Arg::marker() }
600+
}
601+
// LOAD_SUPER_ATTR specializations
602+
Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => {
603+
Self::LoadSuperAttr { arg: Arg::marker() }
604+
}
605+
// STORE_SUBSCR specializations
606+
Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr,
607+
// UNPACK_SEQUENCE specializations
608+
Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => {
609+
Self::UnpackSequence {
610+
size: Arg::marker(),
611+
}
612+
}
613+
// SEND specializations
614+
Self::SendGen => Self::Send {
615+
target: Arg::marker(),
616+
},
617+
// LOAD_CONST specializations
618+
Self::LoadConstImmortal | Self::LoadConstMortal => {
619+
Self::LoadConst { idx: Arg::marker() }
620+
}
621+
// RESUME specializations
622+
Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
623+
// JUMP_BACKWARD specializations
624+
Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward {
625+
target: Arg::marker(),
626+
},
627+
// Instrumented opcodes map back to their base
628+
_ => match self.to_base() {
629+
Some(base) => base,
630+
None => self,
631+
},
632+
}
633+
}
634+
515635
/// Number of CACHE code units that follow this instruction.
516636
/// _PyOpcode_Caches
517637
pub fn cache_entries(self) -> usize {
@@ -626,8 +746,11 @@ impl Instruction {
626746
| Self::UnpackSequenceTuple
627747
| Self::UnpackSequenceTwoTuple => 1,
628748

629-
// Everything else: 0 cache entries
630-
_ => 0,
749+
// Instrumented opcodes have the same cache entries as their base
750+
_ => match self.to_base() {
751+
Some(base) => base.cache_entries(),
752+
None => 0,
753+
},
631754
}
632755
}
633756
}

crates/compiler-core/src/marshal.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -662,9 +662,8 @@ pub fn serialize_value<W: Write, D: Dumpable>(
662662

663663
pub fn serialize_code<W: Write, C: Constant>(buf: &mut W, code: &CodeObject<C>) {
664664
write_len(buf, code.instructions.len());
665-
// SAFETY: it's ok to transmute CodeUnit to [u8; 2]
666-
let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() };
667-
buf.write_slice(instructions_bytes);
665+
let original = code.instructions.original_bytes();
666+
buf.write_slice(&original);
668667

669668
write_len(buf, code.locations.len());
670669
for (start, end) in &*code.locations {

0 commit comments

Comments
 (0)