Skip to content

Commit 5fb4bfc

Browse files
committed
cranelift: Remove f{min,max}_pseudo instructions
This commit removes these two instructions and replaces them instead with their equivalents using `fcmp` plus `select` or `bitselect` depending on the type (`bitselect` for vectors, `select` for scalars). The motivation for this commit is that incorrect optimizations for these instructions were removed in bytecodealliance#6859 and likely stemmed from the surprising definitions of these instructions. These originally were intended to correspond to operations in the SIMD proposal for WebAssembly but nowadays the functionality of these instructions is replaced with: * Lowering from wasm to clif uses the `fcmp` plus `select` combo instruction. * Backends that support optimizing this pattern use ISLE patterns to match the instruction and emit the specialization for the pseudo semantics. This means that while the instructions are removed here it should be the case that no functionality is lost and the output of Wasmtime/Cranelift should still be the same as it was before. Existing tests using the pseudo instructions were preserved except the riscv64 ones (where the lowering was deleted) and the dynamic AArch64 ones. Both s390x and x64 continue to have specialized patterns for this compare-plus-select.
1 parent 592ddc5 commit 5fb4bfc

File tree

25 files changed

+305
-589
lines changed

25 files changed

+305
-589
lines changed

cranelift/codegen/meta/src/shared/instructions.rs

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2931,24 +2931,6 @@ pub(crate) fn define(
29312931
]),
29322932
);
29332933

2934-
ig.push(
2935-
Inst::new(
2936-
"fmin_pseudo",
2937-
r#"
2938-
Floating point pseudo-minimum, propagating NaNs. This behaves differently from ``fmin``.
2939-
See <https://github.com/WebAssembly/simd/pull/122> for background.
2940-
2941-
The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour
2942-
for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
2943-
"#,
2944-
&formats.binary,
2945-
)
2946-
.operands_in(vec![Operand::new("x", Float), Operand::new("y", Float)])
2947-
.operands_out(vec![
2948-
Operand::new("a", Float).with_doc("The smaller of ``x`` and ``y``")
2949-
]),
2950-
);
2951-
29522934
ig.push(
29532935
Inst::new(
29542936
"fmax",
@@ -2968,24 +2950,6 @@ pub(crate) fn define(
29682950
]),
29692951
);
29702952

2971-
ig.push(
2972-
Inst::new(
2973-
"fmax_pseudo",
2974-
r#"
2975-
Floating point pseudo-maximum, propagating NaNs. This behaves differently from ``fmax``.
2976-
See <https://github.com/WebAssembly/simd/pull/122> for background.
2977-
2978-
The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour
2979-
for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
2980-
"#,
2981-
&formats.binary,
2982-
)
2983-
.operands_in(vec![Operand::new("x", Float), Operand::new("y", Float)])
2984-
.operands_out(vec![
2985-
Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``")
2986-
]),
2987-
);
2988-
29892953
ig.push(
29902954
Inst::new(
29912955
"ceil",

cranelift/codegen/src/isa/aarch64/lower.isle

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -415,24 +415,6 @@
415415
(rule (lower (has_type (ty_scalar_float ty) (fmax rn rm)))
416416
(fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty)))
417417

418-
;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
419-
420-
(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmin_pseudo rm rn)))
421-
(bsl ty (vec_rrr (VecALUOp.Fcmgt) rm rn (vector_size ty)) rn rm))
422-
423-
(rule (lower (has_type (ty_scalar_float ty) (fmin_pseudo rm rn)))
424-
(with_flags (fpu_cmp (scalar_size ty) rm rn)
425-
(fpu_csel ty (Cond.Gt) rn rm)))
426-
427-
;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
428-
429-
(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmax_pseudo rm rn)))
430-
(bsl ty (vec_rrr (VecALUOp.Fcmgt) rn rm (vector_size ty)) rn rm))
431-
432-
(rule (lower (has_type (ty_scalar_float ty) (fmax_pseudo rm rn)))
433-
(with_flags (fpu_cmp (scalar_size ty) rn rm)
434-
(fpu_csel ty (Cond.Gt) rn rm)))
435-
436418
;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
437419

438420
(rule -1 (lower (has_type ty @ (multi_lane _ _) (sqrt x)))

cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,6 @@
3535
(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmax x y)))
3636
(value_reg (vec_rrr (VecALUOp.Fmax) (put_in_reg x) (put_in_reg y) (vector_size ty))))
3737

38-
;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39-
(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmin_pseudo x y)))
40-
(value_reg (bsl ty
41-
(vec_rrr (VecALUOp.Fcmgt) (put_in_reg x) (put_in_reg y)
42-
(vector_size ty)) (put_in_reg y) (put_in_reg x))))
43-
44-
;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
45-
(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmax_pseudo x y)))
46-
(value_reg (bsl ty
47-
(vec_rrr (VecALUOp.Fcmgt) (put_in_reg y) (put_in_reg x)
48-
(vector_size ty)) (put_in_reg y) (put_in_reg x))))
49-
5038
;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5139
(rule -2 (lower (has_type (ty_dyn128_int ty) (snarrow x y)))
5240
(if-let _ (zero_value y))

cranelift/codegen/src/isa/riscv64/inst.isle

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -288,14 +288,6 @@
288288
(rs1 Reg)
289289
(rs2 Reg)
290290
(ty Type))
291-
(FloatSelectPseudo
292-
(op FloatSelectOP)
293-
(rd WritableReg)
294-
;; a integer register
295-
(tmp WritableReg)
296-
(rs1 Reg)
297-
(rs2 Reg)
298-
(ty Type))
299291

300292
;; popcnt if target doesn't support extension B
301293
;; use iteration to implement.
@@ -986,15 +978,6 @@
986978
(_ Unit (emit (MInst.FloatRound op rd tmp tmp2 rs ty))))
987979
(writable_reg_to_reg rd)))
988980

989-
(decl gen_float_select_pseudo (FloatSelectOP Reg Reg Type) Reg)
990-
(rule
991-
(gen_float_select_pseudo op x y ty)
992-
(let
993-
((rd WritableReg (temp_writable_reg ty))
994-
(tmp WritableXReg (temp_writable_xreg))
995-
(_ Unit (emit (MInst.FloatSelectPseudo op rd tmp x y ty))))
996-
(writable_reg_to_reg rd)))
997-
998981
(decl gen_float_select (FloatSelectOP Reg Reg Type) Reg)
999982
(rule
1000983
(gen_float_select op x y ty)

cranelift/codegen/src/isa/riscv64/inst/emit.rs

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,6 @@ impl Inst {
458458
| Inst::DummyUse { .. }
459459
| Inst::FloatRound { .. }
460460
| Inst::FloatSelect { .. }
461-
| Inst::FloatSelectPseudo { .. }
462461
| Inst::Popcnt { .. }
463462
| Inst::Rev8 { .. }
464463
| Inst::Cltz { .. }
@@ -2242,53 +2241,6 @@ impl MachInstEmit for Inst {
22422241
Inst::gen_move(rd, rs, ty).emit(&[], sink, emit_info, state);
22432242
sink.bind_label(label_jump_over, &mut state.ctrl_plane);
22442243
}
2245-
&Inst::FloatSelectPseudo {
2246-
op,
2247-
rd,
2248-
tmp,
2249-
rs1,
2250-
rs2,
2251-
ty,
2252-
} => {
2253-
let rs1 = allocs.next(rs1);
2254-
let rs2 = allocs.next(rs2);
2255-
let tmp = allocs.next_writable(tmp);
2256-
let rd = allocs.next_writable(rd);
2257-
let label_rs2 = sink.get_label();
2258-
let label_jump_over = sink.get_label();
2259-
let lt_op = if ty == F32 {
2260-
FpuOPRRR::FltS
2261-
} else {
2262-
FpuOPRRR::FltD
2263-
};
2264-
Inst::FpuRRR {
2265-
alu_op: lt_op,
2266-
frm: None,
2267-
rd: tmp,
2268-
rs1: if op == FloatSelectOP::Max { rs1 } else { rs2 },
2269-
rs2: if op == FloatSelectOP::Max { rs2 } else { rs1 },
2270-
}
2271-
.emit(&[], sink, emit_info, state);
2272-
Inst::CondBr {
2273-
taken: BranchTarget::Label(label_rs2),
2274-
not_taken: BranchTarget::zero(),
2275-
kind: IntegerCompare {
2276-
kind: IntCC::NotEqual,
2277-
rs1: tmp.to_reg(),
2278-
rs2: zero_reg(),
2279-
},
2280-
}
2281-
.emit(&[], sink, emit_info, state);
2282-
// here select rs1 as result.
2283-
Inst::gen_move(rd, rs1, ty).emit(&[], sink, emit_info, state);
2284-
Inst::Jal {
2285-
dest: BranchTarget::Label(label_jump_over),
2286-
}
2287-
.emit(&[], sink, emit_info, state);
2288-
sink.bind_label(label_rs2, &mut state.ctrl_plane);
2289-
Inst::gen_move(rd, rs2, ty).emit(&[], sink, emit_info, state);
2290-
sink.bind_label(label_jump_over, &mut state.ctrl_plane);
2291-
}
22922244

22932245
&Inst::FloatSelect {
22942246
op,

cranelift/codegen/src/isa/riscv64/inst/mod.rs

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -636,13 +636,6 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
636636
collector.reg_early_def(tmp);
637637
collector.reg_early_def(rd);
638638
}
639-
&Inst::FloatSelectPseudo {
640-
rd, tmp, rs1, rs2, ..
641-
} => {
642-
collector.reg_uses(&[rs1, rs2]);
643-
collector.reg_early_def(tmp);
644-
collector.reg_early_def(rd);
645-
}
646639
&Inst::Popcnt {
647640
sum, step, rs, tmp, ..
648641
} => {
@@ -1136,29 +1129,6 @@ impl Inst {
11361129
ty
11371130
)
11381131
}
1139-
&Inst::FloatSelectPseudo {
1140-
op,
1141-
rd,
1142-
tmp,
1143-
rs1,
1144-
rs2,
1145-
ty,
1146-
} => {
1147-
let rs1 = format_reg(rs1, allocs);
1148-
let rs2 = format_reg(rs2, allocs);
1149-
let tmp = format_reg(tmp.to_reg(), allocs);
1150-
let rd = format_reg(rd.to_reg(), allocs);
1151-
format!(
1152-
"f{}.{}.pseudo {},{},{}##tmp={} ty={}",
1153-
op.op_name(),
1154-
if ty == F32 { "s" } else { "d" },
1155-
rd,
1156-
rs1,
1157-
rs2,
1158-
tmp,
1159-
ty
1160-
)
1161-
}
11621132
&Inst::FloatSelect {
11631133
op,
11641134
rd,

cranelift/codegen/src/isa/riscv64/lower.isle

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1300,24 +1300,6 @@
13001300
(max VReg (rv_vfmax_vv x y (unmasked) ty)))
13011301
(rv_vmerge_vvm vec_nan max is_not_nan ty)))
13021302

1303-
;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1304-
1305-
(rule 0 (lower (has_type (ty_scalar_float ty) (fmin_pseudo x y)))
1306-
(gen_float_select_pseudo (FloatSelectOP.Min) x y ty))
1307-
1308-
(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmin_pseudo x y)))
1309-
(let ((mask VReg (gen_fcmp_mask ty (FloatCC.LessThan) y x)))
1310-
(rv_vmerge_vvm x y mask ty)))
1311-
1312-
;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1313-
1314-
(rule 0 (lower (has_type (ty_scalar_float ty) (fmax_pseudo x y)))
1315-
(gen_float_select_pseudo (FloatSelectOP.Max) x y ty))
1316-
1317-
(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmax_pseudo x y)))
1318-
(let ((mask VReg (gen_fcmp_mask ty (FloatCC.LessThan) x y)))
1319-
(rv_vmerge_vvm x y mask ty)))
1320-
13211303
;;;;; Rules for `stack_addr`;;;;;;;;;
13221304
(rule
13231305
(lower (stack_addr ss offset))

cranelift/codegen/src/isa/s390x/lower.isle

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,13 @@
11341134
(rule (lower (has_type (vr128_ty ty) (bitselect x y z)))
11351135
(vec_select ty y z x))
11361136

1137+
;; Special-case some float-selection instructions for min/max
1138+
(rule 3 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y)))
1139+
(fmin_pseudo_reg ty y x))
1140+
(rule 4 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y)))
1141+
(fmax_pseudo_reg ty y x))
1142+
1143+
11371144

11381145
;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
11391146

@@ -1389,20 +1396,6 @@
13891396
(fmax_reg ty x y))
13901397

13911398

1392-
;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1393-
1394-
;; Minimum of two registers.
1395-
(rule (lower (has_type ty (fmin_pseudo x y)))
1396-
(fmin_pseudo_reg ty x y))
1397-
1398-
1399-
;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1400-
1401-
;; Maximum of two registers.
1402-
(rule (lower (has_type ty (fmax_pseudo x y)))
1403-
(fmax_pseudo_reg ty x y))
1404-
1405-
14061399
;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
14071400

14081401
;; Copysign of two registers.
@@ -3719,6 +3712,12 @@
37193712
(select_bool_reg ty (value_nonzero val_cond)
37203713
(put_in_reg val_true) (put_in_reg val_false)))
37213714

3715+
;; Special-case some float-selection instructions for min/max
3716+
(rule 1 (lower (has_type (ty_scalar_float ty) (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y)))
3717+
(fmin_pseudo_reg ty y x))
3718+
(rule 2 (lower (has_type (ty_scalar_float ty) (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y)))
3719+
(fmax_pseudo_reg ty y x))
3720+
37223721

37233722
;;;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
37243723

cranelift/codegen/src/isa/x64/lower.isle

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,21 @@
13831383
(decl pure vconst_all_ones_or_all_zeros () Constant)
13841384
(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros)
13851385

1386+
;; Specializations for floating-pointer compares to generate a `minp*` or a
1387+
;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}`
1388+
;; instructions and how they're lowered into CLIF. Note the careful ordering
1389+
;; of all the operands here to ensure that the input CLIF matched is implemented
1390+
;; by the corresponding x64 instruction.
1391+
(rule 2 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y)))
1392+
(x64_minps x y))
1393+
(rule 2 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y)))
1394+
(x64_minpd x y))
1395+
1396+
(rule 3 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y)))
1397+
(x64_maxps x y))
1398+
(rule 3 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y)))
1399+
(x64_maxpd x y))
1400+
13861401
;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
13871402

13881403
(rule (lower (has_type $I8X16
@@ -2021,6 +2036,18 @@
20212036
(let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c)))
20222037
(select_icmp cond_result x y)))
20232038

2039+
;; Specializations for floating-point compares to generate a `mins*` or a
2040+
;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}"
2041+
;; specializations for vectors.
2042+
(rule 2 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y)))
2043+
(x64_minss x y))
2044+
(rule 2 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y)))
2045+
(x64_minsd x y))
2046+
(rule 3 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y)))
2047+
(x64_maxss x y))
2048+
(rule 3 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y)))
2049+
(x64_maxsd x y))
2050+
20242051
;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20252052

20262053
;; If available, we can use a plain lzcnt instruction here. Note no
@@ -2677,28 +2704,6 @@
26772704
(final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive)))
26782705
final))
26792706

2680-
;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2681-
2682-
(rule (lower (has_type $F32 (fmin_pseudo x y)))
2683-
(x64_minss y x))
2684-
(rule (lower (has_type $F64 (fmin_pseudo x y)))
2685-
(x64_minsd y x))
2686-
(rule (lower (has_type $F32X4 (fmin_pseudo x y)))
2687-
(x64_minps y x))
2688-
(rule (lower (has_type $F64X2 (fmin_pseudo x y)))
2689-
(x64_minpd y x))
2690-
2691-
;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2692-
2693-
(rule (lower (has_type $F32 (fmax_pseudo x y)))
2694-
(x64_maxss y x))
2695-
(rule (lower (has_type $F64 (fmax_pseudo x y)))
2696-
(x64_maxsd y x))
2697-
(rule (lower (has_type $F32X4 (fmax_pseudo x y)))
2698-
(x64_maxps y x))
2699-
(rule (lower (has_type $F64X2 (fmax_pseudo x y)))
2700-
(x64_maxpd y x))
2701-
27022707
;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
27032708

27042709
;; Base case for fma is to call out to one of two libcalls. For vectors they

cranelift/codegen/src/opts/selects.isle

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,3 @@
4343
(rule (simplify (bitselect ty @ (multi_lane _ _) (sge _ x y) y x)) (smin ty x y))
4444
(rule (simplify (bitselect ty @ (multi_lane _ _) (ugt _ x y) y x)) (umin ty x y))
4545
(rule (simplify (bitselect ty @ (multi_lane _ _) (uge _ x y) y x)) (umin ty x y))
46-

0 commit comments

Comments
 (0)