From 5fb4bfc485df5052fb79f250af9b567093b2f52a Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 18 Aug 2023 14:44:24 -0700 Subject: [PATCH] cranelift: Remove `f{min,max}_pseudo` instructions This commit removes these two instructions and replaces them instead with their equivalents using `fcmp` plus `select` or `bitselect` depending on the type (`bitselect` for vectors, `select` for scalars). The motivation for this commit is that incorrect optimizations for these instructions were removed in #6859 and likely stemmed from the surprising definitions of these instructions. These originally were intended to correspond to operations in the SIMD proposal for WebAssembly but nowadays the functionality of these instructions is replaced with: * Lowering from wasm to clif uses the `fcmp` plus `select` combo instruction. * Backends that support optimizing this pattern use ISLE patterns to match the instruction and emit the specialization for the pseudo semantics. This means that while the instructions are removed here it should be the case that no functionality is lost and the output of Wasmtime/Cranelift should still be the same as it was before. Existing tests using the pseudo instructions were preserved except the riscv64 ones (where the lowering was deleted) and the dynamic AArch64 ones. Both s390x and x64 continue to have specialized patterns for this compare-plus-select. --- .../codegen/meta/src/shared/instructions.rs | 36 ----- cranelift/codegen/src/isa/aarch64/lower.isle | 18 --- .../src/isa/aarch64/lower_dynamic_neon.isle | 12 -- cranelift/codegen/src/isa/riscv64/inst.isle | 17 --- .../codegen/src/isa/riscv64/inst/emit.rs | 48 ------- cranelift/codegen/src/isa/riscv64/inst/mod.rs | 30 ---- cranelift/codegen/src/isa/riscv64/lower.isle | 18 --- cranelift/codegen/src/isa/s390x/lower.isle | 27 ++-- cranelift/codegen/src/isa/x64/lower.isle | 49 ++++--- cranelift/codegen/src/opts/selects.isle | 1 - .../isa/aarch64/dynamic-simd-neon.clif | 57 -------- .../isa/riscv64/simd-fmax-pseudo.clif | 92 ------------ .../isa/riscv64/simd-fmin-pseudo.clif | 92 ------------ .../filetests/isa/s390x/floating-point.clif | 20 +-- .../filetests/filetests/isa/s390x/vec-fp.clif | 24 ++-- .../filetests/isa/x64/float-avx.clif | 20 +-- .../runtests/dynamic-simd-arithmetic.clif | 52 ------- .../filetests/runtests/fmax-pseudo.clif | 28 ++-- .../filetests/runtests/fmin-pseudo.clif | 28 ++-- .../filetests/runtests/issue5569.clif | 6 +- .../runtests/simd-fmin-max-pseudo.clif | 24 ++-- .../filetests/filetests/wasm/x64-pmin.wat | 136 ++++++++++++++++++ cranelift/fuzzgen/src/function_generator.rs | 4 - cranelift/interpreter/src/step.rs | 10 -- cranelift/wasm/src/code_translator.rs | 45 ++++-- 25 files changed, 305 insertions(+), 589 deletions(-) delete mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif delete mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif create mode 100644 cranelift/filetests/filetests/wasm/x64-pmin.wat diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 14e397c16b69..ff5aed499193 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -2931,24 +2931,6 @@ pub(crate) fn define( ]), ); - ig.push( - Inst::new( - "fmin_pseudo", - r#" - Floating point pseudo-minimum, propagating NaNs. This behaves differently from ``fmin``. - See for background. - - The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour - for zero or NaN inputs follows from the behaviour of ``<`` with such inputs. - "#, - &formats.binary, - ) - .operands_in(vec![Operand::new("x", Float), Operand::new("y", Float)]) - .operands_out(vec![ - Operand::new("a", Float).with_doc("The smaller of ``x`` and ``y``") - ]), - ); - ig.push( Inst::new( "fmax", @@ -2968,24 +2950,6 @@ pub(crate) fn define( ]), ); - ig.push( - Inst::new( - "fmax_pseudo", - r#" - Floating point pseudo-maximum, propagating NaNs. This behaves differently from ``fmax``. - See for background. - - The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour - for zero or NaN inputs follows from the behaviour of ``<`` with such inputs. - "#, - &formats.binary, - ) - .operands_in(vec![Operand::new("x", Float), Operand::new("y", Float)]) - .operands_out(vec![ - Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``") - ]), - ); - ig.push( Inst::new( "ceil", diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 9d51812f2fc5..80fcec000ec9 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -415,24 +415,6 @@ (rule (lower (has_type (ty_scalar_float ty) (fmax rn rm))) (fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty))) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmin_pseudo rm rn))) - (bsl ty (vec_rrr (VecALUOp.Fcmgt) rm rn (vector_size ty)) rn rm)) - -(rule (lower (has_type (ty_scalar_float ty) (fmin_pseudo rm rn))) - (with_flags (fpu_cmp (scalar_size ty) rm rn) - (fpu_csel ty (Cond.Gt) rn rm))) - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmax_pseudo rm rn))) - (bsl ty (vec_rrr (VecALUOp.Fcmgt) rn rm (vector_size ty)) rn rm)) - -(rule (lower (has_type (ty_scalar_float ty) (fmax_pseudo rm rn))) - (with_flags (fpu_cmp (scalar_size ty) rn rm) - (fpu_csel ty (Cond.Gt) rn rm))) - ;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 (lower (has_type ty @ (multi_lane _ _) (sqrt x))) diff --git a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle index 54adb887b87c..a3cd463c9e4f 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle +++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle @@ -35,18 +35,6 @@ (rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmax x y))) (value_reg (vec_rrr (VecALUOp.Fmax) (put_in_reg x) (put_in_reg y) (vector_size ty)))) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmin_pseudo x y))) - (value_reg (bsl ty - (vec_rrr (VecALUOp.Fcmgt) (put_in_reg x) (put_in_reg y) - (vector_size ty)) (put_in_reg y) (put_in_reg x)))) - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmax_pseudo x y))) - (value_reg (bsl ty - (vec_rrr (VecALUOp.Fcmgt) (put_in_reg y) (put_in_reg x) - (vector_size ty)) (put_in_reg y) (put_in_reg x)))) - ;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -2 (lower (has_type (ty_dyn128_int ty) (snarrow x y))) (if-let _ (zero_value y)) diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index b7576a55317b..3b826ec46c84 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -288,14 +288,6 @@ (rs1 Reg) (rs2 Reg) (ty Type)) - (FloatSelectPseudo - (op FloatSelectOP) - (rd WritableReg) - ;; a integer register - (tmp WritableReg) - (rs1 Reg) - (rs2 Reg) - (ty Type)) ;; popcnt if target doesn't support extension B ;; use iteration to implement. @@ -986,15 +978,6 @@ (_ Unit (emit (MInst.FloatRound op rd tmp tmp2 rs ty)))) (writable_reg_to_reg rd))) -(decl gen_float_select_pseudo (FloatSelectOP Reg Reg Type) Reg) -(rule - (gen_float_select_pseudo op x y ty) - (let - ((rd WritableReg (temp_writable_reg ty)) - (tmp WritableXReg (temp_writable_xreg)) - (_ Unit (emit (MInst.FloatSelectPseudo op rd tmp x y ty)))) - (writable_reg_to_reg rd))) - (decl gen_float_select (FloatSelectOP Reg Reg Type) Reg) (rule (gen_float_select op x y ty) diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index 0e750bd608ca..a0a724f5ecf8 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -458,7 +458,6 @@ impl Inst { | Inst::DummyUse { .. } | Inst::FloatRound { .. } | Inst::FloatSelect { .. } - | Inst::FloatSelectPseudo { .. } | Inst::Popcnt { .. } | Inst::Rev8 { .. } | Inst::Cltz { .. } @@ -2242,53 +2241,6 @@ impl MachInstEmit for Inst { Inst::gen_move(rd, rs, ty).emit(&[], sink, emit_info, state); sink.bind_label(label_jump_over, &mut state.ctrl_plane); } - &Inst::FloatSelectPseudo { - op, - rd, - tmp, - rs1, - rs2, - ty, - } => { - let rs1 = allocs.next(rs1); - let rs2 = allocs.next(rs2); - let tmp = allocs.next_writable(tmp); - let rd = allocs.next_writable(rd); - let label_rs2 = sink.get_label(); - let label_jump_over = sink.get_label(); - let lt_op = if ty == F32 { - FpuOPRRR::FltS - } else { - FpuOPRRR::FltD - }; - Inst::FpuRRR { - alu_op: lt_op, - frm: None, - rd: tmp, - rs1: if op == FloatSelectOP::Max { rs1 } else { rs2 }, - rs2: if op == FloatSelectOP::Max { rs2 } else { rs1 }, - } - .emit(&[], sink, emit_info, state); - Inst::CondBr { - taken: BranchTarget::Label(label_rs2), - not_taken: BranchTarget::zero(), - kind: IntegerCompare { - kind: IntCC::NotEqual, - rs1: tmp.to_reg(), - rs2: zero_reg(), - }, - } - .emit(&[], sink, emit_info, state); - // here select rs1 as result. - Inst::gen_move(rd, rs1, ty).emit(&[], sink, emit_info, state); - Inst::Jal { - dest: BranchTarget::Label(label_jump_over), - } - .emit(&[], sink, emit_info, state); - sink.bind_label(label_rs2, &mut state.ctrl_plane); - Inst::gen_move(rd, rs2, ty).emit(&[], sink, emit_info, state); - sink.bind_label(label_jump_over, &mut state.ctrl_plane); - } &Inst::FloatSelect { op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 786c958ad4da..a6c90991ec66 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -636,13 +636,6 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_early_def(tmp); collector.reg_early_def(rd); } - &Inst::FloatSelectPseudo { - rd, tmp, rs1, rs2, .. - } => { - collector.reg_uses(&[rs1, rs2]); - collector.reg_early_def(tmp); - collector.reg_early_def(rd); - } &Inst::Popcnt { sum, step, rs, tmp, .. } => { @@ -1136,29 +1129,6 @@ impl Inst { ty ) } - &Inst::FloatSelectPseudo { - op, - rd, - tmp, - rs1, - rs2, - ty, - } => { - let rs1 = format_reg(rs1, allocs); - let rs2 = format_reg(rs2, allocs); - let tmp = format_reg(tmp.to_reg(), allocs); - let rd = format_reg(rd.to_reg(), allocs); - format!( - "f{}.{}.pseudo {},{},{}##tmp={} ty={}", - op.op_name(), - if ty == F32 { "s" } else { "d" }, - rd, - rs1, - rs2, - tmp, - ty - ) - } &Inst::FloatSelect { op, rd, diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 1dec264e41da..e7cb2f5c4b20 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1300,24 +1300,6 @@ (max VReg (rv_vfmax_vv x y (unmasked) ty))) (rv_vmerge_vvm vec_nan max is_not_nan ty))) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule 0 (lower (has_type (ty_scalar_float ty) (fmin_pseudo x y))) - (gen_float_select_pseudo (FloatSelectOP.Min) x y ty)) - -(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmin_pseudo x y))) - (let ((mask VReg (gen_fcmp_mask ty (FloatCC.LessThan) y x))) - (rv_vmerge_vvm x y mask ty))) - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule 0 (lower (has_type (ty_scalar_float ty) (fmax_pseudo x y))) - (gen_float_select_pseudo (FloatSelectOP.Max) x y ty)) - -(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmax_pseudo x y))) - (let ((mask VReg (gen_fcmp_mask ty (FloatCC.LessThan) x y))) - (rv_vmerge_vvm x y mask ty))) - ;;;;; Rules for `stack_addr`;;;;;;;;; (rule (lower (stack_addr ss offset)) diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 2b286aeeb2e4..c7bfa335f8b6 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -1134,6 +1134,13 @@ (rule (lower (has_type (vr128_ty ty) (bitselect x y z))) (vec_select ty y z x)) +;; Special-case some float-selection instructions for min/max +(rule 3 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) + (fmin_pseudo_reg ty y x)) +(rule 4 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) + (fmax_pseudo_reg ty y x)) + + ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1389,20 +1396,6 @@ (fmax_reg ty x y)) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Minimum of two registers. -(rule (lower (has_type ty (fmin_pseudo x y))) - (fmin_pseudo_reg ty x y)) - - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Maximum of two registers. -(rule (lower (has_type ty (fmax_pseudo x y))) - (fmax_pseudo_reg ty x y)) - - ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Copysign of two registers. @@ -3719,6 +3712,12 @@ (select_bool_reg ty (value_nonzero val_cond) (put_in_reg val_true) (put_in_reg val_false))) +;; Special-case some float-selection instructions for min/max +(rule 1 (lower (has_type (ty_scalar_float ty) (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) + (fmin_pseudo_reg ty y x)) +(rule 2 (lower (has_type (ty_scalar_float ty) (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) + (fmax_pseudo_reg ty y x)) + ;;;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 3c36f64c44ac..ee33d56814c6 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1383,6 +1383,21 @@ (decl pure vconst_all_ones_or_all_zeros () Constant) (extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros) +;; Specializations for floating-pointer compares to generate a `minp*` or a +;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}` +;; instructions and how they're lowered into CLIF. Note the careful ordering +;; of all the operands here to ensure that the input CLIF matched is implemented +;; by the corresponding x64 instruction. +(rule 2 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minps x y)) +(rule 2 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minpd x y)) + +(rule 3 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxps x y)) +(rule 3 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxpd x y)) + ;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 @@ -2021,6 +2036,18 @@ (let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c))) (select_icmp cond_result x y))) +;; Specializations for floating-point compares to generate a `mins*` or a +;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}" +;; specializations for vectors. +(rule 2 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minss x y)) +(rule 2 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minsd x y)) +(rule 3 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxss x y)) +(rule 3 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxsd x y)) + ;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; If available, we can use a plain lzcnt instruction here. Note no @@ -2677,28 +2704,6 @@ (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive))) final)) -;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type $F32 (fmin_pseudo x y))) - (x64_minss y x)) -(rule (lower (has_type $F64 (fmin_pseudo x y))) - (x64_minsd y x)) -(rule (lower (has_type $F32X4 (fmin_pseudo x y))) - (x64_minps y x)) -(rule (lower (has_type $F64X2 (fmin_pseudo x y))) - (x64_minpd y x)) - -;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type $F32 (fmax_pseudo x y))) - (x64_maxss y x)) -(rule (lower (has_type $F64 (fmax_pseudo x y))) - (x64_maxsd y x)) -(rule (lower (has_type $F32X4 (fmax_pseudo x y))) - (x64_maxps y x)) -(rule (lower (has_type $F64X2 (fmax_pseudo x y))) - (x64_maxpd y x)) - ;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Base case for fma is to call out to one of two libcalls. For vectors they diff --git a/cranelift/codegen/src/opts/selects.isle b/cranelift/codegen/src/opts/selects.isle index b0a2ca9dde0c..aef9cd7f7865 100644 --- a/cranelift/codegen/src/opts/selects.isle +++ b/cranelift/codegen/src/opts/selects.isle @@ -43,4 +43,3 @@ (rule (simplify (bitselect ty @ (multi_lane _ _) (sge _ x y) y x)) (smin ty x y)) (rule (simplify (bitselect ty @ (multi_lane _ _) (ugt _ x y) y x)) (umin ty x y)) (rule (simplify (bitselect ty @ (multi_lane _ _) (uge _ x y) y x)) (umin ty x y)) - diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif index 57c26708a67c..c2969661e014 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif @@ -260,60 +260,3 @@ block0(v0: f64, v1: f64): ; dup v6.2d, v1.d[0] ; fmax v0.2d, v5.2d, v6.2d ; ret - -function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmin_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} - -; VCode: -; block0: -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v6.2d, v7.2d -; bsl v0.16b, v0.16b, v7.16b, v6.16b -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v6.2d, v7.2d -; bsl v0.16b, v7.16b, v6.16b -; ret - -function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmax_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} - -; VCode: -; block0: -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v7.2d, v6.2d -; bsl v0.16b, v0.16b, v7.16b, v6.16b -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v7.2d, v6.2d -; bsl v0.16b, v7.16b, v6.16b -; ret - diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif deleted file mode 100644 index c1b3f21cffb4..000000000000 --- a/cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif +++ /dev/null @@ -1,92 +0,0 @@ -test compile precise-output -set unwind_info=false -target riscv64 has_v - -function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { -block0(v0:f32x4, v1:f32x4): - v2 = fmax_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x90, 0x11, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - -function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { -block0(v0:f64x2, v1:f64x2): - v2 = fmax_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x90, 0x11, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif deleted file mode 100644 index 608ad3767dc9..000000000000 --- a/cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif +++ /dev/null @@ -1,92 +0,0 @@ -test compile precise-output -set unwind_info=false -target riscv64 has_v - -function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { -block0(v0:f32x4, v1:f32x4): - v2 = fmin_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v3,v1 #avl=4, #vtype=(e32, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x90, 0x30, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - -function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { -block0(v0:f64x2, v1:f64x2): - v2 = fmin_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v3,v1 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x90, 0x30, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - diff --git a/cranelift/filetests/filetests/isa/s390x/floating-point.clif b/cranelift/filetests/filetests/isa/s390x/floating-point.clif index 9e757ba95642..7745374b39a2 100644 --- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif +++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif @@ -282,8 +282,9 @@ block0(v0: f64, v1: f64): function %fmin_pseudo_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -298,8 +299,9 @@ block0(v0: f32, v1: f32): function %fmin_pseudo_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -314,8 +316,9 @@ block0(v0: f64, v1: f64): function %fmax_pseudo_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -330,8 +333,9 @@ block0(v0: f32, v1: f32): function %fmax_pseudo_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif index 4ccb89adf7ba..41b100a77014 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif @@ -319,8 +319,10 @@ block0(v0: f64x2, v1: f64x2): function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f32x4 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: @@ -335,8 +337,10 @@ block0(v0: f32x4, v1: f32x4): function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0: f64x2, v1: f64x2): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f64x2 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: @@ -351,8 +355,10 @@ block0(v0: f64x2, v1: f64x2): function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f32x4 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: @@ -367,8 +373,10 @@ block0(v0: f32x4, v1: f32x4): function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0: f64x2, v1: f64x2): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f64x2 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: diff --git a/cranelift/filetests/filetests/isa/x64/float-avx.clif b/cranelift/filetests/filetests/isa/x64/float-avx.clif index fa0e131601ba..948056a03d14 100644 --- a/cranelift/filetests/filetests/isa/x64/float-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/float-avx.clif @@ -203,8 +203,9 @@ block0(v0: f64, v1: f64): function %f32_min(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -228,8 +229,9 @@ block0(v0: f32, v1: f32): function %f64_min(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -253,8 +255,9 @@ block0(v0: f64, v1: f64): function %f32_max(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -278,8 +281,9 @@ block0(v0: f32, v1: f32): function %f64_max(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: diff --git a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif index 0bd30a105d33..00bc44f36a14 100644 --- a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif @@ -299,55 +299,3 @@ block0(v0: f64, v1: f64): return v5 } ; run: %f64x2_splat_max(-0x6.6, 0x2.2) == [0x2.2 0x2.2] - -function %f32x4_splat_min_pseudo(f32, f32) -> f32x4 { - gv0 = dyn_scale_target_const.f32x4 - dt0 = f32x4*gv0 - -block0(v0: f32, v1: f32): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmin_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f32x4_splat_min_pseudo(0x6.6, 0x2.2) == [0x2.2 0x2.2 0x2.2 0x2.2] - -function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmin_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f64x2_splat_min_pseudo(-0x6.6, 0x2.2) == [-0x6.6 -0x6.6] - -function %f32x4_splat_max_pseudo(f32, f32) -> f32x4 { - gv0 = dyn_scale_target_const.f32x4 - dt0 = f32x4*gv0 - -block0(v0: f32, v1: f32): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmax_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f32x4_splat_max_pseudo(0x6.6, 0x2.2) == [0x6.6 0x6.6 0x6.6 0x6.6] - -function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmax_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f64x2_splat_max_pseudo(-0x6.6, 0x2.2) == [0x2.2 0x2.2] diff --git a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif index 0a5be21d52a1..aef5612ef55b 100644 --- a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif @@ -8,8 +8,9 @@ target riscv64 function %fmax_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; run: %fmax_p_f32(0x1.0, 0x2.0) == 0x2.0 ; run: %fmax_p_f32(0x1.0p10, 0x1.0p11) == 0x1.0p11 @@ -44,10 +45,11 @@ block0(v0: f32, v1: f32): function %fmax_is_nan_f32(f32, f32) -> i32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmax_is_nan_f32(-NaN, 0x0.0) == 1 ; run: %fmax_is_nan_f32(-NaN:0x0, 0x0.0) == 1 @@ -60,8 +62,9 @@ block0(v0: f32, v1: f32): function %fmax_p_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; run: %fmax_p_f64(0x1.0, 0x2.0) == 0x2.0 ; run: %fmax_p_f64(0x1.0p10, 0x1.0p11) == 0x1.0p11 @@ -97,10 +100,11 @@ block0(v0: f64, v1: f64): function %fmax_is_nan_f64(f64, f64) -> i32 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmax_is_nan_f64(-NaN, 0x0.0) == 1 ; run: %fmax_is_nan_f64(-NaN:0x0, 0x0.0) == 1 diff --git a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif index 829dc49901d9..77c15234e856 100644 --- a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif @@ -8,8 +8,9 @@ target riscv64 function %fmin_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; run: %fmin_p_f32(0x1.0, 0x2.0) == 0x1.0 ; run: %fmin_p_f32(0x1.0p10, 0x1.0p11) == 0x1.0p10 @@ -44,10 +45,11 @@ block0(v0: f32, v1: f32): function %fmin_is_nan_f32(f32, f32) -> i32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmin_is_nan_f32(-NaN, 0x0.0) == 1 ; run: %fmin_is_nan_f32(-NaN:0x0, 0x0.0) == 1 @@ -60,8 +62,9 @@ block0(v0: f32, v1: f32): function %fmin_p_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; run: %fmin_p_f64(0x1.0, 0x2.0) == 0x1.0 ; run: %fmin_p_f64(0x1.0p10, 0x1.0p11) == 0x1.0p10 @@ -97,10 +100,11 @@ block0(v0: f64, v1: f64): function %fmin_is_nan_f64(f64, f64) -> i32 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmin_is_nan_f64(-NaN, 0x0.0) == 1 ; run: %fmin_is_nan_f64(-NaN:0x0, 0x0.0) == 1 diff --git a/cranelift/filetests/filetests/runtests/issue5569.clif b/cranelift/filetests/filetests/runtests/issue5569.clif index 73b6c8b00cdf..248114a81ea2 100644 --- a/cranelift/filetests/filetests/runtests/issue5569.clif +++ b/cranelift/filetests/filetests/runtests/issue5569.clif @@ -67,7 +67,8 @@ block0(v0: i16, v1: f64, v2: i32, v3: i64, v4: i16, v5: i128, v6: f32): v103 = bor v97, v102 v104 = select v103, v96, v15 ; v96 = 1 v17 = sdiv v15, v104 - v18 = fmax_pseudo v6, v6 + v800 = fcmp lt v6, v6 + v18 = select v800, v6, v6 v105 = iconst.i32 0 v106 = iconst.i32 1 v107 = icmp eq v17, v105 ; v105 = 0 @@ -238,7 +239,8 @@ block0(v0: i16, v1: f64, v2: i32, v3: i64, v4: i16, v5: i128, v6: f32): v253 = bor v247, v252 v254 = select v253, v246, v33 ; v246 = 1 v34 = sdiv v33, v254 - v35 = fmax_pseudo v18, v18 + v801 = fcmp lt v18, v18 + v35 = select v801, v18, v18 v255 = iconst.i32 0 v256 = iconst.i32 1 v257 = icmp eq v34, v255 ; v255 = 0 diff --git a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif index 5cb46d1ad38d..6932e6c622eb 100644 --- a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif @@ -7,30 +7,38 @@ target riscv64gc has_v function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0:f32x4, v1:f32x4): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f32x4 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmin_pseudo_f32x4([0x1.0 NaN 0x0.1 -0x0.0], [0x2.0 0x2.0 NaN 0x0.0]) == [0x1.0 NaN 0x0.1 -0x0.0] function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0:f32x4, v1:f32x4): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f32x4 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmax_pseudo_f32x4([0x1.0 NaN 0x0.1 -0x0.0], [0x2.0 0x2.0 NaN 0x0.0]) == [0x2.0 NaN 0x0.1 -0x0.0] function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0:f64x2, v1:f64x2): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f64x2 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmin_pseudo_f64x2([0x1.0 NaN], [0x2.0 0x2.0]) == [0x1.0 NaN] ; run: %fmin_pseudo_f64x2([0x0.1 -0x0.0], [NaN 0x0.0]) == [0x0.1 -0x0.0] function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0:f64x2, v1:f64x2): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f64x2 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmax_pseudo_f64x2([0x1.0 NaN], [0x2.0 0x2.0]) == [0x2.0 NaN] ; run: %fmax_pseudo_f64x2([0x0.1 -0x0.0], [NaN 0x0.0]) == [0x0.1 -0x0.0] diff --git a/cranelift/filetests/filetests/wasm/x64-pmin.wat b/cranelift/filetests/filetests/wasm/x64-pmin.wat new file mode 100644 index 000000000000..474009b70a26 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/x64-pmin.wat @@ -0,0 +1,136 @@ +;;! target = "x86_64" +;;! compile = true +;;! settings = ["sse42", "opt_level=speed", "has_avx"] + +(module + (func (export "f32.pmin") (param f32 f32) (result f32) + (select + (local.get 1) (local.get 0) + (f32.lt (local.get 1) (local.get 0)))) + (func (export "f32.pmax") (param f32 f32) (result f32) + (select + (local.get 1) (local.get 0) + (f32.lt (local.get 0) (local.get 1)))) + + (func (export "f64.pmin") (param f64 f64) (result f64) + (select + (local.get 1) (local.get 0) + (f64.lt (local.get 1) (local.get 0)))) + (func (export "f64.pmax") (param f64 f64) (result f64) + (select + (local.get 1) (local.get 0) + (f64.lt (local.get 0) (local.get 1)))) + + (func (export "f32x4.pmin") (param v128 v128) (result v128) + (f32x4.pmin (local.get 0) (local.get 1))) + (func (export "f32x4.pmax") (param v128 v128) (result v128) + (f32x4.pmax (local.get 0) (local.get 1))) + + (func (export "f64x2.pmin") (param v128 v128) (result v128) + (f64x2.pmin (local.get 0) (local.get 1))) + (func (export "f64x2.pmax") (param v128 v128) (result v128) + (f64x2.pmax (local.get 0) (local.get 1))) +) +;; function u0:0: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminss %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:1: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxss %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:2: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminsd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:3: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxsd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:4: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminps %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:5: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxps %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:6: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminpd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:7: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxpd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs index a726f424089f..8fdb2694d842 100644 --- a/cranelift/fuzzgen/src/function_generator.rs +++ b/cranelift/fuzzgen/src/function_generator.rs @@ -1170,12 +1170,8 @@ static OPCODE_SIGNATURES: Lazy> = Lazy::new(|| { (Opcode::Sshr, &[I64X2, I128], &[I64X2]), (Opcode::Fmin, &[F32X4, F32X4], &[F32X4]), (Opcode::Fmin, &[F64X2, F64X2], &[F64X2]), - (Opcode::FminPseudo, &[F32X4, F32X4], &[F32X4]), - (Opcode::FminPseudo, &[F64X2, F64X2], &[F64X2]), (Opcode::Fmax, &[F32X4, F32X4], &[F32X4]), (Opcode::Fmax, &[F64X2, F64X2], &[F64X2]), - (Opcode::FmaxPseudo, &[F32X4, F32X4], &[F32X4]), - (Opcode::FmaxPseudo, &[F64X2, F64X2], &[F64X2]), (Opcode::FcvtToUintSat, &[F32X4], &[I8]), (Opcode::FcvtToUintSat, &[F64X2], &[I8]), (Opcode::FcvtToUintSat, &[F32X4], &[I16]), diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 3ee76362c869..f13d2c918400 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -879,11 +879,6 @@ where (a, b) if a.is_zero()? && b.is_zero()? && b.is_negative()? => b, (a, b) => a.smin(b)?, }), - Opcode::FminPseudo => assign(match (arg(0), arg(1)) { - (a, b) if a.is_nan()? || b.is_nan()? => a, - (a, b) if a.is_zero()? && b.is_zero()? => a, - (a, b) => a.smin(b)?, - }), Opcode::Fmax => assign(match (arg(0), arg(1)) { (a, _) if a.is_nan()? => a, (_, b) if b.is_nan()? => b, @@ -891,11 +886,6 @@ where (a, b) if a.is_zero()? && b.is_zero()? && b.is_negative()? => a, (a, b) => a.smax(b)?, }), - Opcode::FmaxPseudo => assign(match (arg(0), arg(1)) { - (a, b) if a.is_nan()? || b.is_nan()? => a, - (a, b) if a.is_zero()? && b.is_zero()? => a, - (a, b) => a.smax(b)?, - }), Opcode::Ceil => unary(DataValueExt::ceil, arg(0))?, Opcode::Floor => unary(DataValueExt::floor, arg(0))?, Opcode::Trunc => unary(DataValueExt::trunc, arg(0))?, diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 0d5e78889744..34e8100f7de3 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1956,12 +1956,29 @@ pub fn translate_operator( state.push1(builder.ins().fmin(a, b)) } Operator::F32x4PMax | Operator::F64x2PMax => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); - state.push1(builder.ins().fmax_pseudo(a, b)) + // Note the careful ordering here with respect to `fcmp` and + // `bitselect`. This matches the spec definition of: + // + // fpmax(z1, z2) = + // * If z1 is less than z2 then return z2. + // * Else return z1. + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); + let cmp = builder.ins().fcmp(FloatCC::LessThan, a, b); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + state.push1(builder.ins().bitselect(cmp, b, a)) } Operator::F32x4PMin | Operator::F64x2PMin => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); - state.push1(builder.ins().fmin_pseudo(a, b)) + // Note the careful ordering here which is similar to `pmax` above: + // + // fpmin(z1, z2) = + // * If z2 is less than z1 then return z2. + // * Else return z1. + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); + let cmp = builder.ins().fcmp(FloatCC::LessThan, b, a); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + state.push1(builder.ins().bitselect(cmp, b, a)) } Operator::F32x4Sqrt | Operator::F64x2Sqrt => { let a = pop1_with_bitcast(state, type_of(op), builder); @@ -2243,27 +2260,39 @@ pub fn translate_operator( } Operator::F32x4RelaxedMax | Operator::F64x2RelaxedMax => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); state.push1( if environ.relaxed_simd_deterministic() || !environ.is_x86() { // Deterministic semantics match the `fmax` instruction, or // the `fAAxBB.max` wasm instruction. builder.ins().fmax(a, b) } else { - builder.ins().fmax_pseudo(a, b) + // Note that this matches the `pmax` translation which has + // careful ordering of its operands to trigger + // pattern-matches in the x86 backend. + let cmp = builder.ins().fcmp(FloatCC::LessThan, a, b); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + builder.ins().bitselect(cmp, b, a) }, ) } Operator::F32x4RelaxedMin | Operator::F64x2RelaxedMin => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); state.push1( if environ.relaxed_simd_deterministic() || !environ.is_x86() { // Deterministic semantics match the `fmin` instruction, or // the `fAAxBB.min` wasm instruction. builder.ins().fmin(a, b) } else { - builder.ins().fmin_pseudo(a, b) + // Note that this matches the `pmin` translation which has + // careful ordering of its operands to trigger + // pattern-matches in the x86 backend. + let cmp = builder.ins().fcmp(FloatCC::LessThan, b, a); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + builder.ins().bitselect(cmp, b, a) }, ); }