diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 14e397c16b69..ff5aed499193 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -2931,24 +2931,6 @@ pub(crate) fn define( ]), ); - ig.push( - Inst::new( - "fmin_pseudo", - r#" - Floating point pseudo-minimum, propagating NaNs. This behaves differently from ``fmin``. - See for background. - - The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour - for zero or NaN inputs follows from the behaviour of ``<`` with such inputs. - "#, - &formats.binary, - ) - .operands_in(vec![Operand::new("x", Float), Operand::new("y", Float)]) - .operands_out(vec![ - Operand::new("a", Float).with_doc("The smaller of ``x`` and ``y``") - ]), - ); - ig.push( Inst::new( "fmax", @@ -2968,24 +2950,6 @@ pub(crate) fn define( ]), ); - ig.push( - Inst::new( - "fmax_pseudo", - r#" - Floating point pseudo-maximum, propagating NaNs. This behaves differently from ``fmax``. - See for background. - - The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour - for zero or NaN inputs follows from the behaviour of ``<`` with such inputs. - "#, - &formats.binary, - ) - .operands_in(vec![Operand::new("x", Float), Operand::new("y", Float)]) - .operands_out(vec![ - Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``") - ]), - ); - ig.push( Inst::new( "ceil", diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 9d51812f2fc5..80fcec000ec9 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -415,24 +415,6 @@ (rule (lower (has_type (ty_scalar_float ty) (fmax rn rm))) (fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty))) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmin_pseudo rm rn))) - (bsl ty (vec_rrr (VecALUOp.Fcmgt) rm rn (vector_size ty)) rn rm)) - -(rule (lower (has_type (ty_scalar_float ty) (fmin_pseudo rm rn))) - (with_flags (fpu_cmp (scalar_size ty) rm rn) - (fpu_csel ty (Cond.Gt) rn rm))) - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmax_pseudo rm rn))) - (bsl ty (vec_rrr (VecALUOp.Fcmgt) rn rm (vector_size ty)) rn rm)) - -(rule (lower (has_type (ty_scalar_float ty) (fmax_pseudo rm rn))) - (with_flags (fpu_cmp (scalar_size ty) rn rm) - (fpu_csel ty (Cond.Gt) rn rm))) - ;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 (lower (has_type ty @ (multi_lane _ _) (sqrt x))) diff --git a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle index 54adb887b87c..a3cd463c9e4f 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle +++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle @@ -35,18 +35,6 @@ (rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmax x y))) (value_reg (vec_rrr (VecALUOp.Fmax) (put_in_reg x) (put_in_reg y) (vector_size ty)))) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmin_pseudo x y))) - (value_reg (bsl ty - (vec_rrr (VecALUOp.Fcmgt) (put_in_reg x) (put_in_reg y) - (vector_size ty)) (put_in_reg y) (put_in_reg x)))) - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -2 (lower (has_type ty @ (dynamic_lane _ _) (fmax_pseudo x y))) - (value_reg (bsl ty - (vec_rrr (VecALUOp.Fcmgt) (put_in_reg y) (put_in_reg x) - (vector_size ty)) (put_in_reg y) (put_in_reg x)))) - ;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -2 (lower (has_type (ty_dyn128_int ty) (snarrow x y))) (if-let _ (zero_value y)) diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index b7576a55317b..3b826ec46c84 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -288,14 +288,6 @@ (rs1 Reg) (rs2 Reg) (ty Type)) - (FloatSelectPseudo - (op FloatSelectOP) - (rd WritableReg) - ;; a integer register - (tmp WritableReg) - (rs1 Reg) - (rs2 Reg) - (ty Type)) ;; popcnt if target doesn't support extension B ;; use iteration to implement. @@ -986,15 +978,6 @@ (_ Unit (emit (MInst.FloatRound op rd tmp tmp2 rs ty)))) (writable_reg_to_reg rd))) -(decl gen_float_select_pseudo (FloatSelectOP Reg Reg Type) Reg) -(rule - (gen_float_select_pseudo op x y ty) - (let - ((rd WritableReg (temp_writable_reg ty)) - (tmp WritableXReg (temp_writable_xreg)) - (_ Unit (emit (MInst.FloatSelectPseudo op rd tmp x y ty)))) - (writable_reg_to_reg rd))) - (decl gen_float_select (FloatSelectOP Reg Reg Type) Reg) (rule (gen_float_select op x y ty) diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index 0e750bd608ca..a0a724f5ecf8 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -458,7 +458,6 @@ impl Inst { | Inst::DummyUse { .. } | Inst::FloatRound { .. } | Inst::FloatSelect { .. } - | Inst::FloatSelectPseudo { .. } | Inst::Popcnt { .. } | Inst::Rev8 { .. } | Inst::Cltz { .. } @@ -2242,53 +2241,6 @@ impl MachInstEmit for Inst { Inst::gen_move(rd, rs, ty).emit(&[], sink, emit_info, state); sink.bind_label(label_jump_over, &mut state.ctrl_plane); } - &Inst::FloatSelectPseudo { - op, - rd, - tmp, - rs1, - rs2, - ty, - } => { - let rs1 = allocs.next(rs1); - let rs2 = allocs.next(rs2); - let tmp = allocs.next_writable(tmp); - let rd = allocs.next_writable(rd); - let label_rs2 = sink.get_label(); - let label_jump_over = sink.get_label(); - let lt_op = if ty == F32 { - FpuOPRRR::FltS - } else { - FpuOPRRR::FltD - }; - Inst::FpuRRR { - alu_op: lt_op, - frm: None, - rd: tmp, - rs1: if op == FloatSelectOP::Max { rs1 } else { rs2 }, - rs2: if op == FloatSelectOP::Max { rs2 } else { rs1 }, - } - .emit(&[], sink, emit_info, state); - Inst::CondBr { - taken: BranchTarget::Label(label_rs2), - not_taken: BranchTarget::zero(), - kind: IntegerCompare { - kind: IntCC::NotEqual, - rs1: tmp.to_reg(), - rs2: zero_reg(), - }, - } - .emit(&[], sink, emit_info, state); - // here select rs1 as result. - Inst::gen_move(rd, rs1, ty).emit(&[], sink, emit_info, state); - Inst::Jal { - dest: BranchTarget::Label(label_jump_over), - } - .emit(&[], sink, emit_info, state); - sink.bind_label(label_rs2, &mut state.ctrl_plane); - Inst::gen_move(rd, rs2, ty).emit(&[], sink, emit_info, state); - sink.bind_label(label_jump_over, &mut state.ctrl_plane); - } &Inst::FloatSelect { op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 786c958ad4da..a6c90991ec66 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -636,13 +636,6 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_early_def(tmp); collector.reg_early_def(rd); } - &Inst::FloatSelectPseudo { - rd, tmp, rs1, rs2, .. - } => { - collector.reg_uses(&[rs1, rs2]); - collector.reg_early_def(tmp); - collector.reg_early_def(rd); - } &Inst::Popcnt { sum, step, rs, tmp, .. } => { @@ -1136,29 +1129,6 @@ impl Inst { ty ) } - &Inst::FloatSelectPseudo { - op, - rd, - tmp, - rs1, - rs2, - ty, - } => { - let rs1 = format_reg(rs1, allocs); - let rs2 = format_reg(rs2, allocs); - let tmp = format_reg(tmp.to_reg(), allocs); - let rd = format_reg(rd.to_reg(), allocs); - format!( - "f{}.{}.pseudo {},{},{}##tmp={} ty={}", - op.op_name(), - if ty == F32 { "s" } else { "d" }, - rd, - rs1, - rs2, - tmp, - ty - ) - } &Inst::FloatSelect { op, rd, diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 1dec264e41da..e7cb2f5c4b20 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1300,24 +1300,6 @@ (max VReg (rv_vfmax_vv x y (unmasked) ty))) (rv_vmerge_vvm vec_nan max is_not_nan ty))) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule 0 (lower (has_type (ty_scalar_float ty) (fmin_pseudo x y))) - (gen_float_select_pseudo (FloatSelectOP.Min) x y ty)) - -(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmin_pseudo x y))) - (let ((mask VReg (gen_fcmp_mask ty (FloatCC.LessThan) y x))) - (rv_vmerge_vvm x y mask ty))) - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule 0 (lower (has_type (ty_scalar_float ty) (fmax_pseudo x y))) - (gen_float_select_pseudo (FloatSelectOP.Max) x y ty)) - -(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmax_pseudo x y))) - (let ((mask VReg (gen_fcmp_mask ty (FloatCC.LessThan) x y))) - (rv_vmerge_vvm x y mask ty))) - ;;;;; Rules for `stack_addr`;;;;;;;;; (rule (lower (stack_addr ss offset)) diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 2b286aeeb2e4..c7bfa335f8b6 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -1134,6 +1134,13 @@ (rule (lower (has_type (vr128_ty ty) (bitselect x y z))) (vec_select ty y z x)) +;; Special-case some float-selection instructions for min/max +(rule 3 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) + (fmin_pseudo_reg ty y x)) +(rule 4 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) + (fmax_pseudo_reg ty y x)) + + ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1389,20 +1396,6 @@ (fmax_reg ty x y)) -;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Minimum of two registers. -(rule (lower (has_type ty (fmin_pseudo x y))) - (fmin_pseudo_reg ty x y)) - - -;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Maximum of two registers. -(rule (lower (has_type ty (fmax_pseudo x y))) - (fmax_pseudo_reg ty x y)) - - ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Copysign of two registers. @@ -3719,6 +3712,12 @@ (select_bool_reg ty (value_nonzero val_cond) (put_in_reg val_true) (put_in_reg val_false))) +;; Special-case some float-selection instructions for min/max +(rule 1 (lower (has_type (ty_scalar_float ty) (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) + (fmin_pseudo_reg ty y x)) +(rule 2 (lower (has_type (ty_scalar_float ty) (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) + (fmax_pseudo_reg ty y x)) + ;;;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 3c36f64c44ac..ee33d56814c6 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1383,6 +1383,21 @@ (decl pure vconst_all_ones_or_all_zeros () Constant) (extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros) +;; Specializations for floating-pointer compares to generate a `minp*` or a +;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}` +;; instructions and how they're lowered into CLIF. Note the careful ordering +;; of all the operands here to ensure that the input CLIF matched is implemented +;; by the corresponding x64 instruction. +(rule 2 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minps x y)) +(rule 2 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minpd x y)) + +(rule 3 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxps x y)) +(rule 3 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxpd x y)) + ;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 @@ -2021,6 +2036,18 @@ (let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c))) (select_icmp cond_result x y))) +;; Specializations for floating-point compares to generate a `mins*` or a +;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}" +;; specializations for vectors. +(rule 2 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minss x y)) +(rule 2 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) + (x64_minsd x y)) +(rule 3 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxss x y)) +(rule 3 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) + (x64_maxsd x y)) + ;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; If available, we can use a plain lzcnt instruction here. Note no @@ -2677,28 +2704,6 @@ (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive))) final)) -;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type $F32 (fmin_pseudo x y))) - (x64_minss y x)) -(rule (lower (has_type $F64 (fmin_pseudo x y))) - (x64_minsd y x)) -(rule (lower (has_type $F32X4 (fmin_pseudo x y))) - (x64_minps y x)) -(rule (lower (has_type $F64X2 (fmin_pseudo x y))) - (x64_minpd y x)) - -;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type $F32 (fmax_pseudo x y))) - (x64_maxss y x)) -(rule (lower (has_type $F64 (fmax_pseudo x y))) - (x64_maxsd y x)) -(rule (lower (has_type $F32X4 (fmax_pseudo x y))) - (x64_maxps y x)) -(rule (lower (has_type $F64X2 (fmax_pseudo x y))) - (x64_maxpd y x)) - ;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Base case for fma is to call out to one of two libcalls. For vectors they diff --git a/cranelift/codegen/src/opts/selects.isle b/cranelift/codegen/src/opts/selects.isle index b0a2ca9dde0c..aef9cd7f7865 100644 --- a/cranelift/codegen/src/opts/selects.isle +++ b/cranelift/codegen/src/opts/selects.isle @@ -43,4 +43,3 @@ (rule (simplify (bitselect ty @ (multi_lane _ _) (sge _ x y) y x)) (smin ty x y)) (rule (simplify (bitselect ty @ (multi_lane _ _) (ugt _ x y) y x)) (umin ty x y)) (rule (simplify (bitselect ty @ (multi_lane _ _) (uge _ x y) y x)) (umin ty x y)) - diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif index 57c26708a67c..c2969661e014 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif @@ -260,60 +260,3 @@ block0(v0: f64, v1: f64): ; dup v6.2d, v1.d[0] ; fmax v0.2d, v5.2d, v6.2d ; ret - -function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmin_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} - -; VCode: -; block0: -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v6.2d, v7.2d -; bsl v0.16b, v0.16b, v7.16b, v6.16b -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v6.2d, v7.2d -; bsl v0.16b, v7.16b, v6.16b -; ret - -function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmax_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} - -; VCode: -; block0: -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v7.2d, v6.2d -; bsl v0.16b, v0.16b, v7.16b, v6.16b -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; dup v6.2d, v0.d[0] -; dup v7.2d, v1.d[0] -; fcmgt v0.2d, v7.2d, v6.2d -; bsl v0.16b, v7.16b, v6.16b -; ret - diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif deleted file mode 100644 index c1b3f21cffb4..000000000000 --- a/cranelift/filetests/filetests/isa/riscv64/simd-fmax-pseudo.clif +++ /dev/null @@ -1,92 +0,0 @@ -test compile precise-output -set unwind_info=false -target riscv64 has_v - -function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { -block0(v0:f32x4, v1:f32x4): - v2 = fmax_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x90, 0x11, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - -function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { -block0(v0:f64x2, v1:f64x2): - v2 = fmax_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x90, 0x11, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif deleted file mode 100644 index 608ad3767dc9..000000000000 --- a/cranelift/filetests/filetests/isa/riscv64/simd-fmin-pseudo.clif +++ /dev/null @@ -1,92 +0,0 @@ -test compile precise-output -set unwind_info=false -target riscv64 has_v - -function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { -block0(v0:f32x4, v1:f32x4): - v2 = fmin_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v3,v1 #avl=4, #vtype=(e32, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x90, 0x30, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - -function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { -block0(v0:f64x2, v1:f64x2): - v2 = fmin_pseudo v0, v1 - return v2 -} - -; VCode: -; add sp,-16 -; sd ra,8(sp) -; sd fp,0(sp) -; mv fp,sp -; block0: -; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vmflt.vv v0,v3,v1 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vvm v8,v1,v3,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; ld ra,8(sp) -; ld fp,0(sp) -; add sp,+16 -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; addi sp, sp, -0x10 -; sd ra, 8(sp) -; sd s0, 0(sp) -; ori s0, sp, 0 -; block1: ; offset 0x10 -; .byte 0x57, 0x70, 0x08, 0xcc -; addi t6, s0, 0x10 -; .byte 0x87, 0x80, 0x0f, 0x02 -; addi t6, s0, 0x20 -; .byte 0x87, 0x81, 0x0f, 0x02 -; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x90, 0x30, 0x6e -; .byte 0x57, 0x84, 0x11, 0x5c -; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 -; ld ra, 8(sp) -; ld s0, 0(sp) -; addi sp, sp, 0x10 -; ret - diff --git a/cranelift/filetests/filetests/isa/s390x/floating-point.clif b/cranelift/filetests/filetests/isa/s390x/floating-point.clif index 9e757ba95642..7745374b39a2 100644 --- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif +++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif @@ -282,8 +282,9 @@ block0(v0: f64, v1: f64): function %fmin_pseudo_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -298,8 +299,9 @@ block0(v0: f32, v1: f32): function %fmin_pseudo_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -314,8 +316,9 @@ block0(v0: f64, v1: f64): function %fmax_pseudo_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -330,8 +333,9 @@ block0(v0: f32, v1: f32): function %fmax_pseudo_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif index 4ccb89adf7ba..41b100a77014 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif @@ -319,8 +319,10 @@ block0(v0: f64x2, v1: f64x2): function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f32x4 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: @@ -335,8 +337,10 @@ block0(v0: f32x4, v1: f32x4): function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0: f64x2, v1: f64x2): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f64x2 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: @@ -351,8 +355,10 @@ block0(v0: f64x2, v1: f64x2): function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f32x4 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: @@ -367,8 +373,10 @@ block0(v0: f32x4, v1: f32x4): function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0: f64x2, v1: f64x2): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f64x2 v2 + v4 = bitselect v3, v1, v0 + return v4 } ; VCode: diff --git a/cranelift/filetests/filetests/isa/x64/float-avx.clif b/cranelift/filetests/filetests/isa/x64/float-avx.clif index fa0e131601ba..948056a03d14 100644 --- a/cranelift/filetests/filetests/isa/x64/float-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/float-avx.clif @@ -203,8 +203,9 @@ block0(v0: f64, v1: f64): function %f32_min(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -228,8 +229,9 @@ block0(v0: f32, v1: f32): function %f64_min(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -253,8 +255,9 @@ block0(v0: f64, v1: f64): function %f32_max(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: @@ -278,8 +281,9 @@ block0(v0: f32, v1: f32): function %f64_max(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; VCode: diff --git a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif index 0bd30a105d33..00bc44f36a14 100644 --- a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif @@ -299,55 +299,3 @@ block0(v0: f64, v1: f64): return v5 } ; run: %f64x2_splat_max(-0x6.6, 0x2.2) == [0x2.2 0x2.2] - -function %f32x4_splat_min_pseudo(f32, f32) -> f32x4 { - gv0 = dyn_scale_target_const.f32x4 - dt0 = f32x4*gv0 - -block0(v0: f32, v1: f32): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmin_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f32x4_splat_min_pseudo(0x6.6, 0x2.2) == [0x2.2 0x2.2 0x2.2 0x2.2] - -function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmin_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f64x2_splat_min_pseudo(-0x6.6, 0x2.2) == [-0x6.6 -0x6.6] - -function %f32x4_splat_max_pseudo(f32, f32) -> f32x4 { - gv0 = dyn_scale_target_const.f32x4 - dt0 = f32x4*gv0 - -block0(v0: f32, v1: f32): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmax_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f32x4_splat_max_pseudo(0x6.6, 0x2.2) == [0x6.6 0x6.6 0x6.6 0x6.6] - -function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { - gv0 = dyn_scale_target_const.f64x2 - dt0 = f64x2*gv0 - -block0(v0: f64, v1: f64): - v2 = splat.dt0 v0 - v3 = splat.dt0 v1 - v4 = fmax_pseudo v2, v3 - v5 = extract_vector v4, 0 - return v5 -} -; run: %f64x2_splat_max_pseudo(-0x6.6, 0x2.2) == [0x2.2 0x2.2] diff --git a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif index 0a5be21d52a1..aef5612ef55b 100644 --- a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif @@ -8,8 +8,9 @@ target riscv64 function %fmax_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; run: %fmax_p_f32(0x1.0, 0x2.0) == 0x2.0 ; run: %fmax_p_f32(0x1.0p10, 0x1.0p11) == 0x1.0p11 @@ -44,10 +45,11 @@ block0(v0: f32, v1: f32): function %fmax_is_nan_f32(f32, f32) -> i32 { block0(v0: f32, v1: f32): - v2 = fmax_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmax_is_nan_f32(-NaN, 0x0.0) == 1 ; run: %fmax_is_nan_f32(-NaN:0x0, 0x0.0) == 1 @@ -60,8 +62,9 @@ block0(v0: f32, v1: f32): function %fmax_p_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + return v3 } ; run: %fmax_p_f64(0x1.0, 0x2.0) == 0x2.0 ; run: %fmax_p_f64(0x1.0p10, 0x1.0p11) == 0x1.0p11 @@ -97,10 +100,11 @@ block0(v0: f64, v1: f64): function %fmax_is_nan_f64(f64, f64) -> i32 { block0(v0: f64, v1: f64): - v2 = fmax_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v0, v1 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmax_is_nan_f64(-NaN, 0x0.0) == 1 ; run: %fmax_is_nan_f64(-NaN:0x0, 0x0.0) == 1 diff --git a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif index 829dc49901d9..77c15234e856 100644 --- a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif @@ -8,8 +8,9 @@ target riscv64 function %fmin_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; run: %fmin_p_f32(0x1.0, 0x2.0) == 0x1.0 ; run: %fmin_p_f32(0x1.0p10, 0x1.0p11) == 0x1.0p10 @@ -44,10 +45,11 @@ block0(v0: f32, v1: f32): function %fmin_is_nan_f32(f32, f32) -> i32 { block0(v0: f32, v1: f32): - v2 = fmin_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmin_is_nan_f32(-NaN, 0x0.0) == 1 ; run: %fmin_is_nan_f32(-NaN:0x0, 0x0.0) == 1 @@ -60,8 +62,9 @@ block0(v0: f32, v1: f32): function %fmin_p_f64(f64, f64) -> f64 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + return v3 } ; run: %fmin_p_f64(0x1.0, 0x2.0) == 0x1.0 ; run: %fmin_p_f64(0x1.0p10, 0x1.0p11) == 0x1.0p10 @@ -97,10 +100,11 @@ block0(v0: f64, v1: f64): function %fmin_is_nan_f64(f64, f64) -> i32 { block0(v0: f64, v1: f64): - v2 = fmin_pseudo v0, v1 - v3 = fcmp ne v2, v2 - v4 = uextend.i32 v3 - return v4 + v2 = fcmp lt v1, v0 + v3 = select v2, v1, v0 + v4 = fcmp ne v3, v3 + v5 = uextend.i32 v4 + return v5 } ; run: %fmin_is_nan_f64(-NaN, 0x0.0) == 1 ; run: %fmin_is_nan_f64(-NaN:0x0, 0x0.0) == 1 diff --git a/cranelift/filetests/filetests/runtests/issue5569.clif b/cranelift/filetests/filetests/runtests/issue5569.clif index 73b6c8b00cdf..248114a81ea2 100644 --- a/cranelift/filetests/filetests/runtests/issue5569.clif +++ b/cranelift/filetests/filetests/runtests/issue5569.clif @@ -67,7 +67,8 @@ block0(v0: i16, v1: f64, v2: i32, v3: i64, v4: i16, v5: i128, v6: f32): v103 = bor v97, v102 v104 = select v103, v96, v15 ; v96 = 1 v17 = sdiv v15, v104 - v18 = fmax_pseudo v6, v6 + v800 = fcmp lt v6, v6 + v18 = select v800, v6, v6 v105 = iconst.i32 0 v106 = iconst.i32 1 v107 = icmp eq v17, v105 ; v105 = 0 @@ -238,7 +239,8 @@ block0(v0: i16, v1: f64, v2: i32, v3: i64, v4: i16, v5: i128, v6: f32): v253 = bor v247, v252 v254 = select v253, v246, v33 ; v246 = 1 v34 = sdiv v33, v254 - v35 = fmax_pseudo v18, v18 + v801 = fcmp lt v18, v18 + v35 = select v801, v18, v18 v255 = iconst.i32 0 v256 = iconst.i32 1 v257 = icmp eq v34, v255 ; v255 = 0 diff --git a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif index 5cb46d1ad38d..6932e6c622eb 100644 --- a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif @@ -7,30 +7,38 @@ target riscv64gc has_v function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0:f32x4, v1:f32x4): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f32x4 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmin_pseudo_f32x4([0x1.0 NaN 0x0.1 -0x0.0], [0x2.0 0x2.0 NaN 0x0.0]) == [0x1.0 NaN 0x0.1 -0x0.0] function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0:f32x4, v1:f32x4): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f32x4 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmax_pseudo_f32x4([0x1.0 NaN 0x0.1 -0x0.0], [0x2.0 0x2.0 NaN 0x0.0]) == [0x2.0 NaN 0x0.1 -0x0.0] function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0:f64x2, v1:f64x2): - v2 = fmin_pseudo v0, v1 - return v2 + v2 = fcmp lt v1, v0 + v3 = bitcast.f64x2 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmin_pseudo_f64x2([0x1.0 NaN], [0x2.0 0x2.0]) == [0x1.0 NaN] ; run: %fmin_pseudo_f64x2([0x0.1 -0x0.0], [NaN 0x0.0]) == [0x0.1 -0x0.0] function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { block0(v0:f64x2, v1:f64x2): - v2 = fmax_pseudo v0, v1 - return v2 + v2 = fcmp lt v0, v1 + v3 = bitcast.f64x2 little v2 + v4 = bitselect v3, v1, v0 + return v4 } ; run: %fmax_pseudo_f64x2([0x1.0 NaN], [0x2.0 0x2.0]) == [0x2.0 NaN] ; run: %fmax_pseudo_f64x2([0x0.1 -0x0.0], [NaN 0x0.0]) == [0x0.1 -0x0.0] diff --git a/cranelift/filetests/filetests/wasm/x64-pmin.wat b/cranelift/filetests/filetests/wasm/x64-pmin.wat new file mode 100644 index 000000000000..474009b70a26 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/x64-pmin.wat @@ -0,0 +1,136 @@ +;;! target = "x86_64" +;;! compile = true +;;! settings = ["sse42", "opt_level=speed", "has_avx"] + +(module + (func (export "f32.pmin") (param f32 f32) (result f32) + (select + (local.get 1) (local.get 0) + (f32.lt (local.get 1) (local.get 0)))) + (func (export "f32.pmax") (param f32 f32) (result f32) + (select + (local.get 1) (local.get 0) + (f32.lt (local.get 0) (local.get 1)))) + + (func (export "f64.pmin") (param f64 f64) (result f64) + (select + (local.get 1) (local.get 0) + (f64.lt (local.get 1) (local.get 0)))) + (func (export "f64.pmax") (param f64 f64) (result f64) + (select + (local.get 1) (local.get 0) + (f64.lt (local.get 0) (local.get 1)))) + + (func (export "f32x4.pmin") (param v128 v128) (result v128) + (f32x4.pmin (local.get 0) (local.get 1))) + (func (export "f32x4.pmax") (param v128 v128) (result v128) + (f32x4.pmax (local.get 0) (local.get 1))) + + (func (export "f64x2.pmin") (param v128 v128) (result v128) + (f64x2.pmin (local.get 0) (local.get 1))) + (func (export "f64x2.pmax") (param v128 v128) (result v128) + (f64x2.pmax (local.get 0) (local.get 1))) +) +;; function u0:0: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminss %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:1: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxss %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:2: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminsd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:3: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxsd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:4: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminps %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:5: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxps %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:6: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vminpd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:7: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; vmaxpd %xmm1, %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; ret diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs index a726f424089f..8fdb2694d842 100644 --- a/cranelift/fuzzgen/src/function_generator.rs +++ b/cranelift/fuzzgen/src/function_generator.rs @@ -1170,12 +1170,8 @@ static OPCODE_SIGNATURES: Lazy> = Lazy::new(|| { (Opcode::Sshr, &[I64X2, I128], &[I64X2]), (Opcode::Fmin, &[F32X4, F32X4], &[F32X4]), (Opcode::Fmin, &[F64X2, F64X2], &[F64X2]), - (Opcode::FminPseudo, &[F32X4, F32X4], &[F32X4]), - (Opcode::FminPseudo, &[F64X2, F64X2], &[F64X2]), (Opcode::Fmax, &[F32X4, F32X4], &[F32X4]), (Opcode::Fmax, &[F64X2, F64X2], &[F64X2]), - (Opcode::FmaxPseudo, &[F32X4, F32X4], &[F32X4]), - (Opcode::FmaxPseudo, &[F64X2, F64X2], &[F64X2]), (Opcode::FcvtToUintSat, &[F32X4], &[I8]), (Opcode::FcvtToUintSat, &[F64X2], &[I8]), (Opcode::FcvtToUintSat, &[F32X4], &[I16]), diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 3ee76362c869..f13d2c918400 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -879,11 +879,6 @@ where (a, b) if a.is_zero()? && b.is_zero()? && b.is_negative()? => b, (a, b) => a.smin(b)?, }), - Opcode::FminPseudo => assign(match (arg(0), arg(1)) { - (a, b) if a.is_nan()? || b.is_nan()? => a, - (a, b) if a.is_zero()? && b.is_zero()? => a, - (a, b) => a.smin(b)?, - }), Opcode::Fmax => assign(match (arg(0), arg(1)) { (a, _) if a.is_nan()? => a, (_, b) if b.is_nan()? => b, @@ -891,11 +886,6 @@ where (a, b) if a.is_zero()? && b.is_zero()? && b.is_negative()? => a, (a, b) => a.smax(b)?, }), - Opcode::FmaxPseudo => assign(match (arg(0), arg(1)) { - (a, b) if a.is_nan()? || b.is_nan()? => a, - (a, b) if a.is_zero()? && b.is_zero()? => a, - (a, b) => a.smax(b)?, - }), Opcode::Ceil => unary(DataValueExt::ceil, arg(0))?, Opcode::Floor => unary(DataValueExt::floor, arg(0))?, Opcode::Trunc => unary(DataValueExt::trunc, arg(0))?, diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 0d5e78889744..34e8100f7de3 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1956,12 +1956,29 @@ pub fn translate_operator( state.push1(builder.ins().fmin(a, b)) } Operator::F32x4PMax | Operator::F64x2PMax => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); - state.push1(builder.ins().fmax_pseudo(a, b)) + // Note the careful ordering here with respect to `fcmp` and + // `bitselect`. This matches the spec definition of: + // + // fpmax(z1, z2) = + // * If z1 is less than z2 then return z2. + // * Else return z1. + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); + let cmp = builder.ins().fcmp(FloatCC::LessThan, a, b); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + state.push1(builder.ins().bitselect(cmp, b, a)) } Operator::F32x4PMin | Operator::F64x2PMin => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); - state.push1(builder.ins().fmin_pseudo(a, b)) + // Note the careful ordering here which is similar to `pmax` above: + // + // fpmin(z1, z2) = + // * If z2 is less than z1 then return z2. + // * Else return z1. + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); + let cmp = builder.ins().fcmp(FloatCC::LessThan, b, a); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + state.push1(builder.ins().bitselect(cmp, b, a)) } Operator::F32x4Sqrt | Operator::F64x2Sqrt => { let a = pop1_with_bitcast(state, type_of(op), builder); @@ -2243,27 +2260,39 @@ pub fn translate_operator( } Operator::F32x4RelaxedMax | Operator::F64x2RelaxedMax => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); state.push1( if environ.relaxed_simd_deterministic() || !environ.is_x86() { // Deterministic semantics match the `fmax` instruction, or // the `fAAxBB.max` wasm instruction. builder.ins().fmax(a, b) } else { - builder.ins().fmax_pseudo(a, b) + // Note that this matches the `pmax` translation which has + // careful ordering of its operands to trigger + // pattern-matches in the x86 backend. + let cmp = builder.ins().fcmp(FloatCC::LessThan, a, b); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + builder.ins().bitselect(cmp, b, a) }, ) } Operator::F32x4RelaxedMin | Operator::F64x2RelaxedMin => { - let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + let ty = type_of(op); + let (a, b) = pop2_with_bitcast(state, ty, builder); state.push1( if environ.relaxed_simd_deterministic() || !environ.is_x86() { // Deterministic semantics match the `fmin` instruction, or // the `fAAxBB.min` wasm instruction. builder.ins().fmin(a, b) } else { - builder.ins().fmin_pseudo(a, b) + // Note that this matches the `pmin` translation which has + // careful ordering of its operands to trigger + // pattern-matches in the x86 backend. + let cmp = builder.ins().fcmp(FloatCC::LessThan, b, a); + let cmp = optionally_bitcast_vector(cmp, ty, builder); + builder.ins().bitselect(cmp, b, a) }, ); }