diff --git a/circle.yml b/circle.yml index 46ba66ce..979ab5b7 100644 --- a/circle.yml +++ b/circle.yml @@ -74,6 +74,12 @@ commands: command: | cmake --build ~/build --parallel cmake --build ~/build --target package + - run: + name: "Benchmark" + working_directory: ~/build + command: | + cat /proc/cpuinfo + test/intx-bench --benchmark_repetitions=11 --benchmark_filter=reciprocal - run: name: "Test" working_directory: ~/build diff --git a/test/benchmarks/bench_div.cpp b/test/benchmarks/bench_div.cpp index 243faade..f6697f2f 100644 --- a/test/benchmarks/bench_div.cpp +++ b/test/benchmarks/bench_div.cpp @@ -77,7 +77,9 @@ void reciprocal(benchmark::State& state) benchmark::DoNotOptimize(x); } BENCHMARK(reciprocal); -BENCHMARK(reciprocal); +BENCHMARK(reciprocal); +BENCHMARK(reciprocal); +BENCHMARK(reciprocal); BENCHMARK(reciprocal); BENCHMARK(reciprocal); BENCHMARK(reciprocal); diff --git a/test/experimental/div.hpp b/test/experimental/div.hpp index 82a629a8..3a95d98c 100644 --- a/test/experimental/div.hpp +++ b/test/experimental/div.hpp @@ -4,18 +4,51 @@ namespace intx { -inline uint64_t reciprocal_naive(uint64_t d) noexcept +inline uint64_t reciprocal_native(uint64_t d) noexcept { - const auto u = uint128{~uint64_t{0}, ~d}; - uint64_t v{}; +#ifdef __x86_64__ + uint64_t _; // NOLINT(*-init-variables) + uint64_t v; // NOLINT(*-init-variables) + asm("divq %4" // NOLINT(*-no-assembler) + : "=d"(_), "=a"(v) + : "d"(~d), "a"(~uint64_t{0}), "r"(d)); + return v; +#else + // Fallback implementation. + return (uint128{~uint64_t{0}, ~d} / d)[0]; +#endif +} -#if __x86_64__ - uint64_t _{}; - asm("divq %4" : "=d"(_), "=a"(v) : "d"(u[1]), "a"(u[0]), "g"(d)); // NOLINT(hicpp-no-assembler) +inline uint64_t reciprocal_builtin_uint128(uint64_t d) noexcept +{ +#if INTX_HAS_BUILTIN_INT128 + const auto u = (builtin_uint128{~d} << 64) | ~uint64_t{0}; + return static_cast(u / d); #else - v = (u / d)[0]; + // Fallback implementation. + return (uint128{~uint64_t{0}, ~d} / d)[0]; #endif +} - return v; +/// The copy of the GMP algorithm from "Improved division by invariant integers". +constexpr uint64_t reciprocal_gmp(uint64_t d) noexcept +{ + INTX_REQUIRE(d & 0x8000000000000000); // Must be normalized. + + const uint64_t d9 = d >> 55; + const uint32_t v0 = internal::reciprocal_table[static_cast(d9 - 256)]; + + const uint64_t d40 = (d >> 24) + 1; + const uint64_t v1 = (v0 << 11) - uint32_t(uint32_t{v0 * v0} * d40 >> 40) - 1; + + const uint64_t v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47); + + const uint64_t d0 = d & 1; + const uint64_t d63 = (d >> 1) + d0; // ceil(d/2) + const uint64_t e = ((v2 >> 1) & (0 - d0)) - (v2 * d63); + const uint64_t v3 = (umul(v2, e)[1] >> 1) + (v2 << 31); + + const uint64_t v4 = v3 - (umul(v3, d) + d)[1] - d; + return v4; } } // namespace intx diff --git a/test/unittests/test_div.cpp b/test/unittests/test_div.cpp index afda30f4..b15e7edb 100644 --- a/test/unittests/test_div.cpp +++ b/test/unittests/test_div.cpp @@ -442,15 +442,19 @@ TEST(div, reciprocal) constexpr auto d_start = uint64_t{1} << 63; for (uint64_t d = d_start; d < d_start + n; ++d) { - auto v = reciprocal_2by1(d); - ASSERT_EQ(v, reciprocal_naive(d)) << d; + const auto expected = reciprocal_builtin_uint128(d); + ASSERT_EQ(reciprocal_2by1(d), expected) << d; + ASSERT_EQ(reciprocal_native(d), expected) << d; + ASSERT_EQ(reciprocal_gmp(d), expected) << d; } constexpr auto d_end = ~uint64_t{0}; for (uint64_t d = d_end; d > d_end - n; --d) { - auto v = reciprocal_2by1(d); - ASSERT_EQ(v, reciprocal_naive(d)) << d; + const auto expected = reciprocal_builtin_uint128(d); + ASSERT_EQ(reciprocal_2by1(d), expected) << d; + ASSERT_EQ(reciprocal_native(d), expected) << d; + ASSERT_EQ(reciprocal_gmp(d), expected) << d; } }