diff --git a/test/benchee/integration/README.md b/test/benchee/integration/README.md
new file mode 100644
index 00000000..aa4e302d
--- /dev/null
+++ b/test/benchee/integration/README.md
@@ -0,0 +1,3 @@
+These tests are split out of `benchee_test.exs` to be able to run more of them in parallel.
+
+These are integration-y tests executing the whole stack.
diff --git a/test/benchee/integration/escript_test.exs b/test/benchee/integration/escript_test.exs
new file mode 100644
index 00000000..b17a32b8
--- /dev/null
+++ b/test/benchee/integration/escript_test.exs
@@ -0,0 +1,21 @@
+defmodule Benchee.EscriptTest do
+  # Cding is global, so can't run in parallel
+  use ExUnit.Case, async: false
+
+  import Benchee.IntegrationHelpers
+
+  describe "escript building" do
+    @working_directory File.cwd!()
+    @sample_project_directory Path.expand("../../fixtures/escript", __DIR__)
+    test "benchee can be built into and used as an escript" do
+      File.cd!(@sample_project_directory)
+      # we don't match the exit_status right now to get better error messages potentially
+      {output, exit_status} = System.cmd("bash", ["test.sh"])
+
+      readme_sample_asserts(output)
+      assert exit_status == 0
+    after
+      File.cd!(@working_directory)
+    end
+  end
+end
diff --git a/test/benchee/integration/evaluation_warning_test.exs b/test/benchee/integration/evaluation_warning_test.exs
new file mode 100644
index 00000000..66006984
--- /dev/null
+++ b/test/benchee/integration/evaluation_warning_test.exs
@@ -0,0 +1,47 @@
+defmodule Benchee.EvaluationWarningTest do
+  use ExUnit.Case, async: true
+
+  describe "warn when functions are evaluated" do
+    test "warns when run in iex" do
+      # test env to avoid repeated compilation on CI
+      port = Port.open({:spawn, "iex -S mix"}, [:binary, env: [{~c"MIX_ENV", ~c"test"}]])
+
+      try do
+        # wait for startup
+        # timeout huge because of CI
+        assert_receive {^port, {:data, "iex(1)> "}}, 20_000
+
+        send(
+          port,
+          {self(),
+           {:command, "Benchee.run(%{\"test\" => fn -> 1 end}, time: 0.001, warmup: 0)\n"}}
+        )
+
+        assert_receive {^port, {:data, "Warning: " <> message}}, 20_000
+        assert message =~ ~r/test.+evaluated.+slower.+compiled.+module.+/is
+
+        # waiting for iex to be ready for input again/the benchmark to be finished
+        # sometimes we get "iex(2)>" as a separate message, sometimes it's attached to the
+        # previous output - hence we gotta doe out own `receive` checking.
+        assert :ok = wait_for_benchmark_finished(port)
+      after
+        # https://elixirforum.com/t/starting-shutting-down-iex-with-a-port-gracefully/60388/2?u=pragtob
+        send(port, {self(), {:command, "\a"}})
+        send(port, {self(), {:command, "q\n"}})
+      end
+    end
+
+    defp wait_for_benchmark_finished(port) do
+      receive do
+        {^port, {:data, output}} ->
+          if String.contains?(output, "iex(2)>") do
+            :ok
+          else
+            wait_for_benchmark_finished(port)
+          end
+      after
+        20_000 -> raise RuntimeError, "Waited too long for iex benchmark to finish/send iex(2)>"
+      end
+    end
+  end
+end
diff --git a/test/benchee/integration/function_overhead_and_outliers_test.exs b/test/benchee/integration/function_overhead_and_outliers_test.exs
new file mode 100644
index 00000000..c8a935fd
--- /dev/null
+++ b/test/benchee/integration/function_overhead_and_outliers_test.exs
@@ -0,0 +1,100 @@
+defmodule Benchee.FunctionOverheadAndOutliersTest do
+  use ExUnit.Case, async: true
+
+  import ExUnit.CaptureIO
+  import Benchee.IntegrationHelpers
+  import Benchee.TestHelpers
+  @test_configuration Benchee.IntegrationHelpers.test_configuration()
+
+  describe "function call overhead measurement" do
+    @overhead_output_regex ~r/function call overhead.*\d+/i
+    test "by default it is off" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"sleeps" => fn -> sleep_safe_time() end},
+            @test_configuration
+          )
+        end)
+
+      refute output =~ @overhead_output_regex
+    end
+
+    test "can be turned on" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"sleeps" => fn -> sleep_safe_time() end},
+            Keyword.merge(@test_configuration, measure_function_call_overhead: true)
+          )
+        end)
+
+      assert output =~ @overhead_output_regex
+    end
+  end
+
+  describe "exclude_outliers" do
+    test "even with it the high level README example still passes its asserts" do
+      output =
+        capture_io(fn ->
+          list = Enum.to_list(1..10_000)
+          map_fun = fn i -> [i, i * i] end
+
+          Benchee.run(
+            %{
+              "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
+              "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
+            },
+            Keyword.merge(@test_configuration, exclude_outliers: true)
+          )
+        end)
+
+      readme_sample_asserts(output)
+    end
+
+    # The easiest way to create an outlier is to just run something once
+    # and then take a "stable" measurement like reductions or memory
+    test "removes outliers" do
+      {:ok, agent} = Agent.start(fn -> 0 end)
+
+      output =
+        capture_io(fn ->
+          suite =
+            Benchee.run(
+              %{
+                "flawed" => fn ->
+                  # Produce some garbage but only once
+                  # can't use process dictionary as it's a different process every time
+                  if Agent.get(agent, & &1) < 1 do
+                    Enum.reduce(1..10_000, 0, &+/2)
+                    Agent.update(agent, &(&1 + 1))
+                  end
+                end
+              },
+              time: 0,
+              warmup: 0,
+              reduction_time: 0.05,
+              exclude_outliers: true
+            )
+
+          %{scenarios: [%{reductions_data: %{samples: samples, statistics: stats}}]} = suite
+
+          assert [outlier] = stats.outliers
+          assert outlier >= stats.upper_outlier_bound
+          # since the outlier is removed, all values are the same
+          assert stats.std_dev == 0
+          assert stats.minimum == stats.maximum
+
+          # It's a big outlier!
+          assert 10 * stats.average < outlier
+
+          # The outlier is only removed from the stats, but not from the samples
+          assert outlier in samples
+        end)
+
+      # As the outlier is removed, all measurements are the same
+      assert output =~ ~r/all.*reduction.*same/i
+      assert output =~ ~r/exclud.*outlier.*true/i
+    end
+  end
+end
diff --git a/test/benchee/integration/hooks_test.exs b/test/benchee/integration/hooks_test.exs
new file mode 100644
index 00000000..87f35267
--- /dev/null
+++ b/test/benchee/integration/hooks_test.exs
@@ -0,0 +1,65 @@
+defmodule Benchee.HooksTest do
+  use ExUnit.Case, async: true
+
+  import ExUnit.CaptureIO
+  import Benchee.TestHelpers
+  @test_configuration Benchee.IntegrationHelpers.test_configuration()
+
+  describe "hooks" do
+    test "it runs all of them" do
+      capture_io(fn ->
+        myself = self()
+
+        Benchee.run(
+          %{
+            "sleeper" =>
+              {fn -> sleep_safe_time() end,
+               before_each: fn input ->
+                 send(myself, :local_before)
+                 input
+               end,
+               after_each: fn _ -> send(myself, :local_after) end,
+               before_scenario: fn input ->
+                 send(myself, :local_before_scenario)
+                 input
+               end,
+               after_scenario: fn _ -> send(myself, :local_after_scenario) end},
+            "sleeper 2" => fn -> sleep_safe_time() end
+          },
+          Keyword.merge(
+            @test_configuration,
+            time: 0.0001,
+            warmup: 0,
+            before_each: fn input ->
+              send(myself, :global_before)
+              input
+            end,
+            after_each: fn _ -> send(myself, :global_after) end,
+            before_scenario: fn input ->
+              send(myself, :global_before_scenario)
+              input
+            end,
+            after_scenario: fn _ -> send(myself, :global_after_scenario) end
+          )
+        )
+      end)
+
+      assert_received_exactly([
+        # first job with all those local hooks
+        :global_before_scenario,
+        :local_before_scenario,
+        :global_before,
+        :local_before,
+        :local_after,
+        :global_after,
+        :local_after_scenario,
+        :global_after_scenario,
+        # second job that only runs global hooks
+        :global_before_scenario,
+        :global_before,
+        :global_after,
+        :global_after_scenario
+      ])
+    end
+  end
+end
diff --git a/test/benchee/integration/max_sample_size_test.exs b/test/benchee/integration/max_sample_size_test.exs
new file mode 100644
index 00000000..9ef4dab5
--- /dev/null
+++ b/test/benchee/integration/max_sample_size_test.exs
@@ -0,0 +1,39 @@
+defmodule Benchee.MaxSampleSizeTest do
+  use ExUnit.Case, async: true
+
+  import ExUnit.CaptureIO
+  import Benchee.Configuration
+
+  alias Benchee.{Configuration, Suite}
+
+  @default_max_sample_size 1_000_000
+
+  test "max_sample_size defaults to 1 million samples" do
+    assert %Suite{configuration: %Configuration{max_sample_size: 1_000_000}} = init()
+  end
+
+  # Normally we do not test defaults this way, but this covers the full process
+  # using the default `max_sample_size` with formatters enabled.
+  @tag :performance
+  test "max_sample_size by default is set to 1 Million" do
+    capture_io(fn ->
+      suite =
+        Benchee.run(
+          %{
+            "fast" => fn -> :fast end
+          },
+          time: 1,
+          warmup: 0
+        )
+
+      Enum.each(suite.scenarios, fn scenario ->
+        assert scenario.run_time_data.statistics.sample_size <= @default_max_sample_size
+      end)
+    end)
+  end
+
+  test "max_sample_size can be disabled" do
+    assert %Suite{configuration: %Configuration{max_sample_size: nil}} =
+             init(max_sample_size: nil)
+  end
+end
diff --git a/test/benchee/integration/measurement_integration_test.exs b/test/benchee/integration/measurement_integration_test.exs
new file mode 100644
index 00000000..737d72ca
--- /dev/null
+++ b/test/benchee/integration/measurement_integration_test.exs
@@ -0,0 +1,105 @@
+defmodule Benchee.MeasurementIntegrationTest do
+  use ExUnit.Case, async: true
+
+  import ExUnit.CaptureIO
+
+  @test_configuration Benchee.IntegrationHelpers.test_configuration()
+
+  describe "memory measurement" do
+    @describetag :memory_measure
+
+    test "measures memory usage when instructed to do so" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"To List" => fn -> Enum.to_list(1..100) end},
+            Keyword.merge(
+              @test_configuration,
+              memory_time: 0.001
+            )
+          )
+        end)
+
+      assert output =~ ~r/Memory usage statistics:/
+      assert output =~ ~r/To List\s+[0-9.]{3,} K*B{1}/
+    end
+
+    test "does not blow up when only measuring memory times" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{
+              "something" => fn -> Enum.map(1..100, fn i -> i + 1 end) end
+            },
+            Keyword.merge(
+              @test_configuration,
+              time: 0,
+              warmup: 0,
+              memory_time: 0.001
+            )
+          )
+        end)
+
+      # no runtime statistics displayed
+      refute output =~ ~r/ips/i
+      assert output =~ ~r/memory.+statistics/i
+    end
+
+    test "the micro keyword list code from Michal does not break memory measurements #213" do
+      benches = %{
+        "delete old" => fn {kv, key} -> BenchKeyword.delete_v0(kv, key) end,
+        "delete reverse" => fn {kv, key} -> BenchKeyword.delete_v2(kv, key) end,
+        "delete keymember reverse" => fn {kv, key} -> BenchKeyword.delete_v3(kv, key) end,
+        "delete throw" => fn {kv, key} -> BenchKeyword.delete_v1(kv, key) end
+      }
+
+      inputs = %{
+        "large miss" => {Enum.map(1..100, &{:"k#{&1}", &1}), :k101},
+        "large hit" => {Enum.map(1..100, &{:"k#{&1}", &1}), :k100},
+        "small miss" => {Enum.map(1..10, &{:"k#{&1}", &1}), :k11}
+      }
+
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            benches,
+            Keyword.merge(
+              @test_configuration,
+              inputs: inputs,
+              print: [fast_warning: false],
+              memory_time: 0.001,
+              warmup: 0,
+              time: 0
+            )
+          )
+        end)
+
+      refute output =~ "N/A"
+      refute output =~ ~r/warning/i
+      assert output =~ "large hit"
+      # Byte
+      assert output =~ "B"
+
+      assert output =~ ~r/1\.0\dx memory/
+
+      assert output =~ "∞ x memo"
+    end
+  end
+
+  describe "reduction measurement" do
+    test "measures reduction count when instructed to do so" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"To List" => fn -> Enum.to_list(1..100) end},
+            Keyword.merge(
+              @test_configuration,
+              reduction_time: 0.1
+            )
+          )
+        end)
+
+      assert output =~ ~r/Reduction count statistics:/
+    end
+  end
+end
diff --git a/test/benchee/integration/profiling_integration_test.exs b/test/benchee/integration/profiling_integration_test.exs
new file mode 100644
index 00000000..5e63c0fa
--- /dev/null
+++ b/test/benchee/integration/profiling_integration_test.exs
@@ -0,0 +1,123 @@
+defmodule Benchee.ProfilingIntegrationTest do
+  use ExUnit.Case, async: true
+
+  import ExUnit.CaptureIO
+  import Benchee.TestHelpers
+
+  @test_configuration Benchee.IntegrationHelpers.test_configuration()
+
+  alias Benchee.{Profile, UnknownProfilerError}
+
+  describe "profiling" do
+    test "integration profiling defaults to no profile" do
+      output =
+        capture_io(fn ->
+          Benchee.run(%{"Sleeps" => fn -> :timer.sleep(10) end}, @test_configuration)
+        end)
+
+      refute output =~ ~r/Profiling.+with/i
+      refute output =~ ~r/Profile done/i
+    end
+
+    test "integration profiling `profile_after: true` runs default profiler" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"Sleeps" => fn -> :timer.sleep(10) end},
+            Keyword.merge(
+              @test_configuration,
+              profile_after: true
+            )
+          )
+        end)
+
+      assert output =~ profiling_regex("Sleeps", Profile.default_profiler())
+      assert output =~ end_of_profiling_regex(Profile.default_profiler())
+    end
+
+    for profiler <- Benchee.Profile.builtin_profilers() do
+      @profiler profiler
+      test "integration profiling runs #{inspect(@profiler)} profiler" do
+        output =
+          capture_io(fn ->
+            Benchee.run(
+              %{"Sleeps" => fn -> :timer.sleep(10) end},
+              Keyword.merge(
+                @test_configuration,
+                profile_after: @profiler
+              )
+            )
+          end)
+
+        assert output =~ profiling_regex("Sleeps", @profiler)
+        assert output =~ end_of_profiling_regex(@profiler)
+      end
+    end
+
+    test "integration profiling a wrong profiler raises exception" do
+      assert_raise UnknownProfilerError, fn ->
+        capture_io(fn ->
+          Benchee.run(
+            %{"Sleeps" => fn -> :timer.sleep(10) end},
+            Keyword.merge(
+              @test_configuration,
+              profile_after: :unknown_profiler
+            )
+          )
+        end)
+      end
+    end
+
+    test "profiling and hooks work together" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"Sleeps" => fn _arg -> :timer.sleep(10) end},
+            Keyword.merge(
+              @test_configuration,
+              profile_after: true,
+              # the value here isn't too important, it just forces the function to take
+              # an argument which is what can make it break
+              before_each: fn _ -> nil end
+            )
+          )
+        end)
+
+      assert output =~ profiling_regex("Sleeps", Profile.default_profiler())
+      assert output =~ end_of_profiling_regex(Profile.default_profiler())
+    end
+
+    test "profiling and inputs work together" do
+      output =
+        capture_io(fn ->
+          Benchee.run(
+            %{"Sleeps" => fn sleep_time -> :timer.sleep(sleep_time) end},
+            Keyword.merge(
+              @test_configuration,
+              profile_after: true,
+              # the value here isn't too important, it just forces the function to take
+              # an argument which is what can make it break
+              inputs: %{"sleep time" => safe_sleep_time()}
+            )
+          )
+        end)
+
+      assert output =~ profiling_regex("Sleeps", Profile.default_profiler())
+      assert output =~ end_of_profiling_regex(Profile.default_profiler())
+    end
+
+    defp profiling_regex(benchmark_name, profiler) do
+      ~r/Profiling #{benchmark_name} with #{profiler}/
+    end
+
+    # :fprof is the only profiler who doesn't have at the end of its output:
+    # "Profile done over X matching functions"
+    defp end_of_profiling_regex(:fprof) do
+      ~r/CNT.+ACC \(ms\).+OWN \(ms\)/
+    end
+
+    defp end_of_profiling_regex(_profiler) do
+      ~r/Profile done/
+    end
+  end
+end
diff --git a/test/benchee/integration/save_load_test.exs b/test/benchee/integration/save_load_test.exs
new file mode 100644
index 00000000..cdcf7ef5
--- /dev/null
+++ b/test/benchee/integration/save_load_test.exs
@@ -0,0 +1,117 @@
+defmodule Benchee.SaveLoadTest do
+  use ExUnit.Case, async: true
+
+  import ExUnit.CaptureIO
+  import Benchee.IntegrationHelpers
+  import Benchee.TestHelpers
+  @test_configuration Benchee.IntegrationHelpers.test_configuration()
+
+  describe "save & load" do
+    test "saving the suite to disk and restoring it" do
+      save = [save: [path: "save.benchee", tag: "main"]]
+      expected_file = "save.benchee"
+
+      try do
+        configuration = Keyword.merge(@test_configuration, save)
+        map_fun = fn i -> [i, i * i] end
+        list = Enum.to_list(1..10_000)
+
+        capture_io(fn ->
+          suite =
+            Benchee.run(
+              %{
+                "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
+                "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
+              },
+              configuration
+            )
+
+          content = File.read!(expected_file)
+
+          untagged_suite =
+            content
+            |> :erlang.binary_to_term()
+            |> suite_without_scenario_tags()
+
+          assert untagged_suite == without_functions_and_inputs(suite)
+        end)
+
+        loaded_output =
+          capture_io(fn ->
+            Benchee.run(%{}, Keyword.merge(@test_configuration, load: expected_file))
+          end)
+
+        readme_sample_asserts(loaded_output, tag_string: " (main)")
+
+        comparison_output =
+          capture_io(fn ->
+            Benchee.run(
+              %{
+                "too fast" => fn -> nil end
+              },
+              Keyword.merge(@test_configuration, load: expected_file)
+            )
+          end)
+
+        assert comparison_output =~ ~r/^too fast\s+\d+(\.\d+)?.*+$/m
+
+        assert comparison_output =~
+                 ~r/^flat_map \(main\)\s+\d+(\.\d+)?.*- \d+.+x slower \+\d+.+s$/m
+
+        assert comparison_output =~
+                 ~r/^map\.flatten \(main\)\s+\d+(\.\d+)?.*- \d+.+x slower \+\d+.+s$/m
+      after
+        if File.exists?(expected_file) do
+          File.rm!(expected_file)
+        end
+      end
+    end
+
+    # function and input provide no real benefit for the envisioned use case of comparing outputs
+    # what it does is balloon the file size written out and take performance to the groun
+    defp without_functions_and_inputs(suite) do
+      update_in(suite.scenarios, fn scenarios ->
+        Enum.map(scenarios, fn %Benchee.Scenario{} = scenario ->
+          %Benchee.Scenario{scenario | function: nil, input: nil}
+        end)
+      end)
+    end
+
+    test " report/1 raises without providing at least a load option" do
+      assert_raise(ArgumentError, ~r/load/i, fn -> Benchee.report([]) end)
+    end
+
+    test "report/1 saving first and then reporting on it" do
+      save = [save: [path: "save.benchee", tag: nil]]
+      expected_file = "save.benchee"
+
+      try do
+        configuration = Keyword.merge(@test_configuration, save)
+        map_fun = fn i -> [i, i * i] end
+        list = Enum.to_list(1..10_000)
+
+        capture_io(fn ->
+          Benchee.run(
+            %{
+              "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
+              "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
+            },
+            configuration
+          )
+        end)
+
+        report_output =
+          capture_io(fn ->
+            Benchee.report(Keyword.merge(@test_configuration, load: expected_file))
+          end)
+
+        # no system information, benchmarking config or progress for omitted steps is printed out
+        readme_sample_asserts(report_output, benchmarking_prints: false)
+      after
+        if File.exists?(expected_file) do
+          File.rm!(expected_file)
+        end
+      end
+    end
+  end
+end
diff --git a/test/benchee_test.exs b/test/benchee_test.exs
index 47bd5405..3e858efa 100644
--- a/test/benchee_test.exs
+++ b/test/benchee_test.exs
@@ -5,16 +5,18 @@ defmodule BencheeTest do
     Conversion.Duration,
     Formatter,
     Formatters.Console,
-    Profile,
     Statistics,
     Suite,
-    Test.FakeFormatter,
-    UnknownProfilerError
+    Test.FakeFormatter
   }
 
   import ExUnit.CaptureIO
+  import Benchee.IntegrationHelpers
   import Benchee.TestHelpers
 
+  # Several integration tests are split out of this file into their
+  # own files so more tests can run in parallel
+
   @header_regex ~r/^Name.+ips.+average.+deviation.+median.+99th %$/m
   @test_configuration [time: 0.01, warmup: 0.005]
 
@@ -517,28 +519,6 @@ defmodule BencheeTest do
     end)
   end
 
-  # normally I wouldn't test defaults like this, but since we need a test that there will be more
-  # when setting max_sample_size to `nil` and that the whole process works, this is needed
-  @default_max_sample_size 1_000_000
-  @tag :performance
-  test "max_sample_size by default is set to 1 Million" do
-    # not disabling the formatters as I want the formatters to run through with that as well
-    capture_io(fn ->
-      suite =
-        Benchee.run(
-          %{
-            "fast" => fn -> :fast end
-          },
-          time: 1,
-          warmup: 0
-        )
-
-      Enum.each(suite.scenarios, fn scenario ->
-        assert scenario.run_time_data.statistics.sample_size <= @default_max_sample_size
-      end)
-    end)
-  end
-
   test ".run returns the suite intact" do
     capture_io(fn ->
       suite =
@@ -679,567 +659,4 @@ defmodule BencheeTest do
       refute output =~ "Benchmarking"
     end
   end
-
-  describe "save & load" do
-    test "saving the suite to disk and restoring it" do
-      save = [save: [path: "save.benchee", tag: "main"]]
-      expected_file = "save.benchee"
-
-      try do
-        configuration = Keyword.merge(@test_configuration, save)
-        map_fun = fn i -> [i, i * i] end
-        list = Enum.to_list(1..10_000)
-
-        capture_io(fn ->
-          suite =
-            Benchee.run(
-              %{
-                "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
-                "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
-              },
-              configuration
-            )
-
-          content = File.read!(expected_file)
-
-          untagged_suite =
-            content
-            |> :erlang.binary_to_term()
-            |> suite_without_scenario_tags()
-
-          assert untagged_suite == without_functions_and_inputs(suite)
-        end)
-
-        loaded_output =
-          capture_io(fn ->
-            Benchee.run(%{}, Keyword.merge(@test_configuration, load: expected_file))
-          end)
-
-        readme_sample_asserts(loaded_output, tag_string: " (main)")
-
-        comparison_output =
-          capture_io(fn ->
-            Benchee.run(
-              %{
-                "too fast" => fn -> nil end
-              },
-              Keyword.merge(@test_configuration, load: expected_file)
-            )
-          end)
-
-        assert comparison_output =~ ~r/^too fast\s+\d+(\.\d+)?.*+$/m
-
-        assert comparison_output =~
-                 ~r/^flat_map \(main\)\s+\d+(\.\d+)?.*- \d+.+x slower \+\d+.+s$/m
-
-        assert comparison_output =~
-                 ~r/^map\.flatten \(main\)\s+\d+(\.\d+)?.*- \d+.+x slower \+\d+.+s$/m
-      after
-        if File.exists?(expected_file) do
-          File.rm!(expected_file)
-        end
-      end
-    end
-
-    # function and input provide no real benefit for the envisioned use case of comparing outputs
-    # what it does is balloon the file size written out and take performance to the groun
-    defp without_functions_and_inputs(suite) do
-      update_in(suite.scenarios, fn scenarios ->
-        Enum.map(scenarios, fn %Benchee.Scenario{} = scenario ->
-          %Benchee.Scenario{scenario | function: nil, input: nil}
-        end)
-      end)
-    end
-
-    test " report/1 raises without providing at least a load option" do
-      assert_raise(ArgumentError, ~r/load/i, fn -> Benchee.report([]) end)
-    end
-
-    test "report/1 saving first and then reporting on it" do
-      save = [save: [path: "save.benchee", tag: nil]]
-      expected_file = "save.benchee"
-
-      try do
-        configuration = Keyword.merge(@test_configuration, save)
-        map_fun = fn i -> [i, i * i] end
-        list = Enum.to_list(1..10_000)
-
-        capture_io(fn ->
-          Benchee.run(
-            %{
-              "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
-              "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
-            },
-            configuration
-          )
-        end)
-
-        report_output =
-          capture_io(fn ->
-            Benchee.report(Keyword.merge(@test_configuration, load: expected_file))
-          end)
-
-        # no system information, benchmarking config or progress for omitted steps is printed out
-        readme_sample_asserts(report_output, benchmarking_prints: false)
-      after
-        if File.exists?(expected_file) do
-          File.rm!(expected_file)
-        end
-      end
-    end
-  end
-
-  describe "hooks" do
-    test "it runs all of them" do
-      capture_io(fn ->
-        myself = self()
-
-        Benchee.run(
-          %{
-            "sleeper" =>
-              {fn -> sleep_safe_time() end,
-               before_each: fn input ->
-                 send(myself, :local_before)
-                 input
-               end,
-               after_each: fn _ -> send(myself, :local_after) end,
-               before_scenario: fn input ->
-                 send(myself, :local_before_scenario)
-                 input
-               end,
-               after_scenario: fn _ -> send(myself, :local_after_scenario) end},
-            "sleeper 2" => fn -> sleep_safe_time() end
-          },
-          Keyword.merge(
-            @test_configuration,
-            time: 0.0001,
-            warmup: 0,
-            before_each: fn input ->
-              send(myself, :global_before)
-              input
-            end,
-            after_each: fn _ -> send(myself, :global_after) end,
-            before_scenario: fn input ->
-              send(myself, :global_before_scenario)
-              input
-            end,
-            after_scenario: fn _ -> send(myself, :global_after_scenario) end
-          )
-        )
-      end)
-
-      assert_received_exactly([
-        # first job with all those local hooks
-        :global_before_scenario,
-        :local_before_scenario,
-        :global_before,
-        :local_before,
-        :local_after,
-        :global_after,
-        :local_after_scenario,
-        :global_after_scenario,
-        # second job that only runs global hooks
-        :global_before_scenario,
-        :global_before,
-        :global_after,
-        :global_after_scenario
-      ])
-    end
-  end
-
-  describe "memory measurement" do
-    @describetag :memory_measure
-
-    test "measures memory usage when instructed to do so" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"To List" => fn -> Enum.to_list(1..100) end},
-            Keyword.merge(
-              @test_configuration,
-              memory_time: 0.001
-            )
-          )
-        end)
-
-      assert output =~ ~r/Memory usage statistics:/
-      assert output =~ ~r/To List\s+[0-9.]{3,} K*B{1}/
-    end
-
-    test "does not blow up when only measuring memory times" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{
-              "something" => fn -> Enum.map(1..100, fn i -> i + 1 end) end
-            },
-            Keyword.merge(
-              @test_configuration,
-              time: 0,
-              warmup: 0,
-              memory_time: 0.001
-            )
-          )
-        end)
-
-      # no runtime statistics displayed
-      refute output =~ ~r/ips/i
-      assert output =~ ~r/memory.+statistics/i
-    end
-
-    test "the micro keyword list code from Michal does not break memory measurements #213" do
-      benches = %{
-        "delete old" => fn {kv, key} -> BenchKeyword.delete_v0(kv, key) end,
-        "delete reverse" => fn {kv, key} -> BenchKeyword.delete_v2(kv, key) end,
-        "delete keymember reverse" => fn {kv, key} -> BenchKeyword.delete_v3(kv, key) end,
-        "delete throw" => fn {kv, key} -> BenchKeyword.delete_v1(kv, key) end
-      }
-
-      inputs = %{
-        "large miss" => {Enum.map(1..100, &{:"k#{&1}", &1}), :k101},
-        "large hit" => {Enum.map(1..100, &{:"k#{&1}", &1}), :k100},
-        "small miss" => {Enum.map(1..10, &{:"k#{&1}", &1}), :k11}
-      }
-
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            benches,
-            Keyword.merge(
-              @test_configuration,
-              inputs: inputs,
-              print: [fast_warning: false],
-              memory_time: 0.001,
-              warmup: 0,
-              time: 0
-            )
-          )
-        end)
-
-      refute output =~ "N/A"
-      refute output =~ ~r/warning/i
-      assert output =~ "large hit"
-      # Byte
-      assert output =~ "B"
-
-      assert output =~ ~r/1\.0\dx memory/
-
-      assert output =~ "∞ x memo"
-    end
-  end
-
-  describe "reduction measurement" do
-    test "measures reduction count when instructed to do so" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"To List" => fn -> Enum.to_list(1..100) end},
-            Keyword.merge(
-              @test_configuration,
-              reduction_time: 0.1
-            )
-          )
-        end)
-
-      assert output =~ ~r/Reduction count statistics:/
-    end
-  end
-
-  describe "profiling" do
-    test "integration profiling defaults to no profile" do
-      output =
-        capture_io(fn ->
-          Benchee.run(%{"Sleeps" => fn -> :timer.sleep(10) end}, @test_configuration)
-        end)
-
-      refute output =~ ~r/Profiling.+with/i
-      refute output =~ ~r/Profile done/i
-    end
-
-    test "integration profiling `profile_after: true` runs default profiler" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"Sleeps" => fn -> :timer.sleep(10) end},
-            Keyword.merge(
-              @test_configuration,
-              profile_after: true
-            )
-          )
-        end)
-
-      assert output =~ profiling_regex("Sleeps", Profile.default_profiler())
-      assert output =~ end_of_profiling_regex(Profile.default_profiler())
-    end
-
-    for profiler <- Benchee.Profile.builtin_profilers() do
-      @profiler profiler
-      test "integration profiling runs #{inspect(@profiler)} profiler" do
-        output =
-          capture_io(fn ->
-            Benchee.run(
-              %{"Sleeps" => fn -> :timer.sleep(10) end},
-              Keyword.merge(
-                @test_configuration,
-                profile_after: @profiler
-              )
-            )
-          end)
-
-        assert output =~ profiling_regex("Sleeps", @profiler)
-        assert output =~ end_of_profiling_regex(@profiler)
-      end
-    end
-
-    test "integration profiling a wrong profiler raises exception" do
-      assert_raise UnknownProfilerError, fn ->
-        capture_io(fn ->
-          Benchee.run(
-            %{"Sleeps" => fn -> :timer.sleep(10) end},
-            Keyword.merge(
-              @test_configuration,
-              profile_after: :unknown_profiler
-            )
-          )
-        end)
-      end
-    end
-
-    test "profiling and hooks work together" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"Sleeps" => fn _arg -> :timer.sleep(10) end},
-            Keyword.merge(
-              @test_configuration,
-              profile_after: true,
-              # the value here isn't too important, it just forces the function to take
-              # an argument which is what can make it break
-              before_each: fn _ -> nil end
-            )
-          )
-        end)
-
-      assert output =~ profiling_regex("Sleeps", Profile.default_profiler())
-      assert output =~ end_of_profiling_regex(Profile.default_profiler())
-    end
-
-    test "profiling and inputs work together" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"Sleeps" => fn sleep_time -> :timer.sleep(sleep_time) end},
-            Keyword.merge(
-              @test_configuration,
-              profile_after: true,
-              # the value here isn't too important, it just forces the function to take
-              # an argument which is what can make it break
-              inputs: %{"sleep time" => safe_sleep_time()}
-            )
-          )
-        end)
-
-      assert output =~ profiling_regex("Sleeps", Profile.default_profiler())
-      assert output =~ end_of_profiling_regex(Profile.default_profiler())
-    end
-
-    defp profiling_regex(benchmark_name, profiler) do
-      ~r/Profiling #{benchmark_name} with #{profiler}/
-    end
-
-    # :fprof is the only profiler who doesn't have at the end of its output:
-    # "Profile done over X matching functions"
-    defp end_of_profiling_regex(:fprof) do
-      ~r/CNT.+ACC \(ms\).+OWN \(ms\)/
-    end
-
-    defp end_of_profiling_regex(_profiler) do
-      ~r/Profile done/
-    end
-  end
-
-  describe "function call overhead measurement" do
-    @overhead_output_regex ~r/function call overhead.*\d+/i
-    test "by default it is off" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"sleeps" => fn -> sleep_safe_time() end},
-            @test_configuration
-          )
-        end)
-
-      refute output =~ @overhead_output_regex
-    end
-
-    test "can be turned on" do
-      output =
-        capture_io(fn ->
-          Benchee.run(
-            %{"sleeps" => fn -> sleep_safe_time() end},
-            Keyword.merge(@test_configuration, measure_function_call_overhead: true)
-          )
-        end)
-
-      assert output =~ @overhead_output_regex
-    end
-  end
-
-  describe "exclude_outliers" do
-    test "even with it the high level README example still passes its asserts" do
-      output =
-        capture_io(fn ->
-          list = Enum.to_list(1..10_000)
-          map_fun = fn i -> [i, i * i] end
-
-          Benchee.run(
-            %{
-              "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
-              "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
-            },
-            Keyword.merge(@test_configuration, exclude_outliers: true)
-          )
-        end)
-
-      readme_sample_asserts(output)
-    end
-
-    # The easiest way to create an outlier is to just run something once
-    # and then take a "stable" measurement like reductions or memory
-    test "removes outliers" do
-      {:ok, agent} = Agent.start(fn -> 0 end)
-
-      output =
-        capture_io(fn ->
-          suite =
-            Benchee.run(
-              %{
-                "flawed" => fn ->
-                  # Produce some garbage but only once
-                  # can't use process dictionary as it's a different process every time
-                  if Agent.get(agent, & &1) < 1 do
-                    Enum.map(1..100, fn i -> "garbage #{i}" end)
-                    Agent.update(agent, &(&1 + 1))
-                  end
-                end
-              },
-              time: 0,
-              warmup: 0,
-              reduction_time: 0.005,
-              exclude_outliers: true
-            )
-
-          %{scenarios: [%{reductions_data: %{samples: samples, statistics: stats}}]} = suite
-
-          assert [outlier] = stats.outliers
-          assert outlier >= stats.upper_outlier_bound
-          # since the outlier is removed, all values are the same
-          assert stats.std_dev == 0
-          assert stats.minimum == stats.maximum
-
-          # It's a big outlier!
-          assert 10 * stats.average < outlier
-
-          # The outlier is only removed from the stats, but not from the samples
-          assert outlier in samples
-        end)
-
-      # As the outlier is removed, all measurements are the same
-      assert output =~ ~r/all.*reduction.*same/i
-      assert output =~ ~r/exclud.*outlier.*true/i
-    end
-  end
-
-  describe "warn when functions are evaluated" do
-    test "warns when run in iex" do
-      # test env to avoid repeated compilation on CI
-      port = Port.open({:spawn, "iex -S mix"}, [:binary, env: [{~c"MIX_ENV", ~c"test"}]])
-
-      try do
-        # wait for startup
-        # timeout huge because of CI
-        assert_receive {^port, {:data, "iex(1)> "}}, 20_000
-
-        send(
-          port,
-          {self(),
-           {:command, "Benchee.run(%{\"test\" => fn -> 1 end}, time: 0.001, warmup: 0)\n"}}
-        )
-
-        assert_receive {^port, {:data, "Warning: " <> message}}, 20_000
-        assert message =~ ~r/test.+evaluated.+slower.+compiled.+module.+/is
-
-        # waiting for iex to be ready for input again/the benchmark to be finished
-        # sometimes we get "iex(2)>" as a separate message, sometimes it's attached to the
-        # previous output - hence we gotta doe out own `receive` checking.
-        assert :ok = wait_for_benchmark_finished(port)
-      after
-        # https://elixirforum.com/t/starting-shutting-down-iex-with-a-port-gracefully/60388/2?u=pragtob
-        send(port, {self(), {:command, "\a"}})
-        send(port, {self(), {:command, "q\n"}})
-      end
-    end
-
-    defp wait_for_benchmark_finished(port) do
-      receive do
-        {^port, {:data, output}} ->
-          if String.contains?(output, "iex(2)>") do
-            :ok
-          else
-            wait_for_benchmark_finished(port)
-          end
-      after
-        20_000 -> raise RuntimeError, "Waited too long for iex benchmark to finish/send iex(2)>"
-      end
-    end
-  end
-
-  describe "escript building" do
-    @working_directory File.cwd!()
-    @sample_project_directory Path.expand("fixtures/escript", __DIR__)
-    test "benchee can be built into and used as an escript" do
-      File.cd!(@sample_project_directory)
-      # we don't match the exit_status right now to get better error messages potentially
-      {output, exit_status} = System.cmd("bash", ["test.sh"])
-
-      readme_sample_asserts(output)
-      assert exit_status == 0
-    after
-      File.cd!(@working_directory)
-    end
-  end
-
-  @slower_regex "\\s+- \\d+\\.\\d+x slower \\+\\d+(\\.\\d+)?.+"
-  defp readme_sample_asserts(output, opts \\ [tag_string: "", benchmarking_prints: true]) do
-    if Access.get(opts, :benchmarking_prints) do
-      assert output =~ "warmup: 5 ms"
-      assert output =~ "time: 10 ms"
-      assert output =~ ~r/calculat.*statistics/i
-    end
-
-    tag_string = Access.get(opts, :tag_string, "")
-
-    tag_regex = Regex.escape(tag_string)
-    assert output =~ @header_regex
-    assert output =~ body_regex("flat_map", tag_regex)
-    assert output =~ body_regex("map.flatten", tag_regex)
-    assert output =~ ~r/Comparison/, output
-    assert output =~ ~r/^map.flatten#{tag_regex}\s+\d+(\.\d+)?\s*.?(#{@slower_regex})?$/m
-    assert output =~ ~r/^flat_map#{tag_regex}\s+\d+(\.\d+)?\s*.?(#{@slower_regex})?$/m
-    assert output =~ ~r/#{@slower_regex}/m
-
-    # In windows time resolution seems to be milliseconds, hence even
-    # standard examples produce a fast warning.
-    # So we skip this "basically everything is going fine" test on windows
-    if !windows?(), do: refute(output =~ ~r/fast/i)
-  end
-
-  defp body_regex(benchmark_name, tag_regex \\ "") do
-    ~r/^#{benchmark_name}#{tag_regex}\s+\d+.+\s+\d+\.?\d*.+\s+.+\d+\.?\d*.+\s+\d+\.?\d*.+/m
-  end
-
-  defp windows? do
-    {_, os} = :os.type()
-    os == :nt
-  end
 end
diff --git a/test/support/integration_helpers.ex b/test/support/integration_helpers.ex
new file mode 100644
index 00000000..650cf0db
--- /dev/null
+++ b/test/support/integration_helpers.ex
@@ -0,0 +1,44 @@
+defmodule Benchee.IntegrationHelpers do
+  @moduledoc false
+
+  import ExUnit.Assertions
+
+  @header_regex ~r/^Name.+ips.+average.+deviation.+median.+99th %$/m
+  @slower_regex "\\s+- \\d+\\.\\d+x slower \\+\\d+(\\.\\d+)?.+"
+  @test_configuration [time: 0.01, warmup: 0.005]
+
+  def test_configuration, do: @test_configuration
+
+  def readme_sample_asserts(output, opts \\ [tag_string: "", benchmarking_prints: true]) do
+    if Access.get(opts, :benchmarking_prints) do
+      assert output =~ "warmup: 5 ms"
+      assert output =~ "time: 10 ms"
+      assert output =~ ~r/calculat.*statistics/i
+    end
+
+    tag_string = Access.get(opts, :tag_string, "")
+
+    tag_regex = Regex.escape(tag_string)
+    assert output =~ @header_regex
+    assert output =~ body_regex("flat_map", tag_regex)
+    assert output =~ body_regex("map.flatten", tag_regex)
+    assert output =~ ~r/Comparison/, output
+    assert output =~ ~r/^map.flatten#{tag_regex}\s+\d+(\.\d+)?\s*.?(#{@slower_regex})?$/m
+    assert output =~ ~r/^flat_map#{tag_regex}\s+\d+(\.\d+)?\s*.?(#{@slower_regex})?$/m
+    assert output =~ ~r/#{@slower_regex}/m
+
+    # In windows time resolution seems to be milliseconds, hence even
+    # standard examples produce a fast warning.
+    # So we skip this "basically everything is going fine" test on windows
+    if !windows?(), do: refute(output =~ ~r/fast/i)
+  end
+
+  def body_regex(benchmark_name, tag_regex \\ "") do
+    ~r/^#{benchmark_name}#{tag_regex}\s+\d+.+\s+\d+\.?\d*.+\s+.+\d+\.?\d*.+\s+\d+\.?\d*.+/m
+  end
+
+  def windows? do
+    {_, os} = :os.type()
+    os == :nt
+  end
+end