From 2023b4cf27f6036637807c906ee9fbe82797f71f Mon Sep 17 00:00:00 2001
From: XYenChi <oriachiuan@gmail.com>
Date: Sun, 19 Apr 2026 23:32:28 +0800
Subject: [PATCH 01/12] Add RISC-V Blocklist (#1)

* Add RISC-V 64 BLOCK_LIST

* Skip long time testcase
---
 test/run_test.py    | 47 +++++++++++++++++++++++++++++++++++++++++++++
 test/test_linalg.py |  1 +
 2 files changed, 48 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index ff2a617a6eb78..0246e5e761175 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -108,6 +108,7 @@ def upload_adhoc_failure_json(*args, **kwargs):
 INDUCTOR_TEST_PREFIX = "inductor"
 IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT
 IS_S390X = platform.machine() == "s390x"
+IS_RISCV64 = platform.machine() == "riscv64"
 
 
 # Note [ROCm parallel CI testing]
@@ -281,6 +282,45 @@ def __contains__(self, item):
     "test_xpu",
 ]
 
+RISCV64_BLOCKLIST = [
+    # disable distributed related test
+    "inductor/test_distributed_patterns"
+    "fx/test_dce_pass"
+    "export/test_cpp_serdes"
+    "export/test_export"
+    "export/test_export_strict"
+    "export/test_export_training_ir_to_run_decomp"
+    "export/test_retraceability"
+    "export/test_serdes"
+    "export/test_strict_export_v2"
+    "test_public_bindings"
+    # quantized engine NoQEngine is not supported
+    "test_torch"
+    "ao/sparsity/test_composability"
+    # QNNPACK is not supported
+    "export/test_converter"
+    # record_contex_cpp is not support on non-linux non-x86_64 platforms
+    "torch_np/numpy_tests/core/test_numeric"
+    # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed'
+    "test_testing"
+    # TODO:L1 cache size = 0, need to fix
+    "inductor/test_cpu_select_algorithm"
+    "inductor/test_aot_inductor_arrayref"
+    "inductor/test_cpu_repro"
+    # TODO:scalar value not equal, need to fix
+    "profiler/test_profiler"
+    # TODO precision
+    "test_binary_ufuncs"
+    "test_decomp"
+    # TODO no CUDA related module
+    "quantization/core/test_workflow_module"  # TestFakeQuantize.test_fq_module_per_channel
+    "quantization/core/test_workflow_ops"
+    "quantization/core/test_quantized_op"
+    # z3-solver build fail
+    "test_proxy_tensor"
+]
+
+
 # The tests inside these files should never be run in parallel with each other
 RUN_PARALLEL_BLOCKLIST = [
     "test_extension_utils",
@@ -1822,6 +1862,13 @@ def get_selected_tests(options) -> list[str]:
             selected_tests,
             "Skip distributed tests on s390x",
         )
+    elif IS_RISCV64:
+        selected_tests = exclude_tests(RISCV64_BLOCKLIST, selected_tests, "on riscv64")
+        selected_tests = exclude_tests(
+            DISTRIBUTED_TESTS,
+            selected_tests,
+            "Skip distributed tests on riscv64",
+        )
 
     # skip all distributed tests if distributed package is not available.
     if not dist.is_available():
diff --git a/test/test_linalg.py b/test/test_linalg.py
index e9461bf83796f..cb1e67e10e598 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -8954,6 +8954,7 @@ def test_matrix_exp_backward_input_validation(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"):
             torch.ops.aten.matrix_exp_backward(non_square, grad_non_square)
 
+    @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)

From 5a20e9ebaa320b870879b6bd6aae7aac7b1d3712 Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Wed, 22 Apr 2026 08:10:52 +0000
Subject: [PATCH 02/12] add riscv64 ci

---
 .github/workflows/ci-riscv64.yml | 99 ++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 .github/workflows/ci-riscv64.yml

diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
new file mode 100644
index 0000000000000..ffc859265b3c9
--- /dev/null
+++ b/.github/workflows/ci-riscv64.yml
@@ -0,0 +1,99 @@
+# Note: this runner is provided externally, so we minimize its access to
+# secrets.
+on:
+  push:
+    branches: [riscv]
+
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+  
+
+name: CI (riscv64)
+
+permissions:
+  contents: read
+  # No permissions to secrets.
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+# FIXME: Drop this
+env:
+  RUSTFLAGS: -D warnings
+  CARGO_TERM_COLOR: always
+
+jobs:
+  build:
+    name: Build and test
+    runs-on: [self-hosted, linux, amd64]
+    # This is in its own separate environment.
+    environment: riscv64
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0   # merge-base
+
+      - name: Extract PR info
+        run: |
+          echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV
+          echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV
+          echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> GITHUB_ENV
+
+      - name: Diff base and head
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+          echo "Push PR build"
+            BASE_REF="${{ github.base_ref }}"
+            HEAD_REF="${{ github.head_ref }}"
+
+            echo "Base ref: $BASE_REF"
+            echo "Head ref: $HEAD_REF"
+
+            # 强约束: PR 必须基于 riscv
+            if [ "$BASE_REF" != "riscv" ]; then
+              echo "ERROR: PR must target 'riscv' branch, got '$BASE_REF'"
+              exit 1
+            fi
+
+            BASE="$BASE_SHA"
+            HEAD="$HEAD_SHA"
+          else
+            echo "Push to riscv"
+            # 统一用 riscv 作为 baseline
+            git fetch origin riscv
+
+            BASE=$(git merge-base HEAD origin/main)
+            HEAD=$(git rev-parse HEAD)
+
+          fi
+
+          echo "BASE_COMMIT=$BASE" >> $GITHUB_ENV
+          echo "HEAD_COMMIT=$HEAD" >> $GITHUB_ENV
+
+          echo "Base: $BASE"
+          echo "Head: $HEAD"
+
+      - name: Generate patch
+        run: |
+          echo "Generating patch..."
+
+          git diff $BASE_COMMIT $HEAD_COMMIT > patch.diff
+
+          echo "Patch size:"
+          wc -l patch.diff
+          cat patch.diff
+
+          # 可选：避免空 patch
+          if [ ! -s patch.diff ]; then
+            echo "Warning: empty patch"
+          fi
+
+      - name: Trigger Jenkins Job
+        run: |
+          export BASE_COMMIT=${BASE_COMMIT}
+          export PATCH_FILE=$(pwd)/patch.diff
+          export GITHUB_PR=${PR_NUMBER:-0}
+
+          #bash /home/jenkins/scripts/jenkins-run.sh

From 8cf1653f9caa96bb645573044661812a39c912f3 Mon Sep 17 00:00:00 2001
From: vimer <yuzibode@126.com>
Date: Fri, 24 Apr 2026 14:58:31 +0800
Subject: [PATCH 03/12] Test ci with PR (#8)

* Add riscv64 ci with PR
---
 .github/workflows/ci-riscv64.yml | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
index ffc859265b3c9..c14ce1e4c3efa 100644
--- a/.github/workflows/ci-riscv64.yml
+++ b/.github/workflows/ci-riscv64.yml
@@ -6,7 +6,6 @@ on:
 
   pull_request_target:
     types: [opened, synchronize, reopened]
-  
 
 name: CI (riscv64)
 
@@ -33,7 +32,7 @@ jobs:
       - name: Checkout repo
         uses: actions/checkout@v4
         with:
-          fetch-depth: 0   # merge-base
+          fetch-depth: 3000   # shadow clone?
 
       - name: Extract PR info
         run: |
@@ -44,7 +43,7 @@ jobs:
       - name: Diff base and head
         run: |
           if [ "${{ github.event_name }}" = "pull_request" ]; then
-          echo "Push PR build"
+            echo "Push PR build"
             BASE_REF="${{ github.base_ref }}"
             HEAD_REF="${{ github.head_ref }}"
 
@@ -57,8 +56,12 @@ jobs:
               exit 1
             fi
 
-            BASE="$BASE_SHA"
-            HEAD="$HEAD_SHA"
+            // need to get contents of the PR
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head
+            git fetch origin pull/${{ github.event.pull_request.number }}/base:pr-base
+
+            BASE=$(git merge-base pr-base pr-head)
+            HEAD=pr-head
           else
             echo "Push to riscv"
             # 统一用 riscv 作为 baseline

From 4657c78e6841d314d6df8902e9c68419d9cfe4d3 Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Fri, 24 Apr 2026 07:06:46 +0000
Subject: [PATCH 04/12] Fix no main brach issue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

⭐ Run Main Diff base and head
Push to riscv
From https://github.com/RuyiAI-Stack/pytorch
 * branch              riscv      -> FETCH_HEAD
fatal: Not a valid object name origin/main
Error:   ❌  Failure - Main Diff base and head
Error: exit status 128
---
 .github/workflows/ci-riscv64.yml | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
index c14ce1e4c3efa..3b3ddd23483e7 100644
--- a/.github/workflows/ci-riscv64.yml
+++ b/.github/workflows/ci-riscv64.yml
@@ -33,16 +33,16 @@ jobs:
         uses: actions/checkout@v4
         with:
           fetch-depth: 3000   # shadow clone?
+          ref: ${{ github.sha }} # including latest sha
 
       - name: Extract PR info
         run: |
           echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV
           echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV
-          echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> GITHUB_ENV
-
+         
       - name: Diff base and head
         run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
+          if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then
             echo "Push PR build"
             BASE_REF="${{ github.base_ref }}"
             HEAD_REF="${{ github.head_ref }}"
@@ -56,19 +56,19 @@ jobs:
               exit 1
             fi
 
-            // need to get contents of the PR
+            # need to get contents of the PR
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head
-            git fetch origin pull/${{ github.event.pull_request.number }}/base:pr-base
-
-            BASE=$(git merge-base pr-base pr-head)
+            git fetch origin main
+            BASE=$(git merge-base pr-head origin/main)
             HEAD=pr-head
           else
             echo "Push to riscv"
             # 统一用 riscv 作为 baseline
-            git fetch origin riscv
+            git fetch origin main
+            #git fetch origin riscv
 
-            BASE=$(git merge-base HEAD origin/main)
-            HEAD=$(git rev-parse HEAD)
+            BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit
+            HEAD=${{ github.sha }}
 
           fi
 
@@ -97,6 +97,5 @@ jobs:
         run: |
           export BASE_COMMIT=${BASE_COMMIT}
           export PATCH_FILE=$(pwd)/patch.diff
-          export GITHUB_PR=${PR_NUMBER:-0}
 
-          #bash /home/jenkins/scripts/jenkins-run.sh
+          bash /home/jenkins/scripts/jenkins-run.sh

From 9b03457b179ff1430f47c642b9853d99669ebb89 Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Sat, 25 Apr 2026 14:21:06 +0000
Subject: [PATCH 05/12] move the patch to dest

---
 .github/workflows/ci-riscv64.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
index 3b3ddd23483e7..dec68a6d12c16 100644
--- a/.github/workflows/ci-riscv64.yml
+++ b/.github/workflows/ci-riscv64.yml
@@ -39,7 +39,7 @@ jobs:
         run: |
           echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV
           echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV
-         
+
       - name: Diff base and head
         run: |
           if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then
@@ -82,20 +82,20 @@ jobs:
         run: |
           echo "Generating patch..."
 
-          git diff $BASE_COMMIT $HEAD_COMMIT > patch.diff
+          SHORT_HEAD=${HEAD_COMMIT:0:7}
+          PATCH_NAME="patch_${SHORT_HEAD}.patch"
+
+          git diff $BASE_COMMIT $HEAD_COMMIT > $PATCH_NAME
 
           echo "Patch size:"
-          wc -l patch.diff
-          cat patch.diff
+          wc -l $PATCH_NAME
 
-          # 可选：避免空 patch
-          if [ ! -s patch.diff ]; then
-            echo "Warning: empty patch"
-          fi
+          cp $PATCH_NAME /home/jenkins/patch/
+          cat /home/jenkins/patch/$PATCH_NAME
 
       - name: Trigger Jenkins Job
         run: |
           export BASE_COMMIT=${BASE_COMMIT}
           export PATCH_FILE=$(pwd)/patch.diff
 
-          bash /home/jenkins/scripts/jenkins-run.sh
+          #bash /home/jenkins/scripts/jenkins-run.sh

From 3f8607eae5bfa51bb897dc848c178447b022cdd4 Mon Sep 17 00:00:00 2001
From: XYenChi <oriachiuan@gmail.com>
Date: Mon, 27 Apr 2026 10:59:07 +0800
Subject: [PATCH 06/12] Fix block list format and remove
 test_cpu_select_algorithm (#4)

* mklnn is unavailable on RISC-V

* Remove test_cpu_select_algorithm from block_list

* Fix block list format
---
 test/inductor/test_cpu_select_algorithm.py |  2 +
 test/run_test.py                           | 52 +++++++++++-----------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index 6c0fdb84da02e..b554b62eb94bb 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -1559,6 +1559,7 @@ def forward(self, x):
         vec_amx = VecAMX()
         self._check_amx_counter(vec_amx)
 
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @inductor_config.patch({"freezing": True})
     @patches
     @torch.no_grad
@@ -1672,6 +1673,7 @@ def forward(self, x, scale):
             vec_amx = VecAMX()
             self._check_amx_counter(vec_amx)
 
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
     @patches
     @torch.no_grad
diff --git a/test/run_test.py b/test/run_test.py
index 0246e5e761175..cfb0e5a3fe74a 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -284,40 +284,40 @@ def __contains__(self, item):
 
 RISCV64_BLOCKLIST = [
     # disable distributed related test
-    "inductor/test_distributed_patterns"
-    "fx/test_dce_pass"
-    "export/test_cpp_serdes"
-    "export/test_export"
-    "export/test_export_strict"
-    "export/test_export_training_ir_to_run_decomp"
-    "export/test_retraceability"
-    "export/test_serdes"
-    "export/test_strict_export_v2"
-    "test_public_bindings"
+    "inductor/test_distributed_patterns",
+    "fx/test_dce_pass",
+    "export/test_cpp_serdes",
+    "export/test_export",
+    "export/test_export_strict",
+    "export/test_export_training_ir_to_run_decomp",
+    "export/test_retraceability",
+    "export/test_serdes",
+    "export/test_strict_export_v2",
+    "test_public_bindings",
     # quantized engine NoQEngine is not supported
-    "test_torch"
-    "ao/sparsity/test_composability"
+    "test_torch",
+    "ao/sparsity/test_composability",
     # QNNPACK is not supported
-    "export/test_converter"
+    "export/test_converter",
     # record_contex_cpp is not support on non-linux non-x86_64 platforms
-    "torch_np/numpy_tests/core/test_numeric"
+    "torch_np/numpy_tests/core/test_numeric",
     # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed'
-    "test_testing"
-    # TODO:L1 cache size = 0, need to fix
-    "inductor/test_cpu_select_algorithm"
-    "inductor/test_aot_inductor_arrayref"
-    "inductor/test_cpu_repro"
+    "test_testing",
+    "inductor/test_aot_inductor_arrayref",
+    "inductor/test_cpu_repro",
+    # TODO: mkldnn not available, shape guard failures on RISC-V
+    "inductor/test_cpu_select_algorithm",
     # TODO:scalar value not equal, need to fix
-    "profiler/test_profiler"
+    "profiler/test_profiler",
     # TODO precision
-    "test_binary_ufuncs"
-    "test_decomp"
+    "test_binary_ufuncs",
+    "test_decomp",
     # TODO no CUDA related module
-    "quantization/core/test_workflow_module"  # TestFakeQuantize.test_fq_module_per_channel
-    "quantization/core/test_workflow_ops"
-    "quantization/core/test_quantized_op"
+    "quantization/core/test_workflow_module",  # TestFakeQuantize.test_fq_module_per_channel
+    "quantization/core/test_workflow_ops",
+    "quantization/core/test_quantized_op",
     # z3-solver build fail
-    "test_proxy_tensor"
+    "test_proxy_tensor",
 ]
 
 

From 714e262658ee624ed1984453b7e395b674f74b31 Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Sat, 25 Apr 2026 14:21:06 +0000
Subject: [PATCH 07/12] move the patch to dest

---
 .github/workflows/ci-riscv64.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
index dec68a6d12c16..d8cd1ecdd8de3 100644
--- a/.github/workflows/ci-riscv64.yml
+++ b/.github/workflows/ci-riscv64.yml
@@ -57,14 +57,14 @@ jobs:
             fi
 
             # need to get contents of the PR
-            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head
-            git fetch origin main
+            git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head
+            git fetch --quiet origin main
             BASE=$(git merge-base pr-head origin/main)
             HEAD=pr-head
           else
             echo "Push to riscv"
             # 统一用 riscv 作为 baseline
-            git fetch origin main
+            git fetch --quiet origin main
             #git fetch origin riscv
 
             BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit
@@ -93,9 +93,8 @@ jobs:
           cp $PATCH_NAME /home/jenkins/patch/
           cat /home/jenkins/patch/$PATCH_NAME
 
+          echo "PATCH_FILE=$PATCH_NAME" >> $GITHUB_ENV
+
       - name: Trigger Jenkins Job
         run: |
-          export BASE_COMMIT=${BASE_COMMIT}
-          export PATCH_FILE=$(pwd)/patch.diff
-
-          #bash /home/jenkins/scripts/jenkins-run.sh
+          bash /home/jenkins/scripts/jenkins-run.sh $BASE_COMMIT  $PATCH_FILE

From 41c36ec91388f2fdb81942e78083aea8e3edfc89 Mon Sep 17 00:00:00 2001
From: Yixuan Chen <chenyixuan@iscas.ac.cn>
Date: Sun, 26 Apr 2026 00:49:38 +0800
Subject: [PATCH 08/12] Fix bytes_to_scalar for float/complex on RISC-V

bytes_to_scalar previously round-tripped raw bytes through Python
float/complex values (via ctypes) before constructing the tensor. This
loses NaN bit patterns on architectures (such as RISC-V) that
canonicalize NaNs in floating-point loads/conversions, causing
test_bytes_to_scalar_cpu_{float32,float64,complex64,complex128} to
fail with mismatched storage bytes.

Construct the scalar tensor by writing the raw bytes directly into its
untyped storage so all input bit patterns (including NaN payloads) are
preserved exactly.
---
 torch/testing/_internal/common_utils.py | 37 +++++++++++--------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 45d7b0b253a30..9c78401637347 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5457,27 +5457,22 @@ def check_bytes(byte_list):
             if not (0 <= byte <= 255):
                 raise AssertionError(f"byte value out of range: expected 0 <= byte <= 255, got {byte}")
 
-    if dtype.is_complex:
-        if len(byte_list) != (num_bytes * 2):
-            raise AssertionError(
-                f"expected len(byte_list) == {num_bytes * 2} for complex dtype, got {len(byte_list)}"
-            )
-        check_bytes(byte_list)
-        real = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list[:num_bytes])).value
-        imag = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list[num_bytes:])).value
-        res = real + 1j * imag
-    else:
-        if len(byte_list) != num_bytes:
-            raise AssertionError(
-                f"expected len(byte_list) == {num_bytes}, got {len(byte_list)}"
-            )
-        check_bytes(byte_list)
-        res = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list)).value
-
-    return torch.tensor(res, device=device, dtype=dtype)
+    expected_len = num_bytes * 2 if dtype.is_complex else num_bytes
+    if len(byte_list) != expected_len:
+        raise AssertionError(
+            f"expected len(byte_list) == {expected_len}"
+            f"{' for complex dtype' if dtype.is_complex else ''}, got {len(byte_list)}"
+        )
+    check_bytes(byte_list)
+
+    # Write bytes directly into storage to preserve exact bit patterns
+    # (e.g. NaN payloads, which are not preserved when round-tripping through
+    # Python float/complex, especially on architectures like RISC-V that
+    # canonicalize NaNs).
+    res = torch.empty((), dtype=dtype, device=device)
+    src = torch.tensor(byte_list, dtype=torch.uint8, device=device)
+    res.untyped_storage().copy_(src.untyped_storage())
+    return res
 
 
 def copy_func(f):

From a0c8bd82b927571cc8cc1ae08a74cd5f8d0f31d3 Mon Sep 17 00:00:00 2001
From: vimer <yuzibode@126.com>
Date: Sun, 3 May 2026 09:05:59 +0800
Subject: [PATCH 09/12] [blacklist]: update it (#11)

These cases are too slow on riscv64, adding them to here simply

Drop test_torch from the list because it is one core case
---
 test/run_test.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/run_test.py b/test/run_test.py
index cfb0e5a3fe74a..fbd293f973e8a 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -294,8 +294,6 @@ def __contains__(self, item):
     "export/test_serdes",
     "export/test_strict_export_v2",
     "test_public_bindings",
-    # quantized engine NoQEngine is not supported
-    "test_torch",
     "ao/sparsity/test_composability",
     # QNNPACK is not supported
     "export/test_converter",
@@ -318,6 +316,15 @@ def __contains__(self, item):
     "quantization/core/test_quantized_op",
     # z3-solver build fail
     "test_proxy_tensor",
+    # too slow on riscv64
+    # 53013.55 s
+    "functorch/test_aotdispatch",
+    # 25069 s
+    "functorch/test_ops",
+    # 17528 s
+    "test_transformers",
+    # 10897 s
+    "functorch/test_vmap",
 ]
 
 

From bbbab668cf8b38dc832ffb9e8bc02b05a5801879 Mon Sep 17 00:00:00 2001
From: vimer <yuzibode@126.com>
Date: Thu, 7 May 2026 12:32:52 +0800
Subject: [PATCH 10/12] Use commit sha on PR workflow (#12)

---
 .github/workflows/ci-riscv64.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
index d8cd1ecdd8de3..44cd575a01e01 100644
--- a/.github/workflows/ci-riscv64.yml
+++ b/.github/workflows/ci-riscv64.yml
@@ -60,7 +60,7 @@ jobs:
             git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head
             git fetch --quiet origin main
             BASE=$(git merge-base pr-head origin/main)
-            HEAD=pr-head
+            HEAD=$(git rev-parse pr-head)
           else
             echo "Push to riscv"
             # 统一用 riscv 作为 baseline

From 9cac52bf8baa487739b8571bf448f5f43070bff4 Mon Sep 17 00:00:00 2001
From: Yixuan Chen <chenyixuan@iscas.ac.cn>
Date: Sun, 26 Apr 2026 00:49:38 +0800
Subject: [PATCH 11/12] Fix bytes_to_scalar for float/complex on RISC-V

bytes_to_scalar previously round-tripped raw bytes through Python
float/complex values (via ctypes) before constructing the tensor. This
loses NaN bit patterns on architectures (such as RISC-V) that
canonicalize NaNs in floating-point loads/conversions, causing
test_bytes_to_scalar_cpu_{float32,float64,complex64,complex128} to
fail with mismatched storage bytes.

Construct the scalar tensor by writing the raw bytes directly into its
untyped storage so all input bit patterns (including NaN payloads) are
preserved exactly.
---
 torch/testing/_internal/common_utils.py | 37 +++++++++++--------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 45d7b0b253a30..9c78401637347 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5457,27 +5457,22 @@ def check_bytes(byte_list):
             if not (0 <= byte <= 255):
                 raise AssertionError(f"byte value out of range: expected 0 <= byte <= 255, got {byte}")
 
-    if dtype.is_complex:
-        if len(byte_list) != (num_bytes * 2):
-            raise AssertionError(
-                f"expected len(byte_list) == {num_bytes * 2} for complex dtype, got {len(byte_list)}"
-            )
-        check_bytes(byte_list)
-        real = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list[:num_bytes])).value
-        imag = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list[num_bytes:])).value
-        res = real + 1j * imag
-    else:
-        if len(byte_list) != num_bytes:
-            raise AssertionError(
-                f"expected len(byte_list) == {num_bytes}, got {len(byte_list)}"
-            )
-        check_bytes(byte_list)
-        res = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list)).value
-
-    return torch.tensor(res, device=device, dtype=dtype)
+    expected_len = num_bytes * 2 if dtype.is_complex else num_bytes
+    if len(byte_list) != expected_len:
+        raise AssertionError(
+            f"expected len(byte_list) == {expected_len}"
+            f"{' for complex dtype' if dtype.is_complex else ''}, got {len(byte_list)}"
+        )
+    check_bytes(byte_list)
+
+    # Write bytes directly into storage to preserve exact bit patterns
+    # (e.g. NaN payloads, which are not preserved when round-tripping through
+    # Python float/complex, especially on architectures like RISC-V that
+    # canonicalize NaNs).
+    res = torch.empty((), dtype=dtype, device=device)
+    src = torch.tensor(byte_list, dtype=torch.uint8, device=device)
+    res.untyped_storage().copy_(src.untyped_storage())
+    return res
 
 
 def copy_func(f):

From 87ac9b22d6ac4e136d31bdde5ab0e0d26e0f57e6 Mon Sep 17 00:00:00 2001
From: Yixuan Chen <chenyixuan@iscas.ac.cn>
Date: Thu, 23 Apr 2026 21:07:25 +0800
Subject: [PATCH 12/12] Add NoQEngine fallback for quantized ops on RISC-V

Implement PackedLinearWeightNoQEngine and PackedConvWeightNoQEngine
classes that dequantize inputs, run float computation, and requantize
outputs. This provides a working fallback when no hardware-specific
quantized engine (FBGEMM, QNNPACK, ONEDNN) is available.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../native/quantized/cpu/conv_serialization.h |  12 +
 .../native/quantized/cpu/fbgemm_utils.cpp     | 277 ++++++++++++++++++
 .../ATen/native/quantized/cpu/fbgemm_utils.h  | 140 +++++++++
 aten/src/ATen/native/quantized/cpu/qconv.cpp  |  12 +
 .../native/quantized/cpu/qconv_prepack.cpp    |  12 +
 .../src/ATen/native/quantized/cpu/qlinear.cpp |  19 ++
 .../native/quantized/cpu/qlinear_prepack.cpp  |   8 +
 .../ATen/native/quantized/qconv_unpack.cpp    |  10 +
 8 files changed, 490 insertions(+)

diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
index 3edd398fa789a..3e45a8062b0d8 100644
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@@ -410,6 +410,18 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
     );
   }
 #endif // AT_MKLDNN_ENABLED()
+  if (ctx.qEngine() == at::QEngine::NoQEngine) {
+    return PackedConvWeightNoQEngine<kSpatialDim>::prepack(
+      std::move(weight.value()),
+      std::move(bias),
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
 TORCH_CHECK(
   false,
   "Didn't find engine for when deserializing ConvPackedParams: ",
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 1e4d2b9960d02..a45824b3485af 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -25,6 +25,16 @@
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/cat.h>
+#include <ATen/ops/conv1d.h>
+#include <ATen/ops/conv2d.h>
+#include <ATen/ops/conv3d.h>
+#include <ATen/ops/conv_transpose1d.h>
+#include <ATen/ops/conv_transpose2d.h>
+#include <ATen/ops/conv_transpose3d.h>
+#include <ATen/ops/dequantize.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/quantize_per_tensor.h>
+#include <ATen/ops/relu.h>
 
 #include <utility>
 #endif
@@ -365,6 +375,270 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>(
 
 #endif // USE_FBGEMM
 
+// NoQEngine packed weight implementations: dequantize, float compute, quantize.
+// Used as a fallback when no hardware-specific quantized engine is available.
+
+c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightNoQEngine::prepack(
+    at::Tensor weight,
+    std::optional<at::Tensor> bias) {
+  return c10::make_intrusive<PackedLinearWeightNoQEngine>(
+      std::move(weight), std::move(bias));
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  return at::quantize_per_tensor(
+      output_fp, output_scale, output_zero_point, c10::kQInt8);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_relu(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  at::Tensor relu_out = at::relu(output_fp);
+  return at::quantize_per_tensor(
+      relu_out, output_scale, output_zero_point, c10::kQInt8);
+}
+
+at::Tensor& PackedLinearWeightNoQEngine::apply_out(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point,
+    at::Tensor& output) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  at::Tensor q_out = at::quantize_per_tensor(
+      output_fp, output_scale, output_zero_point, c10::kQInt8);
+  output.copy_(q_out);
+  return output;
+}
+
+at::Tensor& PackedLinearWeightNoQEngine::apply_relu_out(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point,
+    at::Tensor& output) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  at::Tensor relu_out = at::relu(output_fp);
+  at::Tensor q_out = at::quantize_per_tensor(
+      relu_out, output_scale, output_zero_point, c10::kQInt8);
+  output.copy_(q_out);
+  return output;
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_output_fp32(
+    at::Tensor input,
+    double input_scale,
+    int64_t input_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  return at::linear(input_fp, weight_fp, bias_);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+    at::Tensor input,
+    double input_scale,
+    int64_t input_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  return at::relu(output_fp);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_dynamic(
+    at::Tensor input,
+    bool reduce_range) {
+  at::Tensor weight_fp = at::dequantize(weight_);
+  return at::linear(input, weight_fp, bias_);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_dynamic_relu(
+    at::Tensor input,
+    bool reduce_range) {
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input, weight_fp, bias_);
+  return at::relu(output_fp);
+}
+
+std::tuple<at::Tensor, std::optional<at::Tensor>>
+PackedLinearWeightNoQEngine::unpack() {
+  return std::make_tuple(weight_, bias_);
+}
+
+template <int kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>
+PackedConvWeightNoQEngine<kSpatialDim>::prepack(
+    at::Tensor weight,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> output_padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    bool transpose) {
+  return c10::make_intrusive<PackedConvWeightNoQEngine<kSpatialDim>>(
+      std::move(weight),
+      std::move(bias),
+      std::move(stride),
+      std::move(padding),
+      std::move(output_padding),
+      std::move(dilation),
+      groups,
+      transpose);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightNoQEngine<kSpatialDim>::apply(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  auto stride = stride_.vec();
+  auto padding = padding_.vec();
+  auto dilation = dilation_.vec();
+  at::Tensor output_fp;
+  if (transpose_) {
+    auto output_padding = output_padding_.vec();
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv_transpose1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv_transpose2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv_transpose3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    }
+  } else {
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    }
+  }
+  return at::quantize_per_tensor(
+      output_fp, output_scale, output_zero_point, c10::kQInt8);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightNoQEngine<kSpatialDim>::apply_relu(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  auto stride = stride_.vec();
+  auto padding = padding_.vec();
+  auto dilation = dilation_.vec();
+  at::Tensor output_fp;
+  if (transpose_) {
+    auto output_padding = output_padding_.vec();
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv_transpose1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv_transpose2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv_transpose3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    }
+  } else {
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    }
+  }
+  at::Tensor relu_out = at::relu(output_fp);
+  return at::quantize_per_tensor(
+      relu_out, output_scale, output_zero_point, c10::kQInt8);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightNoQEngine<kSpatialDim>::apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range) {
+  at::Tensor weight_fp = at::dequantize(weight_);
+  auto stride = stride_.vec();
+  auto padding = padding_.vec();
+  auto dilation = dilation_.vec();
+  if (transpose_) {
+    auto output_padding = output_padding_.vec();
+    if constexpr (kSpatialDim == 1) {
+      return at::conv_transpose1d(
+          input, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 2) {
+      return at::conv_transpose2d(
+          input, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else {
+      return at::conv_transpose3d(
+          input, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    }
+  } else {
+    if constexpr (kSpatialDim == 1) {
+      return at::conv1d(
+          input, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 2) {
+      return at::conv2d(
+          input, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else {
+      return at::conv3d(
+          input, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    }
+  }
+}
+
+template <int kSpatialDim>
+std::tuple<at::Tensor, std::optional<at::Tensor>>
+PackedConvWeightNoQEngine<kSpatialDim>::unpack() {
+  return std::make_tuple(weight_, bias_);
+}
+
+template struct PackedConvWeightNoQEngine<1>;
+template struct PackedConvWeightNoQEngine<2>;
+template struct PackedConvWeightNoQEngine<3>;
+
 namespace {
   // This is really terrible, but couldn't figure out a better way to constexpr convert int to
   // string and then perform string concatenation on/with it
@@ -469,6 +743,9 @@ int register_linear_params() {
                   return std::apply(PackedLinearWeightsOnednn::prepack, std::move(state));
                 }
 #endif // #if AT_MKLDNN_ENABLED()
+                if (at::globalContext().qEngine() == at::QEngine::NoQEngine) {
+                  return std::apply(PackedLinearWeightNoQEngine::prepack, std::move(state));
+                }
                 TORCH_CHECK(false, "Unknown qengine");
               })
               .def("bias", [](const c10::intrusive_ptr<LinearPackedParamsBase>& self) {
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index a1139be833f87..5d5acd8b4950c 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -352,6 +352,146 @@ Tensor ConvertConvWeightsToChannelLastTensor(
 
 #endif // USE_FBGEMM
 
+struct TORCH_API PackedLinearWeightNoQEngine : public LinearPackedParamsBase {
+  PackedLinearWeightNoQEngine(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)) {}
+
+  at::Tensor weight_;
+  std::optional<at::Tensor> bias_;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor& apply_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor& apply_relu_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
+      override;
+
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
+      override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  std::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias);
+};
+
+template <int kSpatialDim = 2>
+struct TORCH_API PackedConvWeightNoQEngine
+    : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightNoQEngine(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose) {}
+
+  at::Tensor weight_;
+  std::optional<at::Tensor> bias_;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return transpose_;
+  }
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+};
+
 struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
   PackedEmbeddingBagWeight(
       at::Tensor packed_w,
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 94ac6350aeb0e..9853d49da6866 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -30,6 +30,7 @@
 #include <ATen/ops/_empty_affine_quantized.h>
 #include <ATen/ops/_empty_affine_quantized_native.h>
 #include <ATen/ops/_empty_per_channel_affine_quantized_native.h>
+#include <ATen/ops/dequantize.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/quantize_per_channel_native.h>
 #include <ATen/ops/quantize_per_tensor_native.h>
@@ -2147,6 +2148,17 @@ class QConvAddInt8 final {
       }
     }
 #endif
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      at::Tensor act_fp = at::dequantize(act);
+      at::Tensor accum_fp = at::dequantize(accum);
+      at::Tensor output_fp = packed_weight->apply_dynamic(act_fp, false);
+      output_fp = output_fp + accum_fp;
+      if (kReluFused) {
+        output_fp = at::relu(output_fp);
+      }
+      return at::native::quantize_per_tensor(
+          output_fp, output_scale, output_zero_point, c10::kQInt8);
+    }
     TORCH_CHECK(
     false,
     "Didn't find engine for operation quantized::conv2d_add.",
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index b7b2c5ca8d30e..66b3ec8db7700 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -718,6 +718,12 @@ class QConvPackWeightInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedConvWeightNoQEngine<kSpatialDim>::prepack(
+          std::move(weight), std::move(bias), stride, padding,
+          output_padding, dilation, groups, transpose);
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv2d_prepack ",
@@ -814,6 +820,12 @@ class QConv1dPackWeightInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedConvWeightNoQEngine<2>::prepack(
+          std::move(weight), std::move(bias), stride, padding,
+          output_padding, dilation, groups, transpose);
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv1d_prepack ",
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 1f726b3ee1c3e..47b015452497f 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -25,6 +25,7 @@
 #include <ATen/ops/_empty_affine_quantized_native.h>  // for empty_affine_qu...
 #include <ATen/ops/empty.h>                           // for empty
 #include <ATen/ops/quantize_per_channel_native.h>     // for quantize_per_ch...
+#include <ATen/ops/dequantize.h>
 #include <ATen/ops/quantize_per_tensor_native.h>      // for quantize_per_te...
 #include <ATen/ops/zeros.h>
 #include <ATen/ops/_weight_int4pack_mm_for_cpu.h>
@@ -1526,6 +1527,15 @@ class QLinearLeakyReluInt8 final {
           std::move(input), output_scale, output_zero_point, negative_slope);
     }
 #endif
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      auto [weight, bias] = packed_weight->unpack();
+      at::Tensor input_fp = at::dequantize(input);
+      at::Tensor weight_fp = at::dequantize(weight);
+      at::Tensor output_fp = at::linear(input_fp, weight_fp, bias);
+      at::Tensor lr_out = at::leaky_relu(output_fp, negative_slope);
+      return at::native::quantize_per_tensor(
+          lr_out, output_scale, output_zero_point, c10::kQInt8);
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_leaky_relu ",
@@ -1550,6 +1560,15 @@ class QLinearTanhInt8 final {
           std::move(input), output_scale, output_zero_point);
     }
 #endif
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      auto [weight, bias] = packed_weight->unpack();
+      at::Tensor input_fp = at::dequantize(input);
+      at::Tensor weight_fp = at::dequantize(weight);
+      at::Tensor output_fp = at::linear(input_fp, weight_fp, bias);
+      at::Tensor tanh_out = at::tanh(output_fp);
+      return at::native::quantize_per_tensor(
+          tanh_out, output_scale, output_zero_point, c10::kQInt8);
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_tanh ",
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index b4ae4e677bcd2..95ab2fa06c9b5 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -606,6 +606,10 @@ class QLinearPackWeightInt8 final {
       return PackedLinearWeightsOnednn::prepack(std::move(weight), std::move(bias));
     }
 #endif // #if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedLinearWeightNoQEngine::prepack(
+          std::move(weight), std::move(bias));
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_prepack ",
@@ -645,6 +649,10 @@ class QLinearPackWeightFp16 final {
           "not supported by ONEDNN");
     }
 #endif // #if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedLinearWeightNoQEngine::prepack(
+          std::move(weight), std::move(bias));
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_prepack_fp16 ",
diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp
index 4c2352a396177..53d351698fbdc 100644
--- a/aten/src/ATen/native/quantized/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp
@@ -70,6 +70,10 @@ class QConvUnpackWeightsInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return packed_weight->unpack();
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv2d_unpack ",
@@ -111,6 +115,12 @@ class QConv1dUnpackWeightsInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      std::tie(weight, bias) = packed_weight->unpack();
+      weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
+      return std::tuple<at::Tensor, std::optional<at::Tensor>>(weight, bias);
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv1d_unpack ",