From 2023b4cf27f6036637807c906ee9fbe82797f71f Mon Sep 17 00:00:00 2001 From: XYenChi Date: Sun, 19 Apr 2026 23:32:28 +0800 Subject: [PATCH 01/12] Add RISC-V Blocklist (#1) * Add RISC-V 64 BLOCK_LIST * Skip long time testcase --- test/run_test.py | 47 +++++++++++++++++++++++++++++++++++++++++++++ test/test_linalg.py | 1 + 2 files changed, 48 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index ff2a617a6eb78..0246e5e761175 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -108,6 +108,7 @@ def upload_adhoc_failure_json(*args, **kwargs): INDUCTOR_TEST_PREFIX = "inductor" IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT IS_S390X = platform.machine() == "s390x" +IS_RISCV64 = platform.machine() == "riscv64" # Note [ROCm parallel CI testing] @@ -281,6 +282,45 @@ def __contains__(self, item): "test_xpu", ] +RISCV64_BLOCKLIST = [ + # disable distributed related test + "inductor/test_distributed_patterns" + "fx/test_dce_pass" + "export/test_cpp_serdes" + "export/test_export" + "export/test_export_strict" + "export/test_export_training_ir_to_run_decomp" + "export/test_retraceability" + "export/test_serdes" + "export/test_strict_export_v2" + "test_public_bindings" + # quantized engine NoQEngine is not supported + "test_torch" + "ao/sparsity/test_composability" + # QNNPACK is not supported + "export/test_converter" + # record_contex_cpp is not support on non-linux non-x86_64 platforms + "torch_np/numpy_tests/core/test_numeric" + # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed' + "test_testing" + # TODO:L1 cache size = 0, need to fix + "inductor/test_cpu_select_algorithm" + "inductor/test_aot_inductor_arrayref" + "inductor/test_cpu_repro" + # TODO:scalar value not equal, need to fix + "profiler/test_profiler" + # TODO precision + "test_binary_ufuncs" + "test_decomp" + # TODO no CUDA related module + "quantization/core/test_workflow_module" # TestFakeQuantize.test_fq_module_per_channel + "quantization/core/test_workflow_ops" + "quantization/core/test_quantized_op" + # z3-solver build fail + "test_proxy_tensor" +] + + # The tests inside these files should never be run in parallel with each other RUN_PARALLEL_BLOCKLIST = [ "test_extension_utils", @@ -1822,6 +1862,13 @@ def get_selected_tests(options) -> list[str]: selected_tests, "Skip distributed tests on s390x", ) + elif IS_RISCV64: + selected_tests = exclude_tests(RISCV64_BLOCKLIST, selected_tests, "on riscv64") + selected_tests = exclude_tests( + DISTRIBUTED_TESTS, + selected_tests, + "Skip distributed tests on riscv64", + ) # skip all distributed tests if distributed package is not available. if not dist.is_available(): diff --git a/test/test_linalg.py b/test/test_linalg.py index e9461bf83796f..cb1e67e10e598 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -8954,6 +8954,7 @@ def test_matrix_exp_backward_input_validation(self, device, dtype): with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"): torch.ops.aten.matrix_exp_backward(non_square, grad_non_square) + @slowTest @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) From 5a20e9ebaa320b870879b6bd6aae7aac7b1d3712 Mon Sep 17 00:00:00 2001 From: Bo YU Date: Wed, 22 Apr 2026 08:10:52 +0000 Subject: [PATCH 02/12] add riscv64 ci --- .github/workflows/ci-riscv64.yml | 99 ++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 .github/workflows/ci-riscv64.yml diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml new file mode 100644 index 0000000000000..ffc859265b3c9 --- /dev/null +++ b/.github/workflows/ci-riscv64.yml @@ -0,0 +1,99 @@ +# Note: this runner is provided externally, so we minimize its access to +# secrets. +on: + push: + branches: [riscv] + + pull_request_target: + types: [opened, synchronize, reopened] + + +name: CI (riscv64) + +permissions: + contents: read + # No permissions to secrets. + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +# FIXME: Drop this +env: + RUSTFLAGS: -D warnings + CARGO_TERM_COLOR: always + +jobs: + build: + name: Build and test + runs-on: [self-hosted, linux, amd64] + # This is in its own separate environment. + environment: riscv64 + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 # merge-base + + - name: Extract PR info + run: | + echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV + echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV + echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> GITHUB_ENV + + - name: Diff base and head + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "Push PR build" + BASE_REF="${{ github.base_ref }}" + HEAD_REF="${{ github.head_ref }}" + + echo "Base ref: $BASE_REF" + echo "Head ref: $HEAD_REF" + + # 强约束: PR 必须基于 riscv + if [ "$BASE_REF" != "riscv" ]; then + echo "ERROR: PR must target 'riscv' branch, got '$BASE_REF'" + exit 1 + fi + + BASE="$BASE_SHA" + HEAD="$HEAD_SHA" + else + echo "Push to riscv" + # 统一用 riscv 作为 baseline + git fetch origin riscv + + BASE=$(git merge-base HEAD origin/main) + HEAD=$(git rev-parse HEAD) + + fi + + echo "BASE_COMMIT=$BASE" >> $GITHUB_ENV + echo "HEAD_COMMIT=$HEAD" >> $GITHUB_ENV + + echo "Base: $BASE" + echo "Head: $HEAD" + + - name: Generate patch + run: | + echo "Generating patch..." + + git diff $BASE_COMMIT $HEAD_COMMIT > patch.diff + + echo "Patch size:" + wc -l patch.diff + cat patch.diff + + # 可选:避免空 patch + if [ ! -s patch.diff ]; then + echo "Warning: empty patch" + fi + + - name: Trigger Jenkins Job + run: | + export BASE_COMMIT=${BASE_COMMIT} + export PATCH_FILE=$(pwd)/patch.diff + export GITHUB_PR=${PR_NUMBER:-0} + + #bash /home/jenkins/scripts/jenkins-run.sh From 8cf1653f9caa96bb645573044661812a39c912f3 Mon Sep 17 00:00:00 2001 From: vimer Date: Fri, 24 Apr 2026 14:58:31 +0800 Subject: [PATCH 03/12] Test ci with PR (#8) * Add riscv64 ci with PR --- .github/workflows/ci-riscv64.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml index ffc859265b3c9..c14ce1e4c3efa 100644 --- a/.github/workflows/ci-riscv64.yml +++ b/.github/workflows/ci-riscv64.yml @@ -6,7 +6,6 @@ on: pull_request_target: types: [opened, synchronize, reopened] - name: CI (riscv64) @@ -33,7 +32,7 @@ jobs: - name: Checkout repo uses: actions/checkout@v4 with: - fetch-depth: 0 # merge-base + fetch-depth: 3000 # shadow clone? - name: Extract PR info run: | @@ -44,7 +43,7 @@ jobs: - name: Diff base and head run: | if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "Push PR build" + echo "Push PR build" BASE_REF="${{ github.base_ref }}" HEAD_REF="${{ github.head_ref }}" @@ -57,8 +56,12 @@ jobs: exit 1 fi - BASE="$BASE_SHA" - HEAD="$HEAD_SHA" + // need to get contents of the PR + git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head + git fetch origin pull/${{ github.event.pull_request.number }}/base:pr-base + + BASE=$(git merge-base pr-base pr-head) + HEAD=pr-head else echo "Push to riscv" # 统一用 riscv 作为 baseline From 4657c78e6841d314d6df8902e9c68419d9cfe4d3 Mon Sep 17 00:00:00 2001 From: Bo YU Date: Fri, 24 Apr 2026 07:06:46 +0000 Subject: [PATCH 04/12] Fix no main brach issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ⭐ Run Main Diff base and head Push to riscv From https://github.com/RuyiAI-Stack/pytorch * branch riscv -> FETCH_HEAD fatal: Not a valid object name origin/main Error: ❌ Failure - Main Diff base and head Error: exit status 128 --- .github/workflows/ci-riscv64.yml | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml index c14ce1e4c3efa..3b3ddd23483e7 100644 --- a/.github/workflows/ci-riscv64.yml +++ b/.github/workflows/ci-riscv64.yml @@ -33,16 +33,16 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 3000 # shadow clone? + ref: ${{ github.sha }} # including latest sha - name: Extract PR info run: | echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV - echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> GITHUB_ENV - + - name: Diff base and head run: | - if [ "${{ github.event_name }}" = "pull_request" ]; then + if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then echo "Push PR build" BASE_REF="${{ github.base_ref }}" HEAD_REF="${{ github.head_ref }}" @@ -56,19 +56,19 @@ jobs: exit 1 fi - // need to get contents of the PR + # need to get contents of the PR git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head - git fetch origin pull/${{ github.event.pull_request.number }}/base:pr-base - - BASE=$(git merge-base pr-base pr-head) + git fetch origin main + BASE=$(git merge-base pr-head origin/main) HEAD=pr-head else echo "Push to riscv" # 统一用 riscv 作为 baseline - git fetch origin riscv + git fetch origin main + #git fetch origin riscv - BASE=$(git merge-base HEAD origin/main) - HEAD=$(git rev-parse HEAD) + BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit + HEAD=${{ github.sha }} fi @@ -97,6 +97,5 @@ jobs: run: | export BASE_COMMIT=${BASE_COMMIT} export PATCH_FILE=$(pwd)/patch.diff - export GITHUB_PR=${PR_NUMBER:-0} - #bash /home/jenkins/scripts/jenkins-run.sh + bash /home/jenkins/scripts/jenkins-run.sh From 9b03457b179ff1430f47c642b9853d99669ebb89 Mon Sep 17 00:00:00 2001 From: Bo YU Date: Sat, 25 Apr 2026 14:21:06 +0000 Subject: [PATCH 05/12] move the patch to dest --- .github/workflows/ci-riscv64.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml index 3b3ddd23483e7..dec68a6d12c16 100644 --- a/.github/workflows/ci-riscv64.yml +++ b/.github/workflows/ci-riscv64.yml @@ -39,7 +39,7 @@ jobs: run: | echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV - + - name: Diff base and head run: | if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then @@ -82,20 +82,20 @@ jobs: run: | echo "Generating patch..." - git diff $BASE_COMMIT $HEAD_COMMIT > patch.diff + SHORT_HEAD=${HEAD_COMMIT:0:7} + PATCH_NAME="patch_${SHORT_HEAD}.patch" + + git diff $BASE_COMMIT $HEAD_COMMIT > $PATCH_NAME echo "Patch size:" - wc -l patch.diff - cat patch.diff + wc -l $PATCH_NAME - # 可选:避免空 patch - if [ ! -s patch.diff ]; then - echo "Warning: empty patch" - fi + cp $PATCH_NAME /home/jenkins/patch/ + cat /home/jenkins/patch/$PATCH_NAME - name: Trigger Jenkins Job run: | export BASE_COMMIT=${BASE_COMMIT} export PATCH_FILE=$(pwd)/patch.diff - bash /home/jenkins/scripts/jenkins-run.sh + #bash /home/jenkins/scripts/jenkins-run.sh From 3f8607eae5bfa51bb897dc848c178447b022cdd4 Mon Sep 17 00:00:00 2001 From: XYenChi Date: Mon, 27 Apr 2026 10:59:07 +0800 Subject: [PATCH 06/12] Fix block list format and remove test_cpu_select_algorithm (#4) * mklnn is unavailable on RISC-V * Remove test_cpu_select_algorithm from block_list * Fix block list format --- test/inductor/test_cpu_select_algorithm.py | 2 + test/run_test.py | 52 +++++++++++----------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py index 6c0fdb84da02e..b554b62eb94bb 100644 --- a/test/inductor/test_cpu_select_algorithm.py +++ b/test/inductor/test_cpu_select_algorithm.py @@ -1559,6 +1559,7 @@ def forward(self, x): vec_amx = VecAMX() self._check_amx_counter(vec_amx) + @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled") @inductor_config.patch({"freezing": True}) @patches @torch.no_grad @@ -1672,6 +1673,7 @@ def forward(self, x, scale): vec_amx = VecAMX() self._check_amx_counter(vec_amx) + @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled") @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True}) @patches @torch.no_grad diff --git a/test/run_test.py b/test/run_test.py index 0246e5e761175..cfb0e5a3fe74a 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -284,40 +284,40 @@ def __contains__(self, item): RISCV64_BLOCKLIST = [ # disable distributed related test - "inductor/test_distributed_patterns" - "fx/test_dce_pass" - "export/test_cpp_serdes" - "export/test_export" - "export/test_export_strict" - "export/test_export_training_ir_to_run_decomp" - "export/test_retraceability" - "export/test_serdes" - "export/test_strict_export_v2" - "test_public_bindings" + "inductor/test_distributed_patterns", + "fx/test_dce_pass", + "export/test_cpp_serdes", + "export/test_export", + "export/test_export_strict", + "export/test_export_training_ir_to_run_decomp", + "export/test_retraceability", + "export/test_serdes", + "export/test_strict_export_v2", + "test_public_bindings", # quantized engine NoQEngine is not supported - "test_torch" - "ao/sparsity/test_composability" + "test_torch", + "ao/sparsity/test_composability", # QNNPACK is not supported - "export/test_converter" + "export/test_converter", # record_contex_cpp is not support on non-linux non-x86_64 platforms - "torch_np/numpy_tests/core/test_numeric" + "torch_np/numpy_tests/core/test_numeric", # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed' - "test_testing" - # TODO:L1 cache size = 0, need to fix - "inductor/test_cpu_select_algorithm" - "inductor/test_aot_inductor_arrayref" - "inductor/test_cpu_repro" + "test_testing", + "inductor/test_aot_inductor_arrayref", + "inductor/test_cpu_repro", + # TODO: mkldnn not available, shape guard failures on RISC-V + "inductor/test_cpu_select_algorithm", # TODO:scalar value not equal, need to fix - "profiler/test_profiler" + "profiler/test_profiler", # TODO precision - "test_binary_ufuncs" - "test_decomp" + "test_binary_ufuncs", + "test_decomp", # TODO no CUDA related module - "quantization/core/test_workflow_module" # TestFakeQuantize.test_fq_module_per_channel - "quantization/core/test_workflow_ops" - "quantization/core/test_quantized_op" + "quantization/core/test_workflow_module", # TestFakeQuantize.test_fq_module_per_channel + "quantization/core/test_workflow_ops", + "quantization/core/test_quantized_op", # z3-solver build fail - "test_proxy_tensor" + "test_proxy_tensor", ] From 714e262658ee624ed1984453b7e395b674f74b31 Mon Sep 17 00:00:00 2001 From: Bo YU Date: Sat, 25 Apr 2026 14:21:06 +0000 Subject: [PATCH 07/12] move the patch to dest --- .github/workflows/ci-riscv64.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml index dec68a6d12c16..d8cd1ecdd8de3 100644 --- a/.github/workflows/ci-riscv64.yml +++ b/.github/workflows/ci-riscv64.yml @@ -57,14 +57,14 @@ jobs: fi # need to get contents of the PR - git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head - git fetch origin main + git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head + git fetch --quiet origin main BASE=$(git merge-base pr-head origin/main) HEAD=pr-head else echo "Push to riscv" # 统一用 riscv 作为 baseline - git fetch origin main + git fetch --quiet origin main #git fetch origin riscv BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit @@ -93,9 +93,8 @@ jobs: cp $PATCH_NAME /home/jenkins/patch/ cat /home/jenkins/patch/$PATCH_NAME + echo "PATCH_FILE=$PATCH_NAME" >> $GITHUB_ENV + - name: Trigger Jenkins Job run: | - export BASE_COMMIT=${BASE_COMMIT} - export PATCH_FILE=$(pwd)/patch.diff - - #bash /home/jenkins/scripts/jenkins-run.sh + bash /home/jenkins/scripts/jenkins-run.sh $BASE_COMMIT $PATCH_FILE From 41c36ec91388f2fdb81942e78083aea8e3edfc89 Mon Sep 17 00:00:00 2001 From: Yixuan Chen Date: Sun, 26 Apr 2026 00:49:38 +0800 Subject: [PATCH 08/12] Fix bytes_to_scalar for float/complex on RISC-V bytes_to_scalar previously round-tripped raw bytes through Python float/complex values (via ctypes) before constructing the tensor. This loses NaN bit patterns on architectures (such as RISC-V) that canonicalize NaNs in floating-point loads/conversions, causing test_bytes_to_scalar_cpu_{float32,float64,complex64,complex128} to fail with mismatched storage bytes. Construct the scalar tensor by writing the raw bytes directly into its untyped storage so all input bit patterns (including NaN payloads) are preserved exactly. --- torch/testing/_internal/common_utils.py | 37 +++++++++++-------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 45d7b0b253a30..9c78401637347 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -5457,27 +5457,22 @@ def check_bytes(byte_list): if not (0 <= byte <= 255): raise AssertionError(f"byte value out of range: expected 0 <= byte <= 255, got {byte}") - if dtype.is_complex: - if len(byte_list) != (num_bytes * 2): - raise AssertionError( - f"expected len(byte_list) == {num_bytes * 2} for complex dtype, got {len(byte_list)}" - ) - check_bytes(byte_list) - real = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list[:num_bytes])).value - imag = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list[num_bytes:])).value - res = real + 1j * imag - else: - if len(byte_list) != num_bytes: - raise AssertionError( - f"expected len(byte_list) == {num_bytes}, got {len(byte_list)}" - ) - check_bytes(byte_list) - res = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list)).value - - return torch.tensor(res, device=device, dtype=dtype) + expected_len = num_bytes * 2 if dtype.is_complex else num_bytes + if len(byte_list) != expected_len: + raise AssertionError( + f"expected len(byte_list) == {expected_len}" + f"{' for complex dtype' if dtype.is_complex else ''}, got {len(byte_list)}" + ) + check_bytes(byte_list) + + # Write bytes directly into storage to preserve exact bit patterns + # (e.g. NaN payloads, which are not preserved when round-tripping through + # Python float/complex, especially on architectures like RISC-V that + # canonicalize NaNs). + res = torch.empty((), dtype=dtype, device=device) + src = torch.tensor(byte_list, dtype=torch.uint8, device=device) + res.untyped_storage().copy_(src.untyped_storage()) + return res def copy_func(f): From a0c8bd82b927571cc8cc1ae08a74cd5f8d0f31d3 Mon Sep 17 00:00:00 2001 From: vimer Date: Sun, 3 May 2026 09:05:59 +0800 Subject: [PATCH 09/12] [blacklist]: update it (#11) These cases are too slow on riscv64, adding them to here simply Drop test_torch from the list because it is one core case --- test/run_test.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index cfb0e5a3fe74a..fbd293f973e8a 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -294,8 +294,6 @@ def __contains__(self, item): "export/test_serdes", "export/test_strict_export_v2", "test_public_bindings", - # quantized engine NoQEngine is not supported - "test_torch", "ao/sparsity/test_composability", # QNNPACK is not supported "export/test_converter", @@ -318,6 +316,15 @@ def __contains__(self, item): "quantization/core/test_quantized_op", # z3-solver build fail "test_proxy_tensor", + # too slow on riscv64 + # 53013.55 s + "functorch/test_aotdispatch", + # 25069 s + "functorch/test_ops", + # 17528 s + "test_transformers", + # 10897 s + "functorch/test_vmap", ] From bbbab668cf8b38dc832ffb9e8bc02b05a5801879 Mon Sep 17 00:00:00 2001 From: vimer Date: Thu, 7 May 2026 12:32:52 +0800 Subject: [PATCH 10/12] Use commit sha on PR workflow (#12) --- .github/workflows/ci-riscv64.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml index d8cd1ecdd8de3..44cd575a01e01 100644 --- a/.github/workflows/ci-riscv64.yml +++ b/.github/workflows/ci-riscv64.yml @@ -60,7 +60,7 @@ jobs: git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head git fetch --quiet origin main BASE=$(git merge-base pr-head origin/main) - HEAD=pr-head + HEAD=$(git rev-parse pr-head) else echo "Push to riscv" # 统一用 riscv 作为 baseline From 9cac52bf8baa487739b8571bf448f5f43070bff4 Mon Sep 17 00:00:00 2001 From: Yixuan Chen Date: Sun, 26 Apr 2026 00:49:38 +0800 Subject: [PATCH 11/12] Fix bytes_to_scalar for float/complex on RISC-V bytes_to_scalar previously round-tripped raw bytes through Python float/complex values (via ctypes) before constructing the tensor. This loses NaN bit patterns on architectures (such as RISC-V) that canonicalize NaNs in floating-point loads/conversions, causing test_bytes_to_scalar_cpu_{float32,float64,complex64,complex128} to fail with mismatched storage bytes. Construct the scalar tensor by writing the raw bytes directly into its untyped storage so all input bit patterns (including NaN payloads) are preserved exactly. --- torch/testing/_internal/common_utils.py | 37 +++++++++++-------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 45d7b0b253a30..9c78401637347 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -5457,27 +5457,22 @@ def check_bytes(byte_list): if not (0 <= byte <= 255): raise AssertionError(f"byte value out of range: expected 0 <= byte <= 255, got {byte}") - if dtype.is_complex: - if len(byte_list) != (num_bytes * 2): - raise AssertionError( - f"expected len(byte_list) == {num_bytes * 2} for complex dtype, got {len(byte_list)}" - ) - check_bytes(byte_list) - real = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list[:num_bytes])).value - imag = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list[num_bytes:])).value - res = real + 1j * imag - else: - if len(byte_list) != num_bytes: - raise AssertionError( - f"expected len(byte_list) == {num_bytes}, got {len(byte_list)}" - ) - check_bytes(byte_list) - res = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list)).value - - return torch.tensor(res, device=device, dtype=dtype) + expected_len = num_bytes * 2 if dtype.is_complex else num_bytes + if len(byte_list) != expected_len: + raise AssertionError( + f"expected len(byte_list) == {expected_len}" + f"{' for complex dtype' if dtype.is_complex else ''}, got {len(byte_list)}" + ) + check_bytes(byte_list) + + # Write bytes directly into storage to preserve exact bit patterns + # (e.g. NaN payloads, which are not preserved when round-tripping through + # Python float/complex, especially on architectures like RISC-V that + # canonicalize NaNs). + res = torch.empty((), dtype=dtype, device=device) + src = torch.tensor(byte_list, dtype=torch.uint8, device=device) + res.untyped_storage().copy_(src.untyped_storage()) + return res def copy_func(f): From 87ac9b22d6ac4e136d31bdde5ab0e0d26e0f57e6 Mon Sep 17 00:00:00 2001 From: Yixuan Chen Date: Thu, 23 Apr 2026 21:07:25 +0800 Subject: [PATCH 12/12] Add NoQEngine fallback for quantized ops on RISC-V Implement PackedLinearWeightNoQEngine and PackedConvWeightNoQEngine classes that dequantize inputs, run float computation, and requantize outputs. This provides a working fallback when no hardware-specific quantized engine (FBGEMM, QNNPACK, ONEDNN) is available. Co-Authored-By: Claude Opus 4.7 --- .../native/quantized/cpu/conv_serialization.h | 12 + .../native/quantized/cpu/fbgemm_utils.cpp | 277 ++++++++++++++++++ .../ATen/native/quantized/cpu/fbgemm_utils.h | 140 +++++++++ aten/src/ATen/native/quantized/cpu/qconv.cpp | 12 + .../native/quantized/cpu/qconv_prepack.cpp | 12 + .../src/ATen/native/quantized/cpu/qlinear.cpp | 19 ++ .../native/quantized/cpu/qlinear_prepack.cpp | 8 + .../ATen/native/quantized/qconv_unpack.cpp | 10 + 8 files changed, 490 insertions(+) diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h index 3edd398fa789a..3e45a8062b0d8 100644 --- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h +++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h @@ -410,6 +410,18 @@ c10::intrusive_ptr> deserialize_conv( ); } #endif // AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedConvWeightNoQEngine::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } TORCH_CHECK( false, "Didn't find engine for when deserializing ConvPackedParams: ", diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index 1e4d2b9960d02..a45824b3485af 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -25,6 +25,16 @@ #include #else #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #endif @@ -365,6 +375,270 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>( #endif // USE_FBGEMM +// NoQEngine packed weight implementations: dequantize, float compute, quantize. +// Used as a fallback when no hardware-specific quantized engine is available. + +c10::intrusive_ptr PackedLinearWeightNoQEngine::prepack( + at::Tensor weight, + std::optional bias) { + return c10::make_intrusive( + std::move(weight), std::move(bias)); +} + +at::Tensor PackedLinearWeightNoQEngine::apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + return at::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + at::Tensor relu_out = at::relu(output_fp); + return at::quantize_per_tensor( + relu_out, output_scale, output_zero_point, c10::kQInt8); +} + +at::Tensor& PackedLinearWeightNoQEngine::apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + at::Tensor q_out = at::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); + output.copy_(q_out); + return output; +} + +at::Tensor& PackedLinearWeightNoQEngine::apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + at::Tensor relu_out = at::relu(output_fp); + at::Tensor q_out = at::quantize_per_tensor( + relu_out, output_scale, output_zero_point, c10::kQInt8); + output.copy_(q_out); + return output; +} + +at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + return at::linear(input_fp, weight_fp, bias_); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + return at::relu(output_fp); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_dynamic( + at::Tensor input, + bool reduce_range) { + at::Tensor weight_fp = at::dequantize(weight_); + return at::linear(input, weight_fp, bias_); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_dynamic_relu( + at::Tensor input, + bool reduce_range) { + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input, weight_fp, bias_); + return at::relu(output_fp); +} + +std::tuple> +PackedLinearWeightNoQEngine::unpack() { + return std::make_tuple(weight_, bias_); +} + +template +c10::intrusive_ptr> +PackedConvWeightNoQEngine::prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) { + return c10::make_intrusive>( + std::move(weight), + std::move(bias), + std::move(stride), + std::move(padding), + std::move(output_padding), + std::move(dilation), + groups, + transpose); +} + +template +at::Tensor PackedConvWeightNoQEngine::apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + auto stride = stride_.vec(); + auto padding = padding_.vec(); + auto dilation = dilation_.vec(); + at::Tensor output_fp; + if (transpose_) { + auto output_padding = output_padding_.vec(); + if constexpr (kSpatialDim == 1) { + output_fp = at::conv_transpose1d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv_transpose2d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv_transpose3d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } + } else { + if constexpr (kSpatialDim == 1) { + output_fp = at::conv1d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv2d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv3d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } + } + return at::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); +} + +template +at::Tensor PackedConvWeightNoQEngine::apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + auto stride = stride_.vec(); + auto padding = padding_.vec(); + auto dilation = dilation_.vec(); + at::Tensor output_fp; + if (transpose_) { + auto output_padding = output_padding_.vec(); + if constexpr (kSpatialDim == 1) { + output_fp = at::conv_transpose1d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv_transpose2d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv_transpose3d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } + } else { + if constexpr (kSpatialDim == 1) { + output_fp = at::conv1d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv2d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv3d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } + } + at::Tensor relu_out = at::relu(output_fp); + return at::quantize_per_tensor( + relu_out, output_scale, output_zero_point, c10::kQInt8); +} + +template +at::Tensor PackedConvWeightNoQEngine::apply_dynamic( + const at::Tensor& input, + bool reduce_range) { + at::Tensor weight_fp = at::dequantize(weight_); + auto stride = stride_.vec(); + auto padding = padding_.vec(); + auto dilation = dilation_.vec(); + if (transpose_) { + auto output_padding = output_padding_.vec(); + if constexpr (kSpatialDim == 1) { + return at::conv_transpose1d( + input, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 2) { + return at::conv_transpose2d( + input, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else { + return at::conv_transpose3d( + input, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } + } else { + if constexpr (kSpatialDim == 1) { + return at::conv1d( + input, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 2) { + return at::conv2d( + input, weight_fp, bias_, + stride, padding, dilation, groups_); + } else { + return at::conv3d( + input, weight_fp, bias_, + stride, padding, dilation, groups_); + } + } +} + +template +std::tuple> +PackedConvWeightNoQEngine::unpack() { + return std::make_tuple(weight_, bias_); +} + +template struct PackedConvWeightNoQEngine<1>; +template struct PackedConvWeightNoQEngine<2>; +template struct PackedConvWeightNoQEngine<3>; + namespace { // This is really terrible, but couldn't figure out a better way to constexpr convert int to // string and then perform string concatenation on/with it @@ -469,6 +743,9 @@ int register_linear_params() { return std::apply(PackedLinearWeightsOnednn::prepack, std::move(state)); } #endif // #if AT_MKLDNN_ENABLED() + if (at::globalContext().qEngine() == at::QEngine::NoQEngine) { + return std::apply(PackedLinearWeightNoQEngine::prepack, std::move(state)); + } TORCH_CHECK(false, "Unknown qengine"); }) .def("bias", [](const c10::intrusive_ptr& self) { diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index a1139be833f87..5d5acd8b4950c 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -352,6 +352,146 @@ Tensor ConvertConvWeightsToChannelLastTensor( #endif // USE_FBGEMM +struct TORCH_API PackedLinearWeightNoQEngine : public LinearPackedParamsBase { + PackedLinearWeightNoQEngine( + at::Tensor weight, + std::optional bias) + : weight_(std::move(weight)), + bias_(std::move(bias)) {} + + at::Tensor weight_; + std::optional bias_; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor& apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor& apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); +}; + +template +struct TORCH_API PackedConvWeightNoQEngine + : public ConvPackedParamsBase { + PackedConvWeightNoQEngine( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) + : weight_(std::move(weight)), + bias_(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose) {} + + at::Tensor weight_; + std::optional bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + std::tuple> unpack() override; + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); +}; + struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { PackedEmbeddingBagWeight( at::Tensor packed_w, diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 94ac6350aeb0e..9853d49da6866 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -2147,6 +2148,17 @@ class QConvAddInt8 final { } } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + at::Tensor act_fp = at::dequantize(act); + at::Tensor accum_fp = at::dequantize(accum); + at::Tensor output_fp = packed_weight->apply_dynamic(act_fp, false); + output_fp = output_fp + accum_fp; + if (kReluFused) { + output_fp = at::relu(output_fp); + } + return at::native::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_add.", diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index b7b2c5ca8d30e..66b3ec8db7700 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -718,6 +718,12 @@ class QConvPackWeightInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedConvWeightNoQEngine::prepack( + std::move(weight), std::move(bias), stride, padding, + output_padding, dilation, groups, transpose); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_prepack ", @@ -814,6 +820,12 @@ class QConv1dPackWeightInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedConvWeightNoQEngine<2>::prepack( + std::move(weight), std::move(bias), stride, padding, + output_padding, dilation, groups, transpose); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv1d_prepack ", diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index 1f726b3ee1c3e..47b015452497f 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -25,6 +25,7 @@ #include // for empty_affine_qu... #include // for empty #include // for quantize_per_ch... +#include #include // for quantize_per_te... #include #include @@ -1526,6 +1527,15 @@ class QLinearLeakyReluInt8 final { std::move(input), output_scale, output_zero_point, negative_slope); } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + auto [weight, bias] = packed_weight->unpack(); + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias); + at::Tensor lr_out = at::leaky_relu(output_fp, negative_slope); + return at::native::quantize_per_tensor( + lr_out, output_scale, output_zero_point, c10::kQInt8); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_leaky_relu ", @@ -1550,6 +1560,15 @@ class QLinearTanhInt8 final { std::move(input), output_scale, output_zero_point); } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + auto [weight, bias] = packed_weight->unpack(); + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias); + at::Tensor tanh_out = at::tanh(output_fp); + return at::native::quantize_per_tensor( + tanh_out, output_scale, output_zero_point, c10::kQInt8); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_tanh ", diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index b4ae4e677bcd2..95ab2fa06c9b5 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -606,6 +606,10 @@ class QLinearPackWeightInt8 final { return PackedLinearWeightsOnednn::prepack(std::move(weight), std::move(bias)); } #endif // #if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedLinearWeightNoQEngine::prepack( + std::move(weight), std::move(bias)); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_prepack ", @@ -645,6 +649,10 @@ class QLinearPackWeightFp16 final { "not supported by ONEDNN"); } #endif // #if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedLinearWeightNoQEngine::prepack( + std::move(weight), std::move(bias)); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_prepack_fp16 ", diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp index 4c2352a396177..53d351698fbdc 100644 --- a/aten/src/ATen/native/quantized/qconv_unpack.cpp +++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp @@ -70,6 +70,10 @@ class QConvUnpackWeightsInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return packed_weight->unpack(); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_unpack ", @@ -111,6 +115,12 @@ class QConv1dUnpackWeightsInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + std::tie(weight, bias) = packed_weight->unpack(); + weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); + return std::tuple>(weight, bias); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv1d_unpack ",