diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml
new file mode 100644
index 0000000000000..44cd575a01e01
--- /dev/null
+++ b/.github/workflows/ci-riscv64.yml
@@ -0,0 +1,100 @@
+# Note: this runner is provided externally, so we minimize its access to
+# secrets.
+on:
+  push:
+    branches: [riscv]
+
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+
+name: CI (riscv64)
+
+permissions:
+  contents: read
+  # No permissions to secrets.
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+# FIXME: Drop this
+env:
+  RUSTFLAGS: -D warnings
+  CARGO_TERM_COLOR: always
+
+jobs:
+  build:
+    name: Build and test
+    runs-on: [self-hosted, linux, amd64]
+    # This is in its own separate environment.
+    environment: riscv64
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 3000   # shadow clone?
+          ref: ${{ github.sha }} # including latest sha
+
+      - name: Extract PR info
+        run: |
+          echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV
+          echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV
+
+      - name: Diff base and head
+        run: |
+          if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then
+            echo "Push PR build"
+            BASE_REF="${{ github.base_ref }}"
+            HEAD_REF="${{ github.head_ref }}"
+
+            echo "Base ref: $BASE_REF"
+            echo "Head ref: $HEAD_REF"
+
+            # 强约束: PR 必须基于 riscv
+            if [ "$BASE_REF" != "riscv" ]; then
+              echo "ERROR: PR must target 'riscv' branch, got '$BASE_REF'"
+              exit 1
+            fi
+
+            # need to get contents of the PR
+            git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head
+            git fetch --quiet origin main
+            BASE=$(git merge-base pr-head origin/main)
+            HEAD=$(git rev-parse pr-head)
+          else
+            echo "Push to riscv"
+            # 统一用 riscv 作为 baseline
+            git fetch --quiet origin main
+            #git fetch origin riscv
+
+            BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit
+            HEAD=${{ github.sha }}
+
+          fi
+
+          echo "BASE_COMMIT=$BASE" >> $GITHUB_ENV
+          echo "HEAD_COMMIT=$HEAD" >> $GITHUB_ENV
+
+          echo "Base: $BASE"
+          echo "Head: $HEAD"
+
+      - name: Generate patch
+        run: |
+          echo "Generating patch..."
+
+          SHORT_HEAD=${HEAD_COMMIT:0:7}
+          PATCH_NAME="patch_${SHORT_HEAD}.patch"
+
+          git diff $BASE_COMMIT $HEAD_COMMIT > $PATCH_NAME
+
+          echo "Patch size:"
+          wc -l $PATCH_NAME
+
+          cp $PATCH_NAME /home/jenkins/patch/
+          cat /home/jenkins/patch/$PATCH_NAME
+
+          echo "PATCH_FILE=$PATCH_NAME" >> $GITHUB_ENV
+
+      - name: Trigger Jenkins Job
+        run: |
+          bash /home/jenkins/scripts/jenkins-run.sh $BASE_COMMIT  $PATCH_FILE
diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
index 3edd398fa789a..3e45a8062b0d8 100644
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@@ -410,6 +410,18 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
     );
   }
 #endif // AT_MKLDNN_ENABLED()
+  if (ctx.qEngine() == at::QEngine::NoQEngine) {
+    return PackedConvWeightNoQEngine<kSpatialDim>::prepack(
+      std::move(weight.value()),
+      std::move(bias),
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
 TORCH_CHECK(
   false,
   "Didn't find engine for when deserializing ConvPackedParams: ",
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 1e4d2b9960d02..a45824b3485af 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -25,6 +25,16 @@
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/cat.h>
+#include <ATen/ops/conv1d.h>
+#include <ATen/ops/conv2d.h>
+#include <ATen/ops/conv3d.h>
+#include <ATen/ops/conv_transpose1d.h>
+#include <ATen/ops/conv_transpose2d.h>
+#include <ATen/ops/conv_transpose3d.h>
+#include <ATen/ops/dequantize.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/quantize_per_tensor.h>
+#include <ATen/ops/relu.h>
 
 #include <utility>
 #endif
@@ -365,6 +375,270 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>(
 
 #endif // USE_FBGEMM
 
+// NoQEngine packed weight implementations: dequantize, float compute, quantize.
+// Used as a fallback when no hardware-specific quantized engine is available.
+
+c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightNoQEngine::prepack(
+    at::Tensor weight,
+    std::optional<at::Tensor> bias) {
+  return c10::make_intrusive<PackedLinearWeightNoQEngine>(
+      std::move(weight), std::move(bias));
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  return at::quantize_per_tensor(
+      output_fp, output_scale, output_zero_point, c10::kQInt8);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_relu(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  at::Tensor relu_out = at::relu(output_fp);
+  return at::quantize_per_tensor(
+      relu_out, output_scale, output_zero_point, c10::kQInt8);
+}
+
+at::Tensor& PackedLinearWeightNoQEngine::apply_out(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point,
+    at::Tensor& output) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  at::Tensor q_out = at::quantize_per_tensor(
+      output_fp, output_scale, output_zero_point, c10::kQInt8);
+  output.copy_(q_out);
+  return output;
+}
+
+at::Tensor& PackedLinearWeightNoQEngine::apply_relu_out(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point,
+    at::Tensor& output) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  at::Tensor relu_out = at::relu(output_fp);
+  at::Tensor q_out = at::quantize_per_tensor(
+      relu_out, output_scale, output_zero_point, c10::kQInt8);
+  output.copy_(q_out);
+  return output;
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_output_fp32(
+    at::Tensor input,
+    double input_scale,
+    int64_t input_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  return at::linear(input_fp, weight_fp, bias_);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+    at::Tensor input,
+    double input_scale,
+    int64_t input_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_);
+  return at::relu(output_fp);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_dynamic(
+    at::Tensor input,
+    bool reduce_range) {
+  at::Tensor weight_fp = at::dequantize(weight_);
+  return at::linear(input, weight_fp, bias_);
+}
+
+at::Tensor PackedLinearWeightNoQEngine::apply_dynamic_relu(
+    at::Tensor input,
+    bool reduce_range) {
+  at::Tensor weight_fp = at::dequantize(weight_);
+  at::Tensor output_fp = at::linear(input, weight_fp, bias_);
+  return at::relu(output_fp);
+}
+
+std::tuple<at::Tensor, std::optional<at::Tensor>>
+PackedLinearWeightNoQEngine::unpack() {
+  return std::make_tuple(weight_, bias_);
+}
+
+template <int kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>
+PackedConvWeightNoQEngine<kSpatialDim>::prepack(
+    at::Tensor weight,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> output_padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    bool transpose) {
+  return c10::make_intrusive<PackedConvWeightNoQEngine<kSpatialDim>>(
+      std::move(weight),
+      std::move(bias),
+      std::move(stride),
+      std::move(padding),
+      std::move(output_padding),
+      std::move(dilation),
+      groups,
+      transpose);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightNoQEngine<kSpatialDim>::apply(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  auto stride = stride_.vec();
+  auto padding = padding_.vec();
+  auto dilation = dilation_.vec();
+  at::Tensor output_fp;
+  if (transpose_) {
+    auto output_padding = output_padding_.vec();
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv_transpose1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv_transpose2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv_transpose3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    }
+  } else {
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    }
+  }
+  return at::quantize_per_tensor(
+      output_fp, output_scale, output_zero_point, c10::kQInt8);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightNoQEngine<kSpatialDim>::apply_relu(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  at::Tensor input_fp = at::dequantize(input);
+  at::Tensor weight_fp = at::dequantize(weight_);
+  auto stride = stride_.vec();
+  auto padding = padding_.vec();
+  auto dilation = dilation_.vec();
+  at::Tensor output_fp;
+  if (transpose_) {
+    auto output_padding = output_padding_.vec();
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv_transpose1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv_transpose2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv_transpose3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    }
+  } else {
+    if constexpr (kSpatialDim == 1) {
+      output_fp = at::conv1d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 2) {
+      output_fp = at::conv2d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 3) {
+      output_fp = at::conv3d(
+          input_fp, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    }
+  }
+  at::Tensor relu_out = at::relu(output_fp);
+  return at::quantize_per_tensor(
+      relu_out, output_scale, output_zero_point, c10::kQInt8);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightNoQEngine<kSpatialDim>::apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range) {
+  at::Tensor weight_fp = at::dequantize(weight_);
+  auto stride = stride_.vec();
+  auto padding = padding_.vec();
+  auto dilation = dilation_.vec();
+  if (transpose_) {
+    auto output_padding = output_padding_.vec();
+    if constexpr (kSpatialDim == 1) {
+      return at::conv_transpose1d(
+          input, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else if constexpr (kSpatialDim == 2) {
+      return at::conv_transpose2d(
+          input, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    } else {
+      return at::conv_transpose3d(
+          input, weight_fp, bias_,
+          stride, padding, output_padding, groups_, dilation);
+    }
+  } else {
+    if constexpr (kSpatialDim == 1) {
+      return at::conv1d(
+          input, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else if constexpr (kSpatialDim == 2) {
+      return at::conv2d(
+          input, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    } else {
+      return at::conv3d(
+          input, weight_fp, bias_,
+          stride, padding, dilation, groups_);
+    }
+  }
+}
+
+template <int kSpatialDim>
+std::tuple<at::Tensor, std::optional<at::Tensor>>
+PackedConvWeightNoQEngine<kSpatialDim>::unpack() {
+  return std::make_tuple(weight_, bias_);
+}
+
+template struct PackedConvWeightNoQEngine<1>;
+template struct PackedConvWeightNoQEngine<2>;
+template struct PackedConvWeightNoQEngine<3>;
+
 namespace {
   // This is really terrible, but couldn't figure out a better way to constexpr convert int to
   // string and then perform string concatenation on/with it
@@ -469,6 +743,9 @@ int register_linear_params() {
                   return std::apply(PackedLinearWeightsOnednn::prepack, std::move(state));
                 }
 #endif // #if AT_MKLDNN_ENABLED()
+                if (at::globalContext().qEngine() == at::QEngine::NoQEngine) {
+                  return std::apply(PackedLinearWeightNoQEngine::prepack, std::move(state));
+                }
                 TORCH_CHECK(false, "Unknown qengine");
               })
               .def("bias", [](const c10::intrusive_ptr<LinearPackedParamsBase>& self) {
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index a1139be833f87..5d5acd8b4950c 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -352,6 +352,146 @@ Tensor ConvertConvWeightsToChannelLastTensor(
 
 #endif // USE_FBGEMM
 
+struct TORCH_API PackedLinearWeightNoQEngine : public LinearPackedParamsBase {
+  PackedLinearWeightNoQEngine(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)) {}
+
+  at::Tensor weight_;
+  std::optional<at::Tensor> bias_;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor& apply_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor& apply_relu_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
+      override;
+
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
+      override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  std::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias);
+};
+
+template <int kSpatialDim = 2>
+struct TORCH_API PackedConvWeightNoQEngine
+    : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightNoQEngine(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose) {}
+
+  at::Tensor weight_;
+  std::optional<at::Tensor> bias_;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return transpose_;
+  }
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+};
+
 struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
   PackedEmbeddingBagWeight(
       at::Tensor packed_w,
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 94ac6350aeb0e..9853d49da6866 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -30,6 +30,7 @@
 #include <ATen/ops/_empty_affine_quantized.h>
 #include <ATen/ops/_empty_affine_quantized_native.h>
 #include <ATen/ops/_empty_per_channel_affine_quantized_native.h>
+#include <ATen/ops/dequantize.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/quantize_per_channel_native.h>
 #include <ATen/ops/quantize_per_tensor_native.h>
@@ -2147,6 +2148,17 @@ class QConvAddInt8 final {
       }
     }
 #endif
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      at::Tensor act_fp = at::dequantize(act);
+      at::Tensor accum_fp = at::dequantize(accum);
+      at::Tensor output_fp = packed_weight->apply_dynamic(act_fp, false);
+      output_fp = output_fp + accum_fp;
+      if (kReluFused) {
+        output_fp = at::relu(output_fp);
+      }
+      return at::native::quantize_per_tensor(
+          output_fp, output_scale, output_zero_point, c10::kQInt8);
+    }
     TORCH_CHECK(
     false,
     "Didn't find engine for operation quantized::conv2d_add.",
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index b7b2c5ca8d30e..66b3ec8db7700 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -718,6 +718,12 @@ class QConvPackWeightInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedConvWeightNoQEngine<kSpatialDim>::prepack(
+          std::move(weight), std::move(bias), stride, padding,
+          output_padding, dilation, groups, transpose);
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv2d_prepack ",
@@ -814,6 +820,12 @@ class QConv1dPackWeightInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedConvWeightNoQEngine<2>::prepack(
+          std::move(weight), std::move(bias), stride, padding,
+          output_padding, dilation, groups, transpose);
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv1d_prepack ",
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 1f726b3ee1c3e..47b015452497f 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -25,6 +25,7 @@
 #include <ATen/ops/_empty_affine_quantized_native.h>  // for empty_affine_qu...
 #include <ATen/ops/empty.h>                           // for empty
 #include <ATen/ops/quantize_per_channel_native.h>     // for quantize_per_ch...
+#include <ATen/ops/dequantize.h>
 #include <ATen/ops/quantize_per_tensor_native.h>      // for quantize_per_te...
 #include <ATen/ops/zeros.h>
 #include <ATen/ops/_weight_int4pack_mm_for_cpu.h>
@@ -1526,6 +1527,15 @@ class QLinearLeakyReluInt8 final {
           std::move(input), output_scale, output_zero_point, negative_slope);
     }
 #endif
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      auto [weight, bias] = packed_weight->unpack();
+      at::Tensor input_fp = at::dequantize(input);
+      at::Tensor weight_fp = at::dequantize(weight);
+      at::Tensor output_fp = at::linear(input_fp, weight_fp, bias);
+      at::Tensor lr_out = at::leaky_relu(output_fp, negative_slope);
+      return at::native::quantize_per_tensor(
+          lr_out, output_scale, output_zero_point, c10::kQInt8);
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_leaky_relu ",
@@ -1550,6 +1560,15 @@ class QLinearTanhInt8 final {
           std::move(input), output_scale, output_zero_point);
     }
 #endif
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      auto [weight, bias] = packed_weight->unpack();
+      at::Tensor input_fp = at::dequantize(input);
+      at::Tensor weight_fp = at::dequantize(weight);
+      at::Tensor output_fp = at::linear(input_fp, weight_fp, bias);
+      at::Tensor tanh_out = at::tanh(output_fp);
+      return at::native::quantize_per_tensor(
+          tanh_out, output_scale, output_zero_point, c10::kQInt8);
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_tanh ",
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index b4ae4e677bcd2..95ab2fa06c9b5 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -606,6 +606,10 @@ class QLinearPackWeightInt8 final {
       return PackedLinearWeightsOnednn::prepack(std::move(weight), std::move(bias));
     }
 #endif // #if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedLinearWeightNoQEngine::prepack(
+          std::move(weight), std::move(bias));
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_prepack ",
@@ -645,6 +649,10 @@ class QLinearPackWeightFp16 final {
           "not supported by ONEDNN");
     }
 #endif // #if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return PackedLinearWeightNoQEngine::prepack(
+          std::move(weight), std::move(bias));
+    }
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_prepack_fp16 ",
diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp
index 4c2352a396177..53d351698fbdc 100644
--- a/aten/src/ATen/native/quantized/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp
@@ -70,6 +70,10 @@ class QConvUnpackWeightsInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      return packed_weight->unpack();
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv2d_unpack ",
@@ -111,6 +115,12 @@ class QConv1dUnpackWeightsInt8 final {
     }
 #endif
 
+    if (ctx.qEngine() == at::QEngine::NoQEngine) {
+      std::tie(weight, bias) = packed_weight->unpack();
+      weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
+      return std::tuple<at::Tensor, std::optional<at::Tensor>>(weight, bias);
+    }
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv1d_unpack ",
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index 6c0fdb84da02e..b554b62eb94bb 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -1559,6 +1559,7 @@ def forward(self, x):
         vec_amx = VecAMX()
         self._check_amx_counter(vec_amx)
 
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @inductor_config.patch({"freezing": True})
     @patches
     @torch.no_grad
@@ -1672,6 +1673,7 @@ def forward(self, x, scale):
             vec_amx = VecAMX()
             self._check_amx_counter(vec_amx)
 
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
     @patches
     @torch.no_grad
diff --git a/test/run_test.py b/test/run_test.py
index ff2a617a6eb78..fbd293f973e8a 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -108,6 +108,7 @@ def upload_adhoc_failure_json(*args, **kwargs):
 INDUCTOR_TEST_PREFIX = "inductor"
 IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT
 IS_S390X = platform.machine() == "s390x"
+IS_RISCV64 = platform.machine() == "riscv64"
 
 
 # Note [ROCm parallel CI testing]
@@ -281,6 +282,52 @@ def __contains__(self, item):
     "test_xpu",
 ]
 
+RISCV64_BLOCKLIST = [
+    # disable distributed related test
+    "inductor/test_distributed_patterns",
+    "fx/test_dce_pass",
+    "export/test_cpp_serdes",
+    "export/test_export",
+    "export/test_export_strict",
+    "export/test_export_training_ir_to_run_decomp",
+    "export/test_retraceability",
+    "export/test_serdes",
+    "export/test_strict_export_v2",
+    "test_public_bindings",
+    "ao/sparsity/test_composability",
+    # QNNPACK is not supported
+    "export/test_converter",
+    # record_contex_cpp is not support on non-linux non-x86_64 platforms
+    "torch_np/numpy_tests/core/test_numeric",
+    # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed'
+    "test_testing",
+    "inductor/test_aot_inductor_arrayref",
+    "inductor/test_cpu_repro",
+    # TODO: mkldnn not available, shape guard failures on RISC-V
+    "inductor/test_cpu_select_algorithm",
+    # TODO:scalar value not equal, need to fix
+    "profiler/test_profiler",
+    # TODO precision
+    "test_binary_ufuncs",
+    "test_decomp",
+    # TODO no CUDA related module
+    "quantization/core/test_workflow_module",  # TestFakeQuantize.test_fq_module_per_channel
+    "quantization/core/test_workflow_ops",
+    "quantization/core/test_quantized_op",
+    # z3-solver build fail
+    "test_proxy_tensor",
+    # too slow on riscv64
+    # 53013.55 s
+    "functorch/test_aotdispatch",
+    # 25069 s
+    "functorch/test_ops",
+    # 17528 s
+    "test_transformers",
+    # 10897 s
+    "functorch/test_vmap",
+]
+
+
 # The tests inside these files should never be run in parallel with each other
 RUN_PARALLEL_BLOCKLIST = [
     "test_extension_utils",
@@ -1822,6 +1869,13 @@ def get_selected_tests(options) -> list[str]:
             selected_tests,
             "Skip distributed tests on s390x",
         )
+    elif IS_RISCV64:
+        selected_tests = exclude_tests(RISCV64_BLOCKLIST, selected_tests, "on riscv64")
+        selected_tests = exclude_tests(
+            DISTRIBUTED_TESTS,
+            selected_tests,
+            "Skip distributed tests on riscv64",
+        )
 
     # skip all distributed tests if distributed package is not available.
     if not dist.is_available():
diff --git a/test/test_linalg.py b/test/test_linalg.py
index e9461bf83796f..cb1e67e10e598 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -8954,6 +8954,7 @@ def test_matrix_exp_backward_input_validation(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"):
             torch.ops.aten.matrix_exp_backward(non_square, grad_non_square)
 
+    @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 45d7b0b253a30..9c78401637347 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5457,27 +5457,22 @@ def check_bytes(byte_list):
             if not (0 <= byte <= 255):
                 raise AssertionError(f"byte value out of range: expected 0 <= byte <= 255, got {byte}")
 
-    if dtype.is_complex:
-        if len(byte_list) != (num_bytes * 2):
-            raise AssertionError(
-                f"expected len(byte_list) == {num_bytes * 2} for complex dtype, got {len(byte_list)}"
-            )
-        check_bytes(byte_list)
-        real = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list[:num_bytes])).value
-        imag = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list[num_bytes:])).value
-        res = real + 1j * imag
-    else:
-        if len(byte_list) != num_bytes:
-            raise AssertionError(
-                f"expected len(byte_list) == {num_bytes}, got {len(byte_list)}"
-            )
-        check_bytes(byte_list)
-        res = ctype.from_buffer((ctypes.c_byte * num_bytes)(
-            *byte_list)).value
-
-    return torch.tensor(res, device=device, dtype=dtype)
+    expected_len = num_bytes * 2 if dtype.is_complex else num_bytes
+    if len(byte_list) != expected_len:
+        raise AssertionError(
+            f"expected len(byte_list) == {expected_len}"
+            f"{' for complex dtype' if dtype.is_complex else ''}, got {len(byte_list)}"
+        )
+    check_bytes(byte_list)
+
+    # Write bytes directly into storage to preserve exact bit patterns
+    # (e.g. NaN payloads, which are not preserved when round-tripping through
+    # Python float/complex, especially on architectures like RISC-V that
+    # canonicalize NaNs).
+    res = torch.empty((), dtype=dtype, device=device)
+    src = torch.tensor(byte_list, dtype=torch.uint8, device=device)
+    res.untyped_storage().copy_(src.untyped_storage())
+    return res
 
 
 def copy_func(f):