diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml new file mode 100644 index 0000000000000..44cd575a01e01 --- /dev/null +++ b/.github/workflows/ci-riscv64.yml @@ -0,0 +1,100 @@ +# Note: this runner is provided externally, so we minimize its access to +# secrets. +on: + push: + branches: [riscv] + + pull_request_target: + types: [opened, synchronize, reopened] + +name: CI (riscv64) + +permissions: + contents: read + # No permissions to secrets. + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +# FIXME: Drop this +env: + RUSTFLAGS: -D warnings + CARGO_TERM_COLOR: always + +jobs: + build: + name: Build and test + runs-on: [self-hosted, linux, amd64] + # This is in its own separate environment. + environment: riscv64 + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 3000 # shadow clone? + ref: ${{ github.sha }} # including latest sha + + - name: Extract PR info + run: | + echo "BASE_SHA=${{ github.event.pull_request.base.sha }}" >> $GITHUB_ENV + echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> GITHUB_ENV + + - name: Diff base and head + run: | + if [[ "${{ github.event_name }}" = "pull_request" || "${{ github.event_name }}" == "pull_request_target" ]]; then + echo "Push PR build" + BASE_REF="${{ github.base_ref }}" + HEAD_REF="${{ github.head_ref }}" + + echo "Base ref: $BASE_REF" + echo "Head ref: $HEAD_REF" + + # 强约束: PR 必须基于 riscv + if [ "$BASE_REF" != "riscv" ]; then + echo "ERROR: PR must target 'riscv' branch, got '$BASE_REF'" + exit 1 + fi + + # need to get contents of the PR + git fetch --quiet origin pull/${{ github.event.pull_request.number }}/head:pr-head + git fetch --quiet origin main + BASE=$(git merge-base pr-head origin/main) + HEAD=$(git rev-parse pr-head) + else + echo "Push to riscv" + # 统一用 riscv 作为 baseline + git fetch --quiet origin main + #git fetch origin riscv + + BASE=$(git merge-base ${{ github.sha }} origin/main) # The latest commit + HEAD=${{ github.sha }} + + fi + + echo "BASE_COMMIT=$BASE" >> $GITHUB_ENV + echo "HEAD_COMMIT=$HEAD" >> $GITHUB_ENV + + echo "Base: $BASE" + echo "Head: $HEAD" + + - name: Generate patch + run: | + echo "Generating patch..." + + SHORT_HEAD=${HEAD_COMMIT:0:7} + PATCH_NAME="patch_${SHORT_HEAD}.patch" + + git diff $BASE_COMMIT $HEAD_COMMIT > $PATCH_NAME + + echo "Patch size:" + wc -l $PATCH_NAME + + cp $PATCH_NAME /home/jenkins/patch/ + cat /home/jenkins/patch/$PATCH_NAME + + echo "PATCH_FILE=$PATCH_NAME" >> $GITHUB_ENV + + - name: Trigger Jenkins Job + run: | + bash /home/jenkins/scripts/jenkins-run.sh $BASE_COMMIT $PATCH_FILE diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h index 3edd398fa789a..3e45a8062b0d8 100644 --- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h +++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h @@ -410,6 +410,18 @@ c10::intrusive_ptr> deserialize_conv( ); } #endif // AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedConvWeightNoQEngine::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } TORCH_CHECK( false, "Didn't find engine for when deserializing ConvPackedParams: ", diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index 1e4d2b9960d02..a45824b3485af 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -25,6 +25,16 @@ #include #else #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #endif @@ -365,6 +375,270 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>( #endif // USE_FBGEMM +// NoQEngine packed weight implementations: dequantize, float compute, quantize. +// Used as a fallback when no hardware-specific quantized engine is available. + +c10::intrusive_ptr PackedLinearWeightNoQEngine::prepack( + at::Tensor weight, + std::optional bias) { + return c10::make_intrusive( + std::move(weight), std::move(bias)); +} + +at::Tensor PackedLinearWeightNoQEngine::apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + return at::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + at::Tensor relu_out = at::relu(output_fp); + return at::quantize_per_tensor( + relu_out, output_scale, output_zero_point, c10::kQInt8); +} + +at::Tensor& PackedLinearWeightNoQEngine::apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + at::Tensor q_out = at::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); + output.copy_(q_out); + return output; +} + +at::Tensor& PackedLinearWeightNoQEngine::apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + at::Tensor relu_out = at::relu(output_fp); + at::Tensor q_out = at::quantize_per_tensor( + relu_out, output_scale, output_zero_point, c10::kQInt8); + output.copy_(q_out); + return output; +} + +at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + return at::linear(input_fp, weight_fp, bias_); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias_); + return at::relu(output_fp); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_dynamic( + at::Tensor input, + bool reduce_range) { + at::Tensor weight_fp = at::dequantize(weight_); + return at::linear(input, weight_fp, bias_); +} + +at::Tensor PackedLinearWeightNoQEngine::apply_dynamic_relu( + at::Tensor input, + bool reduce_range) { + at::Tensor weight_fp = at::dequantize(weight_); + at::Tensor output_fp = at::linear(input, weight_fp, bias_); + return at::relu(output_fp); +} + +std::tuple> +PackedLinearWeightNoQEngine::unpack() { + return std::make_tuple(weight_, bias_); +} + +template +c10::intrusive_ptr> +PackedConvWeightNoQEngine::prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) { + return c10::make_intrusive>( + std::move(weight), + std::move(bias), + std::move(stride), + std::move(padding), + std::move(output_padding), + std::move(dilation), + groups, + transpose); +} + +template +at::Tensor PackedConvWeightNoQEngine::apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + auto stride = stride_.vec(); + auto padding = padding_.vec(); + auto dilation = dilation_.vec(); + at::Tensor output_fp; + if (transpose_) { + auto output_padding = output_padding_.vec(); + if constexpr (kSpatialDim == 1) { + output_fp = at::conv_transpose1d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv_transpose2d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv_transpose3d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } + } else { + if constexpr (kSpatialDim == 1) { + output_fp = at::conv1d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv2d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv3d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } + } + return at::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); +} + +template +at::Tensor PackedConvWeightNoQEngine::apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight_); + auto stride = stride_.vec(); + auto padding = padding_.vec(); + auto dilation = dilation_.vec(); + at::Tensor output_fp; + if (transpose_) { + auto output_padding = output_padding_.vec(); + if constexpr (kSpatialDim == 1) { + output_fp = at::conv_transpose1d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv_transpose2d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv_transpose3d( + input_fp, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } + } else { + if constexpr (kSpatialDim == 1) { + output_fp = at::conv1d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 2) { + output_fp = at::conv2d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 3) { + output_fp = at::conv3d( + input_fp, weight_fp, bias_, + stride, padding, dilation, groups_); + } + } + at::Tensor relu_out = at::relu(output_fp); + return at::quantize_per_tensor( + relu_out, output_scale, output_zero_point, c10::kQInt8); +} + +template +at::Tensor PackedConvWeightNoQEngine::apply_dynamic( + const at::Tensor& input, + bool reduce_range) { + at::Tensor weight_fp = at::dequantize(weight_); + auto stride = stride_.vec(); + auto padding = padding_.vec(); + auto dilation = dilation_.vec(); + if (transpose_) { + auto output_padding = output_padding_.vec(); + if constexpr (kSpatialDim == 1) { + return at::conv_transpose1d( + input, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else if constexpr (kSpatialDim == 2) { + return at::conv_transpose2d( + input, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } else { + return at::conv_transpose3d( + input, weight_fp, bias_, + stride, padding, output_padding, groups_, dilation); + } + } else { + if constexpr (kSpatialDim == 1) { + return at::conv1d( + input, weight_fp, bias_, + stride, padding, dilation, groups_); + } else if constexpr (kSpatialDim == 2) { + return at::conv2d( + input, weight_fp, bias_, + stride, padding, dilation, groups_); + } else { + return at::conv3d( + input, weight_fp, bias_, + stride, padding, dilation, groups_); + } + } +} + +template +std::tuple> +PackedConvWeightNoQEngine::unpack() { + return std::make_tuple(weight_, bias_); +} + +template struct PackedConvWeightNoQEngine<1>; +template struct PackedConvWeightNoQEngine<2>; +template struct PackedConvWeightNoQEngine<3>; + namespace { // This is really terrible, but couldn't figure out a better way to constexpr convert int to // string and then perform string concatenation on/with it @@ -469,6 +743,9 @@ int register_linear_params() { return std::apply(PackedLinearWeightsOnednn::prepack, std::move(state)); } #endif // #if AT_MKLDNN_ENABLED() + if (at::globalContext().qEngine() == at::QEngine::NoQEngine) { + return std::apply(PackedLinearWeightNoQEngine::prepack, std::move(state)); + } TORCH_CHECK(false, "Unknown qengine"); }) .def("bias", [](const c10::intrusive_ptr& self) { diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index a1139be833f87..5d5acd8b4950c 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -352,6 +352,146 @@ Tensor ConvertConvWeightsToChannelLastTensor( #endif // USE_FBGEMM +struct TORCH_API PackedLinearWeightNoQEngine : public LinearPackedParamsBase { + PackedLinearWeightNoQEngine( + at::Tensor weight, + std::optional bias) + : weight_(std::move(weight)), + bias_(std::move(bias)) {} + + at::Tensor weight_; + std::optional bias_; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor& apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor& apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); +}; + +template +struct TORCH_API PackedConvWeightNoQEngine + : public ConvPackedParamsBase { + PackedConvWeightNoQEngine( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) + : weight_(std::move(weight)), + bias_(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose) {} + + at::Tensor weight_; + std::optional bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + std::tuple> unpack() override; + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); +}; + struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { PackedEmbeddingBagWeight( at::Tensor packed_w, diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 94ac6350aeb0e..9853d49da6866 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -2147,6 +2148,17 @@ class QConvAddInt8 final { } } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + at::Tensor act_fp = at::dequantize(act); + at::Tensor accum_fp = at::dequantize(accum); + at::Tensor output_fp = packed_weight->apply_dynamic(act_fp, false); + output_fp = output_fp + accum_fp; + if (kReluFused) { + output_fp = at::relu(output_fp); + } + return at::native::quantize_per_tensor( + output_fp, output_scale, output_zero_point, c10::kQInt8); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_add.", diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index b7b2c5ca8d30e..66b3ec8db7700 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -718,6 +718,12 @@ class QConvPackWeightInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedConvWeightNoQEngine::prepack( + std::move(weight), std::move(bias), stride, padding, + output_padding, dilation, groups, transpose); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_prepack ", @@ -814,6 +820,12 @@ class QConv1dPackWeightInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedConvWeightNoQEngine<2>::prepack( + std::move(weight), std::move(bias), stride, padding, + output_padding, dilation, groups, transpose); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv1d_prepack ", diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index 1f726b3ee1c3e..47b015452497f 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -25,6 +25,7 @@ #include // for empty_affine_qu... #include // for empty #include // for quantize_per_ch... +#include #include // for quantize_per_te... #include #include @@ -1526,6 +1527,15 @@ class QLinearLeakyReluInt8 final { std::move(input), output_scale, output_zero_point, negative_slope); } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + auto [weight, bias] = packed_weight->unpack(); + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias); + at::Tensor lr_out = at::leaky_relu(output_fp, negative_slope); + return at::native::quantize_per_tensor( + lr_out, output_scale, output_zero_point, c10::kQInt8); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_leaky_relu ", @@ -1550,6 +1560,15 @@ class QLinearTanhInt8 final { std::move(input), output_scale, output_zero_point); } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + auto [weight, bias] = packed_weight->unpack(); + at::Tensor input_fp = at::dequantize(input); + at::Tensor weight_fp = at::dequantize(weight); + at::Tensor output_fp = at::linear(input_fp, weight_fp, bias); + at::Tensor tanh_out = at::tanh(output_fp); + return at::native::quantize_per_tensor( + tanh_out, output_scale, output_zero_point, c10::kQInt8); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_tanh ", diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index b4ae4e677bcd2..95ab2fa06c9b5 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -606,6 +606,10 @@ class QLinearPackWeightInt8 final { return PackedLinearWeightsOnednn::prepack(std::move(weight), std::move(bias)); } #endif // #if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedLinearWeightNoQEngine::prepack( + std::move(weight), std::move(bias)); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_prepack ", @@ -645,6 +649,10 @@ class QLinearPackWeightFp16 final { "not supported by ONEDNN"); } #endif // #if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return PackedLinearWeightNoQEngine::prepack( + std::move(weight), std::move(bias)); + } TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_prepack_fp16 ", diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp index 4c2352a396177..53d351698fbdc 100644 --- a/aten/src/ATen/native/quantized/qconv_unpack.cpp +++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp @@ -70,6 +70,10 @@ class QConvUnpackWeightsInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + return packed_weight->unpack(); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_unpack ", @@ -111,6 +115,12 @@ class QConv1dUnpackWeightsInt8 final { } #endif + if (ctx.qEngine() == at::QEngine::NoQEngine) { + std::tie(weight, bias) = packed_weight->unpack(); + weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); + return std::tuple>(weight, bias); + } + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv1d_unpack ", diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py index 6c0fdb84da02e..b554b62eb94bb 100644 --- a/test/inductor/test_cpu_select_algorithm.py +++ b/test/inductor/test_cpu_select_algorithm.py @@ -1559,6 +1559,7 @@ def forward(self, x): vec_amx = VecAMX() self._check_amx_counter(vec_amx) + @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled") @inductor_config.patch({"freezing": True}) @patches @torch.no_grad @@ -1672,6 +1673,7 @@ def forward(self, x, scale): vec_amx = VecAMX() self._check_amx_counter(vec_amx) + @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled") @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True}) @patches @torch.no_grad diff --git a/test/run_test.py b/test/run_test.py index ff2a617a6eb78..fbd293f973e8a 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -108,6 +108,7 @@ def upload_adhoc_failure_json(*args, **kwargs): INDUCTOR_TEST_PREFIX = "inductor" IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT IS_S390X = platform.machine() == "s390x" +IS_RISCV64 = platform.machine() == "riscv64" # Note [ROCm parallel CI testing] @@ -281,6 +282,52 @@ def __contains__(self, item): "test_xpu", ] +RISCV64_BLOCKLIST = [ + # disable distributed related test + "inductor/test_distributed_patterns", + "fx/test_dce_pass", + "export/test_cpp_serdes", + "export/test_export", + "export/test_export_strict", + "export/test_export_training_ir_to_run_decomp", + "export/test_retraceability", + "export/test_serdes", + "export/test_strict_export_v2", + "test_public_bindings", + "ao/sparsity/test_composability", + # QNNPACK is not supported + "export/test_converter", + # record_contex_cpp is not support on non-linux non-x86_64 platforms + "torch_np/numpy_tests/core/test_numeric", + # Failed to import torch.distributed.run: cannot import name 'Store' from 'torch.distributed' + "test_testing", + "inductor/test_aot_inductor_arrayref", + "inductor/test_cpu_repro", + # TODO: mkldnn not available, shape guard failures on RISC-V + "inductor/test_cpu_select_algorithm", + # TODO:scalar value not equal, need to fix + "profiler/test_profiler", + # TODO precision + "test_binary_ufuncs", + "test_decomp", + # TODO no CUDA related module + "quantization/core/test_workflow_module", # TestFakeQuantize.test_fq_module_per_channel + "quantization/core/test_workflow_ops", + "quantization/core/test_quantized_op", + # z3-solver build fail + "test_proxy_tensor", + # too slow on riscv64 + # 53013.55 s + "functorch/test_aotdispatch", + # 25069 s + "functorch/test_ops", + # 17528 s + "test_transformers", + # 10897 s + "functorch/test_vmap", +] + + # The tests inside these files should never be run in parallel with each other RUN_PARALLEL_BLOCKLIST = [ "test_extension_utils", @@ -1822,6 +1869,13 @@ def get_selected_tests(options) -> list[str]: selected_tests, "Skip distributed tests on s390x", ) + elif IS_RISCV64: + selected_tests = exclude_tests(RISCV64_BLOCKLIST, selected_tests, "on riscv64") + selected_tests = exclude_tests( + DISTRIBUTED_TESTS, + selected_tests, + "Skip distributed tests on riscv64", + ) # skip all distributed tests if distributed package is not available. if not dist.is_available(): diff --git a/test/test_linalg.py b/test/test_linalg.py index e9461bf83796f..cb1e67e10e598 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -8954,6 +8954,7 @@ def test_matrix_exp_backward_input_validation(self, device, dtype): with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"): torch.ops.aten.matrix_exp_backward(non_square, grad_non_square) + @slowTest @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 45d7b0b253a30..9c78401637347 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -5457,27 +5457,22 @@ def check_bytes(byte_list): if not (0 <= byte <= 255): raise AssertionError(f"byte value out of range: expected 0 <= byte <= 255, got {byte}") - if dtype.is_complex: - if len(byte_list) != (num_bytes * 2): - raise AssertionError( - f"expected len(byte_list) == {num_bytes * 2} for complex dtype, got {len(byte_list)}" - ) - check_bytes(byte_list) - real = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list[:num_bytes])).value - imag = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list[num_bytes:])).value - res = real + 1j * imag - else: - if len(byte_list) != num_bytes: - raise AssertionError( - f"expected len(byte_list) == {num_bytes}, got {len(byte_list)}" - ) - check_bytes(byte_list) - res = ctype.from_buffer((ctypes.c_byte * num_bytes)( - *byte_list)).value - - return torch.tensor(res, device=device, dtype=dtype) + expected_len = num_bytes * 2 if dtype.is_complex else num_bytes + if len(byte_list) != expected_len: + raise AssertionError( + f"expected len(byte_list) == {expected_len}" + f"{' for complex dtype' if dtype.is_complex else ''}, got {len(byte_list)}" + ) + check_bytes(byte_list) + + # Write bytes directly into storage to preserve exact bit patterns + # (e.g. NaN payloads, which are not preserved when round-tripping through + # Python float/complex, especially on architectures like RISC-V that + # canonicalize NaNs). + res = torch.empty((), dtype=dtype, device=device) + src = torch.tensor(byte_list, dtype=torch.uint8, device=device) + res.untyped_storage().copy_(src.untyped_storage()) + return res def copy_func(f):