From 3ac0c7e5a7d44d32fef806cca166256f7daf7080 Mon Sep 17 00:00:00 2001 From: Absolutely-Daisy Date: Fri, 26 Jun 2026 19:25:23 +0800 Subject: [PATCH] =?UTF-8?q?feat(OpenMP):=20=E7=BB=9F=E4=B8=80=E5=90=91?= =?UTF-8?q?=E9=87=8F=E8=BF=90=E7=AE=97=E5=B9=B6=E8=A1=8C=E5=8C=96=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=EF=BC=8C=E8=A1=A5=E5=85=85=20Davidson/CG=20=E6=B1=82?= =?UTF-8?q?=E8=A7=A3=E5=99=A8=E9=81=97=E6=BC=8F=E7=9A=84=20OpenMP=20?= =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit math_kernel_op_vec.cpp 里几个向量运算的 OpenMP 还用的是裸 #pragma omp parallel for, 项目里 memory_op 早换成 OMP_PARALLEL+BLOCK_TASK_DIST_1D 了,顺手统一了一下。 Davidson 里两段 O(n^2) 的矩阵拷贝循环也补了并行化。 diago_cg.cpp 加了 tool_threading.h 以备后续使用。 --- .../kernels/math_kernel_op_vec.cpp | 92 ++++++++++--------- source/source_hsolver/diago_cg.cpp | 1 + source/source_hsolver/diago_dav_subspace.cpp | 49 ++++++---- 3 files changed, 80 insertions(+), 62 deletions(-) diff --git a/source/source_base/kernels/math_kernel_op_vec.cpp b/source/source_base/kernels/math_kernel_op_vec.cpp index 8353b82660a..cf85e61d357 100644 --- a/source/source_base/kernels/math_kernel_op_vec.cpp +++ b/source/source_base/kernels/math_kernel_op_vec.cpp @@ -1,5 +1,7 @@ #include "source_base/kernels/math_kernel_op.h" #include "source_base/module_external/blas_connector.h" +#include "source_base/parallel_reduce.h" +#include "source_base/tool_threading.h" namespace ModuleBase @@ -23,13 +25,14 @@ struct vector_mul_real_op using Real = typename GetTypeReal::type; void operator()(const int dim, T* result, const T* vector, const Real constant) { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector[i] * constant; - } + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, dim, (int)(4096 / sizeof(T)), beg, len); + for (int i = beg; i < beg + len; i++) + { + result[i] = vector[i] * constant; + } + }); } }; @@ -41,23 +44,25 @@ struct vector_mul_vector_op { if (add) { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] += vector1[i] * vector2[i]; - } + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, dim, (int)(4096 / sizeof(T)), beg, len); + for (int i = beg; i < beg + len; i++) + { + result[i] += vector1[i] * vector2[i]; + } + }); } else { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * vector2[i]; - } + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, dim, (int)(4096 / sizeof(T)), beg, len); + for (int i = beg; i < beg + len; i++) + { + result[i] = vector1[i] * vector2[i]; + } + }); } } }; @@ -68,13 +73,14 @@ struct vector_div_constant_op using Real = typename GetTypeReal::type; void operator()(const int& dim, T* result, const T* vector, const Real constant) { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector[i] / constant; - } + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, dim, (int)(4096 / sizeof(T)), beg, len); + for (int i = beg; i < beg + len; i++) + { + result[i] = vector[i] / constant; + } + }); } }; @@ -84,13 +90,14 @@ struct vector_div_vector_op using Real = typename GetTypeReal::type; void operator()(const int& dim, T* result, const T* vector1, const Real* vector2) { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] / vector2[i]; - } + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, dim, (int)(4096 / sizeof(T)), beg, len); + for (int i = beg; i < beg + len; i++) + { + result[i] = vector1[i] / vector2[i]; + } + }); } }; @@ -120,13 +127,14 @@ struct vector_add_vector_op const T* vector2, const Real constant2) { -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, dim, (int)(4096 / sizeof(T)), beg, len); + for (int i = beg; i < beg + len; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + }); } }; diff --git a/source/source_hsolver/diago_cg.cpp b/source/source_hsolver/diago_cg.cpp index 0c8f5b81e75..082ad8351d0 100644 --- a/source/source_hsolver/diago_cg.cpp +++ b/source/source_hsolver/diago_cg.cpp @@ -9,6 +9,7 @@ #include #include #include // ModuleBase::TITLE +#include #include // ModuleBase::GlobalFunc::NOTE #include diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index 4402fad9f8a..34cbe335992 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -4,6 +4,7 @@ #include "source_base/memory.h" #include "source_base/module_device/device.h" #include "source_base/timer.h" +#include "source_base/tool_threading.h" #include "source_hsolver/kernels/dngvd_op.h" #include "source_base/kernels/math_kernel_op.h" #include "source_hsolver/kernels/bpcg_kernel_op.h" // normalize_op, precondition_op, apply_eigenvalues_op @@ -604,14 +605,18 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, std::vector> h_diag(nbase, std::vector(nbase, *this->zero)); std::vector> s_diag(nbase, std::vector(nbase, *this->zero)); - for (size_t i = 0; i < nbase; i++) - { - for (size_t j = 0; j < nbase; j++) + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, (int)nbase, 8, beg, len); + for (int i = beg; i < beg + len; i++) { - h_diag[i][j] = hcc[i * this->nbase_x + j]; - s_diag[i][j] = scc[i * this->nbase_x + j]; + for (size_t j = 0; j < nbase; j++) + { + h_diag[i][j] = hcc[i * this->nbase_x + j]; + s_diag[i][j] = scc[i * this->nbase_x + j]; + } } - } + }); dngvx_op()(this->ctx, nbase, this->nbase_x, @@ -621,22 +626,26 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, (*eigenvalue_iter).data(), this->vcc); // reset: - for (size_t i = 0; i < nbase; i++) - { - for (size_t j = 0; j < nbase; j++) + ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { + int beg = 0, len = 0; + ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, (int)nbase, 8, beg, len); + for (int i = beg; i < beg + len; i++) { - hcc[i * this->nbase_x + j] = h_diag[i][j]; - scc[i * this->nbase_x + j] = s_diag[i][j]; + for (size_t j = 0; j < nbase; j++) + { + hcc[i * this->nbase_x + j] = h_diag[i][j]; + scc[i * this->nbase_x + j] = s_diag[i][j]; + } + + for (size_t j = nbase; j < this->nbase_x; j++) + { + hcc[i * this->nbase_x + j] = *this->zero; + hcc[j * this->nbase_x + i] = *this->zero; + scc[i * this->nbase_x + j] = *this->zero; + scc[j * this->nbase_x + i] = *this->zero; + } } - - for (size_t j = nbase; j < this->nbase_x; j++) - { - hcc[i * this->nbase_x + j] = *this->zero; - hcc[j * this->nbase_x + i] = *this->zero; - scc[i * this->nbase_x + j] = *this->zero; - scc[j * this->nbase_x + i] = *this->zero; - } - } + }); } } else