OpenMP — 并行计算
OpenMP 是编译器指令级并行框架,通过
#pragma omp注解让串行代码快速并行化,是科学计算和数值计算的标准工具。
启用 OpenMP
bash
# GCC/Clang
g++ -fopenmp main.cpp -o main
clang++ -fopenmp main.cpp -o main
# CMake
find_package(OpenMP REQUIRED)
target_link_libraries(myapp PRIVATE OpenMP::OpenMP_CXX)基本并行
cpp
#include <omp.h>
#include <iostream>
#include <vector>
// 并行区域
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
std::cout << "线程 " << tid << "/" << nthreads << "\n";
}
// 并行 for 循环(最常用)
std::vector<double> a(1000000), b(1000000), c(1000000);
#pragma omp parallel for
for (int i = 0; i < 1000000; ++i) {
c[i] = a[i] + b[i];
}
// 指定线程数
#pragma omp parallel for num_threads(4)
for (int i = 0; i < n; ++i) {
process(i);
}
// 查询
std::cout << "CPU 核数: " << omp_get_num_procs() << "\n";
std::cout << "最大线程数: " << omp_get_max_threads() << "\n";
omp_set_num_threads(8); // 设置线程数归约(Reduction)
cpp
// 并行求和(reduction 子句自动处理竞争)
double sum = 0.0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < n; ++i) {
sum += data[i];
}
// 多变量归约
double min_val = DBL_MAX, max_val = -DBL_MAX;
#pragma omp parallel for reduction(min:min_val) reduction(max:max_val)
for (int i = 0; i < n; ++i) {
min_val = std::min(min_val, data[i]);
max_val = std::max(max_val, data[i]);
}
// 支持的归约操作:+, *, -, &, |, ^, &&, ||, min, max调度策略
cpp
// static:静态分配,每个线程处理连续块(默认)
#pragma omp parallel for schedule(static)
for (int i = 0; i < n; ++i) { /* ... */ }
// dynamic:动态分配,适合负载不均的循环
#pragma omp parallel for schedule(dynamic, 100) // 每次分配 100 个迭代
for (int i = 0; i < n; ++i) {
variable_work(i); // 每次迭代工作量不同
}
// guided:自适应块大小(从大到小)
#pragma omp parallel for schedule(guided)
for (int i = 0; i < n; ++i) { /* ... */ }同步
cpp
// critical:临界区(同一时刻只有一个线程执行)
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
#pragma omp critical
{
shared_list.push_back(compute(i));
}
}
// atomic:原子操作(比 critical 快)
int counter = 0;
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
#pragma omp atomic
counter++;
}
// barrier:同步点(所有线程到达后才继续)
#pragma omp parallel
{
phase1();
#pragma omp barrier // 等待所有线程完成 phase1
phase2();
}
// single:只有一个线程执行
#pragma omp parallel
{
#pragma omp single
{
std::cout << "只执行一次\n";
}
// 其他线程在此等待
}任务并行
cpp
// task:动态任务(适合递归、不规则并行)
#pragma omp parallel
{
#pragma omp single
{
for (auto& item : work_list) {
#pragma omp task firstprivate(item)
{
process(item);
}
}
}
}
// 递归并行(快速排序)
void parallel_quicksort(int* arr, int n) {
if (n < 1000) {
std::sort(arr, arr + n);
return;
}
int pivot = partition(arr, n);
#pragma omp task
parallel_quicksort(arr, pivot);
#pragma omp task
parallel_quicksort(arr + pivot + 1, n - pivot - 1);
#pragma omp taskwait
}
#pragma omp parallel
{
#pragma omp single
parallel_quicksort(data.data(), data.size());
}SIMD 向量化
cpp
// simd:显式 SIMD 向量化
#pragma omp simd
for (int i = 0; i < n; ++i) {
c[i] = a[i] * b[i] + d[i];
}
// parallel for simd:并行 + SIMD
#pragma omp parallel for simd
for (int i = 0; i < n; ++i) {
c[i] = std::sqrt(a[i] * a[i] + b[i] * b[i]);
}关键认知
OpenMP 是最低侵入性的并行化方式,只需添加 #pragma 注解。reduction 子句是并行归约的正确方式,不要手动加锁。schedule(dynamic) 适合负载不均的循环,schedule(static) 适合均匀负载。科学计算首选 OpenMP,通用并行首选 TBB。