Skip to content

OpenMP — 并行计算

OpenMP 是编译器指令级并行框架,通过 #pragma omp 注解让串行代码快速并行化,是科学计算和数值计算的标准工具。

启用 OpenMP

bash
# GCC/Clang
g++ -fopenmp main.cpp -o main
clang++ -fopenmp main.cpp -o main

# CMake
find_package(OpenMP REQUIRED)
target_link_libraries(myapp PRIVATE OpenMP::OpenMP_CXX)

基本并行

cpp
#include <omp.h>
#include <iostream>
#include <vector>

// 并行区域
#pragma omp parallel
{
    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
    std::cout << "线程 " << tid << "/" << nthreads << "\n";
}

// 并行 for 循环(最常用)
std::vector<double> a(1000000), b(1000000), c(1000000);

#pragma omp parallel for
for (int i = 0; i < 1000000; ++i) {
    c[i] = a[i] + b[i];
}

// 指定线程数
#pragma omp parallel for num_threads(4)
for (int i = 0; i < n; ++i) {
    process(i);
}

// 查询
std::cout << "CPU 核数: " << omp_get_num_procs() << "\n";
std::cout << "最大线程数: " << omp_get_max_threads() << "\n";
omp_set_num_threads(8);  // 设置线程数

归约(Reduction)

cpp
// 并行求和(reduction 子句自动处理竞争)
double sum = 0.0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < n; ++i) {
    sum += data[i];
}

// 多变量归约
double min_val = DBL_MAX, max_val = -DBL_MAX;
#pragma omp parallel for reduction(min:min_val) reduction(max:max_val)
for (int i = 0; i < n; ++i) {
    min_val = std::min(min_val, data[i]);
    max_val = std::max(max_val, data[i]);
}

// 支持的归约操作:+, *, -, &, |, ^, &&, ||, min, max

调度策略

cpp
// static:静态分配,每个线程处理连续块(默认)
#pragma omp parallel for schedule(static)
for (int i = 0; i < n; ++i) { /* ... */ }

// dynamic:动态分配,适合负载不均的循环
#pragma omp parallel for schedule(dynamic, 100)  // 每次分配 100 个迭代
for (int i = 0; i < n; ++i) {
    variable_work(i);  // 每次迭代工作量不同
}

// guided:自适应块大小(从大到小)
#pragma omp parallel for schedule(guided)
for (int i = 0; i < n; ++i) { /* ... */ }

同步

cpp
// critical:临界区(同一时刻只有一个线程执行)
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
    #pragma omp critical
    {
        shared_list.push_back(compute(i));
    }
}

// atomic:原子操作(比 critical 快)
int counter = 0;
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
    #pragma omp atomic
    counter++;
}

// barrier:同步点(所有线程到达后才继续)
#pragma omp parallel
{
    phase1();
    #pragma omp barrier  // 等待所有线程完成 phase1
    phase2();
}

// single:只有一个线程执行
#pragma omp parallel
{
    #pragma omp single
    {
        std::cout << "只执行一次\n";
    }
    // 其他线程在此等待
}

任务并行

cpp
// task:动态任务(适合递归、不规则并行)
#pragma omp parallel
{
    #pragma omp single
    {
        for (auto& item : work_list) {
            #pragma omp task firstprivate(item)
            {
                process(item);
            }
        }
    }
}

// 递归并行(快速排序)
void parallel_quicksort(int* arr, int n) {
    if (n < 1000) {
        std::sort(arr, arr + n);
        return;
    }
    int pivot = partition(arr, n);
    #pragma omp task
    parallel_quicksort(arr, pivot);
    #pragma omp task
    parallel_quicksort(arr + pivot + 1, n - pivot - 1);
    #pragma omp taskwait
}

#pragma omp parallel
{
    #pragma omp single
    parallel_quicksort(data.data(), data.size());
}

SIMD 向量化

cpp
// simd:显式 SIMD 向量化
#pragma omp simd
for (int i = 0; i < n; ++i) {
    c[i] = a[i] * b[i] + d[i];
}

// parallel for simd:并行 + SIMD
#pragma omp parallel for simd
for (int i = 0; i < n; ++i) {
    c[i] = std::sqrt(a[i] * a[i] + b[i] * b[i]);
}

关键认知

OpenMP 是最低侵入性的并行化方式,只需添加 #pragma 注解。reduction 子句是并行归约的正确方式,不要手动加锁。schedule(dynamic) 适合负载不均的循环,schedule(static) 适合均匀负载。科学计算首选 OpenMP,通用并行首选 TBB。

系统学习 C++ 生态,深入底层架构