Skip to content

第9篇:异步编程与并行计算在CANN中的应用

本文深入解析CANN(Compute Architecture for Neural Networks)中的异步编程模型与并行计算架构。通过分析源码实现,详细阐述CANN如何通过事件驱动机制、流水线并行、多级并行架构等技术,最大化发挥昇腾AI处理器的硬件性能。文章将探讨CANN受限的异步编程设计理念、死锁避免机制、以及在实际算子开发中的最佳实践。

CANN采用了一种独特的受限异步编程模型,其设计哲学基于以下几个核心原则:

  • 性能优先:异步机制主要为性能优化服务,而非提供通用的并发编程能力
  • 硬件协同:异步模型紧密围绕昇腾AI处理器的硬件架构设计
  • 可预测性:受限的模型确保执行时序的可预测性,便于性能分析和优化
  • 简化同步:通过预定义的同步点,减少复杂的同步逻辑

CANN的异步执行基于Stream概念,类似CUDA的stream机制:

// Stream生命周期管理
class StreamManager {
private:
aclrtStream stream_;
bool initialized_ = false;
public:
void Initialize() {
CHECK_ACL(aclrtCreateStream(&stream_));
initialized_ = true;
}
void Synchronize() {
if (initialized_) {
CHECK_ACL(aclrtSynchronizeStream(stream_));
}
}
void Destroy() {
if (initialized_) {
CHECK_ACL(aclrtDestroyStream(stream_));
initialized_ = false;
}
}
aclrtStream GetStream() const { return stream_; }
};

CANN支持异步核函数启动,实现Host与Device的并行执行:

// 异步核函数启动示例
extern "C" __global__ __aicore__ void async_kernel_example(
GM_ADDR input, GM_ADDR output, GM_ADDR workspace) {
// 异步执行准备
TPipe pipe;
pipe.Init();
// 异步数据搬运
DataCopy(local_buffer, global_input, data_size);
PipeBarrier<PIPE_V>();
// 异步计算
ProcessData(local_buffer);
// 异步结果写回
DataCopy(global_output, local_buffer, result_size);
}
// Host侧异步调用
void LaunchAsyncKernel(void* input, void* output, void* workspace,
aclrtStream stream) {
async_kernel_example<<<grid, block, shared_mem, stream>>>(
reinterpret_cast<GM_ADDR>(input),
reinterpret_cast<GM_ADDR>(output),
reinterpret_cast<GM_ADDR>(workspace));
}

CANN实现了一套基于HardEvent的同步机制,使用预定义的事件类型进行协调:

// 事件类型定义
enum class HardEvent : uint32_t {
// 标量到矩阵引擎
S_MTE2 = 0,
// 矩阵引擎2到向量
MTE2_V = 1,
// 向量到矩阵引擎3
V_MTE3 = 2,
// 矩阵引擎3到标量
MTE3_S = 3,
// 矩阵引擎3到矩阵引擎2
MTE3_MTE2 = 4,
// 矩阵引擎2到矩阵引擎3
MTE2_MTE3 = 5,
// 向量内部同步
V_V = 6
};
// 事件管理器
class EventManager {
public:
static event_t FetchEvent(HardEvent event_type) {
return static_cast<event_t>(GetTPipePtr()->FetchEventID(event_type));
}
template <HardEvent Event>
static void SetFlag(event_t event_id) {
GetTPipePtr()->SetFlag(static_cast<int32_t>(event_id));
}
template <HardEvent Event>
static void WaitFlag(event_t event_id) {
GetTPipePtr()->WaitFlag(static_cast<int32_t>(event_id));
}
};
// 生产者设置事件
template <typename T>
__aicore__ void ProducerStage(T* output) {
// 计算完成
ComputeData(output);
// 设置完成事件
event_t done_event = EventManager::FetchEvent(HardEvent::MTE2_V);
EventManager::SetFlag<HardEvent::MTE2_V>(done_event);
}
// 消费者等待事件
template <typename T>
__aicore__ void ConsumerStage(T* input) {
// 等待数据准备就绪
event_t ready_event = EventManager::FetchEvent(HardEvent::MTE2_V);
EventManager::WaitFlag<HardEvent::MTE2_V>(ready_event);
// 使用数据
ProcessData(input);
}
// 双向同步示例
template <typename T>
__aicore__ void BidirectionalSync(T* data_a, T* data_b) {
event_t event_a2b = EventManager::FetchEvent(HardEvent::MTE2_MTE3);
event_t event_b2a = EventManager::FetchEvent(HardEvent::MTE3_MTE2);
// A阶段处理
ProcessStageA(data_a);
EventManager::SetFlag<HardEvent::MTE2_MTE3>(event_a2b);
// 等待B阶段完成
EventManager::WaitFlag<HardEvent::MTE3_MTE2>(event_b2a);
// A阶段继续处理
ContinueStageA(data_a);
}

管道屏障提供更粗粒度的同步机制:

// 管道屏障使用示例
class PipelineBarrier {
public:
template <PipeType pipe>
static void Sync() {
PipeBarrier<pipe>();
}
// 同步示例
static void SyncAll() {
Sync<PIPE_V>(); // 向量管道同步
Sync<PIPE_MTE2>(); // 矩阵引擎2同步
Sync<PIPE_MTE3>(); // 矩阵引擎3同步
}
};
// 在计算中使用
template <typename T>
__aicore__ void ComputeWithSync(T* input, T* output) {
// 数据加载
DataCopy(local_buffer, input, size);
PipelineBarrier::Sync<PIPE_V>();
// 计算
ComputeKernel(local_buffer);
PipelineBarrier::Sync<PIPE_V>();
// 结果存储
DataCopy(output, local_buffer, size);
PipelineBarrier::Sync<PIPE_V>();
}

Kernel级并行通过Tiling策略实现,支持多种并行模式:

// Tiling策略定义
#define TILINGKEY_SPLIT_H_DROPLESS 200001 // 水平分割无填充
#define TILINGKEY_SPLIT_H_DROP_PAD 200002 // 水平分割填充丢弃
#define TILINGKEY_SPLIT_H_ACTIVE 200003 // 水平分割激活模式
#define TILINGKEY_FULL_LOAD_DROPLESS 300001 // 全加载无填充
#define TILINGKEY_FULL_LOAD_DROP_PAD 300002 // 全加载填充丢弃
// 动态Tiling选择
template <typename T>
class TilingSelector {
public:
static uint32_t SelectOptimalTiling(const ShapeInfo& shape,
const WorkspaceInfo& ws) {
// 根据数据形状和工作空间选择最优Tiling
if (shape.IsSplitFriendly() && ws.IsLargeEnough()) {
return TILINGKEY_SPLIT_H_DROPLESS;
} else if (shape.NeedsPadding()) {
return TILINGKEY_SPLIT_H_DROP_PAD;
} else {
return TILINGKEY_FULL_LOAD_DROPLESS;
}
}
};

Block级并行将数据分配到多个计算单元:

// Block级并行调度
template <typename ComputeClass>
class BlockScheduler {
public:
struct BlockConfig {
uint32_t block_num; // Block数量
uint32_t unit_per_core; // 每核处理单元数
uint32_t more_unit_cores; // 需要额外单元的核心数
uint32_t tail_units; // 尾巴单元数
};
static BlockConfig CalculateBlockConfig(const ShapeInfo& shape) {
BlockConfig config;
// 获取硬件资源
uint32_t available_cores = GetAicoreNum();
uint64_t total_units = shape.GetTotalUnits();
// 计算基本分配
config.unit_per_core = std::max(
total_units / available_cores / MIN_UNITS_PER_CORE,
MIN_UNITS_PER_CORE
);
config.block_num = std::min(
available_cores,
total_units / config.unit_per_core
);
// 处理余数
uint32_t remainder = total_units % config.unit_per_core;
config.more_unit_cores = remainder;
config.tail_units = 0;
return config;
}
};

Core级并行利用AI Core的多核执行能力:

// Core级并行执行
extern "C" __global__ __aicore__ void multi_core_kernel(
GM_ADDR input, GM_ADDR output, uint32_t block_id) {
// 获取Block配置
auto block_config = BlockScheduler<ComputeType>::GetConfig();
// 计算当前core的处理范围
uint32_t start_unit = block_id * block_config.unit_per_core;
uint32_t units_count = block_config.unit_per_core;
// 处理余数分配
if (block_id < block_config.more_unit_cores) {
units_count++;
}
// 获取最后一个block
if (block_id == block_config.block_num - 1) {
units_count += block_config.tail_units;
}
// 执行计算
ProcessUnits(input, output, start_unit, units_count);
}

CANN实现了Load-Compute-Store三级流水线:

// 流水线阶段定义
enum class PipelineStage {
LOAD, // 数据加载阶段
COMPUTE, // 计算阶段
STORE // 结果存储阶段
};
// 流水线管理器
template <typename T>
class PipelineManager {
private:
static constexpr int PIPELINE_DEPTH = 3;
TPipe pipe_;
LocalTensor<T> buffers_[PIPELINE_DEPTH];
int current_stage_ = 0;
public:
void Initialize() {
pipe_.Init();
for (int i = 0; i < PIPELINE_DEPTH; ++i) {
buffers_[i] = pipe_.AllocTensor<T>();
}
}
void Process(T* input, T* output, size_t total_size) {
size_t tile_size = GetOptimalTileSize();
size_t num_tiles = (total_size + tile_size - 1) / tile_size;
// 流水线预热
for (int i = 0; i < PIPELINE_DEPTH && i < num_tiles; ++i) {
LoadStage(input + i * tile_size, buffers_[i],
std::min(tile_size, total_size - i * tile_size));
}
// 流水线执行
int load_idx = PIPELINE_DEPTH;
int compute_idx = 0;
int store_idx = 0;
while (store_idx < num_tiles) {
// Load阶段
if (load_idx < num_tiles) {
LoadStage(input + load_idx * tile_size,
buffers_[load_idx % PIPELINE_DEPTH],
std::min(tile_size, total_size - load_idx * tile_size));
load_idx++;
}
// Compute阶段
if (compute_idx < num_tiles) {
ComputeStage(buffers_[compute_idx % PIPELINE_DEPTH],
buffers_[(compute_idx + 1) % PIPELINE_DEPTH],
tile_size);
compute_idx++;
}
// Store阶段
if (store_idx < num_tiles) {
StoreStage(buffers_[(store_idx + 1) % PIPELINE_DEPTH],
output + store_idx * tile_size,
std::min(tile_size, total_size - store_idx * tile_size));
store_idx++;
}
}
}
private:
void LoadStage(T* src, LocalTensor<T> dst, size_t size) {
DataCopy(dst, src, size);
PipeBarrier<PIPE_V>();
}
void ComputeStage(LocalTensor<T> input, LocalTensor<T> output,
size_t size) {
ComputeKernel(input, output, size);
PipeBarrier<PIPE_V>();
}
void StoreStage(LocalTensor<T> src, T* dst, size_t size) {
DataCopy(dst, src, size);
PipeBarrier<PIPE_V>();
}
};
// 事件驱动流水线
class EventDrivenPipeline {
private:
event_t load_done_;
event_t compute_done_;
event_t store_done_;
public:
void Initialize() {
load_done_ = EventManager::FetchEvent(HardEvent::MTE2_V);
compute_done_ = EventManager::FetchEvent(HardEvent::V_MTE3);
store_done_ = EventManager::FetchEvent(HardEvent::MTE3_S);
}
template <typename T>
void ProcessAsync(T* input, T* output, LocalTensor<T> workspace) {
// 异步Load
AsyncLoad(input, workspace);
// 异步Compute
AsyncCompute(workspace);
// 异步Store
AsyncStore(workspace, output);
}
private:
template <typename T>
void AsyncLoad(T* input, LocalTensor<T> workspace) {
// 启动异步加载
DataCopyAsync(workspace, input, tile_size_);
// 设置完成事件
EventManager::SetFlag<HardEvent::MTE2_V>(load_done_);
}
void AsyncCompute(LocalTensor<T> workspace) {
// 等待加载完成
EventManager::WaitFlag<HardEvent::MTE2_V>(load_done_);
// 启动异步计算
ComputeAsync(workspace);
// 设置完成事件
EventManager::SetFlag<HardEvent::V_MTE3>(compute_done_);
}
template <typename T>
void AsyncStore(LocalTensor<T> workspace, T* output) {
// 等待计算完成
EventManager::WaitFlag<HardEvent::V_MTE3>(compute_done_);
// 启动异步存储
DataCopyAsync(output, workspace, tile_size_);
// 设置完成事件
EventManager::SetFlag<HardEvent::MTE3_S>(store_done_);
}
};
// 动态流水线调度器
class DynamicPipelineScheduler {
private:
struct PipelineMetrics {
float load_efficiency_;
float compute_efficiency_;
float store_efficiency_;
float overall_throughput_;
};
public:
template <typename T>
PipelineMetrics OptimizePipeline(T* input, T* output,
const ShapeInfo& shape) {
PipelineMetrics metrics;
// 测试不同tile大小
std::vector<size_t> tile_sizes = {512, 1024, 2048, 4096, 8192};
float best_throughput = 0.0f;
size_t best_tile_size = 1024;
for (size_t tile_size : tile_sizes) {
auto current_metrics = BenchmarkPipeline(input, output,
shape, tile_size);
if (current_metrics.overall_throughput_ > best_throughput) {
best_throughput = current_metrics.overall_throughput_;
best_tile_size = tile_size;
metrics = current_metrics;
}
}
// 应用最优配置
ApplyOptimalConfig(best_tile_size);
return metrics;
}
private:
template <typename T>
PipelineMetrics BenchmarkPipeline(T* input, T* output,
const ShapeInfo& shape,
size_t tile_size) {
PipelineMetrics metrics;
auto start = std::chrono::high_resolution_clock::now();
// 执行流水线
PipelineManager<T> pipeline;
pipeline.Process(input, output, shape.GetTotalSize());
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
end - start).count();
// 计算指标
float total_data_mb = shape.GetTotalSize() * sizeof(T) / (1024.0f * 1024.0f);
metrics.overall_throughput_ = total_data_mb / (duration / 1000.0f);
return metrics;
}
};

CANN通过严格的操作顺序避免死锁:

// 安全的事件操作模式
class SafeEventPattern {
public:
template <typename T>
static void SafeProducerConsumer(T* producer_data, T* consumer_data) {
event_t sync_event = EventManager::FetchEvent(HardEvent::MTE2_V);
// 生产者:先设置再等待
EventManager::SetFlag<HardEvent::MTE2_V>(sync_event);
ProducerWork(producer_data);
EventManager::WaitFlag<HardEvent::MTE2_V>(sync_event);
// 消费者:先等待再设置
EventManager::WaitFlag<HardEvent::MTE2_V>(sync_event);
ConsumerWork(consumer_data);
EventManager::SetFlag<HardEvent::MTE2_V>(sync_event);
}
};
// 事件超时保护
class EventTimeoutGuard {
private:
static constexpr uint32_t MAX_WAIT_CYCLES = 1000000; // 最大等待周期
public:
template <HardEvent Event>
static bool WaitFlagWithTimeout(event_t event_id) {
uint32_t wait_cycles = 0;
while (!IsEventSet<Event>(event_id) &&
wait_cycles < MAX_WAIT_CYCLES) {
wait_cycles++;
}
if (wait_cycles >= MAX_WAIT_CYCLES) {
// 超时处理
HandleTimeout<Event>(event_id);
return false;
}
return true;
}
private:
template <HardEvent Event>
static void HandleTimeout(event_t event_id) {
// 记录错误日志
LogError("Event timeout: " + std::to_string(static_cast<int>(Event)));
// 强制重置事件状态
ResetEvent<Event>(event_id);
}
};
// 资源池避免资源竞争
class ResourcePool {
private:
struct Resource {
void* ptr;
bool in_use;
uint32_t last_used_cycle;
};
std::vector<Resource> resources_;
uint32_t current_cycle_ = 0;
public:
void* Allocate(size_t size) {
current_cycle_++;
// 查找可用资源
for (auto& res : resources_) {
if (!res.in_use &&
GetResourceSize(res.ptr) >= size &&
(current_cycle_ - res.last_used_cycle) > MIN_REUSE_GAP) {
res.in_use = true;
res.last_used_cycle = current_cycle_;
return res.ptr;
}
}
// 分配新资源
return AllocateNewResource(size);
}
void Release(void* ptr) {
for (auto& res : resources_) {
if (res.ptr == ptr) {
res.in_use = false;
break;
}
}
}
private:
static constexpr uint32_t MIN_REUSE_GAP = 100; // 最小复用间隔
};
// CPU-GPU异步重叠
class AsyncOverlapOptimizer {
public:
template <typename T>
void OptimizeHostDeviceOverlap(T* host_input, T* device_input,
T* device_output, T* host_output,
size_t size) {
// 创建stream
aclrtStream compute_stream, transfer_stream;
aclrtCreateStream(&compute_stream);
aclrtCreateStream(&transfer_stream);
// 异步数据传输
CHECK_ACL(aclrtMemcpyAsync(device_input, host_input,
size * sizeof(T),
ACL_MEMCPY_HOST_TO_DEVICE,
transfer_stream));
// 异步计算(依赖数据传输)
LaunchComputeKernel(device_input, device_output,
size, compute_stream);
// 等待计算完成
aclrtSynchronizeStream(compute_stream);
// 异步数据传回
CHECK_ACL(aclrtMemcpyAsync(host_output, device_output,
size * sizeof(T),
ACL_MEMCPY_DEVICE_TO_HOST,
transfer_stream));
// 清理资源
aclrtDestroyStream(compute_stream);
aclrtDestroyStream(transfer_stream);
}
};
// 内存访问模式优化
class MemoryAccessOptimizer {
public:
template <typename T>
void OptimizeAccessPattern(T* data, size_t size) {
// 合并内存访问
if (IsCoalescedAccess(data, size)) {
ProcessCoalesced(data, size);
} else {
// 重新组织数据以提高合并度
ReorganizeForCoalescing(data, size);
}
// 避免bank冲突
if (HasBankConflict(data)) {
RemapToAvoidBankConflict(data);
}
}
private:
template <typename T>
void ProcessCoalesced(T* data, size_t size) {
// 使用向量化加载
using VectorType = typename VectorTypeTraits<T>::Type;
T* vec_data = reinterpret_cast<VectorType*>(data);
size_t vec_size = size / (sizeof(VectorType) / sizeof(T));
for (size_t i = 0; i < vec_size; ++i) {
ProcessVector(vec_data[i]);
}
}
};
// 动态负载均衡
class DynamicLoadBalancer {
private:
struct CoreWorkload {
uint32_t core_id;
uint32_t assigned_units;
float execution_time;
};
public:
std::vector<CoreWorkload> BalanceLoad(
const std::vector<uint32_t>& work_units,
uint32_t num_cores) {
std::vector<CoreWorkload> workloads(num_cores);
// 初始化
for (uint32_t i = 0; i < num_cores; ++i) {
workloads[i] = {i, 0, 0.0f};
}
// 基于历史性能数据分配
for (uint32_t unit : work_units) {
uint32_t best_core = FindBestCore(workloads, unit);
workloads[best_core].assigned_units++;
workloads[best_core].execution_time +=
EstimateExecutionTime(unit, best_core);
}
return workloads;
}
private:
uint32_t FindBestCore(const std::vector<CoreWorkload>& workloads,
uint32_t unit) {
uint32_t best_core = 0;
float min_time = workloads[0].execution_time +
EstimateExecutionTime(unit, 0);
for (size_t i = 1; i < workloads.size(); ++i) {
float total_time = workloads[i].execution_time +
EstimateExecutionTime(unit, i);
if (total_time < min_time) {
min_time = total_time;
best_core = i;
}
}
return best_core;
}
};
// MoE路由算子的异步并行实现
class MoERouterAsync {
public:
template <typename T>
void RouteAsync(T* input, T* output, int32_t* routing_map,
const MoEConfig& config) {
// 创建多个stream
std::vector<aclrtStream> streams(config.num_experts);
for (auto& stream : streams) {
aclrtCreateStream(&stream);
}
// 异步路由决策
auto decision_stream = streams[0];
LaunchRoutingDecision(input, routing_map, config, decision_stream);
// 等待路由决策完成
aclrtSynchronizeStream(decision_stream);
// 根据路由结果异步分发数据
for (int i = 0; i < config.num_experts; ++i) {
LaunchDataDispatch(input, output + i * config.expert_capacity,
routing_map, i, streams[i]);
}
// 同步所有专家
for (auto& stream : streams) {
aclrtSynchronizeStream(stream);
}
// 清理资源
for (auto& stream : streams) {
aclrtDestroyStream(stream);
}
}
private:
void LaunchRoutingDecision(void* input, int32_t* routing_map,
const MoEConfig& config,
aclrtStream stream);
void LaunchDataDispatch(void* input, void* expert_input,
int32_t* routing_map, int expert_id,
aclrtStream stream);
};
// 注意力机制的流水线实现
class AttentionPipeline {
private:
enum Stage {
QKV_PROJECTION,
SCORE_COMPUTATION,
ATTENTION_WEIGHTS,
VALUE_AGGREGATION,
OUTPUT_PROJECTION
};
public:
template <typename T>
void ComputeAttention(T* input, T* output,
const AttentionConfig& config) {
// 创建流水线
TPipe pipe;
pipe.Init();
// 分配缓冲区
auto qkv_buffer = pipe.template AllocTensor<T>();
auto score_buffer = pipe.template AllocTensor<T>();
auto weight_buffer = pipe.template AllocTensor<T>();
auto context_buffer = pipe.template AllocTensor<T>();
// 流水线执行
PipelineExecutor executor(pipe);
// Stage 1: QKV投影
executor.AddStage([=]() {
QKVProjection(input, qkv_buffer, config);
});
// Stage 2: 注意力分数计算
executor.AddStage([=]() {
ComputeAttentionScores(qkv_buffer, score_buffer, config);
});
// Stage 3: 注意力权重
executor.AddStage([=]() {
Softmax(score_buffer, weight_buffer, config);
});
// Stage 4: Value聚合
executor.AddStage([=]() {
AggregateValues(qkv_buffer, weight_buffer, context_buffer, config);
});
// Stage 5: 输出投影
executor.AddStage([=]() {
OutputProjection(context_buffer, output, config);
});
// 执行流水线
executor.Run();
// 清理资源
pipe.Destroy();
}
};
  1. 最小化同步点:减少不必要的同步操作
  2. 最大化重叠:CPU-GPU、计算-传输重叠
  3. 避免死锁:遵循事件操作的顺序一致性
  4. 资源复用:合理规划资源生命周期
  1. Tile大小优化:根据硬件特性选择最优tile
  2. 内存访问优化:提高缓存命中率,减少bank冲突
  3. 负载均衡:动态分配任务,避免核心空闲
  4. 流水线深度:平衡延迟和吞吐量
// 性能分析器
class AsyncPerformanceProfiler {
private:
struct ProfileData {
std::string stage_name;
uint64_t start_time;
uint64_t end_time;
float duration_ms;
};
std::vector<ProfileData> profile_data_;
public:
void BeginStage(const std::string& stage_name) {
ProfileData data;
data.stage_name = stage_name;
data.start_time = GetCurrentCycle();
profile_data_.push_back(data);
}
void EndStage(const std::string& stage_name) {
auto& data = profile_data_.back();
data.end_time = GetCurrentCycle();
data.duration_ms = (data.end_time - data.start_time) * CYCLE_TO_MS;
}
void PrintReport() {
float total_time = 0.0f;
std::cout << "=== Async Performance Report ===" << std::endl;
for (const auto& data : profile_data_) {
std::cout << data.stage_name << ": "
<< data.duration_ms << " ms" << std::endl;
total_time += data.duration_ms;
}
std::cout << "Total time: " << total_time << " ms" << std::endl;
std::cout << "Pipeline efficiency: "
<< CalculateEfficiency() * 100 << "%" << std::endl;
}
private:
float CalculateEfficiency() {
float compute_time = 0.0f;
float transfer_time = 0.0f;
for (const auto& data : profile_data_) {
if (data.stage_name.find("Compute") != std::string::npos) {
compute_time += data.duration_ms;
} else if (data.stage_name.find("Transfer") != std::string::npos) {
transfer_time += data.duration_ms;
}
}
return compute_time / (compute_time + transfer_time);
}
};
  1. 创新的异步模型:受限但高效的异步编程模型
  2. 完善的同步机制:基于事件驱动的同步系统
  3. 多级并行架构:Kernel-Block-Core三级并行
  4. 高效流水线:Load-Compute-Store流水线优化
  • 性能提升:充分利用硬件并行能力
  • 开发效率:简化的异步编程模型
  • 可维护性:清晰的同步机制
  • 可扩展性:灵活的并行策略
  1. 更智能的调度:AI驱动的动态调度
  2. 自适应优化:基于工作负载的自适应优化
  3. 跨设备协同:多设备间的异步协同
  4. 更细粒度并行:指令级并行优化

通过深入理解CANN的异步编程与并行计算机制,开发者可以充分发挥昇腾AI处理器的性能潜力,构建高性能的AI应用。



本文基于CANN 7.0版本编写,深入解析了异步编程与并行计算在CANN中的实现和应用。