仅对英特尔可见 — GUID: mub1476294593345
Ixiasoft
仅对英特尔可见 — GUID: mub1476294593345
Ixiasoft
2.7. 基于HTML Report中的信息优化OpenCL设计实例
执行matrix square(矩阵平方)AxA的OpenCL设计实例:
// performs matrix square A*A
// A is a square LEN*LEN matrix
// A = [ r0 : [ c[0], c[1], ... c[LEN-1] ],
// r1 : [ c[0], ... ],
// ... ],
// r[LEN-1] : [ ] ]
// LEN = 100
kernel void matrix_square (global float* restrict A, global float* restrict out)
{
for( unsigned oi = 0 ; oi < LEN*LEN ; oi++ )
{
float sum = 0.f;
int row = oi / LEN;
int col = oi % LEN;
#pragma unroll
for ( int stride = 0 ; stride < LEN ; stride++ )
{
unsigned i = (row * LEN) + stride;
unsigned j = (stride * LEN) + col;
sum += A[i] * A[j];
}
out[oi] = sum;
}
}
内核matrix_square的面积分析表明触发器(FF)和RAM的估计使用率很高。

在System查看器中对block 2的进一步检查,表明Block2也具有高的延迟值。

导致这些性能瓶颈是由于系统从循环内的全局存储器加载数据。因此,您可以采取的第一个优化步骤是将数据预加载到局部存储器中,如以下修改后的代码所示:
// 1. preload the data into local memory
kernel void matrix_square_v1 (global float* restrict A, global float* restrict out)
{
local float cache_a[LEN*LEN];
for( unsigned k = 0 ; k < LEN*LEN ; k++ )
{
cache_a[k] = A[k];
}
for( unsigned oi = 0 ; oi < LEN*LEN ; oi++ )
{
float sum = 0.f;
int row = oi / LEN;
int col = oi % LEN;
#pragma unroll
for( unsigned stride = 0 ; stride < LEN ; stride++ )
{
unsigned i = (row * LEN) + stride;
unsigned j = (stride * LEN) + col;
sum += cache_a[i] * cache_a[j];
}
out[oi] = sum;
}
}


如果您移除模数(模运算)并以列计数器替换,如修改后的内核matrix_square_v2中所示,您可以减少自适应查找表(ALUT)和FF的使用量。
// 1. preload the data into local memory
// 2. remove the modulus computation
kernel void matrix_square_v2 (global float* restrict A, global float* restrict out)
{
local float cache_a[LEN*LEN];
for( unsigned k = 0 ; k < LEN*LEN ; k++ )
{
cache_a[k] = A[k];
}
unsigned row = 0;
unsigned col = 0;
for( unsigned oi = 0 ; oi < LEN*LEN ; oi++ )
{
float sum = 0.f;
// keep a column counter to know when to increment row
if( col == LEN - 1 )
{
col = 0;
row += 1;
}
else
{
col += 1;
}
#pragma unroll
for( unsigned stride = 0 ; stride < LEN ; stride++ )
{
unsigned i = (row * LEN) + stride;
unsigned j = (stride * LEN) + col;
sum += cache_a[i] * cache_a[j];
}
out[oi] = sum;
}
}


进一步检查matrix_square_v2的面积报告,可发现索引i和j(即,分别是unsigned i = (row * LEN) + stride和unsigned j = (stride * LEN) + col)的计算具有非常不同的ALUT用法估算。
有一种方法来优化索引计算的DSP和RAM块的使用,删除乘法计算并且只追踪加法,如以下修改后的内核matrix_square_v3中所示。
// 1. preload the data into local memory
// 2. remove the modulus computation
// 3. remove DSP and RAM blocks for index calculation helps reduce the latency
kernel void matrix_square_v3 (global float* restrict A, global float* restrict out)
{
local float cache_a[LEN*LEN];
for( unsigned k = 0 ; k < LEN*LEN ; k++ )
{
cache_a[k] = A[k];
}
unsigned row_i = 0;
unsigned row_j = 0;
for( unsigned oi = 0 ; oi < LEN*LEN ; oi++ )
{
unsigned i, j;
// keep a column base counter to know when to increment the row base
if( row_j == LEN - 1 )
{
row_i += LEN;
row_j = 0;
}
else
{
row_j += 1;
}
// initialize i and j
i = row_i;
j = row_j;
float sum = 0.f;
#pragma unroll
for( unsigned stride = 0 ; stride < LEN ; stride++ )
{
i += 1; // 0, 1, 2, 3, 0,...
j += LEN; // 0, 3, 6, 9, 1,...
sum += cache_a[i] * cache_a[j];
}
out[oi] = sum;
}
}
通过删除乘法步骤,您可以减少DSP的使用,如以下面积报告所示。此外,该修改有助于减少延迟。


为了解决携带循环的依赖关系,请展开(unroll)sum-product(和-积)了解完整并行度(parallelism)并创建寄存器来避免多次复制cache_a,如以下修改后的内核matrix_square_v4中的代码所示。
// 1. preload the data into local memory
// 2. remove the modulus computation
// 3. remove DSP and RAM blocks for index calculation helps reduce the latency
// 4. unroll the sum-product for full parallelism, create registers to avoid many copies of cache_a
kernel void matrix_square_v4 (global float* restrict A, global float* restrict out)
{
local float cache_a[LEN*LEN];
for( unsigned k = 0 ; k < LEN*LEN ; k++ )
{
cache_a[k] = A[k];
}
unsigned row_i = 0;
unsigned row_j = 0;
for( unsigned oi = 0 ; oi < LEN*LEN ; oi++ )
{
unsigned i, j;
// keep a column base counter to know when to increment the row base
if( row_j == LEN - 1 )
{
row_i += LEN;
row_j = 0;
}
else
{
row_j += 1;
}
// initialize i and j
i = row_i;
j = row_j;
float r_buf[LEN];
float c_buf[LEN];
for( int stride = 0 ; stride < LEN ; stride++ )
{
i += 1; // 0, 1, 2, 3, 0,...
j += LEN; // 0, 3, 6, 9, 1,...
r_buf[stride] = cache_a[i];
c_buf[stride] = cache_a[j];
}
// uses harder floating point when -fp-relaxed is used
float sum = 0.f;
#pragma unroll
for(unsigned idx = 0; idx < LEN; idx++)
{
sum += r_buf[idx] * c_buf[idx];
}
out[oi] = sum;
}
}
如以下集群视图结果所示,通过分解计算步骤,您可以获得更改的吞吐量,但以增加区域面积使用为代价。该修改还会将延迟减少50%。

以下集群视图提供一种替代方法,通过使用-fp-relaxed来显示点积(dot product)而非链式。

以下表格提供全部5个内核版本的吞吐量比较:
内核 | ALUTs | FFs | RAM | MLAB | DSPs | Dot Product Loop Latency |
---|---|---|---|---|---|---|
matrix_square | 81806 (10%) | 302792 (18%) | 1989 (73%) | 408 (1%) | 100 (7%) | 637 |
matrix_square_v1 | 20094 (2%) | 38814 (2%) | 1619 (60%) | 248 (1%) | 100 (7%) | 380 |
matrix_square_v2 | 15487 (2%) | 51813 (3%) | 1110 (41%) | 298 (1%) | 100 (7%) | 364 |
matrix_square_v3 | 18279 (2%) | 37554 (2%) | 1618 (60%) | 244 (1%) | 100 (7%) | 362 |
matrix_square_v4 (-fp-relaxed) |
9681 (1%) | 22409 (1%) | 257 (9%) | 67 (0%) | 103 (7%) | 37 |