========================================================
AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = gfx1010
error = -11
memory pattern = Cached global memory based trmm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 8, dims[0].y = 32, dims[0].x = 8, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ;
Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64
Kernel extra flags: 942700465
Source:
typedef union GPtr {
__global float *f;
__global float2 *f2v;
__global float4 *f4v;
__global float8 *f8v;
__global float16 *f16v;
} GPtr;
typedef union LPtr {
__local float *f;
__local float2 *f2v;
__local float4 *f4v;
__local float8 *f8v;
__local float16 *f16v;
} LPtr;
typedef union PPtr {
float *f;
float2 *f2v;
float4 *f4v;
float8 *f8v;
float16 *f16v;
} PPtr;
__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
strmmSubgroup(
uint M,
uint N,
float alpha,
const __global float *restrict A,
uint lda,
const __global float *restrict B,
__global float *C,
uint ldb)
{
float8 a0, a1, a2, a3;
float8 b0, b1, b2, b3;
float4 c0, c1, c2, c3;
uint currM, currN;
uint4 coord = 0; /* contains coordB, coordA, k */
const int lid = get_local_id(0);
const int gid = get_global_id(0) / 64;
int2 itemId;
int2 subgCoord;
itemId.x = get_local_id(0)%4;
itemId.y = get_local_id(0)/4;
subgCoord.x = itemId.y/8;
subgCoord.y = itemId.y%8;
currN = gid * 8;
currM = (M - 1) / 32 * 32;
GPtr Ag = {A};
GPtr Bg = {B};
coord.x = currN + subgCoord.x*4;
for (uint m0 = 0; m0 < M; m0 += 32) {
uint kBegin = 0;
coord.z = kBegin;
coord.y = currM + subgCoord.y*4;
c0 = 0;
c1 = 0;
c2 = 0;
c3 = 0;
if ((coord.x < N) && (coord.y < M)) {
if (coord.y + 4 > M) {
coord.y -= 4 - M % 4;
}
if (coord.x + 4 > N) {
coord.x -= 4 - N % 4;
}
uint k0;
uint kMax;
kMax = currM - currM%8;
for( k0 = 0; k0 < kMax; k0 += 32 ) {
coord.z=(k0+itemId.x*8+64*gid)%kMax;
/* -- Tiles multiplier -- */
const uint4 bx = {mad24(coord.x % N, ldb, 0u), mad24((coord.x + 1) % N, ldb, 0u), mad24((coord.x + 2) % N, ldb, 0u),
mad24((coord.x + 3) % N, ldb, 0u)};
const uint8 bk = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M;
b0.s0 = (Bg).f[bx.s0 + bk.s0];
b0.s1 = (Bg).f[bx.s0 + bk.s1];
b0.s2 = (Bg).f[bx.s0 + bk.s2];
b0.s3 = (Bg).f[bx.s0 + bk.s3];
b0.s4 = (Bg).f[bx.s0 + bk.s4];
b0.s5 = (Bg).f[bx.s0 + bk.s5];
b0.s6 = (Bg).f[bx.s0 + bk.s6];
b0.s7 = (Bg).f[bx.s0 + bk.s7];
b1.s0 = (Bg).f[bx.s1 + bk.s0];
b1.s1 = (Bg).f[bx.s1 + bk.s1];
b1.s2 = (Bg).f[bx.s1 + bk.s2];
b1.s3 = (Bg).f[bx.s1 + bk.s3];
b1.s4 = (Bg).f[bx.s1 + bk.s4];
b1.s5 = (Bg).f[bx.s1 + bk.s5];
b1.s6 = (Bg).f[bx.s1 + bk.s6];
b1.s7 = (Bg).f[bx.s1 + bk.s7];
b2.s0 = (Bg).f[bx.s2 + bk.s0];
b2.s1 = (Bg).f[bx.s2 + bk.s1];
b2.s2 = (Bg).f[bx.s2 + bk.s2];
b2.s3 = (Bg).f[bx.s2 + bk.s3];
b2.s4 = (Bg).f[bx.s2 + bk.s4];
b2.s5 = (Bg).f[bx.s2 + bk.s5];
b2.s6 = (Bg).f[bx.s2 + bk.s6];
b2.s7 = (Bg).f[bx.s2 + bk.s7];
b3.s0 = (Bg).f[bx.s3 + bk.s0];
b3.s1 = (Bg).f[bx.s3 + bk.s1];
b3.s2 = (Bg).f[bx.s3 + bk.s2];
b3.s3 = (Bg).f[bx.s3 + bk.s3];
b3.s4 = (Bg).f[bx.s3 + bk.s4];
b3.s5 = (Bg).f[bx.s3 + bk.s5];
b3.s6 = (Bg).f[bx.s3 + bk.s6];
b3.s7 = (Bg).f[bx.s3 + bk.s7];
const uint4 ay = {mad24(coord.y % M, lda, 0u), mad24((coord.y + 1) % M, lda, 0u), mad24((coord.y + 2) % M, lda, 0u),
mad24((coord.y + 3) % M, lda, 0u)};
const uint8 ak = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M;
a0.s0 = (Ag).f[ay.s0 + ak.s0];
a0.s1 = (Ag).f[ay.s0 + ak.s1];
a0.s2 = (Ag).f[ay.s0 + ak.s2];
a0.s3 = (Ag).f[ay.s0 + ak.s3];
a0.s4 = (Ag).f[ay.s0 + ak.s4];
a0.s5 = (Ag).f[ay.s0 + ak.s5];
a0.s6 = (Ag).f[ay.s0 + ak.s6];
a0.s7 = (Ag).f[ay.s0 + ak.s7];
a1.s0 = (Ag).f[ay.s1 + ak.s0];
a1.s1 = (Ag).f[ay.s1 + ak.s1];
a1.s2 = (Ag).f[ay.s1 + ak.s2];
a1.s3 = (Ag).f[ay.s1 + ak.s3];
a1.s4 = (Ag).f[ay.s1 + ak.s4];
a1.s5 = (Ag).f[ay.s1 + ak.s5];
a1.s6 = (Ag).f[ay.s1 + ak.s6];
a1.s7 = (Ag).f[ay.s1 + ak.s7];
a2.s0 = (Ag).f[ay.s2 + ak.s0];
a2.s1 = (Ag).f[ay.s2 + ak.s1];
a2.s2 = (Ag).f[ay.s2 + ak.s2];
a2.s3 = (Ag).f[ay.s2 + ak.s3];
a2.s4 = (Ag).f[ay.s2 + ak.s4];
a2.s5 = (Ag).f[ay.s2 + ak.s5];
a2.s6 = (Ag).f[ay.s2 + ak.s6];
a2.s7 = (Ag).f[ay.s2 + ak.s7];
a3.s0 = (Ag).f[ay.s3 + ak.s0];
a3.s1 = (Ag).f[ay.s3 + ak.s1];
a3.s2 = (Ag).f[ay.s3 + ak.s2];
a3.s3 = (Ag).f[ay.s3 + ak.s3];
a3.s4 = (Ag).f[ay.s3 + ak.s4];
a3.s5 = (Ag).f[ay.s3 + ak.s5];
a3.s6 = (Ag).f[ay.s3 + ak.s6];
a3.s7 = (Ag).f[ay.s3 + ak.s7];
c0.s0 = mad(a0.s0, b0.s0, c0.s0);
c0.s0 = mad(a0.s1, b0.s1, c0.s0);
c0.s0 = mad(a0.s2, b0.s2, c0.s0);
c0.s0 = mad(a0.s3, b0.s3, c0.s0);
c0.s0 = mad(a0.s4, b0.s4, c0.s0);
c0.s0 = mad(a0.s5, b0.s5, c0.s0);
c0.s0 = mad(a0.s6, b0.s6, c0.s0);
c0.s0 = mad(a0.s7, b0.s7, c0.s0);
c1.s0 = mad(a0.s0, b1.s0, c1.s0);
c1.s0 = mad(a0.s1, b1.s1, c1.s0);
c1.s0 = mad(a0.s2, b1.s2, c1.s0);
c1.s0 = mad(a0.s3, b1.s3, c1.s0);
c1.s0 = mad(a0.s4, b1.s4, c1.s0);
c1.s0 = mad(a0.s5, b1.s5, c1.s0);
c1.s0 = mad(a0.s6, b1.s6, c1.s0);
c1.s0 = mad(a0.s7, b1.s7, c1.s0);
c2.s0 = mad(a0.s0, b2.s0, c2.s0);
c2.s0 = mad(a0.s1, b2.s1, c2.s0);
c2.s0 = mad(a0.s2, b2.s2, c2.s0);
c2.s0 = mad(a0.s3, b2.s3, c2.s0);
c2.s0 = mad(a0.s4, b2.s4, c2.s0);
c2.s0 = mad(a0.s5, b2.s5, c2.s0);
c2.s0 = mad(a0.s6, b2.s6, c2.s0);
c2.s0 = mad(a0.s7, b2.s7, c2.s0);
c3.s0 = mad(a0.s0, b3.s0, c3.s0);
c3.s0 = mad(a0.s1, b3.s1, c3.s0);
c3.s0 = mad(a0.s2, b3.s2, c3.s0);
c3.s0 = mad(a0.s3, b3.s3, c3.s0);
c3.s0 = mad(a0.s4, b3.s4, c3.s0);
c3.s0 = mad(a0.s5, b3.s5, c3.s0);
c3.s0 = mad(a0.s6, b3.s6, c3.s0);
c3.s0 = mad(a0.s7, b3.s7, c3.s0);
c0.s1 = mad(a1.s0, b0.s0, c0.s1);
c0.s1 = mad(a1.s1, b0.s1, c0.s1);
c0.s1 = mad(a1.s2, b0.s2, c0.s1);
c0.s1 = mad(a1.s3, b0.s3, c0.s1);
c0.s1 = mad(a1.s4, b0.s4, c0.s1);
c0.s1 = mad(a1.s5, b0.s5, c0.s1);
c0.s1 = mad(a1.s6, b0.s6, c0.s1);
c0.s1 = mad(a1.s7, b0.s7, c0.s1);
c1.s1 = mad(a1.s0, b1.s0, c1.s1);
c1.s1 = mad(a1.s1, b1.s1, c1.s1);
c1.s1 = mad(a1.s2, b1.s2, c1.s1);
c1.s1 = mad(a1.s3, b1.s3, c1.s1);
c1.s1 = mad(a1.s4, b1.s4, c1.s1);
c1.s1 = mad(a1.s5, b1.s5, c1.s1);
c1.s1 = mad(a1.s6, b1.s6, c1.s1);
c1.s1 = mad(a1.s7, b1.s7, c1.s1);
c2.s1 = mad(a1.s0, b2.s0, c2.s1);
c2.s1 = mad(a1.s1, b2.s1, c2.s1);
c2.s1 = mad(a1.s2, b2.s2, c2.s1);
c2.s1 = mad(a1.s3, b2.s3, c2.s1);
c2.s1 = mad(a1.s4, b2.s4, c2.s1);
c2.s1 = mad(a1.s5, b2.s5, c2.s1);
c2.s1 = mad(a1.s6, b2.s6, c2.s1);
c2.s1 = mad(a1.s7, b2.s7, c2.s1);
c3.s1 = mad(a1.s0, b3.s0, c3.s1);
c3.s1 = mad(a1.s1, b3.s1, c3.s1);
c3.s1 = mad(a1.s2, b3.s2, c3.s1);
c3.s1 = mad(a1.s3, b3.s3, c3.s1);
c3.s1 = mad(a1.s4, b3.s4, c3.s1);
c3.s1 = mad(a1.s5, b3.s5, c3.s1);
c3.s1 = mad(a1.s6, b3.s6, c3.s1);
c3.s1 = mad(a1.s7, b3.s7, c3.s1);
c0.s2 = mad(a2.s0, b0.s0, c0.s2);
c0.s2 = mad(a2.s1, b0.s1, c0.s2);
c0.s2 = mad(a2.s2, b0.s2, c0.s2);
c0.s2 = mad(a2.s3, b0.s3, c0.s2);
c0.s2 = mad(a2.s4, b0.s4, c0.s2);
c0.s2 = mad(a2.s5, b0.s5, c0.s2);
c0.s2 = mad(a2.s6, b0.s6, c0.s2);
c0.s2 = mad(a2.s7, b0.s7, c0.s2);
c1.s2 = mad(a2.s0, b1.s0, c1.s2);
c1.s2 = mad(a2.s1, b1.s1, c1.s2);
c1.s2 = mad(a2.s2, b1.s2, c1.s2);
c1.s2 = mad(a2.s3, b1.s3, c1.s2);
c1.s2 = mad(a2.s4, b1.s4, c1.s2);
c1.s2 = mad(a2.s5, b1.s5, c1.s2);
c1.s2 = mad(a2.s6, b1.s6, c1.s2);
c1.s2 = mad(a2.s7, b1.s7, c1.s2);
c2.s2 = mad(a2.s0, b2.s0, c2.s2);
c2.s2 = mad(a2.s1, b2.s1, c2.s2);
c2.s2 = mad(a2.s2, b2.s2, c2.s2);
c2.s2 = mad(a2.s3, b2.s3, c2.s2);
c2.s2 = mad(a2.s4, b2.s4, c2.s2);
c2.s2 = mad(a2.s5, b2.s5, c2.s2);
c2.s2 = mad(a2.s6, b2.s6, c2.s2);
c2.s2 = mad(a2.s7, b2.s7, c2.s2);
c3.s2 = mad(a2.s0, b3.s0, c3.s2);
c3.s2 = mad(a2.s1, b3.s1, c3.s2);
c3.s2 = mad(a2.s2, b3.s2, c3.s2);
c3.s2 = mad(a2.s3, b3.s3, c3.s2);
c3.s2 = mad(a2.s4, b3.s4, c3.s2);
c3.s2 = mad(a2.s5, b3.s5, c3.s2);
c3.s2 = mad(a2.s6, b3.s6, c3.s2);
c3.s2 = mad(a2.s7, b3.s7, c3.s2);
c0.s3 = mad(a3.s0, b0.s0, c0.s3);
c0.s3 = mad(a3.s1, b0.s1, c0.s3);
c0.s3 = mad(a3.s2, b0.s2, c0.s3);
c0.s3 = mad(a3.s3, b0.s3, c0.s3);
c0.s3 = mad(a3.s4, b0.s4, c0.s3);
c0.s3 = mad(a3.s5, b0.s5, c0.s3);
c0.s3 = mad(a3.s6, b0.s6, c0.s3);
c0.s3 = mad(a3.s7, b0.s7, c0.s3);
c1.s3 = mad(a3.s0, b1.s0, c1.s3);
c1.s3 = mad(a3.s1, b1.s1, c1.s3);
c1.s3 = mad(a3.s2, b1.s2, c1.s3);
c1.s3 = mad(a3.s3, b1.s3, c1.s3);
c1.s3 = mad(a3.s4, b1.s4, c1.s3);
c1.s3 = mad(a3.s5, b1.s5, c1.s3);
c1.s3 = mad(a3.s6, b1.s6, c1.s3);
c1.s3 = mad(a3.s7, b1.s7, c1.s3);
c2.s3 = mad(a3.s0, b2.s0, c2.s3);
c2.s3 = mad(a3.s1, b2.s1, c2.s3);
c2.s3 = mad(a3.s2, b2.s2, c2.s3);
c2.s3 = mad(a3.s3, b2.s3, c2.s3);
c2.s3 = mad(a3.s4, b2.s4, c2.s3);
c2.s3 = mad(a3.s5, b2.s5, c2.s3);
c2.s3 = mad(a3.s6, b2.s6, c2.s3);
c2.s3 = mad(a3.s7, b2.s7, c2.s3);
c3.s3 = mad(a3.s0, b3.s0, c3.s3);
c3.s3 = mad(a3.s1, b3.s1, c3.s3);
c3.s3 = mad(a3.s2, b3.s2, c3.s3);
c3.s3 = mad(a3.s3, b3.s3, c3.s3);
c3.s3 = mad(a3.s4, b3.s4, c3.s3);
c3.s3 = mad(a3.s5, b3.s5, c3.s3);
c3.s3 = mad(a3.s6, b3.s6, c3.s3);
c3.s3 = mad(a3.s7, b3.s7, c3.s3);
/* ---------------------- */
}
if( itemId.x == 0 ) {
for( k0 = kMax; (k0 < currM+32)&&(k0 < M); k0 += 1 ) {
coord.z=k0;
/* -- Tiles multiplier -- */
const uint bk = coord.z % M;
b0.s0 = (Bg).f[mad24(coord.x % N, ldb, bk)];
b1.s0 = (Bg).f[mad24((coord.x + 1) % N, ldb, bk)];
b2.s0 = (Bg).f[mad24((coord.x + 2) % N, ldb, bk)];
b3.s0 = (Bg).f[mad24((coord.x + 3) % N, ldb, bk)];
b0.s0 = (coord.z < M) ? b0.s0 : 0;
b1.s0 = (coord.z < M) ? b1.s0 : 0;
b2.s0 = (coord.z < M) ? b2.s0 : 0;
b3.s0 = (coord.z < M) ? b3.s0 : 0;
const uint ak = coord.z % M;
a0.s0 = (Ag).f[mad24(coord.y % M, lda, ak)];
a1.s0 = (Ag).f[mad24((coord.y + 1) % M, lda, ak)];
a2.s0 = (Ag).f[mad24((coord.y + 2) % M, lda, ak)];
a3.s0 = (Ag).f[mad24((coord.y + 3) % M, lda, ak)];
a0.s0 = (coord.z < M) ? a0.s0 : 0;
a1.s0 = (coord.z < M) ? a1.s0 : 0;
a2.s0 = (coord.z < M) ? a2.s0 : 0;
a3.s0 = (coord.z < M) ? a3.s0 : 0;
// post fetch A
{
uint zy = coord.y;
a0.s0 = zy < coord.z ? 0 : a0.s0;
a0.s0 = zy == coord.z ? 1 : a0.s0;
zy++;
a1.s0 = zy < coord.z ? 0 : a1.s0;
a1.s0 = zy == coord.z ? 1 : a1.s0;
zy++;
a2.s0 = zy < coord.z ? 0 : a2.s0;
a2.s0 = zy == coord.z ? 1 : a2.s0;
zy++;
a3.s0 = zy < coord.z ? 0 : a3.s0;
a3.s0 = zy == coord.z ? 1 : a3.s0;
}
c0.s0 = mad(a0.s0, b0.s0, c0.s0);
c1.s0 = mad(a0.s0, b1.s0, c1.s0);
c2.s0 = mad(a0.s0, b2.s0, c2.s0);
c3.s0 = mad(a0.s0, b3.s0, c3.s0);
c0.s1 = mad(a1.s0, b0.s0, c0.s1);
c1.s1 = mad(a1.s0, b1.s0, c1.s1);
c2.s1 = mad(a1.s0, b2.s0, c2.s1);
c3.s1 = mad(a1.s0, b3.s0, c3.s1);
c0.s2 = mad(a2.s0, b0.s0, c0.s2);
c1.s2 = mad(a2.s0, b1.s0, c1.s2);
c2.s2 = mad(a2.s0, b2.s0, c2.s2);
c3.s2 = mad(a2.s0, b3.s0, c3.s2);
c0.s3 = mad(a3.s0, b0.s0, c0.s3);
c1.s3 = mad(a3.s0, b1.s0, c1.s3);
c2.s3 = mad(a3.s0, b2.s0, c2.s3);
c3.s3 = mad(a3.s0, b3.s0, c3.s3);
/* ---------------------- */
}
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
if ((coord.y + 4 == M) && (M % 4)) {
coord.y += 4 - M % 4;
}
if ((coord.x + 4 == N) && (N % 4)) {
coord.x += 4 - N % 4;
}
//-----MergeUpdateResult
// veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]
__local float4 ascratch[4*16*4];
__local float4 *scratch = ascratch;
//LDS block has the same vectorization as C matrix block
//VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) );
scratch += 4*((itemId.y%16)*4 +itemId.x );
for( uint mstep = 0; mstep < 16; mstep += 16 ) {
if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) {
scratch[0] = c0;
scratch[1] = c1;
scratch[2] = c2;
scratch[3] = c3;
c0 = 0;
c1 = 0;
c2 = 0;
c3 = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) {
if ( 0 == itemId.x ) {
for(uint k = 0; k < 4 * 4; k += 4) {
c0 += scratch[0];
c1 += scratch[1];
c2 += scratch[2];
c3 += scratch[3];
//Adding the LDS block size in vectors
scratch += 4;
}
if ((coord.y < M) && (coord.x < N)) {
uint y = min(4u, M - (uint)coord.y);
uint x = min(4u, N - (uint)coord.x);
if ((y == 4) && (x == 4)) {
GPtr uC;
uC.f = C + coord.x * ldb + coord.y;
__global float *pC = uC.f;
float4 tempC0, tempC1, tempC2, tempC3;
tempC0 = mad(c0, alpha, 0);
tempC1 = mad(c1, alpha, 0);
tempC2 = mad(c2, alpha, 0);
tempC3 = mad(c3, alpha, 0);
pC[0] = tempC0.s0;
pC[1] = tempC0.s1;
pC[2] = tempC0.s2;
pC[3] = tempC0.s3;
pC[ldb] = tempC1.s0;
pC[ldb + 1] = tempC1.s1;
pC[ldb + 2] = tempC1.s2;
pC[ldb + 3] = tempC1.s3;
pC[(ldb << 1)] = tempC2.s0;
pC[mad24(2u, ldb, 1u)] = tempC2.s1;
pC[mad24(2u, ldb, 2u)] = tempC2.s2;
pC[mad24(2u, ldb, 3u)] = tempC2.s3;
pC[mad24(3u, ldb, 0u)] = tempC3.s0;
pC[mad24(3u, ldb, 1u)] = tempC3.s1;
pC[mad24(3u, ldb, 2u)] = tempC3.s2;
pC[mad24(3u, ldb, 3u)] = tempC3.s3;
}
else {
GPtr uC;
int i, j;
PPtr res;
uC.f = C + coord.x * ldb + coord.y;
uC.f += (x-1) * ldb;
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c3.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c3.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c3.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c3.s3 * alpha;
}
uC.f -= ldb;
x--;
}
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c2.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c2.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c2.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c2.s3 * alpha;
}
uC.f -= ldb;
x--;
}
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c1.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c1.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c1.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c1.s3 * alpha;
}
uC.f -= ldb;
x--;
}
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c0.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c0.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c0.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c0.s3 * alpha;
}
uC.f -= ldb;
x--;
}
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
currM -= 32;
}
}
--------------------------------------------------------
Build log:
/tmp/comgr-a1a18b/input/CompileCLSource:56:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers
GPtr Ag = {A};
^
/tmp/comgr-a1a18b/input/CompileCLSource:57:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers
GPtr Bg = {B};
^
/tmp/comgr-a1a18b/input/CompileCLSource:366:24: error: variables in the local address space can only be declared in the outermost scope of a kernel function
__local float4 ascratch[4*16*4];
^
2 warnings and 1 error generated.
Error: Failed to compile opencl source (from CL to LLVM IR).
========================================================
Segmentation fault (core dumped)
Additionally, several tests fail, e.g.
[ RUN ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39
m : 6 n: 63
/home/robin/dev/clBLAS/src/tests/include/matrix.h:472: Failure
The difference between ((ref).s[0]) and ((clresult).s[0]) is 58230133, which exceeds delta, where
((ref).s[0]) evaluates to -270497230451976,
((clresult).s[0]) evaluates to -270497288682109, and
delta evaluates to 0.
clblasColumnMajor, clblasTrans, clblasTrans, M = 128, N = 128, K = 128, offA = 1, offB = 0, offC = 0, lda = 500, ldb = 501, ldc = 502
seed = 12345, queues = 1, [ FAILED ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39, where GetParam() = (1, 1, 1, 128, 128, 128, 48-byte object <F4-01 00-00 00-00 00-00 F5-01 00-00 00-00 00-00 F6-01 00-00 00-00 00-00 01-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00>, 1) (9 ms)