vecSum = reduce (+) 0 . join . mapWorkgroup ( join . toGlobal (mapLocal (mapSeq id)) . split 1 . join . mapWarp ( join . mapLane (reduceSeq (+) 0) . split 2 . reorder (stride 1) . join . mapLane (reduceSeq (+) 0) . split 2 . reorder (stride 2) . join . mapLane (reduceSeq (+) 0) . split 2 . reorder (stride 4) . join . mapLane (reduceSeq (+) 0) . split 2 . reorder (stride 8) . join . mapLane (reduceSeq (+) 0) . split 2 . reorder (stride 16) . join . mapLane (reduceSeq (+) 0) . split 2 . reorder (stride 32) ) . split 64 . join . mapLocal (reduceSeq (+) 0) . split 2 . reorder (stride 64) . join . toLocal (mapLocal (reduceSeq (+) 0)) . split (blockSize / 128) . reorder (stride 128) ) . split blockSize
kernel void reduce6(global float* g_idata, global float* g_odata, unsigned int n, local volatile float* l_data) { unsigned int tid = get_local_id(0); unsigned int i = get_group_id(0) * (get_local_size(0)*2) + get_local_id(0); unsigned int gridSize = WG_SIZE * get_num_groups(0); l_data[tid] = 0; while (i < n) { l_data[tid] += g_idata[i]; if (i + WG_SIZE < n) l_data[tid] += g_idata[i+WG_SIZE]; i += gridSize; } barrier(CLK_LOCAL_MEM_FENCE); if (WG_SIZE >= 256) { if (tid < 128) { l_data[tid] += l_data[tid+128]; } barrier(CLK_LOCAL_MEM_FENCE); } if (WG_SIZE >= 128) { if (tid < 64) { l_data[tid] += l_data[tid+ 64]; } barrier(CLK_LOCAL_MEM_FENCE); } if (tid < 32) { if (WG_SIZE >= 64) { l_data[tid] += l_data[tid+32]; } if (WG_SIZE >= 32) { l_data[tid] += l_data[tid+16]; } if (WG_SIZE >= 16) { l_data[tid] += l_data[tid+ 8]; } if (WG_SIZE >= 8) { l_data[tid] += l_data[tid+ 4]; } if (WG_SIZE >= 4) { l_data[tid] += l_data[tid+ 2]; } if (WG_SIZE >= 2) { l_data[tid] += l_data[tid+ 1]; } } if (tid == 0) g_odata[get_group_id(0)] = l_data[0]; }
matMult = λ A B . map (λ rowA . map (λ colB . dotProduct rowA colB ) (transpose B) ) A