volatile register uint32_t *pItem = (uint32_t *)mRMT_mem_ptr;
for (register int i = 0; i < PULSES_PER_FILL/8; i++) {
if (mCur < mSize) {
register uint32_t tmp1, tmp2, tmp3, tmp4;
#if 1
register uint8_t pData = mPixelData[mCur];
register rmt_item32_t *bitTablePtr = &bitTable[0][0];
#if 1
// This is a slight speedup by not waiting for the load stall
// cycles. By the time the load is finished, the store should
// be ready. Stores aren't as critical due to a write buffer,
// but you can't immediately store after loading without losing
// a couple of cycles due to pipeline stalls.
__asm__ __volatile__(
// Get upper nibble of color and multiply it by 16 to
// get the address in the table
" srli %[tmp], %[p], 4 \n"
" slli %[tmp], %[tmp], 4 \n"
// Add color nibble as offset to table
" add.n %[tmp], %[tmp], %[bitTable] \n"
// Load 4 words from table
" l32i %[tmp1], %[tmp],0 \n"
" l32i %[tmp2], %[tmp],4 \n"
" l32i %[tmp3], %[tmp],8 \n"
" l32i %[tmp4], %[tmp],12 \n"
// Store 4 words to RMT memory
" s32i %[tmp1], %[pRmtMem], 0x0 \n"
" s32i %[tmp2], %[pRmtMem], 0x4 \n"
" s32i %[tmp3], %[pRmtMem], 0x8 \n"
" s32i %[tmp4], %[pRmtMem], 0xc \n"
// Take the lower nibble and multiply it by 16 to
// get the address in the table
" extui %[tmp], %[p], 0, 4 \n"
" slli %[tmp], %[tmp], 4 \n"
// Add color nibble as offset to table
" add.n %[tmp], %[tmp], %[bitTable] \n"
// Load 4 words from table
" l32i %[tmp1], %[tmp],0 \n"
" l32i %[tmp2], %[tmp],4 \n"
" l32i %[tmp3], %[tmp],8 \n"
" l32i %[tmp4], %[tmp],12 \n"
// Store 4 words to RMT memory
" s32i %[tmp1], %[pRmtMem], 0x10 \n"
" s32i %[tmp2], %[pRmtMem], 0x14 \n"
" s32i %[tmp3], %[pRmtMem], 0x18 \n"
" s32i %[tmp4], %[pRmtMem], 0x1c \n"
// Update RMT memory pointer
" addi.n %[pRmtMem], %[pRmtMem], 0x20\n"
// Flush the writes
" memw \n"
: [tmp] "=&r"(tmp), [pRmtMem] "+r"(pItem),
[tmp1] "=&r"(tmp1),[tmp2] "=&r"(tmp2),
[tmp3] "=&r"(tmp3), [tmp4] "=&r"(tmp4)
: [bitTable] "r"(bitTablePtr), [p] "r"(pData)
: );
#else
// This attempts to do the same thing but with the vector load
// instructions. This crashes for some unknown reason when the
// ee.vld.128.ip instruction hits. It's not due to alignment
// but for some reason the ee instruction is barfing on the address.
__asm__ __volatile__(
" srli %[tmp], %[p], 4 \n"
" slli %[tmp], %[tmp], 4 \n"
" add.n %[tmp], %[tmp], %[bitTable] \n"
" mov.n a15, %[tmp] \n"
" l32i a14, %[tmp], 0 \n"
" ee.vld.128.ip q0,%[tmp],0 \n"
" extui %[tmp], %[p], 0, 4 \n"
" slli %[tmp], %[tmp], 4 \n"
" add.n %[tmp], %[tmp], %[bitTable] \n"
" ee.vld.128.ip q1,%[tmp],0 \n"
" ee.movi.32.a q0, %[tmp], 3 \n"
" s32i %[tmp], %[pRmtMem], 0x0 \n"
" ee.movi.32.a q0, %[tmp], 2 \n"
" s32i %[tmp], %[pRmtMem], 0x4 \n"
" ee.movi.32.a q0, %[tmp], 1 \n"
" s32i %[tmp], %[pRmtMem], 0x8 \n"
" ee.movi.32.a q0, %[tmp], 0 \n"
" s32i %[tmp], %[pRmtMem], 0xc \n"
" ee.movi.32.a q1, %[tmp], 3 \n"
" s32i %[tmp], %[pRmtMem], 0x10 \n"
" ee.movi.32.a q1, %[tmp], 2 \n"
" s32i %[tmp], %[pRmtMem], 0x14 \n"
" ee.movi.32.a q1, %[tmp], 1 \n"
" s32i %[tmp], %[pRmtMem], 0x18 \n"
" ee.movi.32.a q1, %[tmp], 0 \n"
" s32i %[tmp], %[pRmtMem], 0x1c \n"
" addi %[pRmtMem],%[pRmtMem], 0x20 \n"
" memw \n"
: [tmp] "=&r"(tmp), [pRmtMem] "+r"(pItem)
: [bitTable] "r"(bitTablePtr), [p] "r"(pData)
: "a14", "a15");
#endif
mCur++;