- entry 0x403c98d8
- [ 83][D][esp32-hal-cpu.c:244] setCpuFrequencyMhz(): PLL: 480 / 6 = 80 Mhz, APB: 80000000 Hz
- Guru Meditation Error: Core 1 panic'ed (LoadStoreError). Exception was unhandled.
- Core 1 register dump:
- PC : 0x403752b8 PS : 0x00060830 A0 : 0x80375558 A1 : 0x3fce2bc0
- A2 : 0x3fc94088 A3 : 0x40378d40 A4 : 0x00000030 A5 : 0x00000000
- A6 : 0x02ce3644 A7 : 0x00ffffff A8 : 0x60016800 A9 : 0x00000120
- A10 : 0x00000000 A11 : 0x00000000 A12 : 0x40378d40 A13 : 0x00000000
- A14 : 0x0028800a A15 : 0x40378d40 SAR : 0x00000010 EXCCAUSE: 0x00000003
- EXCVADDR: 0x40378d40 LBEG : 0x400570e8 LEND : 0x400570f3 LCOUNT : 0xffffffff
My goal for this is to speed up the FastLED code by using a lookup table for each nibble rather than doing a comparison for each bit. My benchmarking of my current method shows over a 25% speedup, but I want to see how far I can take it. The ee.vld.128.ip instruction looks ideal since I need to load 16 bytes of data then perform 4 word writes to the RMT memory buffer.
What am I doing wrong here? The l32i instruction accessing the exact same address works fine and loads the expected value.
I have also added code to set the wur.sar_byte and wur.accx_x registers to 0, but this makes no difference, nor do I see why this would be necessary.
My code up until the crash looks like:
- " srli %[tmp], %[p], 4 \n"
- " slli %[tmp], %[tmp], 4 \n"
- " add.n %[tmp], %[tmp], %[bitTable] \n"
- " mov.n a15, %[tmp] \n"
- " l32i a14, %[tmp], 0 \n"
- " ee.vld.128.ip q0,%[tmp],0 \n"
- 403752a2: 72b8 l32i.n a11, a2, 28
- 403752a4: fc6131 l32r a3, 40374428 <_iram_text_start+0x8>
- 403752a7: bbaa add.n a11, a11, a10
- 403752a9: 000bb2 l8ui a11, a11, 0
- 403752ac: 41c4b0 srli a12, a11, 4
- 403752af: 11ccc0 slli a12, a12, 4
- 403752b2: cc3a add.n a12, a12, a3
- 403752b4: 0cfd mov.n a15, a12
- 403752b6: 0ce8 l32i.n a14, a12, 0
- 403752b8: 8300c4 ee.vld.128.ip q0, a12, 0
- register uint8_t pData = mPixelData[mCur];
- register rmt_item32_t *bitTablePtr = &bitTable[0][0];
- __asm__ __volatile__(
- " srli %[tmp], %[p], 4 \n"
- " slli %[tmp], %[tmp], 4 \n"
- " add.n %[tmp], %[tmp], %[bitTable] \n"
- " mov.n a15, %[tmp] \n"
- " l32i a14, %[tmp], 0 \n"
- " ee.vld.128.ip q0,%[tmp],0 \n"
- " extui %[tmp], %[p], 0, 4 \n"
- " slli %[tmp], %[tmp], 4 \n"
- " add.n %[tmp], %[tmp], %[bitTable] \n"
- " ee.vld.128.ip q1,%[tmp],0 \n"
- " ee.movi.32.a q0, %[tmp], 3 \n"
- " s32i %[tmp], %[pRmtMem], 0x0 \n"
- " ee.movi.32.a q0, %[tmp], 2 \n"
- " s32i %[tmp], %[pRmtMem], 0x4 \n"
- " ee.movi.32.a q0, %[tmp], 1 \n"
- " s32i %[tmp], %[pRmtMem], 0x8 \n"
- " ee.movi.32.a q0, %[tmp], 0 \n"
- " s32i %[tmp], %[pRmtMem], 0xc \n"
- " ee.movi.32.a q1, %[tmp], 3 \n"
- " s32i %[tmp], %[pRmtMem], 0x10 \n"
- " ee.movi.32.a q1, %[tmp], 2 \n"
- " s32i %[tmp], %[pRmtMem], 0x14 \n"
- " ee.movi.32.a q1, %[tmp], 1 \n"
- " s32i %[tmp], %[pRmtMem], 0x18 \n"
- " ee.movi.32.a q1, %[tmp], 0 \n"
- " s32i %[tmp], %[pRmtMem], 0x1c \n"
- " addi %[pRmtMem],%[pRmtMem], 0x20 \n"
- " memw \n"
- : [tmp] "=&r"(tmp), [pRmtMem] "+r"(pItem)
- : [bitTable] "r"(bitTablePtr), [p] "r"(pData)
- : "a14", "a15");
- mCur++;
Any help would be appreciated.
-Aaron