ULP RISCV SPI Simulation Using Bit-banging

Honzik321
Posts: 52
Joined: Mon Apr 06, 2020 11:17 pm
Location: Czech Republic

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby Honzik321 » Sun Feb 02, 2025 10:34 am

I encountered another major problem. Assigning a single byte to an array element takes about 7 µs! Is there a more optimal way to store a 27-byte block faster?

Code: Select all

uint8_t read_buffer[27] = {0};

		for(int i = 0; i < 27; i++) {
			
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN))  / 4);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN))  / 2);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)));
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) * 2);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN))  * 4);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) * 8);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) * 16);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)(((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) * 32);
			
			read_buffer[i] = byte1;
		}
Attachments
Without assigning a value to an array.png
Without assigning a value to an array.png (12.13 KiB) Viewed 3997 times
Pause caused by assigning a value to an array.png
Pause caused by assigning a value to an array.png (9.38 KiB) Viewed 3997 times

MicroController
Posts: 2045
Joined: Mon Oct 17, 2022 7:38 pm
Location: Europe, Germany

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby MicroController » Sun Feb 02, 2025 11:54 am

Use one 32-bit store instead of four 8-bit stores.
Also worth trying: Iterating over the array via a pointer.

Honzik321
Posts: 52
Joined: Mon Apr 06, 2020 11:17 pm
Location: Czech Republic

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby Honzik321 » Sun Feb 02, 2025 2:07 pm

This code gives almost the same time for reading 28 bytes of data (even a few microseconds more than the previous code). I have no more ideas :-( I need to read 28 bytes of data under 500 µs, which is very close—right now, it takes about 530 µs.

Code: Select all

	
	uint32_t read_buffer1[28] = {0};
	
	        for(uint8_t i = 0; i < 7; i++) {
		
		uint32_t byte4 = 0;
		
		for(uint8_t j = 0; j < 4; j++) {
			
			uint8_t byte1 = 0;
			
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) );
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 1);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 2);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 3);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 4);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 5);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 6);
			REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
			byte1 |= (uint8_t)((((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 7);
			
			byte4 |= (((uint32_t)byte1) << (j * 8)); // ((uint32_t)byte1 << 24) | ((uint32_t)byte1 << 24) | ((uint32_t)byte1 << 24) | ((uint32_t)byte1 << 24); // | // ((uint32_t)byte1 & 0xFF) << (j * 8); // (uint32_t)byte1 << (j * 8);
		}
	
		*(read_buffer + i) = byte4;
	}

MicroController
Posts: 2045
Joined: Mon Oct 17, 2022 7:38 pm
Location: Europe, Germany

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby MicroController » Sun Feb 02, 2025 6:33 pm

The fastest way to iterate over an array "via a pointer" is usually:

Code: Select all

static const unsigned int MY_ARRAY_NUM_ELEMENTS = sizeof(my_array)/sizeof(my_array[0]));

uint32_t* ptr = &my_array[0];
uint32_t* const end = ptr + MY_ARRAY_NUM_ELEMENTS;

do {
  ...
  *ptr = ...;
  ptr += 1;
} while (ptr < end);
(Though gcc sometimes decides to "optimize" this back to a for-loop with an explicit index.)
Unrolling the loop and using literals for subscripts is still faster.

And collecting (up to) 32 bits is fastest like

Code: Select all

uint32_t bits = 0;
do {
...
  bits = (bits << 1) | nextBit;
...
} while(...);

Honzik321
Posts: 52
Joined: Mon Apr 06, 2020 11:17 pm
Location: Czech Republic

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby Honzik321 » Tue Feb 11, 2025 7:39 pm

Thank you for the tips! I tested all possibilities and combinations of data reading by uint8_t, uint16_t, and uint32_t types. Unfortunately, I couldn't achieve faster reading. Using a 32-bit array can speed up the process by approximately 10%, but at the cost of irregular bit lengths. I decided to stick with a uint8_t array and load data in 8-bit chunks. There is always a gap of approximately 9 µs between individual bytes, in which case the CLK for each byte is regular with a frequency of approximately 600 kHz.

DrMickeyLauer
Posts: 179
Joined: Sun May 22, 2022 2:42 pm

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby DrMickeyLauer » Wed Feb 12, 2025 1:02 pm

Given the immense constraints of the RISCV ULP, if this works, I think this is quite an achievement! Is your code published anywhere? I'd like to take a look.

ESP_Sprite
Posts: 9985
Joined: Thu Nov 26, 2015 4:08 am

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby ESP_Sprite » Thu Feb 13, 2025 1:58 am

At this level, you'd probably want to look at the assembly output as well; it will probably give you some more clues on how to optimize things.

Honzik321
Posts: 52
Joined: Mon Apr 06, 2020 11:17 pm
Location: Czech Republic

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby Honzik321 » Sun Feb 23, 2025 10:41 am

Sorry for the late reply. At the moment, the achieved SPI transfer speed using bit-banging is sufficient for reading data from two SPI slave devices every 2 ms (500 samples per second). My original goal was 1 kSPS, but this will probably not be achievable.

Below, I am sending the complete code for the ULP coprocessor (created from the example project ULP => interrupts). I shifts the bits immediately when reading a byte and the DATA (MISO) is on pin 0, as this solution gave me the best results. It also didn’t matter whether I used a pointer or assigned data to an array using an index. Optimization didn't matter; `0s` or `02` gives the same result. The result is reading from two SPI slave devices into two ping-pong buffers of 1350 bytes each, allowing one buffer to be processed while the other is being filled with data.

/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Unlicense OR CC0-1.0
*/
/* ULP RISC-V interrupts example

This example code is in the Public Domain (or CC0 licensed, at your option.)

Unless required by applicable law or agreed to in writing, this
software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied.

This code runs on ULP RISC-V coprocessor
*/

Code: Select all

#include "ulp_riscv_utils.h"
#include "ulp_riscv_gpio.h"

#define DRDY_PIN 6
#define CLK_PIN 2
#define DATA_PIN 0
#define CS1_PIN 3
#define CS2_PIN 4

uint8_t read_buffer1[1350] = {0};
uint8_t read_buffer2[1350] = {0};
// uint8_t *p_buffer = read_buffer1;
// uint16_t buf_i = 0;

volatile bool buffer_ready = false;
volatile uint8_t *p_buffer_global = NULL;

/* SW Interrupt Handler */
//void sw_int_handler(void *arg)
//{
//    sw_int_cnt++;
//}

/* GPIO Interrupt Handler */
void gpio_int_handler(void *arg)
{
	// static uint8_t read_buffer1[1350] = {0};
	// static uint8_t read_buffer2[1350] = {0};
	static uint8_t *p_buffer = read_buffer1;
	static uint16_t buf_i = 0;
	
	/* * * * * * * * * * * * * * * * * * * * * * DEVICE ONE  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
	
	// Activate Device 1
	REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CS1_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));

	for(uint8_t i = 0; i < 27; i++) {
		
		uint8_t byte1 = 0;
		byte1 = 0;
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 7);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 6);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 5);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 4);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 3);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 2);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 1);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 0);

		p_buffer[buf_i++] = byte1;
	}
	
	// Deactivate Device 1
	REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CS1_PIN) << RTC_GPIO_OUT_DATA_W1TS_S));

	/* * * * * * * * * * * * * * * * * * * * * * DEVICE TWO  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

	// Activate Device 2
	REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CS2_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));

	for(uint8_t i = 0; i < 27; i++) {

		uint8_t byte1 = 0;
		byte1 = 0;
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 7);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 6);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 5);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 4);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 3);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 2);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 1);
		REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TS_S)); REG_WRITE(RTC_GPIO_OUT_W1TC_REG, (BIT(CLK_PIN) << RTC_GPIO_OUT_DATA_W1TC_S));
		byte1 |= (((uint8_t)((REG_READ(RTC_GPIO_IN_REG) >> RTC_GPIO_IN_NEXT_S) & BIT(DATA_PIN)) ) << 0);

		p_buffer[buf_i++] = byte1;
	}

	// Deactivate Device 2
	REG_WRITE(RTC_GPIO_OUT_W1TS_REG, (BIT(CS2_PIN) << RTC_GPIO_OUT_DATA_W1TS_S));

    // Přepnutí bufferu, pokud je plný
    if (buf_i >= 1350) {
        buf_i = 0;
        p_buffer_global = p_buffer;
        p_buffer = (p_buffer == read_buffer1) ? read_buffer2 : read_buffer1;
        buffer_ready = true;

        ulp_riscv_wakeup_main_processor();
    }
}

int main(void) {
	
    /* Register SW interrupt handler */
    // ulp_riscv_enable_sw_intr(sw_int_handler, NULL);

    /* Configure GPIO in input mode for interrupt */
    ulp_riscv_gpio_init(DRDY_PIN);
    ulp_riscv_gpio_input_enable(DRDY_PIN);

    ulp_riscv_gpio_init(DATA_PIN);
    ulp_riscv_gpio_input_enable(DATA_PIN);
    
    ulp_riscv_gpio_init(CLK_PIN);
    ulp_riscv_gpio_output_enable(CLK_PIN);
    
    ulp_riscv_gpio_init(CS1_PIN);
    ulp_riscv_gpio_output_enable(CS1_PIN);
	ulp_riscv_gpio_output_level(CS1_PIN, 1);
    
    ulp_riscv_gpio_init(CS2_PIN);
    ulp_riscv_gpio_output_enable(CS2_PIN);
    ulp_riscv_gpio_output_level(CS1_PIN, 1);
    
    /* Register GPIO interrupt handler */
    ulp_riscv_gpio_isr_register(DRDY_PIN, ULP_RISCV_GPIO_INTR_NEGEDGE, gpio_int_handler, NULL);

    while (1) {		
		
//		if(sw_int_cnt > 20000L) {
//			wake_by_gpio = 0;
//			wake_by_sw = 1;
//			ulp_riscv_wakeup_main_processor();
//			sw_int_cnt = 0;
//		}
		
		ulp_riscv_delay_cycles(1000 * ULP_RISCV_CYCLES_PER_MS);
    }

    return 0;
}

MicroController
Posts: 2045
Joined: Mon Oct 17, 2022 7:38 pm
Location: Europe, Germany

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby MicroController » Mon Feb 24, 2025 9:51 am

ESP_Sprite wrote:
Thu Feb 13, 2025 1:58 am
At this level, you'd probably want to look at the assembly output as well; it will probably give you some more clues on how to optimize things.
Did that now. Generated assembly looks pretty optimal, as in: I could not find anything that could be further optimized (right+left shifts are properly combined, casting to uint8_t is ignored, the for loop is turned into a pointer loop,...). Interestingly, gcc seems to try and make use of as many registers as possible, which makes it first read the 8 input values from the IO into 8 registers as fast as possible before transforming+combining the 8 values into one byte; this explains why there is a longer pause in the clock signal between bytes.

Turning

Code: Select all

static uint16_t buf_i = 0;
into

Code: Select all

static uint32_t buf_i = 0;
saves two (unnecessary) shift instructions handling a potential overflow of the uint16_t.
-> General advice: For best performance (on 32-bit CPUs), it's often beneficial to use 32-bit data types in calculations when you don't actually need the overflow behavior of a smaller type. Loading+storing a 32-bit (local) variable from/to a 16- or 8-bit value in memory is usually just as fast as loading/storing to/from a 16- or 8-bit local variable.

RandomInternetGuy
Posts: 59
Joined: Fri Aug 11, 2023 4:56 am

Re: ULP RISCV SPI Simulation Using Bit-banging

Postby RandomInternetGuy » Wed Feb 26, 2025 7:56 am

Compiler-weenie piping in.

> The fastest way to iterate over an array "via a pointer" is usually:

Code: Select all

uint32_t* ptr = &my_array[0];
uint32_t* const end = ptr + MY_ARRAY_NUM_ELEMENTS; 
do {
  ...
  *ptr = ...;
  ptr += 1;
} while (ptr < end);
As you observed, that was often true in the older era of optimizers. The reason that's a coin toss now depends upon whether the optimizer can determine for absolute certainty that the store through *ptr will never change end, which is a loop control. Since it's adjacent in memory to the last byte of my_array[] - which IS stored into - you can see why it might be timid[1] that end isn't changing as it's actually pointing to approximately itself. It can also cause it to forcibly reload a loop control on each iteration, when YOU know that's not being changed, but it cand prove that.


> (Though gcc sometimes decides to "optimize" this back to a for-loop with an explicit index.)
> Unrolling the loop and using literals for subscripts is still faster.

...and this is why. It knows if it's doing stores like:

ptr[0] = 0xaa;
ptr[1] = 0xaa;
ptr[2] = 0xaa;

...which it can reduce down to index + offset addressing (it can rewrite ptr[1] to ptr + 1), at least for RISC-V keeping ptr in a register and scaling it by the index is the winner, it's able to see that the store is never going to modify end as long as you don't walk off either end of ptr which, of course, you're not supposed to do and is UB if you do.

This is why the "fortran style" of indexed arrays often performs ever so slightly better than raw pointers these days - especially if there are a bunch of nested loops with stores going evrywhere. During store analysis, it can prove that end won't get clobbered, in the indexed array case and not the case of a store through a pointer.

This is all down to the hairy, ugly edge of these things and is subject to change between compiiler revs and a lot of other things, so don't bank on it. That's just the explanation of why they like indexed array STORES better than pointer stores. If you're writing FFTS or something, measure, spend time with Godbolt, and work on the algorithsms FIRST before you fret over tings like x++ vs ++x for scalars and so on.

[1] Alias analysis has been a leading cause of brain injury by spontaneous aneurismin in compiler students in recent decades.

Who is online

Users browsing this forum: No registered users and 82 guests