123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325 |
- #include "include.h"
- #include "asr.h"
- #if ASR_PREFETCH_EN && ASR_EN
- ///x0,sum矩阵只可以使用ram0(0x50000~0x54000)和ram1(0x54000~0x59000),y0矩阵可以使用weight0-ram1(0x40000~~0x59000),且均要4字节对齐
- ///并且x0和y0不能使用同一块ram
- void matrix_hw(int32_t* sum, int8_t* x0, int8_t *y0, s16 loop);
- void npu_memcpy_int16(int16_t *c, int16_t *a, int32_t a_size);
- void npu_matrix_kick_wait(void);
- #if NPU_CONTIN_CAL_EN
- AT(.ws_asr.ram0.sum)
- int32_t sum_buffer[MAX_NPU_MATRIX * 16];
- #else
- AT(.ws_asr.ram0.sum.s)
- int sum_buffer[0x1e8];
- AT(.ws_asr.ram0.sum)
- int32_t matrix_sum ;
- #endif
- AT(.ws_asr.offset)
- int8_t asr_offset[12]; //让asr库里数组对齐16字节
- AT(.npu_matrix.ram0.x)
- int8_t npu_matrix_x[256 * MAX_NPU_MATRIX];
- typedef struct {
- u16 all_frame;
- u16 frame_len;
- u8 load_frame;
- } tdnn_prenum_t;
- AT(.com_rodata.tdnn)
- const tdnn_prenum_t tdnn_prenum[7] = {
- {128, 120, 68},
- {128, 256, 32},
- {128, 256, 32},
- {128, 256, 32},
- {128, 256, 32},
- {128, 128, 64},
- {488, 128, 64},
- };
- typedef struct {
- u8 load_frame;
- u8 *buf;
- } tdnn_cache_t;
- typedef struct {
- u16 all_frame;
- u8 index;
- u8 toggle;
- u32 load_addr;
- tdnn_cache_t cache[2];
- } tdnn_pretetch_t;
- extern const int16_t tdnn_mean_1[128];
- extern const int16_t tdnn_bias_1[128];
- extern const float tdnn_var_1[128];
- extern const int16_t tdnn_mean_2[128];
- extern const int16_t tdnn_bias_2[128];
- extern const float tdnn_var_2[128];
- extern const int16_t tdnn_mean_3[128];
- extern const int16_t tdnn_bias_3[128];
- extern const float tdnn_var_3[128];
- extern const int16_t tdnn_mean_4[128];
- extern const int16_t tdnn_bias_4[128];
- extern const float tdnn_var_4[128];
- extern const int16_t tdnn_mean_5[128];
- extern const int16_t tdnn_bias_5[128];
- extern const float tdnn_var_5[128];
- extern const int16_t tdnn_mean_6[128];
- extern const int16_t tdnn_bias_6[128];
- extern const float tdnn_var_6[128];
- extern const int16_t tdnn_bias_7[488];
- extern const float tdnn_scale_7[488];
- int16_t tdnn_ram_mean1[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias1[128] AT(.ws_asr.test);
- float tdnn_ram_var1[128] AT(.ws_asr.test);
- int16_t tdnn_ram_mean2[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias2[128] AT(.ws_asr.test);
- float tdnn_ram_var2[128] AT(.ws_asr.test);
- int16_t tdnn_ram_mean3[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias3[128] AT(.ws_asr.test);
- float tdnn_ram_var3[128] AT(.ws_asr.test);
- int16_t tdnn_ram_mean4[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias4[128] AT(.ws_asr.test);
- float tdnn_ram_var4[128] AT(.ws_asr.test);
- int16_t tdnn_ram_mean5[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias5[128] AT(.ws_asr.test);
- float tdnn_ram_var5[128] AT(.ws_asr.test);
- int16_t tdnn_ram_mean6[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias6[128] AT(.ws_asr.test);
- float tdnn_ram_var6[128] AT(.ws_asr.test);
- int16_t tdnn_ram_bias7[488] AT(.ws_asr.test);
- float tdnn_ram_scale7[488] AT(.ws_asr.test);
- u8 tdnn_buffer[2][0x2000] AT(.npu_matrix.ram0.tdnn);
- tdnn_pretetch_t tdnn_pretetch AT(.ws_asr.test);
- u8 asr_prefetch_kisck;
- AT(.com_text.tdnn)
- void tdnn_compute_float_cal(float *out_buf, const int16_t *tdnn_mean, const float *tdnn_scale,
- const float *tdnn_var, const int16_t *tdnn_bias, float in_scale, u16 matrix_cnt_t, u16 float_cnt_t, int last_layer)
- {
- int32_t sum_curr = 0, z = 0;
- for (int cnt = float_cnt_t; cnt < (matrix_cnt_t + float_cnt_t); cnt++) {
- z = sum_buffer[(sum_curr<<4)] * in_scale + tdnn_bias[cnt];
- if (!last_layer) {
- if (z <= 0) {
- z = 0;
- }
- out_buf[cnt] = (z - tdnn_mean[cnt]) * tdnn_var[cnt];
- } else {
- out_buf[cnt] = z * tdnn_scale[cnt];
- }
- sum_curr++;
- }
- }
- AT(.com_text.tdnn)
- void tdnn_compute(int8_t *in_buf, float in_scale, int in_dim, int out_dim,
- const int8_t *tdnn_weight, const int16_t *tdnn_bias,
- const int16_t *tdnn_mean, const float *tdnn_var, const float *tdnn_scale,
- float *out_buf, int last_layer) {
- int i;
- if (asr_prefetch_kisck) {
- return;
- }
- tdnn_pretetch_t *t = &tdnn_pretetch;
- const tdnn_prenum_t *p = &tdnn_prenum[t->index];
- u32 frame_len = p->frame_len * p->load_frame;
- // printf("tdnn_weight:%x out_dim:%d in_dim:%d index:%d frame_len:%d\n", tdnn_weight, out_dim, in_dim, t->index, frame_len);
- if (tdnn_mean == tdnn_mean_1) {
- tdnn_mean = tdnn_ram_mean1;
- tdnn_bias = tdnn_ram_bias1;
- tdnn_var = tdnn_ram_var1;
- } else if (tdnn_mean == tdnn_mean_2) {
- tdnn_mean = tdnn_ram_mean2;
- tdnn_bias = tdnn_ram_bias2;
- tdnn_var = tdnn_ram_var2;
- } else if (tdnn_mean == tdnn_mean_3) {
- tdnn_mean = tdnn_ram_mean3;
- tdnn_bias = tdnn_ram_bias3;
- tdnn_var = tdnn_ram_var3;
- } else if (tdnn_mean == tdnn_mean_4) {
- tdnn_mean = tdnn_ram_mean4;
- tdnn_bias = tdnn_ram_bias4;
- tdnn_var = tdnn_ram_var4;
- } else if (tdnn_mean == tdnn_mean_5) {
- tdnn_mean = tdnn_ram_mean5;
- tdnn_bias = tdnn_ram_bias5;
- tdnn_var = tdnn_ram_var5;
- } else if (tdnn_mean == tdnn_mean_6) {
- tdnn_mean = tdnn_ram_mean6;
- tdnn_bias = tdnn_ram_bias6;
- tdnn_var = tdnn_ram_var6;
- } else {
- tdnn_scale = tdnn_ram_scale7;
- tdnn_bias = tdnn_ram_bias7;
- }
- // GPIOBSET |= BIT(4);
- spiflash_lock();
- // GPIOBSET = BIT(1);///
- #if NPU_CONTIN_CAL_EN
- u8 matrix_cnt = 0;
- u16 float_cnt = 0;
- memset(sum_buffer, 0, 4 * MAX_NPU_MATRIX * 16);
- #endif
- for (i = 0; i < out_dim; i++) {
- u16 offset = (i * in_dim) % frame_len;
- //load下一帧
- if (offset == 0) {
- u16 load_frame = 0;
- t->toggle ^= 1;
- if (t->all_frame == 0) {
- t->index++;
- if (t->index >= 7) {
- t->index = 0;
- }
- p = &tdnn_prenum[t->index];
- t->all_frame = p->all_frame;
- load_frame = p->load_frame;
- } else {
- if (t->all_frame >= p->load_frame) {
- load_frame = p->load_frame;
- } else {
- load_frame = t->all_frame;
- }
- }
- u32 load_len = load_frame*p->frame_len;
- spiflash_read_wait();
- // GPIOESET |= BIT(4);///
- spiflash_read_kick(t->cache[t->toggle ^ 1].buf, t->load_addr, load_len);
- // GPIOECLR |= BIT(4);///
- t->all_frame -= load_frame;
- t->load_addr += load_len;
- if (t->load_addr >= (ASR_BASE_ADDR + ASR_BASE_LEN)) {
- t->load_addr = ASR_BASE_ADDR;
- }
- }
- int8_t *tdnn_buffer_t = (int8_t *)t->cache[t->toggle].buf + offset;
- #if !NPU_CONTIN_CAL_EN
- matrix_sum = 0;
- u8 mod = ((int)tdnn_buffer_t % 16);
- if (mod) { //地址对齐16bytes
- memcpy(npu_matrix_x, tdnn_buffer_t, in_dim);
- matrix_hw(&matrix_sum, npu_matrix_x, in_buf, in_dim);
- } else {
- matrix_hw(&matrix_sum, tdnn_buffer_t, in_buf, in_dim);
- }
- npu_matrix_kick_wait();
- sum_buffer[i] = matrix_sum;
- #else
- #if NPU_MEMCPY_EN
- u8 mod = ((int)tdnn_buffer_t % 16);
- if (mod) { //地址对齐16bytes
- memcpy(&npu_matrix_x[matrix_cnt * 256], tdnn_buffer_t, in_dim);
- } else {
- npu_memcpy_int16((int16_t*)&npu_matrix_x[matrix_cnt * 256], (int16_t*)tdnn_buffer_t, 128);
- }
- #else
- memcpy(&npu_matrix_x[matrix_cnt * 256], tdnn_buffer_t, in_dim);
- #endif
- matrix_hw(&sum_buffer[matrix_cnt * 16], &npu_matrix_x[matrix_cnt * 256], in_buf, in_dim);
- matrix_cnt++;
- if (out_dim == i + 1) {
- npu_matrix_kick_wait();
- tdnn_compute_float_cal(out_buf, tdnn_mean, tdnn_scale, tdnn_var, tdnn_bias, in_scale, matrix_cnt, float_cnt, last_layer);
- float_cnt += matrix_cnt;
- memset(sum_buffer, 0, 4 * MAX_NPU_MATRIX * 16);
- matrix_cnt = 0;
- } else {
- if (matrix_cnt == MAX_NPU_MATRIX) {
- npu_matrix_kick_wait();
- tdnn_compute_float_cal(out_buf, tdnn_mean, tdnn_scale, tdnn_var, tdnn_bias, in_scale, matrix_cnt, float_cnt, last_layer);
- float_cnt += matrix_cnt;
- memset(sum_buffer, 0, 4 * MAX_NPU_MATRIX * 16);
- matrix_cnt = 0;
- }
- }
- #endif
- }
- spiflash_read_wait();
- spiflash_unlock();
- #if !NPU_CONTIN_CAL_EN
- for (i = 0; i < out_dim; i++) {
- matrix_sum = sum_buffer[i];
- int32_t z = matrix_sum * in_scale + tdnn_bias[i];
- if (!last_layer) {
- if (z <= 0) {
- z = 0;
- }
- out_buf[i] = (z - tdnn_mean[i]) * tdnn_var[i];
- } else {
- out_buf[i] = z * tdnn_scale[i];
- }
- }
- #endif
- }
- ALIGNED(512)
- void asr_prefetch_init_do(void)
- {
- tdnn_pretetch_t *t = &tdnn_pretetch;
- memset(t, 0, sizeof(tdnn_pretetch_t));
- const tdnn_prenum_t *p = &tdnn_prenum[t->index];
- t->load_addr = ASR_BASE_ADDR;
- t->cache[0].buf = tdnn_buffer[0];
- t->cache[1].buf = tdnn_buffer[1];
- //先读
- u32 load_len = p->load_frame*p->frame_len;
- spiflash_lock();
- spiflash_read_kick(t->cache[0].buf, t->load_addr, load_len);
- spiflash_read_wait();
- t->cache[0].load_frame = p->load_frame;
- t->all_frame = p->all_frame - p->load_frame; //已经load了一帧,减掉
- t->load_addr += load_len;
- spiflash_unlock();
- memcpy(tdnn_ram_mean1, tdnn_mean_1, sizeof(tdnn_mean_1));
- memcpy(tdnn_ram_bias1, tdnn_bias_1, sizeof(tdnn_bias_1));
- memcpy(tdnn_ram_var1, tdnn_var_1, sizeof(tdnn_var_1));
- memcpy(tdnn_ram_mean2, tdnn_mean_2, sizeof(tdnn_mean_2));
- memcpy(tdnn_ram_bias2, tdnn_bias_2, sizeof(tdnn_bias_2));
- memcpy(tdnn_ram_var2, tdnn_var_2, sizeof(tdnn_var_2));
- memcpy(tdnn_ram_mean3, tdnn_mean_3, sizeof(tdnn_mean_3));
- memcpy(tdnn_ram_bias3, tdnn_bias_3, sizeof(tdnn_bias_3));
- memcpy(tdnn_ram_var3, tdnn_var_3, sizeof(tdnn_var_3));
- memcpy(tdnn_ram_mean4, tdnn_mean_4, sizeof(tdnn_mean_4));
- memcpy(tdnn_ram_bias4, tdnn_bias_4, sizeof(tdnn_bias_4));
- memcpy(tdnn_ram_var4, tdnn_var_4, sizeof(tdnn_var_4));
- memcpy(tdnn_ram_mean5, tdnn_mean_5, sizeof(tdnn_mean_5));
- memcpy(tdnn_ram_bias5, tdnn_bias_5, sizeof(tdnn_bias_5));
- memcpy(tdnn_ram_var5, tdnn_var_5, sizeof(tdnn_var_5));
- memcpy(tdnn_ram_mean6, tdnn_mean_6, sizeof(tdnn_mean_6));
- memcpy(tdnn_ram_bias6, tdnn_bias_6, sizeof(tdnn_bias_6));
- memcpy(tdnn_ram_var6, tdnn_var_6, sizeof(tdnn_var_6));
- memcpy(tdnn_ram_bias7, tdnn_bias_7, sizeof(tdnn_bias_7));
- memcpy(tdnn_ram_scale7, tdnn_scale_7, sizeof(tdnn_scale_7));
- }
- void asr_prefetch_init(void)
- {
- memset(asr_offset,0,12);
- asr_prefetch_init_do();
- asr_prefetch_kisck = 5;
- }
- #endif
|