api_math.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #ifndef _API_MATH_H
  2. #define _API_MATH_H
  3. typedef struct {
  4. u32 outdat; //A = outdat / 2^31
  5. u32 outexp; //B = 2^outexp
  6. } math_out_t; //result = A * B
  7. void log2_hw(u32 X, u8 Q, math_out_t *output);
  8. void log10_hw(u32 X, u8 Q, math_out_t *output);
  9. void ln_hw(u32 X, u8 Q, math_out_t *output);
  10. void pow2_hw(u32 X, u8 Q, math_out_t *output);
  11. void powe_hw(u32 X, u8 Q, math_out_t *output);
  12. void sqrt_hw(u32 X, u8 Q, math_out_t *output);
  13. int sqrt64_hw(int64_t x);
  14. typedef enum {
  15. RDFT_128 = 0,
  16. RDFT_256,
  17. RDFT_512,
  18. } RDFT_LEN;
  19. typedef struct {
  20. void *in_addr;
  21. void *out_addr;
  22. RDFT_LEN size; //size:0(128), 1(256), 2(512)
  23. u8 window_en :1; //只有fft 512有效
  24. u8 input_type :1; //input type:0,half word; 1,word
  25. u8 isr_en :1; //是否打开中断
  26. } fft_cfg_t;
  27. void fft_hw(fft_cfg_t *cfg);
  28. typedef struct {
  29. void *in_addr;
  30. void *out_addr;
  31. RDFT_LEN size; //size:0(128), 1(256), 2(512)
  32. u8 window_en :1; //只有ifft 512有效
  33. u8 output_type :1; //output type:0,half word; 1,word
  34. u8 overlap_en :1;
  35. u8 overlap_len :1; //ola长度:0:240,1:320(512点有效)
  36. u8 isr_en :1; //是否打开中断
  37. } ifft_cfg_t;
  38. void ifft_hw(ifft_cfg_t *cfg);
  39. void dct_hw(int *dct_int, int *dct_out);
  40. void xcorr_hw(const int *xcorr_x, const int *xcorr_y, int *xcorr_out, int len, int max_pitch, int lshft);
  41. typedef struct {
  42. int *in;
  43. int *coef;
  44. int *cache;
  45. int *out;
  46. u16 len;
  47. u16 order;
  48. s16 lshft;
  49. u16 index; //硬件保存,软件不用设置
  50. } fir_cfg_t;
  51. void fir_hw(fir_cfg_t *cfg);
  52. ///x0,sum矩阵只可以使用ram0(0x50000~0x54000)和ram1(0x54000~0x59000),y0矩阵可以使用weight0-ram1(0x40000~~0x59000),且均要4字节对齐
  53. ///并且x0和y0不能使用同一块ram,
  54. void npu_matrix_init(void);
  55. void npu_matrix_exit(void);
  56. void matrix_hw(int32_t* sum, int8_t* x0, int8_t *y0, s16 loop); //for (int i = 0; i < loop1; i++) {sum += x0 * y0;}
  57. void matrix_hw_1(int32_t* sum, int8_t* x0, int8_t *y0, s16 loop1, s16 loop2); //for (int i = 0; i < loop1; i++) {sum_temp = 0; for (int j = 0; j < loop2; j++) {sum_temp += x0[i * loop2 + j] * y0[j];} sum[i] = sum_temp;}
  58. ALWAYS_INLINE u64 addu (u64 a, u32 b) { return __builtin_addu(a, b);} //a + b
  59. ALWAYS_INLINE s64 adds (s64 a, s32 b) {return __builtin_adds(a, b);} //a + b
  60. ALWAYS_INLINE u64 addlu (u64 a, u32 b) {return __builtin_addlu(a, b);} //a << 1 + b
  61. ALWAYS_INLINE s64 addls (s64 a, s32 b) {return __builtin_addls(a, b);} //a << 1 + b
  62. ALWAYS_INLINE u64 addru (u64 a, u32 b) {return __builtin_addru(a, b);} //a >> 1 + b
  63. ALWAYS_INLINE s64 addrs (s64 a, s32 b) {return __builtin_addrs(a, b);} //a >> 1 + b
  64. ALWAYS_INLINE u64 subu (u64 a, u32 b) {return __builtin_subu(a, b);} //a - b
  65. ALWAYS_INLINE s64 subs (s64 a, s32 b) {return __builtin_subs(a, b);} //a - b
  66. ALWAYS_INLINE u64 sublu (u64 a, u32 b) {return __builtin_sublu(a, b);} //a << 1 - b
  67. ALWAYS_INLINE s64 subls (s64 a, s32 b) {return __builtin_subls(a, b);} //a << 1 - b
  68. ALWAYS_INLINE u64 subru (u64 a, u32 b) {return __builtin_subru(a, b);} //a >> 1 - b
  69. ALWAYS_INLINE s64 subrs (s64 a, s32 b) {return __builtin_subrs(a, b);} //a >> 1 - b
  70. ALWAYS_INLINE u64 mulu (u32 a, u32 b) {return __builtin_mulu(a, b);} //a * b
  71. ALWAYS_INLINE s64 muls (s32 a, s32 b) {return __builtin_muls(a, b);} //a * b
  72. ALWAYS_INLINE s64 mulus (u32 a, s32 b) {return __builtin_mulus(a, b);} //a * b
  73. ALWAYS_INLINE u64 macu (u64 c, u32 a, u32 b) {return __builtin_macu (c, a, b);} //c + a * b
  74. ALWAYS_INLINE s64 macs (s64 c, s32 a, s32 b) {return __builtin_macs (c, a, b);} //c + a * b
  75. ALWAYS_INLINE s64 macus (s64 c, u32 a, s32 b) {return __builtin_macus (c, a, b);} //c + a * b
  76. ALWAYS_INLINE u64 msbu (u64 c, u32 a, u32 b) {return __builtin_msbu (c, a, b);} //c - a * b
  77. ALWAYS_INLINE s64 msbs (s64 c, s32 a, s32 b) {return __builtin_msbs (c, a, b);} //c - a * b
  78. ALWAYS_INLINE s64 msbus (s64 c, u32 a, s32 b) {return __builtin_msbus (c, a, b);} //c - a * b
  79. ALWAYS_INLINE u64 mulu_shift16 (u32 a, u32 b) {return __builtin_mulu_shift16(a, b);} //(a * b) >> 16
  80. ALWAYS_INLINE s64 muls_shift16 (s32 a, s32 b) {return __builtin_muls_shift16(a, b);} //(a * b) >> 16
  81. ALWAYS_INLINE s64 muls_shift15 (s32 a, s32 b) {return __builtin_muls_shift15(a, b);} //(a * b) >> 15
  82. ALWAYS_INLINE s64 mulus_shift16 (u32 a, s32 b) {return __builtin_mulus_shift16(a, b);} //(a * b) >> 16
  83. ALWAYS_INLINE u64 macu_shift16 (u64 c, u32 a, u32 b) {return __builtin_macu_shift16 (c, a, b);} //c + ((a * b) >> 16)
  84. ALWAYS_INLINE s64 macs_shift16 (s64 c, s32 a, s32 b) {return __builtin_macs_shift16 (c, a, b);} //c + ((a * b) >> 16)
  85. ALWAYS_INLINE s64 macs_shift15 (s64 c, s32 a, s32 b) {return __builtin_macs_shift15 (c, a, b);} //c + ((a * b) >> 15)
  86. ALWAYS_INLINE s64 macus_shift16 (s64 c, u32 a, s32 b) {return __builtin_macus_shift16 (c, a, b);} //c + ((a * b) >> 16)
  87. ALWAYS_INLINE u64 msbu_shift16 (u64 c, u32 a, u32 b) {return __builtin_msbu_shift16 (c, a, b);} //c - ((a * b) >> 16)
  88. ALWAYS_INLINE s64 msbs_shift16 (s64 c, s32 a, s32 b) {return __builtin_msbs_shift16 (c, a, b);} //c - ((a * b) >> 16)
  89. ALWAYS_INLINE s64 msbs_shift15 (s64 c, s32 a, s32 b) {return __builtin_msbs_shift15 (c, a, b);} //c - ((a * b) >> 15)
  90. ALWAYS_INLINE s64 msbus_shift16 (s64 c, u32 a, s32 b) {return __builtin_msbus_shift16 (c, a, b);} //c - ((a * b) >> 16)
  91. ALWAYS_INLINE u64 mulu_shift24 (u32 a, u32 b) {return __builtin_mulu_shift24(a, b);} //(a * b) >> 24
  92. ALWAYS_INLINE s64 muls_shift24 (s32 a, s32 b) {return __builtin_muls_shift24(a, b);} //(a * b) >> 24
  93. ALWAYS_INLINE s64 muls_shift23 (s32 a, s32 b) {return __builtin_muls_shift23(a, b);} //(a * b) >> 23
  94. ALWAYS_INLINE s64 mulus_shift24 (u32 a, s32 b) {return __builtin_mulus_shift24(a, b);} //(a * b) >> 24
  95. ALWAYS_INLINE u64 macu_shift24 (u64 c, u32 a, u32 b) {return __builtin_macu_shift24 (c, a, b);} //c + ((a * b) >> 24)
  96. ALWAYS_INLINE s64 macs_shift24 (s64 c, s32 a, s32 b) {return __builtin_macs_shift24 (c, a, b);} //c + ((a * b) >> 24)
  97. ALWAYS_INLINE s64 macs_shift23 (s64 c, s32 a, s32 b) {return __builtin_macs_shift23 (c, a, b);} //c + ((a * b) >> 23)
  98. ALWAYS_INLINE s64 macus_shift24 (s64 c, u32 a, s32 b) {return __builtin_macus_shift24 (c, a, b);} //c + ((a * b) >> 24)
  99. ALWAYS_INLINE u64 msbu_shift24 (u64 c, u32 a, u32 b) {return __builtin_msbu_shift24 (c, a, b);} //c - ((a * b) >> 24)
  100. ALWAYS_INLINE s64 msbs_shift24 (s64 c, s32 a, s32 b) {return __builtin_msbs_shift24 (c, a, b);} //c - ((a * b) >> 24)
  101. ALWAYS_INLINE s64 msbs_shift23 (s64 c, s32 a, s32 b) {return __builtin_msbs_shift23 (c, a, b);} //c - ((a * b) >> 23)
  102. ALWAYS_INLINE s64 msbus_shift24 (s64 c, u32 a, s32 b) {return __builtin_msbus_shift24 (c, a, b);} //c - ((a * b) >> 24)
  103. ALWAYS_INLINE u64 mulu_shift32 (u32 a, u32 b) {return __builtin_mulu_shift32(a, b);} //(a * b) >> 32
  104. ALWAYS_INLINE s64 muls_shift32 (s32 a, s32 b) {return __builtin_muls_shift32(a, b);} //(a * b) >> 32
  105. ALWAYS_INLINE s64 muls_shift31 (s32 a, s32 b) {return __builtin_muls_shift31(a, b);} //(a * b) >> 31
  106. ALWAYS_INLINE s64 mulus_shift32 (u32 a, s32 b) {return __builtin_mulus_shift32(a, b);} //(a * b) >> 32
  107. ALWAYS_INLINE u64 macu_shift32 (u64 c, u32 a, u32 b) {return __builtin_macu_shift32 (c, a, b);} //c + ((a * b) >> 32)
  108. ALWAYS_INLINE s64 macs_shift32 (s64 c, s32 a, s32 b) {return __builtin_macs_shift32 (c, a, b);} //c + ((a * b) >> 32)
  109. ALWAYS_INLINE s64 macs_shift31 (s64 c, s32 a, s32 b) {return __builtin_macs_shift31 (c, a, b);} //c + ((a * b) >> 31)
  110. ALWAYS_INLINE s64 macus_shift32 (s64 c, u32 a, s32 b) {return __builtin_macus_shift32 (c, a, b);} //c + ((a * b) >> 32)
  111. ALWAYS_INLINE u64 msbu_shift32 (u64 c, u32 a, u32 b) {return __builtin_msbu_shift32 (c, a, b);} //c - ((a * b) >> 32)
  112. ALWAYS_INLINE s64 msbs_shift32 (s64 c, s32 a, s32 b) {return __builtin_msbs_shift32 (c, a, b);} //c - ((a * b) >> 32)
  113. ALWAYS_INLINE s64 msbs_shift31 (s64 c, s32 a, s32 b) {return __builtin_msbs_shift31 (c, a, b);} //c - ((a * b) >> 31)
  114. ALWAYS_INLINE s64 msbus_shift32 (s64 c, u32 a, s32 b) {return __builtin_msbus_shift32 (c, a, b);} //c - ((a * b) >> 32)
  115. ALWAYS_INLINE u64 lshift64 (u64 a, u32 b) {return __builtin_lshift64 (a,b);} //a << b
  116. ALWAYS_INLINE u64 rshift64 (u64 a, u32 b) {return __builtin_rshift64 (a,b);} //a >> b
  117. ALWAYS_INLINE u64 ashift64 (u64 a, u32 b) {return __builtin_ashift64 (a,b);} //a >> b(算术)
  118. ALWAYS_INLINE u64 clip64u (u64 a, u32 b) {return __builtin_clip64u (a, b);} //clip(a >> b)
  119. ALWAYS_INLINE s64 clip64s (s64 a, u32 b) {return __builtin_clip64s (a, b);} //clip(a >> b)
  120. ALWAYS_INLINE u64 round64 (u64 a, u32 b) {return __builtin_round64 (a,b);} //round(a >> b)
  121. ALWAYS_INLINE s32 clb(s32 x0)
  122. {
  123. s32 y0;
  124. __asm__ volatile("p.clb %0, %1" : "=r"(y0) : "r"(x0));
  125. return y0;
  126. }
  127. ALWAYS_INLINE s32 max(s32 x0, s32 x1)
  128. {
  129. s32 y0;
  130. __asm__ volatile("p.max %0, %1, %2" : "=r"(y0) : "r"(x0), "r"(x1));
  131. return y0;
  132. }
  133. ALWAYS_INLINE u32 maxu(u32 x0, u32 x1)
  134. {
  135. u32 y0;
  136. __asm__ volatile("p.maxu %0, %1, %2" : "=r"(y0) : "r"(x0), "r"(x1));
  137. return y0;
  138. }
  139. ALWAYS_INLINE s32 min(s32 x0, s32 x1)
  140. {
  141. s32 y0;
  142. __asm__ volatile("p.min %0, %1, %2" : "=r"(y0) : "r"(x0), "r"(x1));
  143. return y0;
  144. }
  145. ALWAYS_INLINE u32 minu(u32 x0, u32 x1)
  146. {
  147. u32 y0;
  148. __asm__ volatile("p.minu %0, %1, %2" : "=r"(y0) : "r"(x0), "r"(x1));
  149. return y0;
  150. }
  151. ALWAYS_INLINE s32 clip(s32 x0, const s32 x1)
  152. {
  153. s32 y0;
  154. __asm__ volatile("p.clip %0, %1, %2" : "=r"(y0) : "r"(x0), "i"(x1));
  155. return y0;
  156. }
  157. ALWAYS_INLINE u32 clipu(u32 x0, const u32 x1)
  158. {
  159. u32 y0;
  160. __asm__ volatile("p.clipu %0, %1, %2" : "=r"(y0) : "r"(x0), "i"(x1));
  161. return y0;
  162. }
  163. ALWAYS_INLINE s32 abs_s(s32 x0)
  164. {
  165. s32 y0;
  166. __asm__ volatile("p.abs %0, %1" : "=r"(y0) : "r"(x0));
  167. return y0;
  168. }
  169. ALWAYS_INLINE int fl1(s32 x0)
  170. {
  171. s32 y0;
  172. __asm__ volatile("p.fl1 %0, %1" : "=r"(y0) : "r"(x0));
  173. return y0;
  174. }
  175. #define NORM_U32(v) max(31 - fl1(v), 0)
  176. #define NORM_S32(v) max((clb(v) - 1), 0)
  177. ALWAYS_INLINE s32 clb64s(s64 x0)
  178. {
  179. s32 y0, hi, lo;
  180. hi = (s32)ashift64(x0, 32);
  181. lo = (u32)x0;
  182. if (x0 == 0) { // x0 = 0x0000 0000 0000 0000
  183. y0 = 0;
  184. }
  185. else if (hi == 0) { //x0 = 0x0000 0000 xxxx xxxx
  186. y0 = 31 + NORM_U32(lo);
  187. }
  188. else if (hi == -1 && lo >= 0x80000000) { //x0 = 0xffff ffff [8-f]xxx xxxx
  189. y0 = 32 + NORM_S32(lo);
  190. }
  191. else { //x0 = 0xffff ffff [0-7]xxx xxxx or 0x(^(ffff ffff)) xxxx xxxx
  192. y0 = NORM_S32(hi);
  193. }
  194. return y0;
  195. }
  196. #endif