You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

367 lines
18KB

  1. /*
  2. * Loongson SIMD utils
  3. *
  4. * Copyright (c) 2016 Loongson Technology Corporation Limited
  5. * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #ifndef AVUTIL_MIPS_MMIUTILS_H
  24. #define AVUTIL_MIPS_MMIUTILS_H
  25. #include "config.h"
  26. #include "libavutil/mem_internal.h"
  27. #include "libavutil/mips/asmdefs.h"
  28. #if HAVE_LOONGSON2
  29. #define DECLARE_VAR_LOW32 int32_t low32
  30. #define RESTRICT_ASM_LOW32 [low32]"=&r"(low32),
  31. #define DECLARE_VAR_ALL64 int64_t all64
  32. #define RESTRICT_ASM_ALL64 [all64]"=&r"(all64),
  33. #define DECLARE_VAR_ADDRT mips_reg addrt
  34. #define RESTRICT_ASM_ADDRT [addrt]"=&r"(addrt),
  35. #define MMI_LWX(reg, addr, stride, bias) \
  36. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  37. "lw "#reg", "#bias"(%[addrt]) \n\t"
  38. #define MMI_SWX(reg, addr, stride, bias) \
  39. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  40. "sw "#reg", "#bias"(%[addrt]) \n\t"
  41. #define MMI_LDX(reg, addr, stride, bias) \
  42. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  43. "ld "#reg", "#bias"(%[addrt]) \n\t"
  44. #define MMI_SDX(reg, addr, stride, bias) \
  45. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  46. "sd "#reg", "#bias"(%[addrt]) \n\t"
  47. #define MMI_LWC1(fp, addr, bias) \
  48. "lwc1 "#fp", "#bias"("#addr") \n\t"
  49. #define MMI_ULWC1(fp, addr, bias) \
  50. "ulw %[low32], "#bias"("#addr") \n\t" \
  51. "mtc1 %[low32], "#fp" \n\t"
  52. #define MMI_LWXC1(fp, addr, stride, bias) \
  53. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  54. MMI_LWC1(fp, %[addrt], bias)
  55. #define MMI_SWC1(fp, addr, bias) \
  56. "swc1 "#fp", "#bias"("#addr") \n\t"
  57. #define MMI_USWC1(fp, addr, bias) \
  58. "mfc1 %[low32], "#fp" \n\t" \
  59. "usw %[low32], "#bias"("#addr") \n\t"
  60. #define MMI_SWXC1(fp, addr, stride, bias) \
  61. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  62. MMI_SWC1(fp, %[addrt], bias)
  63. #define MMI_LDC1(fp, addr, bias) \
  64. "ldc1 "#fp", "#bias"("#addr") \n\t"
  65. #define MMI_ULDC1(fp, addr, bias) \
  66. "uld %[all64], "#bias"("#addr") \n\t" \
  67. "dmtc1 %[all64], "#fp" \n\t"
  68. #define MMI_LDXC1(fp, addr, stride, bias) \
  69. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  70. MMI_LDC1(fp, %[addrt], bias)
  71. #define MMI_SDC1(fp, addr, bias) \
  72. "sdc1 "#fp", "#bias"("#addr") \n\t"
  73. #define MMI_USDC1(fp, addr, bias) \
  74. "dmfc1 %[all64], "#fp" \n\t" \
  75. "usd %[all64], "#bias"("#addr") \n\t"
  76. #define MMI_SDXC1(fp, addr, stride, bias) \
  77. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  78. MMI_SDC1(fp, %[addrt], bias)
  79. #define MMI_LQ(reg1, reg2, addr, bias) \
  80. "ld "#reg1", "#bias"("#addr") \n\t" \
  81. "ld "#reg2", 8+"#bias"("#addr") \n\t"
  82. #define MMI_SQ(reg1, reg2, addr, bias) \
  83. "sd "#reg1", "#bias"("#addr") \n\t" \
  84. "sd "#reg2", 8+"#bias"("#addr") \n\t"
  85. #define MMI_LQC1(fp1, fp2, addr, bias) \
  86. "ldc1 "#fp1", "#bias"("#addr") \n\t" \
  87. "ldc1 "#fp2", 8+"#bias"("#addr") \n\t"
  88. #define MMI_SQC1(fp1, fp2, addr, bias) \
  89. "sdc1 "#fp1", "#bias"("#addr") \n\t" \
  90. "sdc1 "#fp2", 8+"#bias"("#addr") \n\t"
  91. #elif HAVE_LOONGSON3 /* !HAVE_LOONGSON2 */
  92. #define DECLARE_VAR_ALL64
  93. #define RESTRICT_ASM_ALL64
  94. #define DECLARE_VAR_ADDRT
  95. #define RESTRICT_ASM_ADDRT
  96. #define MMI_LWX(reg, addr, stride, bias) \
  97. "gslwx "#reg", "#bias"("#addr", "#stride") \n\t"
  98. #define MMI_SWX(reg, addr, stride, bias) \
  99. "gsswx "#reg", "#bias"("#addr", "#stride") \n\t"
  100. #define MMI_LDX(reg, addr, stride, bias) \
  101. "gsldx "#reg", "#bias"("#addr", "#stride") \n\t"
  102. #define MMI_SDX(reg, addr, stride, bias) \
  103. "gssdx "#reg", "#bias"("#addr", "#stride") \n\t"
  104. #define MMI_LWC1(fp, addr, bias) \
  105. "lwc1 "#fp", "#bias"("#addr") \n\t"
  106. #if _MIPS_SIM == _ABIO32 /* workaround for 3A2000 gslwlc1 bug */
  107. #define DECLARE_VAR_LOW32 int32_t low32
  108. #define RESTRICT_ASM_LOW32 [low32]"=&r"(low32),
  109. #define MMI_ULWC1(fp, addr, bias) \
  110. "ulw %[low32], "#bias"("#addr") \n\t" \
  111. "mtc1 %[low32], "#fp" \n\t"
  112. #else /* _MIPS_SIM != _ABIO32 */
  113. #define DECLARE_VAR_LOW32
  114. #define RESTRICT_ASM_LOW32
  115. #define MMI_ULWC1(fp, addr, bias) \
  116. "gslwlc1 "#fp", 3+"#bias"("#addr") \n\t" \
  117. "gslwrc1 "#fp", "#bias"("#addr") \n\t"
  118. #endif /* _MIPS_SIM != _ABIO32 */
  119. #define MMI_LWXC1(fp, addr, stride, bias) \
  120. "gslwxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  121. #define MMI_SWC1(fp, addr, bias) \
  122. "swc1 "#fp", "#bias"("#addr") \n\t"
  123. #define MMI_USWC1(fp, addr, bias) \
  124. "gsswlc1 "#fp", 3+"#bias"("#addr") \n\t" \
  125. "gsswrc1 "#fp", "#bias"("#addr") \n\t"
  126. #define MMI_SWXC1(fp, addr, stride, bias) \
  127. "gsswxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  128. #define MMI_LDC1(fp, addr, bias) \
  129. "ldc1 "#fp", "#bias"("#addr") \n\t"
  130. #define MMI_ULDC1(fp, addr, bias) \
  131. "gsldlc1 "#fp", 7+"#bias"("#addr") \n\t" \
  132. "gsldrc1 "#fp", "#bias"("#addr") \n\t"
  133. #define MMI_LDXC1(fp, addr, stride, bias) \
  134. "gsldxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  135. #define MMI_SDC1(fp, addr, bias) \
  136. "sdc1 "#fp", "#bias"("#addr") \n\t"
  137. #define MMI_USDC1(fp, addr, bias) \
  138. "gssdlc1 "#fp", 7+"#bias"("#addr") \n\t" \
  139. "gssdrc1 "#fp", "#bias"("#addr") \n\t"
  140. #define MMI_SDXC1(fp, addr, stride, bias) \
  141. "gssdxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  142. #define MMI_LQ(reg1, reg2, addr, bias) \
  143. "gslq "#reg1", "#reg2", "#bias"("#addr") \n\t"
  144. #define MMI_SQ(reg1, reg2, addr, bias) \
  145. "gssq "#reg1", "#reg2", "#bias"("#addr") \n\t"
  146. #define MMI_LQC1(fp1, fp2, addr, bias) \
  147. "gslqc1 "#fp1", "#fp2", "#bias"("#addr") \n\t"
  148. #define MMI_SQC1(fp1, fp2, addr, bias) \
  149. "gssqc1 "#fp1", "#fp2", "#bias"("#addr") \n\t"
  150. #endif /* HAVE_LOONGSON2 */
  151. /**
  152. * backup register
  153. */
  154. #define BACKUP_REG \
  155. LOCAL_ALIGNED_16(double, temp_backup_reg, [8]); \
  156. if (_MIPS_SIM == _ABI64) \
  157. __asm__ volatile ( \
  158. "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
  159. "gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
  160. "gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
  161. "gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
  162. : \
  163. : [temp]"r"(temp_backup_reg) \
  164. : "memory" \
  165. ); \
  166. else \
  167. __asm__ volatile ( \
  168. "gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
  169. "gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
  170. "gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
  171. : \
  172. : [temp]"r"(temp_backup_reg) \
  173. : "memory" \
  174. );
  175. /**
  176. * recover register
  177. */
  178. #define RECOVER_REG \
  179. if (_MIPS_SIM == _ABI64) \
  180. __asm__ volatile ( \
  181. "gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
  182. "gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
  183. "gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
  184. "gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
  185. : \
  186. : [temp]"r"(temp_backup_reg) \
  187. : "memory" \
  188. ); \
  189. else \
  190. __asm__ volatile ( \
  191. "gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
  192. "gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
  193. "gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
  194. : \
  195. : [temp]"r"(temp_backup_reg) \
  196. : "memory" \
  197. );
  198. /**
  199. * brief: Transpose 2X2 word packaged data.
  200. * fr_i0, fr_i1: src
  201. * fr_o0, fr_o1: dst
  202. */
  203. #define TRANSPOSE_2W(fr_i0, fr_i1, fr_o0, fr_o1) \
  204. "punpcklwd "#fr_o0", "#fr_i0", "#fr_i1" \n\t" \
  205. "punpckhwd "#fr_o1", "#fr_i0", "#fr_i1" \n\t"
  206. /**
  207. * brief: Transpose 4X4 half word packaged data.
  208. * fr_i0, fr_i1, fr_i2, fr_i3: src & dst
  209. * fr_t0, fr_t1, fr_t2, fr_t3: temporary register
  210. */
  211. #define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, \
  212. fr_t0, fr_t1, fr_t2, fr_t3) \
  213. "punpcklhw "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \
  214. "punpckhhw "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \
  215. "punpcklhw "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \
  216. "punpckhhw "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \
  217. "punpcklwd "#fr_i0", "#fr_t0", "#fr_t2" \n\t" \
  218. "punpckhwd "#fr_i1", "#fr_t0", "#fr_t2" \n\t" \
  219. "punpcklwd "#fr_i2", "#fr_t1", "#fr_t3" \n\t" \
  220. "punpckhwd "#fr_i3", "#fr_t1", "#fr_t3" \n\t"
  221. /**
  222. * brief: Transpose 8x8 byte packaged data.
  223. * fr_i0~i7: src & dst
  224. * fr_t0~t3: temporary register
  225. */
  226. #define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, \
  227. fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3) \
  228. "punpcklbh "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \
  229. "punpckhbh "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \
  230. "punpcklbh "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \
  231. "punpckhbh "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \
  232. "punpcklbh "#fr_i0", "#fr_i4", "#fr_i5" \n\t" \
  233. "punpckhbh "#fr_i1", "#fr_i4", "#fr_i5" \n\t" \
  234. "punpcklbh "#fr_i2", "#fr_i6", "#fr_i7" \n\t" \
  235. "punpckhbh "#fr_i3", "#fr_i6", "#fr_i7" \n\t" \
  236. "punpcklhw "#fr_i4", "#fr_t0", "#fr_t2" \n\t" \
  237. "punpckhhw "#fr_i5", "#fr_t0", "#fr_t2" \n\t" \
  238. "punpcklhw "#fr_i6", "#fr_t1", "#fr_t3" \n\t" \
  239. "punpckhhw "#fr_i7", "#fr_t1", "#fr_t3" \n\t" \
  240. "punpcklhw "#fr_t0", "#fr_i0", "#fr_i2" \n\t" \
  241. "punpckhhw "#fr_t1", "#fr_i0", "#fr_i2" \n\t" \
  242. "punpcklhw "#fr_t2", "#fr_i1", "#fr_i3" \n\t" \
  243. "punpckhhw "#fr_t3", "#fr_i1", "#fr_i3" \n\t" \
  244. "punpcklwd "#fr_i0", "#fr_i4", "#fr_t0" \n\t" \
  245. "punpckhwd "#fr_i1", "#fr_i4", "#fr_t0" \n\t" \
  246. "punpcklwd "#fr_i2", "#fr_i5", "#fr_t1" \n\t" \
  247. "punpckhwd "#fr_i3", "#fr_i5", "#fr_t1" \n\t" \
  248. "punpcklwd "#fr_i4", "#fr_i6", "#fr_t2" \n\t" \
  249. "punpckhwd "#fr_i5", "#fr_i6", "#fr_t2" \n\t" \
  250. "punpcklwd "#fr_i6", "#fr_i7", "#fr_t3" \n\t" \
  251. "punpckhwd "#fr_i7", "#fr_i7", "#fr_t3" \n\t"
  252. /**
  253. * brief: Parallel SRA for 8 byte packaged data.
  254. * fr_i0: src
  255. * fr_i1: SRA number(SRAB number + 8)
  256. * fr_t0, fr_t1: temporary register
  257. * fr_d0: dst
  258. */
  259. #define PSRAB_MMI(fr_i0, fr_i1, fr_t0, fr_t1, fr_d0) \
  260. "punpcklbh "#fr_t0", "#fr_t0", "#fr_i0" \n\t" \
  261. "punpckhbh "#fr_t1", "#fr_t1", "#fr_i0" \n\t" \
  262. "psrah "#fr_t0", "#fr_t0", "#fr_i1" \n\t" \
  263. "psrah "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \
  264. "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t"
  265. /**
  266. * brief: Parallel SRL for 8 byte packaged data.
  267. * fr_i0: src
  268. * fr_i1: SRL number(SRLB number + 8)
  269. * fr_t0, fr_t1: temporary register
  270. * fr_d0: dst
  271. */
  272. #define PSRLB_MMI(fr_i0, fr_i1, fr_t0, fr_t1, fr_d0) \
  273. "punpcklbh "#fr_t0", "#fr_t0", "#fr_i0" \n\t" \
  274. "punpckhbh "#fr_t1", "#fr_t1", "#fr_i0" \n\t" \
  275. "psrlh "#fr_t0", "#fr_t0", "#fr_i1" \n\t" \
  276. "psrlh "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \
  277. "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t"
  278. #define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \
  279. "psrah "#fp1", "#fp1", "#shift" \n\t" \
  280. "psrah "#fp2", "#fp2", "#shift" \n\t" \
  281. "psrah "#fp3", "#fp3", "#shift" \n\t" \
  282. "psrah "#fp4", "#fp4", "#shift" \n\t"
  283. #define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \
  284. PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \
  285. PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)
  286. /**
  287. * brief: (((value) + (1 << ((n) - 1))) >> (n))
  288. * fr_i0: src & dst
  289. * fr_i1: Operand number
  290. * fr_t0, fr_t1: temporary FPR
  291. * gr_t0: temporary GPR
  292. */
  293. #define ROUND_POWER_OF_TWO_MMI(fr_i0, fr_i1, fr_t0, fr_t1, gr_t0) \
  294. "li "#gr_t0", 0x01 \n\t" \
  295. "dmtc1 "#gr_t0", "#fr_t0" \n\t" \
  296. "punpcklwd "#fr_t0", "#fr_t0", "#fr_t0" \n\t" \
  297. "psubw "#fr_t1", "#fr_i1", "#fr_t0" \n\t" \
  298. "psllw "#fr_t1", "#fr_t0", "#fr_t1" \n\t" \
  299. "paddw "#fr_i0", "#fr_i0", "#fr_t1" \n\t" \
  300. "psraw "#fr_i0", "#fr_i0", "#fr_i1" \n\t"
  301. #endif /* AVUTILS_MIPS_MMIUTILS_H */