You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

986 lines
55KB

  1. /*
  2. * Loongson SIMD optimized h264pred
  3. *
  4. * Copyright (c) 2015 Loongson Technology Corporation Limited
  5. * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "h264pred_mips.h"
  25. #include "libavcodec/bit_depth_template.c"
  26. #include "libavutil/mips/mmiutils.h"
  27. #include "constants.h"
  28. void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
  29. {
  30. double ftmp[2];
  31. uint64_t tmp[1];
  32. DECLARE_VAR_ALL64;
  33. __asm__ volatile (
  34. "dli %[tmp0], 0x08 \n\t"
  35. MMI_LDC1(%[ftmp0], %[srcA], 0x00)
  36. MMI_LDC1(%[ftmp1], %[srcA], 0x08)
  37. "1: \n\t"
  38. MMI_SDC1(%[ftmp0], %[src], 0x00)
  39. MMI_SDC1(%[ftmp1], %[src], 0x08)
  40. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  41. MMI_SDC1(%[ftmp0], %[src], 0x00)
  42. MMI_SDC1(%[ftmp1], %[src], 0x08)
  43. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  44. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  45. "bnez %[tmp0], 1b \n\t"
  46. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  47. [tmp0]"=&r"(tmp[0]),
  48. RESTRICT_ASM_ALL64
  49. [src]"+&r"(src)
  50. : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride))
  51. : "memory"
  52. );
  53. }
  54. void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
  55. {
  56. uint64_t tmp[3];
  57. mips_reg addr[2];
  58. __asm__ volatile (
  59. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  60. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  61. "dli %[tmp2], 0x08 \n\t"
  62. "1: \n\t"
  63. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  64. "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
  65. "swl %[tmp1], 0x07(%[addr1]) \n\t"
  66. "swr %[tmp1], 0x00(%[addr1]) \n\t"
  67. "swl %[tmp1], 0x0f(%[addr1]) \n\t"
  68. "swr %[tmp1], 0x08(%[addr1]) \n\t"
  69. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  70. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  71. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  72. "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
  73. "swl %[tmp1], 0x07(%[addr1]) \n\t"
  74. "swr %[tmp1], 0x00(%[addr1]) \n\t"
  75. "swl %[tmp1], 0x0f(%[addr1]) \n\t"
  76. "swr %[tmp1], 0x08(%[addr1]) \n\t"
  77. "daddi %[tmp2], %[tmp2], -0x01 \n\t"
  78. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  79. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  80. "bnez %[tmp2], 1b \n\t"
  81. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  82. [tmp2]"=&r"(tmp[2]),
  83. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  84. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  85. [ff_pb_1]"r"(ff_pb_1)
  86. : "memory"
  87. );
  88. }
  89. void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
  90. {
  91. uint64_t tmp[4];
  92. mips_reg addr[2];
  93. __asm__ volatile (
  94. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  95. "dli %[tmp0], 0x08 \n\t"
  96. "xor %[tmp3], %[tmp3], %[tmp3] \n\t"
  97. "1: \n\t"
  98. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  99. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  100. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  101. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  102. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  103. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  104. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  105. "bnez %[tmp0], 1b \n\t"
  106. "dli %[tmp0], 0x08 \n\t"
  107. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  108. "2: \n\t"
  109. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  110. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  111. PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t"
  112. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  113. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  114. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  115. PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t"
  116. "bnez %[tmp0], 2b \n\t"
  117. "daddiu %[tmp3], %[tmp3], 0x10 \n\t"
  118. "dsra %[tmp3], 0x05 \n\t"
  119. "dmul %[tmp2], %[tmp3], %[ff_pb_1] \n\t"
  120. PTR_ADDU "%[addr0], %[src], $0 \n\t"
  121. "dli %[tmp0], 0x08 \n\t"
  122. "3: \n\t"
  123. "swl %[tmp2], 0x07(%[addr0]) \n\t"
  124. "swr %[tmp2], 0x00(%[addr0]) \n\t"
  125. "swl %[tmp2], 0x0f(%[addr0]) \n\t"
  126. "swr %[tmp2], 0x08(%[addr0]) \n\t"
  127. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  128. "swl %[tmp2], 0x07(%[addr0]) \n\t"
  129. "swr %[tmp2], 0x00(%[addr0]) \n\t"
  130. "swl %[tmp2], 0x0f(%[addr0]) \n\t"
  131. "swr %[tmp2], 0x08(%[addr0]) \n\t"
  132. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  133. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  134. "bnez %[tmp0], 3b \n\t"
  135. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  136. [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
  137. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  138. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  139. [ff_pb_1]"r"(ff_pb_1)
  140. : "memory"
  141. );
  142. }
  143. void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
  144. int has_topright, ptrdiff_t stride)
  145. {
  146. uint32_t dc;
  147. double ftmp[11];
  148. mips_reg tmp[3];
  149. DECLARE_VAR_ALL64;
  150. DECLARE_VAR_ADDRT;
  151. __asm__ volatile (
  152. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  153. MMI_ULDC1(%[ftmp10], %[srcA], 0x00)
  154. MMI_ULDC1(%[ftmp9], %[src0], 0x00)
  155. MMI_ULDC1(%[ftmp8], %[src1], 0x00)
  156. "punpcklbh %[ftmp7], %[ftmp10], %[ftmp0] \n\t"
  157. "punpckhbh %[ftmp6], %[ftmp10], %[ftmp0] \n\t"
  158. "punpcklbh %[ftmp5], %[ftmp9], %[ftmp0] \n\t"
  159. "punpckhbh %[ftmp4], %[ftmp9], %[ftmp0] \n\t"
  160. "punpcklbh %[ftmp3], %[ftmp8], %[ftmp0] \n\t"
  161. "punpckhbh %[ftmp2], %[ftmp8], %[ftmp0] \n\t"
  162. "bnez %[has_topleft], 1f \n\t"
  163. "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  164. "1: \n\t"
  165. "bnez %[has_topright], 2f \n\t"
  166. "pinsrh_3 %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  167. "2: \n\t"
  168. "dli %[tmp0], 0x02 \n\t"
  169. "mtc1 %[tmp0], %[ftmp1] \n\t"
  170. "pmullh %[ftmp5], %[ftmp5], %[ff_pw_2] \n\t"
  171. "pmullh %[ftmp4], %[ftmp4], %[ff_pw_2] \n\t"
  172. "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  173. "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  174. "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  175. "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  176. "paddh %[ftmp7], %[ftmp7], %[ff_pw_2] \n\t"
  177. "paddh %[ftmp6], %[ftmp6], %[ff_pw_2] \n\t"
  178. "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  179. "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  180. "packushb %[ftmp9], %[ftmp7], %[ftmp6] \n\t"
  181. "biadd %[ftmp10], %[ftmp9] \n\t"
  182. "mfc1 %[tmp1], %[ftmp10] \n\t"
  183. "addiu %[tmp1], %[tmp1], 0x04 \n\t"
  184. "srl %[tmp1], %[tmp1], 0x03 \n\t"
  185. "mul %[dc], %[tmp1], %[ff_pb_1] \n\t"
  186. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  187. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  188. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  189. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  190. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  191. [ftmp10]"=&f"(ftmp[10]),
  192. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  193. RESTRICT_ASM_ALL64
  194. [dc]"=r"(dc)
  195. : [srcA]"r"((mips_reg)(src-stride-1)),
  196. [src0]"r"((mips_reg)(src-stride)),
  197. [src1]"r"((mips_reg)(src-stride+1)),
  198. [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright),
  199. [ff_pb_1]"r"(ff_pb_1), [ff_pw_2]"f"(ff_pw_2)
  200. : "memory"
  201. );
  202. __asm__ volatile (
  203. "dli %[tmp0], 0x02 \n\t"
  204. "punpcklwd %[ftmp0], %[dc], %[dc] \n\t"
  205. "1: \n\t"
  206. MMI_SDC1(%[ftmp0], %[src], 0x00)
  207. MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
  208. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  209. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  210. MMI_SDC1(%[ftmp0], %[src], 0x00)
  211. MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
  212. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  213. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  214. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  215. "bnez %[tmp0], 1b \n\t"
  216. : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
  217. RESTRICT_ASM_ALL64
  218. RESTRICT_ASM_ADDRT
  219. [src]"+&r"(src)
  220. : [dc]"f"(dc), [stride]"r"((mips_reg)stride)
  221. : "memory"
  222. );
  223. }
  224. void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
  225. ptrdiff_t stride)
  226. {
  227. uint32_t dc, dc1, dc2;
  228. double ftmp[14];
  229. mips_reg tmp[1];
  230. const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2;
  231. const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2;
  232. const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2;
  233. const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2;
  234. const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2;
  235. const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2;
  236. const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
  237. const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
  238. DECLARE_VAR_ALL64;
  239. DECLARE_VAR_ADDRT;
  240. __asm__ volatile (
  241. MMI_ULDC1(%[ftmp4], %[srcA], 0x00)
  242. MMI_ULDC1(%[ftmp5], %[src0], 0x00)
  243. MMI_ULDC1(%[ftmp6], %[src1], 0x00)
  244. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  245. "dli %[tmp0], 0x03 \n\t"
  246. "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
  247. "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
  248. "mtc1 %[tmp0], %[ftmp1] \n\t"
  249. "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
  250. "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
  251. "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
  252. "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
  253. "pshufh %[ftmp3], %[ftmp8], %[ftmp1] \n\t"
  254. "pshufh %[ftmp13], %[ftmp12], %[ftmp1] \n\t"
  255. "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t"
  256. "pinsrh_3 %[ftmp12], %[ftmp12], %[ftmp3] \n\t"
  257. "bnez %[has_topleft], 1f \n\t"
  258. "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  259. "1: \n\t"
  260. "bnez %[has_topright], 2f \n\t"
  261. "pshufh %[ftmp13], %[ftmp10], %[ftmp1] \n\t"
  262. "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t"
  263. "2: \n\t"
  264. "dli %[tmp0], 0x02 \n\t"
  265. "mtc1 %[tmp0], %[ftmp1] \n\t"
  266. "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  267. "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
  268. "pmullh %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
  269. "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  270. "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  271. "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  272. "paddh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
  273. "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  274. "paddh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  275. "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  276. "psrah %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
  277. "packushb %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
  278. "biadd %[ftmp4], %[ftmp5] \n\t"
  279. "mfc1 %[dc2], %[ftmp4] \n\t"
  280. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  281. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  282. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  283. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  284. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  285. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  286. [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
  287. [tmp0]"=&r"(tmp[0]),
  288. RESTRICT_ASM_ALL64
  289. [dc2]"=r"(dc2)
  290. : [srcA]"r"((mips_reg)(src-stride-1)),
  291. [src0]"r"((mips_reg)(src-stride)),
  292. [src1]"r"((mips_reg)(src-stride+1)),
  293. [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright)
  294. : "memory"
  295. );
  296. dc1 = l0+l1+l2+l3+l4+l5+l6+l7;
  297. dc = ((dc1+dc2+8)>>4)*0x01010101U;
  298. __asm__ volatile (
  299. "dli %[tmp0], 0x02 \n\t"
  300. "punpcklwd %[ftmp0], %[dc], %[dc] \n\t"
  301. "1: \n\t"
  302. MMI_SDC1(%[ftmp0], %[src], 0x00)
  303. MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
  304. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  305. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  306. MMI_SDC1(%[ftmp0], %[src], 0x00)
  307. MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
  308. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  309. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  310. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  311. "bnez %[tmp0], 1b \n\t"
  312. : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
  313. RESTRICT_ASM_ALL64
  314. RESTRICT_ASM_ADDRT
  315. [src]"+&r"(src)
  316. : [dc]"f"(dc), [stride]"r"((mips_reg)stride)
  317. : "memory"
  318. );
  319. }
  320. void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
  321. int has_topright, ptrdiff_t stride)
  322. {
  323. double ftmp[12];
  324. mips_reg tmp[1];
  325. DECLARE_VAR_ALL64;
  326. __asm__ volatile (
  327. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  328. MMI_LDC1(%[ftmp3], %[srcA], 0x00)
  329. MMI_LDC1(%[ftmp4], %[src0], 0x00)
  330. MMI_LDC1(%[ftmp5], %[src1], 0x00)
  331. "punpcklbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
  332. "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
  333. "punpcklbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
  334. "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
  335. "punpcklbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
  336. "punpckhbh %[ftmp11], %[ftmp5], %[ftmp0] \n\t"
  337. "bnez %[has_topleft], 1f \n\t"
  338. "pinsrh_0 %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  339. "1: \n\t"
  340. "bnez %[has_topright], 2f \n\t"
  341. "pinsrh_3 %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
  342. "2: \n\t"
  343. "dli %[tmp0], 0x02 \n\t"
  344. "mtc1 %[tmp0], %[ftmp1] \n\t"
  345. "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  346. "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  347. "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
  348. "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  349. "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  350. "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
  351. "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  352. "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  353. "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  354. "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  355. "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  356. "packushb %[ftmp4], %[ftmp6], %[ftmp7] \n\t"
  357. MMI_SDC1(%[ftmp4], %[src], 0x00)
  358. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  359. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  360. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  361. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  362. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  363. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  364. [tmp0]"=&r"(tmp[0]),
  365. RESTRICT_ASM_ALL64
  366. [src]"=r"(src)
  367. : [srcA]"r"((mips_reg)(src-stride-1)),
  368. [src0]"r"((mips_reg)(src-stride)),
  369. [src1]"r"((mips_reg)(src-stride+1)),
  370. [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright)
  371. : "memory"
  372. );
  373. __asm__ volatile (
  374. "dli %[tmp0], 0x02 \n\t"
  375. "1: \n\t"
  376. MMI_SDC1(%[ftmp0], %[src], 0x00)
  377. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  378. MMI_SDC1(%[ftmp0], %[src], 0x00)
  379. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  380. MMI_SDC1(%[ftmp0], %[src], 0x00)
  381. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  382. MMI_SDC1(%[ftmp0], %[src], 0x00)
  383. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  384. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  385. "bnez %[tmp0], 1b \n\t"
  386. : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
  387. RESTRICT_ASM_ALL64
  388. [src]"+&r"(src)
  389. : [stride]"r"((mips_reg)stride)
  390. : "memory"
  391. );
  392. }
  393. void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
  394. ptrdiff_t stride)
  395. {
  396. const int dc = (src[-stride] + src[1-stride] + src[2-stride]
  397. + src[3-stride] + src[-1+0*stride] + src[-1+1*stride]
  398. + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
  399. uint64_t tmp[2];
  400. mips_reg addr[1];
  401. DECLARE_VAR_ADDRT;
  402. __asm__ volatile (
  403. PTR_ADDU "%[tmp0], %[dc], $0 \n\t"
  404. "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
  405. "xor %[addr0], %[addr0], %[addr0] \n\t"
  406. MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
  407. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  408. MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
  409. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  410. MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
  411. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  412. MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
  413. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  414. RESTRICT_ASM_ADDRT
  415. [addr0]"=&r"(addr[0])
  416. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  417. [dc]"r"(dc), [ff_pb_1]"r"(ff_pb_1)
  418. : "memory"
  419. );
  420. }
  421. void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
  422. {
  423. uint64_t tmp[2];
  424. mips_reg addr[2];
  425. __asm__ volatile (
  426. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  427. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  428. "ldl %[tmp0], 0x07(%[addr0]) \n\t"
  429. "ldr %[tmp0], 0x00(%[addr0]) \n\t"
  430. "dli %[tmp1], 0x04 \n\t"
  431. "1: \n\t"
  432. "sdl %[tmp0], 0x07(%[addr1]) \n\t"
  433. "sdr %[tmp0], 0x00(%[addr1]) \n\t"
  434. PTR_ADDU "%[addr1], %[stride] \n\t"
  435. "sdl %[tmp0], 0x07(%[addr1]) \n\t"
  436. "sdr %[tmp0], 0x00(%[addr1]) \n\t"
  437. "daddi %[tmp1], -0x01 \n\t"
  438. PTR_ADDU "%[addr1], %[stride] \n\t"
  439. "bnez %[tmp1], 1b \n\t"
  440. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  441. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  442. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride)
  443. : "memory"
  444. );
  445. }
  446. void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
  447. {
  448. uint64_t tmp[3];
  449. mips_reg addr[2];
  450. __asm__ volatile (
  451. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  452. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  453. "dli %[tmp0], 0x04 \n\t"
  454. "1: \n\t"
  455. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  456. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  457. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  458. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  459. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  460. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  461. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  462. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  463. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  464. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  465. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  466. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  467. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  468. "bnez %[tmp0], 1b \n\t"
  469. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  470. [tmp2]"=&r"(tmp[2]),
  471. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  472. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  473. [ff_pb_1]"r"(ff_pb_1)
  474. : "memory"
  475. );
  476. }
  477. void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
  478. {
  479. double ftmp[4];
  480. uint64_t tmp[1];
  481. mips_reg addr[1];
  482. DECLARE_VAR_ALL64;
  483. __asm__ volatile (
  484. "dli %[tmp0], 0x02 \n\t"
  485. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  486. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  487. MMI_LDC1(%[ftmp1], %[addr0], 0x00)
  488. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  489. "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
  490. "biadd %[ftmp2], %[ftmp2] \n\t"
  491. "biadd %[ftmp3], %[ftmp3] \n\t"
  492. "mtc1 %[tmp0], %[ftmp1] \n\t"
  493. "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  494. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  495. "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  496. "paddush %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  497. "paddush %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  498. "mtc1 %[tmp0], %[ftmp1] \n\t"
  499. "psrlh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  500. "psrlh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  501. "packushb %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  502. MMI_SDC1(%[ftmp1], %[src], 0x00)
  503. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  504. MMI_SDC1(%[ftmp1], %[src], 0x00)
  505. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  506. MMI_SDC1(%[ftmp1], %[src], 0x00)
  507. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  508. MMI_SDC1(%[ftmp1], %[src], 0x00)
  509. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  510. MMI_SDC1(%[ftmp1], %[src], 0x00)
  511. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  512. MMI_SDC1(%[ftmp1], %[src], 0x00)
  513. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  514. MMI_SDC1(%[ftmp1], %[src], 0x00)
  515. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  516. MMI_SDC1(%[ftmp1], %[src], 0x00)
  517. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  518. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  519. [tmp0]"=&r"(tmp[0]),
  520. RESTRICT_ASM_ALL64
  521. [addr0]"=&r"(addr[0]),
  522. [src]"+&r"(src)
  523. : [stride]"r"((mips_reg)stride)
  524. : "memory"
  525. );
  526. }
  527. void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
  528. {
  529. double ftmp[5];
  530. mips_reg addr[7];
  531. __asm__ volatile (
  532. "negu %[addr0], %[stride] \n\t"
  533. PTR_ADDU "%[addr0], %[addr0], %[src] \n\t"
  534. PTR_ADDIU "%[addr1], %[addr0], 0x04 \n\t"
  535. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  536. PTR_ADDU "%[addr3], $0, %[addr2] \n\t"
  537. PTR_ADDIU "%[addr0], 0x01 \n\t"
  538. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  539. PTR_ADDU "%[addr4], $0, %[addr2] \n\t"
  540. PTR_ADDIU "%[addr1], 0x01 \n\t"
  541. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  542. PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
  543. PTR_ADDIU "%[addr0], 0x01 \n\t"
  544. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  545. PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
  546. PTR_ADDIU "%[addr1], 0x01 \n\t"
  547. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  548. PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
  549. PTR_ADDIU "%[addr0], 0x01 \n\t"
  550. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  551. PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
  552. PTR_ADDIU "%[addr1], 0x01 \n\t"
  553. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  554. PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
  555. PTR_ADDIU "%[addr0], 0x01 \n\t"
  556. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  557. PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
  558. PTR_ADDIU "%[addr1], 0x01 \n\t"
  559. "dli %[addr2], -0x01 \n\t"
  560. PTR_ADDU "%[addr2], %[addr2], %[src] \n\t"
  561. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  562. PTR_ADDU "%[addr5], $0, %[addr1] \n\t"
  563. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  564. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  565. PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
  566. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  567. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  568. PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
  569. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  570. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  571. PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
  572. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  573. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  574. PTR_ADDU "%[addr6], $0, %[addr1] \n\t"
  575. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  576. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  577. PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
  578. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  579. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  580. PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
  581. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  582. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  583. PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
  584. PTR_ADDU "%[addr3], %[addr3], %[addr5] \n\t"
  585. PTR_ADDIU "%[addr3], %[addr3], 0x04 \n\t"
  586. PTR_ADDIU "%[addr4], %[addr4], 0x02 \n\t"
  587. PTR_ADDIU "%[addr1], %[addr6], 0x02 \n\t"
  588. PTR_ADDU "%[addr2], %[addr4], %[addr1] \n\t"
  589. PTR_SRL "%[addr3], 0x03 \n\t"
  590. PTR_SRL "%[addr4], 0x02 \n\t"
  591. PTR_SRL "%[addr1], 0x02 \n\t"
  592. PTR_SRL "%[addr2], 0x03 \n\t"
  593. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  594. "dmtc1 %[addr3], %[ftmp1] \n\t"
  595. "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  596. "dmtc1 %[addr4], %[ftmp2] \n\t"
  597. "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  598. "dmtc1 %[addr1], %[ftmp3] \n\t"
  599. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  600. "dmtc1 %[addr2], %[ftmp4] \n\t"
  601. "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  602. "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  603. "packushb %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
  604. PTR_ADDU "%[addr0], $0, %[src] \n\t"
  605. MMI_SDC1(%[ftmp1], %[addr0], 0x00)
  606. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  607. MMI_SDC1(%[ftmp1], %[addr0], 0x00)
  608. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  609. MMI_SDC1(%[ftmp1], %[addr0], 0x00)
  610. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  611. MMI_SDC1(%[ftmp1], %[addr0], 0x00)
  612. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  613. MMI_SDC1(%[ftmp2], %[addr0], 0x00)
  614. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  615. MMI_SDC1(%[ftmp2], %[addr0], 0x00)
  616. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  617. MMI_SDC1(%[ftmp2], %[addr0], 0x00)
  618. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  619. MMI_SDC1(%[ftmp2], %[addr0], 0x00)
  620. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  621. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  622. [ftmp4]"=&f"(ftmp[4]),
  623. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  624. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  625. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  626. [addr6]"=&r"(addr[6])
  627. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride)
  628. : "memory"
  629. );
  630. }
  631. void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
  632. {
  633. double ftmp[1];
  634. uint64_t tmp[1];
  635. DECLARE_VAR_ALL64;
  636. __asm__ volatile (
  637. MMI_LDC1(%[ftmp0], %[srcA], 0x00)
  638. "dli %[tmp0], 0x04 \n\t"
  639. "1: \n\t"
  640. MMI_SDC1(%[ftmp0], %[src], 0x00)
  641. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  642. MMI_SDC1(%[ftmp0], %[src], 0x00)
  643. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  644. MMI_SDC1(%[ftmp0], %[src], 0x00)
  645. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  646. MMI_SDC1(%[ftmp0], %[src], 0x00)
  647. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  648. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  649. "bnez %[tmp0], 1b \n\t"
  650. : [ftmp0]"=&f"(ftmp[0]),
  651. [tmp0]"=&r"(tmp[0]),
  652. RESTRICT_ASM_ALL64
  653. [src]"+&r"(src)
  654. : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride))
  655. : "memory"
  656. );
  657. }
  658. void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
  659. {
  660. uint64_t tmp[3];
  661. mips_reg addr[2];
  662. __asm__ volatile (
  663. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  664. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  665. "dli %[tmp0], 0x08 \n\t"
  666. "1: \n\t"
  667. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  668. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  669. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  670. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  671. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  672. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  673. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  674. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  675. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  676. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  677. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  678. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  679. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  680. "bnez %[tmp0], 1b \n\t"
  681. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  682. [tmp2]"=&r"(tmp[2]),
  683. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  684. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  685. [ff_pb_1]"r"(ff_pb_1)
  686. : "memory"
  687. );
  688. }
  689. static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
  690. const int svq3, const int rv40)
  691. {
  692. double ftmp[11];
  693. uint64_t tmp[6];
  694. mips_reg addr[1];
  695. DECLARE_VAR_ALL64;
  696. __asm__ volatile(
  697. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  698. "dli %[tmp0], 0x20 \n\t"
  699. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  700. MMI_ULDC1(%[ftmp0], %[addr0], -0x01)
  701. MMI_ULDC1(%[ftmp2], %[addr0], 0x08)
  702. "dsrl %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  703. "dsrl %[ftmp3], %[ftmp2], %[ftmp4] \n\t"
  704. "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  705. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  706. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  707. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  708. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  709. "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t"
  710. "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t"
  711. "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t"
  712. "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t"
  713. "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  714. "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  715. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  716. "dli %[tmp0], 0x0e \n\t"
  717. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  718. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  719. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  720. "dli %[tmp0], 0x01 \n\t"
  721. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  722. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  723. "paddsh %[ftmp5], %[ftmp0], %[ftmp1] \n\t"
  724. PTR_ADDIU "%[addr0], %[src], -0x01 \n\t"
  725. PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
  726. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  727. "lbu %[tmp5], 0x10(%[addr0]) \n\t"
  728. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  729. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  730. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  731. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  732. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  733. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  734. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  735. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  736. "dsll %[tmp0], %[tmp0], 0x30 \n\t"
  737. "or %[tmp4], %[tmp4], %[tmp0] \n\t"
  738. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  739. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  740. "dmtc1 %[tmp2], %[ftmp0] \n\t"
  741. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  742. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  743. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  744. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  745. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  746. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  747. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  748. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  749. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  750. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  751. "dsll %[tmp0], %[tmp0], 0x30 \n\t"
  752. "or %[tmp4], %[tmp4], %[tmp0] \n\t"
  753. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  754. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  755. "dmtc1 %[tmp2], %[ftmp1] \n\t"
  756. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  757. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  758. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  759. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  760. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  761. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  762. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  763. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  764. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  765. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  766. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  767. "dsll %[tmp0], %[tmp0], 0x30 \n\t"
  768. "or %[tmp4], %[tmp4], %[tmp0] \n\t"
  769. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  770. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  771. "dmtc1 %[tmp2], %[ftmp2] \n\t"
  772. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  773. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  774. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  775. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  776. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  777. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  778. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  779. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  780. "daddu %[tmp5], %[tmp5], %[tmp0] \n\t"
  781. "daddiu %[tmp5], %[tmp5], 0x01 \n\t"
  782. "dsll %[tmp5], %[tmp5], 0x04 \n\t"
  783. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  784. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  785. "dsll %[tmp0], %[tmp0], 0x30 \n\t"
  786. "or %[tmp4], %[tmp4], %[tmp0] \n\t"
  787. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  788. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  789. "dmtc1 %[tmp2], %[ftmp3] \n\t"
  790. "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t"
  791. "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t"
  792. "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t"
  793. "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t"
  794. "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  795. "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  796. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  797. "dli %[tmp0], 0x0e \n\t"
  798. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  799. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  800. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  801. "dli %[tmp0], 0x01 \n\t"
  802. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  803. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  804. "paddsh %[ftmp6], %[ftmp0], %[ftmp1] \n\t"
  805. "dmfc1 %[tmp0], %[ftmp5] \n\t"
  806. "dsll %[tmp0], %[tmp0], 0x30 \n\t"
  807. "dsra %[tmp0], %[tmp0], 0x30 \n\t"
  808. "dmfc1 %[tmp1], %[ftmp6] \n\t"
  809. "dsll %[tmp1], %[tmp1], 0x30 \n\t"
  810. "dsra %[tmp1], %[tmp1], 0x30 \n\t"
  811. "beqz %[svq3], 1f \n\t"
  812. "dli %[tmp2], 0x04 \n\t"
  813. "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t"
  814. "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t"
  815. "dli %[tmp2], 0x05 \n\t"
  816. "dmul %[tmp0], %[tmp0], %[tmp2] \n\t"
  817. "dmul %[tmp1], %[tmp1], %[tmp2] \n\t"
  818. "dli %[tmp2], 0x10 \n\t"
  819. "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t"
  820. "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t"
  821. "daddu %[tmp2], %[tmp0], $0 \n\t"
  822. "daddu %[tmp0], %[tmp1], $0 \n\t"
  823. "daddu %[tmp1], %[tmp2], $0 \n\t"
  824. "b 2f \n\t"
  825. "1: \n\t"
  826. "beqz %[rv40], 1f \n\t"
  827. "dsra %[tmp2], %[tmp0], 0x02 \n\t"
  828. "daddu %[tmp0], %[tmp0], %[tmp2] \n\t"
  829. "dsra %[tmp2], %[tmp1], 0x02 \n\t"
  830. "daddu %[tmp1], %[tmp1], %[tmp2] \n\t"
  831. "dsra %[tmp0], %[tmp0], 0x04 \n\t"
  832. "dsra %[tmp1], %[tmp1], 0x04 \n\t"
  833. "b 2f \n\t"
  834. "1: \n\t"
  835. "dli %[tmp2], 0x05 \n\t"
  836. "dmul %[tmp0], %[tmp0], %[tmp2] \n\t"
  837. "dmul %[tmp1], %[tmp1], %[tmp2] \n\t"
  838. "daddiu %[tmp0], %[tmp0], 0x20 \n\t"
  839. "daddiu %[tmp1], %[tmp1], 0x20 \n\t"
  840. "dsra %[tmp0], %[tmp0], 0x06 \n\t"
  841. "dsra %[tmp1], %[tmp1], 0x06 \n\t"
  842. "2: \n\t"
  843. "daddu %[tmp3], %[tmp0], %[tmp1] \n\t"
  844. "dli %[tmp2], 0x07 \n\t"
  845. "dmul %[tmp3], %[tmp3], %[tmp2] \n\t"
  846. "dsubu %[tmp5], %[tmp5], %[tmp3] \n\t"
  847. "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  848. "dmtc1 %[tmp0], %[ftmp0] \n\t"
  849. "pshufh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  850. "dmtc1 %[tmp1], %[ftmp5] \n\t"
  851. "pshufh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  852. "dmtc1 %[tmp5], %[ftmp6] \n\t"
  853. "pshufh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  854. "dli %[tmp0], 0x05 \n\t"
  855. "dmtc1 %[tmp0], %[ftmp7] \n\t"
  856. "pmullh %[ftmp1], %[ff_pw_0to3], %[ftmp0] \n\t"
  857. "dmtc1 %[ff_pw_4to7], %[ftmp2] \n\t"
  858. "pmullh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  859. "dmtc1 %[ff_pw_8tob], %[ftmp3] \n\t"
  860. "pmullh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  861. "dmtc1 %[ff_pw_ctof], %[ftmp4] \n\t"
  862. "pmullh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  863. "dli %[tmp0], 0x10 \n\t"
  864. PTR_ADDU "%[addr0], %[src], $0 \n\t"
  865. "1: \n\t"
  866. "paddsh %[ftmp8], %[ftmp1], %[ftmp6] \n\t"
  867. "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  868. "paddsh %[ftmp9], %[ftmp2], %[ftmp6] \n\t"
  869. "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t"
  870. "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t"
  871. MMI_SDC1(%[ftmp0], %[addr0], 0x00)
  872. "paddsh %[ftmp8], %[ftmp3], %[ftmp6] \n\t"
  873. "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  874. "paddsh %[ftmp9], %[ftmp4], %[ftmp6] \n\t"
  875. "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t"
  876. "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t"
  877. MMI_SDC1(%[ftmp0], %[addr0], 0x08)
  878. "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  879. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  880. "daddiu %[tmp0], %[tmp0], -0x01 \n\t"
  881. "bnez %[tmp0], 1b \n\t"
  882. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  883. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  884. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  885. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  886. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  887. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  888. [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
  889. [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
  890. RESTRICT_ASM_ALL64
  891. [addr0]"=&r"(addr[0])
  892. : [src]"r"(src), [stride]"r"((mips_reg)stride),
  893. [svq3]"r"(svq3), [rv40]"r"(rv40),
  894. [ff_pw_m8tom5]"f"(ff_pw_m8tom5), [ff_pw_m4tom1]"f"(ff_pw_m4tom1),
  895. [ff_pw_1to4]"f"(ff_pw_1to4), [ff_pw_5to8]"f"(ff_pw_5to8),
  896. [ff_pw_0to3]"f"(ff_pw_0to3), [ff_pw_4to7]"r"(ff_pw_4to7),
  897. [ff_pw_8tob]"r"(ff_pw_8tob), [ff_pw_ctof]"r"(ff_pw_ctof)
  898. : "memory"
  899. );
  900. }
  901. void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride)
  902. {
  903. pred16x16_plane_compat_mmi(src, stride, 0, 0);
  904. }
  905. void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride)
  906. {
  907. pred16x16_plane_compat_mmi(src, stride, 1, 0);
  908. }
  909. void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride)
  910. {
  911. pred16x16_plane_compat_mmi(src, stride, 0, 1);
  912. }