You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

993 lines
60KB

  1. /*
  2. * Loongson SIMD optimized h264pred
  3. *
  4. * Copyright (c) 2015 Loongson Technology Corporation Limited
  5. * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "h264pred_mips.h"
  25. #include "libavcodec/bit_depth_template.c"
  26. #include "libavutil/mips/asmdefs.h"
  27. #include "constants.h"
  28. void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
  29. {
  30. double ftmp[2];
  31. uint64_t tmp[1];
  32. __asm__ volatile (
  33. "dli %[tmp0], 0x08 \n\t"
  34. "gsldlc1 %[ftmp0], 0x07(%[srcA]) \n\t"
  35. "gsldrc1 %[ftmp0], 0x00(%[srcA]) \n\t"
  36. "gsldlc1 %[ftmp1], 0x0f(%[srcA]) \n\t"
  37. "gsldrc1 %[ftmp1], 0x08(%[srcA]) \n\t"
  38. "1: \n\t"
  39. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  40. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  41. "gssdlc1 %[ftmp1], 0x0f(%[src]) \n\t"
  42. "gssdrc1 %[ftmp1], 0x08(%[src]) \n\t"
  43. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  44. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  45. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  46. "gssdlc1 %[ftmp1], 0x0f(%[src]) \n\t"
  47. "gssdrc1 %[ftmp1], 0x08(%[src]) \n\t"
  48. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  49. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  50. "bnez %[tmp0], 1b \n\t"
  51. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  52. [tmp0]"=&r"(tmp[0]),
  53. [src]"+&r"(src)
  54. : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride))
  55. : "memory"
  56. );
  57. }
  58. void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
  59. {
  60. uint64_t tmp[3];
  61. mips_reg addr[2];
  62. __asm__ volatile (
  63. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  64. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  65. "dli %[tmp2], 0x08 \n\t"
  66. "1: \n\t"
  67. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  68. "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
  69. "swl %[tmp1], 0x07(%[addr1]) \n\t"
  70. "swr %[tmp1], 0x00(%[addr1]) \n\t"
  71. "swl %[tmp1], 0x0f(%[addr1]) \n\t"
  72. "swr %[tmp1], 0x08(%[addr1]) \n\t"
  73. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  74. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  75. "lbu %[tmp0], 0x00(%[addr0]) \n\t"
  76. "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
  77. "swl %[tmp1], 0x07(%[addr1]) \n\t"
  78. "swr %[tmp1], 0x00(%[addr1]) \n\t"
  79. "swl %[tmp1], 0x0f(%[addr1]) \n\t"
  80. "swr %[tmp1], 0x08(%[addr1]) \n\t"
  81. "daddi %[tmp2], %[tmp2], -0x01 \n\t"
  82. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  83. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  84. "bnez %[tmp2], 1b \n\t"
  85. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  86. [tmp2]"=&r"(tmp[2]),
  87. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  88. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  89. [ff_pb_1]"r"(ff_pb_1)
  90. : "memory"
  91. );
  92. }
  93. void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
  94. {
  95. uint64_t tmp[4];
  96. mips_reg addr[2];
  97. __asm__ volatile (
  98. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  99. "dli %[tmp0], 0x08 \n\t"
  100. "xor %[tmp3], %[tmp3], %[tmp3] \n\t"
  101. "1: \n\t"
  102. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  103. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  104. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  105. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  106. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  107. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  108. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  109. "bnez %[tmp0], 1b \n\t"
  110. "dli %[tmp0], 0x08 \n\t"
  111. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  112. "2: \n\t"
  113. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  114. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  115. PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t"
  116. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  117. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  118. "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
  119. PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t"
  120. "bnez %[tmp0], 2b \n\t"
  121. "daddiu %[tmp3], %[tmp3], 0x10 \n\t"
  122. "dsra %[tmp3], 0x05 \n\t"
  123. "dmul %[tmp2], %[tmp3], %[ff_pb_1] \n\t"
  124. PTR_ADDU "%[addr0], %[src], $0 \n\t"
  125. "dli %[tmp0], 0x08 \n\t"
  126. "3: \n\t"
  127. "swl %[tmp2], 0x07(%[addr0]) \n\t"
  128. "swr %[tmp2], 0x00(%[addr0]) \n\t"
  129. "swl %[tmp2], 0x0f(%[addr0]) \n\t"
  130. "swr %[tmp2], 0x08(%[addr0]) \n\t"
  131. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  132. "swl %[tmp2], 0x07(%[addr0]) \n\t"
  133. "swr %[tmp2], 0x00(%[addr0]) \n\t"
  134. "swl %[tmp2], 0x0f(%[addr0]) \n\t"
  135. "swr %[tmp2], 0x08(%[addr0]) \n\t"
  136. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  137. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  138. "bnez %[tmp0], 3b \n\t"
  139. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  140. [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
  141. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  142. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  143. [ff_pb_1]"r"(ff_pb_1)
  144. : "memory"
  145. );
  146. }
  147. void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
  148. int has_topright, ptrdiff_t stride)
  149. {
  150. uint32_t dc;
  151. double ftmp[11];
  152. mips_reg tmp[3];
  153. __asm__ volatile (
  154. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  155. "gsldlc1 %[ftmp10], 0x07(%[srcA]) \n\t"
  156. "gsldrc1 %[ftmp10], 0x00(%[srcA]) \n\t"
  157. "gsldlc1 %[ftmp9], 0x07(%[src0]) \n\t"
  158. "gsldrc1 %[ftmp9], 0x00(%[src0]) \n\t"
  159. "gsldlc1 %[ftmp8], 0x07(%[src1]) \n\t"
  160. "gsldrc1 %[ftmp8], 0x00(%[src1]) \n\t"
  161. "punpcklbh %[ftmp7], %[ftmp10], %[ftmp0] \n\t"
  162. "punpckhbh %[ftmp6], %[ftmp10], %[ftmp0] \n\t"
  163. "punpcklbh %[ftmp5], %[ftmp9], %[ftmp0] \n\t"
  164. "punpckhbh %[ftmp4], %[ftmp9], %[ftmp0] \n\t"
  165. "punpcklbh %[ftmp3], %[ftmp8], %[ftmp0] \n\t"
  166. "punpckhbh %[ftmp2], %[ftmp8], %[ftmp0] \n\t"
  167. "bnez %[has_topleft], 1f \n\t"
  168. "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  169. "1: \n\t"
  170. "bnez %[has_topright], 2f \n\t"
  171. "pinsrh_3 %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  172. "2: \n\t"
  173. "dli %[tmp0], 0x02 \n\t"
  174. "mtc1 %[tmp0], %[ftmp1] \n\t"
  175. "pmullh %[ftmp5], %[ftmp5], %[ff_pw_2] \n\t"
  176. "pmullh %[ftmp4], %[ftmp4], %[ff_pw_2] \n\t"
  177. "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  178. "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  179. "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  180. "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  181. "paddh %[ftmp7], %[ftmp7], %[ff_pw_2] \n\t"
  182. "paddh %[ftmp6], %[ftmp6], %[ff_pw_2] \n\t"
  183. "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  184. "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  185. "packushb %[ftmp9], %[ftmp7], %[ftmp6] \n\t"
  186. "biadd %[ftmp10], %[ftmp9] \n\t"
  187. "mfc1 %[tmp1], %[ftmp10] \n\t"
  188. "addiu %[tmp1], %[tmp1], 0x04 \n\t"
  189. "srl %[tmp1], %[tmp1], 0x03 \n\t"
  190. "mul %[dc], %[tmp1], %[ff_pb_1] \n\t"
  191. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  192. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  193. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  194. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  195. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  196. [ftmp10]"=&f"(ftmp[10]),
  197. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  198. [dc]"=r"(dc)
  199. : [srcA]"r"((mips_reg)(src-stride-1)),
  200. [src0]"r"((mips_reg)(src-stride)),
  201. [src1]"r"((mips_reg)(src-stride+1)),
  202. [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright),
  203. [ff_pb_1]"r"(ff_pb_1), [ff_pw_2]"f"(ff_pw_2)
  204. : "memory"
  205. );
  206. __asm__ volatile (
  207. "dli %[tmp0], 0x02 \n\t"
  208. "punpcklwd %[ftmp0], %[dc], %[dc] \n\t"
  209. "1: \n\t"
  210. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  211. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  212. "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t"
  213. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  214. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  215. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  216. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  217. "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t"
  218. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  219. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  220. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  221. "bnez %[tmp0], 1b \n\t"
  222. : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
  223. [src]"+&r"(src)
  224. : [dc]"f"(dc), [stride]"r"((mips_reg)stride)
  225. : "memory"
  226. );
  227. }
  228. void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
  229. ptrdiff_t stride)
  230. {
  231. uint32_t dc, dc1, dc2;
  232. double ftmp[14];
  233. mips_reg tmp[1];
  234. const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2;
  235. const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2;
  236. const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2;
  237. const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2;
  238. const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2;
  239. const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2;
  240. const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
  241. const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
  242. __asm__ volatile (
  243. "gsldlc1 %[ftmp4], 0x07(%[srcA]) \n\t"
  244. "gsldrc1 %[ftmp4], 0x00(%[srcA]) \n\t"
  245. "gsldlc1 %[ftmp5], 0x07(%[src0]) \n\t"
  246. "gsldrc1 %[ftmp5], 0x00(%[src0]) \n\t"
  247. "gsldlc1 %[ftmp6], 0x07(%[src1]) \n\t"
  248. "gsldrc1 %[ftmp6], 0x00(%[src1]) \n\t"
  249. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  250. "dli %[tmp0], 0x03 \n\t"
  251. "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
  252. "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
  253. "mtc1 %[tmp0], %[ftmp1] \n\t"
  254. "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
  255. "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
  256. "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
  257. "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
  258. "pshufh %[ftmp3], %[ftmp8], %[ftmp1] \n\t"
  259. "pshufh %[ftmp13], %[ftmp12], %[ftmp1] \n\t"
  260. "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t"
  261. "pinsrh_3 %[ftmp12], %[ftmp12], %[ftmp3] \n\t"
  262. "bnez %[has_topleft], 1f \n\t"
  263. "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  264. "1: \n\t"
  265. "bnez %[has_topright], 2f \n\t"
  266. "pshufh %[ftmp13], %[ftmp10], %[ftmp1] \n\t"
  267. "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t"
  268. "2: \n\t"
  269. "dli %[tmp0], 0x02 \n\t"
  270. "mtc1 %[tmp0], %[ftmp1] \n\t"
  271. "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  272. "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
  273. "pmullh %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
  274. "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  275. "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  276. "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  277. "paddh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
  278. "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  279. "paddh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  280. "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  281. "psrah %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
  282. "packushb %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
  283. "biadd %[ftmp4], %[ftmp5] \n\t"
  284. "mfc1 %[dc2], %[ftmp4] \n\t"
  285. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  286. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  287. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  288. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  289. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  290. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  291. [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
  292. [tmp0]"=&r"(tmp[0]), [dc2]"=r"(dc2)
  293. : [srcA]"r"((mips_reg)(src-stride-1)),
  294. [src0]"r"((mips_reg)(src-stride)),
  295. [src1]"r"((mips_reg)(src-stride+1)),
  296. [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright)
  297. : "memory"
  298. );
  299. dc1 = l0+l1+l2+l3+l4+l5+l6+l7;
  300. dc = ((dc1+dc2+8)>>4)*0x01010101U;
  301. __asm__ volatile (
  302. "dli %[tmp0], 0x02 \n\t"
  303. "punpcklwd %[ftmp0], %[dc], %[dc] \n\t"
  304. "1: \n\t"
  305. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  306. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  307. "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t"
  308. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  309. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  310. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  311. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  312. "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t"
  313. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  314. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  315. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  316. "bnez %[tmp0], 1b \n\t"
  317. : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
  318. [src]"+&r"(src)
  319. : [dc]"f"(dc), [stride]"r"((mips_reg)stride)
  320. : "memory"
  321. );
  322. }
  323. void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
  324. int has_topright, ptrdiff_t stride)
  325. {
  326. double ftmp[12];
  327. mips_reg tmp[1];
  328. __asm__ volatile (
  329. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  330. "gsldlc1 %[ftmp3], 0x07(%[srcA]) \n\t"
  331. "gsldrc1 %[ftmp3], 0x00(%[srcA]) \n\t"
  332. "gsldlc1 %[ftmp4], 0x07(%[src0]) \n\t"
  333. "gsldrc1 %[ftmp4], 0x00(%[src0]) \n\t"
  334. "gsldlc1 %[ftmp5], 0x07(%[src1]) \n\t"
  335. "gsldrc1 %[ftmp5], 0x00(%[src1]) \n\t"
  336. "punpcklbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
  337. "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
  338. "punpcklbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
  339. "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
  340. "punpcklbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
  341. "punpckhbh %[ftmp11], %[ftmp5], %[ftmp0] \n\t"
  342. "bnez %[has_topleft], 1f \n\t"
  343. "pinsrh_0 %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  344. "1: \n\t"
  345. "bnez %[has_topright], 2f \n\t"
  346. "pinsrh_3 %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
  347. "2: \n\t"
  348. "dli %[tmp0], 0x02 \n\t"
  349. "mtc1 %[tmp0], %[ftmp1] \n\t"
  350. "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  351. "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  352. "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
  353. "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  354. "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  355. "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
  356. "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  357. "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  358. "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  359. "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  360. "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  361. "packushb %[ftmp4], %[ftmp6], %[ftmp7] \n\t"
  362. "sdc1 %[ftmp4], 0x00(%[src]) \n\t"
  363. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  364. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  365. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  366. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  367. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  368. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  369. [tmp0]"=&r"(tmp[0]),
  370. [src]"=r"(src)
  371. : [srcA]"r"((mips_reg)(src-stride-1)),
  372. [src0]"r"((mips_reg)(src-stride)),
  373. [src1]"r"((mips_reg)(src-stride+1)),
  374. [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright)
  375. : "memory"
  376. );
  377. __asm__ volatile (
  378. "dli %[tmp0], 0x02 \n\t"
  379. "1: \n\t"
  380. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  381. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  382. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  383. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  384. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  385. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  386. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  387. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  388. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  389. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  390. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  391. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  392. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  393. "bnez %[tmp0], 1b \n\t"
  394. : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
  395. [src]"+&r"(src)
  396. : [stride]"r"((mips_reg)stride)
  397. : "memory"
  398. );
  399. }
  400. void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
  401. ptrdiff_t stride)
  402. {
  403. const int dc = (src[-stride] + src[1-stride] + src[2-stride]
  404. + src[3-stride] + src[-1+0*stride] + src[-1+1*stride]
  405. + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
  406. uint64_t tmp[2];
  407. mips_reg addr[1];
  408. __asm__ volatile (
  409. PTR_ADDU "%[tmp0], %[dc], $0 \n\t"
  410. "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
  411. "xor %[addr0], %[addr0], %[addr0] \n\t"
  412. "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t"
  413. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  414. "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t"
  415. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  416. "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t"
  417. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  418. "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t"
  419. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  420. [addr0]"=&r"(addr[0])
  421. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  422. [dc]"r"(dc), [ff_pb_1]"r"(ff_pb_1)
  423. : "memory"
  424. );
  425. }
  426. void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
  427. {
  428. uint64_t tmp[2];
  429. mips_reg addr[2];
  430. __asm__ volatile (
  431. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  432. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  433. "ldl %[tmp0], 0x07(%[addr0]) \n\t"
  434. "ldr %[tmp0], 0x00(%[addr0]) \n\t"
  435. "dli %[tmp1], 0x04 \n\t"
  436. "1: \n\t"
  437. "sdl %[tmp0], 0x07(%[addr1]) \n\t"
  438. "sdr %[tmp0], 0x00(%[addr1]) \n\t"
  439. PTR_ADDU "%[addr1], %[stride] \n\t"
  440. "sdl %[tmp0], 0x07(%[addr1]) \n\t"
  441. "sdr %[tmp0], 0x00(%[addr1]) \n\t"
  442. "daddi %[tmp1], -0x01 \n\t"
  443. PTR_ADDU "%[addr1], %[stride] \n\t"
  444. "bnez %[tmp1], 1b \n\t"
  445. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  446. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  447. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride)
  448. : "memory"
  449. );
  450. }
  451. void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
  452. {
  453. uint64_t tmp[3];
  454. mips_reg addr[2];
  455. __asm__ volatile (
  456. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  457. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  458. "dli %[tmp0], 0x04 \n\t"
  459. "1: \n\t"
  460. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  461. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  462. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  463. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  464. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  465. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  466. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  467. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  468. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  469. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  470. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  471. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  472. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  473. "bnez %[tmp0], 1b \n\t"
  474. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  475. [tmp2]"=&r"(tmp[2]),
  476. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  477. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  478. [ff_pb_1]"r"(ff_pb_1)
  479. : "memory"
  480. );
  481. }
  482. void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
  483. {
  484. double ftmp[4];
  485. uint64_t tmp[1];
  486. mips_reg addr[1];
  487. __asm__ volatile (
  488. "dli %[tmp0], 0x02 \n\t"
  489. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  490. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  491. "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
  492. "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
  493. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  494. "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
  495. "biadd %[ftmp2], %[ftmp2] \n\t"
  496. "biadd %[ftmp3], %[ftmp3] \n\t"
  497. "mtc1 %[tmp0], %[ftmp1] \n\t"
  498. "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  499. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  500. "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  501. "paddush %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  502. "paddush %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  503. "mtc1 %[tmp0], %[ftmp1] \n\t"
  504. "psrlh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  505. "psrlh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  506. "packushb %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  507. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  508. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  509. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  510. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  511. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  512. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  513. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  514. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  515. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  516. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  517. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  518. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  519. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  520. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  521. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  522. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  523. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  524. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  525. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  526. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  527. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  528. "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t"
  529. "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t"
  530. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  531. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  532. [tmp0]"=&r"(tmp[0]),
  533. [addr0]"=&r"(addr[0]),
  534. [src]"+&r"(src)
  535. : [stride]"r"((mips_reg)stride)
  536. : "memory"
  537. );
  538. }
  539. void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
  540. {
  541. double ftmp[5];
  542. mips_reg addr[7];
  543. __asm__ volatile (
  544. "negu %[addr0], %[stride] \n\t"
  545. PTR_ADDU "%[addr0], %[addr0], %[src] \n\t"
  546. PTR_ADDIU "%[addr1], %[addr0], 0x04 \n\t"
  547. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  548. PTR_ADDU "%[addr3], $0, %[addr2] \n\t"
  549. PTR_ADDIU "%[addr0], 0x01 \n\t"
  550. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  551. PTR_ADDU "%[addr4], $0, %[addr2] \n\t"
  552. PTR_ADDIU "%[addr1], 0x01 \n\t"
  553. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  554. PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
  555. PTR_ADDIU "%[addr0], 0x01 \n\t"
  556. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  557. PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
  558. PTR_ADDIU "%[addr1], 0x01 \n\t"
  559. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  560. PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
  561. PTR_ADDIU "%[addr0], 0x01 \n\t"
  562. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  563. PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
  564. PTR_ADDIU "%[addr1], 0x01 \n\t"
  565. "lbu %[addr2], 0x00(%[addr0]) \n\t"
  566. PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
  567. PTR_ADDIU "%[addr0], 0x01 \n\t"
  568. "lbu %[addr2], 0x00(%[addr1]) \n\t"
  569. PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
  570. PTR_ADDIU "%[addr1], 0x01 \n\t"
  571. "dli %[addr2], -0x01 \n\t"
  572. PTR_ADDU "%[addr2], %[addr2], %[src] \n\t"
  573. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  574. PTR_ADDU "%[addr5], $0, %[addr1] \n\t"
  575. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  576. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  577. PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
  578. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  579. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  580. PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
  581. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  582. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  583. PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
  584. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  585. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  586. PTR_ADDU "%[addr6], $0, %[addr1] \n\t"
  587. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  588. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  589. PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
  590. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  591. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  592. PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
  593. PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
  594. "lbu %[addr1], 0x00(%[addr2]) \n\t"
  595. PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
  596. PTR_ADDU "%[addr3], %[addr3], %[addr5] \n\t"
  597. PTR_ADDIU "%[addr3], %[addr3], 0x04 \n\t"
  598. PTR_ADDIU "%[addr4], %[addr4], 0x02 \n\t"
  599. PTR_ADDIU "%[addr1], %[addr6], 0x02 \n\t"
  600. PTR_ADDU "%[addr2], %[addr4], %[addr1] \n\t"
  601. PTR_SRL "%[addr3], 0x03 \n\t"
  602. PTR_SRL "%[addr4], 0x02 \n\t"
  603. PTR_SRL "%[addr1], 0x02 \n\t"
  604. PTR_SRL "%[addr2], 0x03 \n\t"
  605. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  606. "dmtc1 %[addr3], %[ftmp1] \n\t"
  607. "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  608. "dmtc1 %[addr4], %[ftmp2] \n\t"
  609. "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  610. "dmtc1 %[addr1], %[ftmp3] \n\t"
  611. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  612. "dmtc1 %[addr2], %[ftmp4] \n\t"
  613. "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  614. "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  615. "packushb %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
  616. PTR_ADDU "%[addr0], $0, %[src] \n\t"
  617. "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t"
  618. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  619. "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t"
  620. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  621. "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t"
  622. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  623. "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t"
  624. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  625. "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t"
  626. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  627. "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t"
  628. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  629. "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t"
  630. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  631. "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t"
  632. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  633. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  634. [ftmp4]"=&f"(ftmp[4]),
  635. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  636. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  637. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  638. [addr6]"=&r"(addr[6])
  639. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride)
  640. : "memory"
  641. );
  642. }
  643. void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
  644. {
  645. double ftmp[1];
  646. uint64_t tmp[1];
  647. __asm__ volatile (
  648. "gsldlc1 %[ftmp0], 0x07(%[srcA]) \n\t"
  649. "gsldrc1 %[ftmp0], 0x00(%[srcA]) \n\t"
  650. "dli %[tmp0], 0x04 \n\t"
  651. "1: \n\t"
  652. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  653. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  654. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  655. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  656. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  657. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  658. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  659. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  660. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  661. "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t"
  662. "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t"
  663. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  664. PTR_ADDU "%[src], %[src], %[stride] \n\t"
  665. "bnez %[tmp0], 1b \n\t"
  666. : [ftmp0]"=&f"(ftmp[0]),
  667. [tmp0]"=&r"(tmp[0]),
  668. [src]"+&r"(src)
  669. : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride))
  670. : "memory"
  671. );
  672. }
  673. void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
  674. {
  675. uint64_t tmp[3];
  676. mips_reg addr[2];
  677. __asm__ volatile (
  678. PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
  679. PTR_ADDU "%[addr1], %[src], $0 \n\t"
  680. "dli %[tmp0], 0x08 \n\t"
  681. "1: \n\t"
  682. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  683. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  684. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  685. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  686. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  687. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  688. "lbu %[tmp1], 0x00(%[addr0]) \n\t"
  689. "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
  690. "swl %[tmp2], 0x07(%[addr1]) \n\t"
  691. "swr %[tmp2], 0x00(%[addr1]) \n\t"
  692. "daddi %[tmp0], %[tmp0], -0x01 \n\t"
  693. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  694. PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
  695. "bnez %[tmp0], 1b \n\t"
  696. : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  697. [tmp2]"=&r"(tmp[2]),
  698. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  699. : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
  700. [ff_pb_1]"r"(ff_pb_1)
  701. : "memory"
  702. );
  703. }
  704. static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
  705. const int svq3, const int rv40)
  706. {
  707. double ftmp[11];
  708. uint64_t tmp[7];
  709. mips_reg addr[1];
  710. __asm__ volatile(
  711. PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
  712. "dli %[tmp2], 0x20 \n\t"
  713. "dmtc1 %[tmp2], %[ftmp4] \n\t"
  714. "gsldlc1 %[ftmp0], 0x06(%[addr0]) \n\t"
  715. "gsldlc1 %[ftmp2], 0x0f(%[addr0]) \n\t"
  716. "gsldrc1 %[ftmp0], -0x01(%[addr0]) \n\t"
  717. "gsldrc1 %[ftmp2], 0x08(%[addr0]) \n\t"
  718. "dsrl %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  719. "dsrl %[ftmp3], %[ftmp2], %[ftmp4] \n\t"
  720. "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  721. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  722. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  723. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  724. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  725. "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t"
  726. "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t"
  727. "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t"
  728. "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t"
  729. "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  730. "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  731. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  732. "dli %[tmp2], 0x0e \n\t"
  733. "dmtc1 %[tmp2], %[ftmp4] \n\t"
  734. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  735. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  736. "dli %[tmp2], 0x01 \n\t"
  737. "dmtc1 %[tmp2], %[ftmp4] \n\t"
  738. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  739. "paddsh %[ftmp5], %[ftmp0], %[ftmp1] \n\t"
  740. PTR_ADDIU "%[addr0], %[src], -0x01 \n\t"
  741. PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
  742. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  743. "lbu %[tmp6], 0x10(%[addr0]) \n\t"
  744. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  745. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  746. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  747. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  748. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  749. "lbu %[tmp5], 0x00(%[addr0]) \n\t"
  750. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  751. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  752. "dsll %[tmp5], %[tmp5], 0x30 \n\t"
  753. "or %[tmp4], %[tmp4], %[tmp5] \n\t"
  754. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  755. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  756. "dmtc1 %[tmp2], %[ftmp0] \n\t"
  757. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  758. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  759. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  760. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  761. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  762. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  763. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  764. "lbu %[tmp5], 0x00(%[addr0]) \n\t"
  765. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  766. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  767. "dsll %[tmp5], %[tmp5], 0x30 \n\t"
  768. "or %[tmp4], %[tmp4], %[tmp5] \n\t"
  769. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  770. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  771. "dmtc1 %[tmp2], %[ftmp1] \n\t"
  772. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  773. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  774. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  775. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  776. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  777. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  778. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  779. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  780. "lbu %[tmp5], 0x00(%[addr0]) \n\t"
  781. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  782. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  783. "dsll %[tmp5], %[tmp5], 0x30 \n\t"
  784. "or %[tmp4], %[tmp4], %[tmp5] \n\t"
  785. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  786. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  787. "dmtc1 %[tmp2], %[ftmp2] \n\t"
  788. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  789. "lbu %[tmp2], 0x00(%[addr0]) \n\t"
  790. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  791. "lbu %[tmp3], 0x00(%[addr0]) \n\t"
  792. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  793. "lbu %[tmp4], 0x00(%[addr0]) \n\t"
  794. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  795. "lbu %[tmp5], 0x00(%[addr0]) \n\t"
  796. "daddu %[tmp6], %[tmp6], %[tmp5] \n\t"
  797. "daddiu %[tmp6], %[tmp6], 0x01 \n\t"
  798. "dsll %[tmp6], %[tmp6], 0x04 \n\t"
  799. "dsll %[tmp3], %[tmp3], 0x10 \n\t"
  800. "dsll %[tmp4], %[tmp4], 0x20 \n\t"
  801. "dsll %[tmp5], %[tmp5], 0x30 \n\t"
  802. "or %[tmp4], %[tmp4], %[tmp5] \n\t"
  803. "or %[tmp2], %[tmp2], %[tmp3] \n\t"
  804. "or %[tmp2], %[tmp2], %[tmp4] \n\t"
  805. "dmtc1 %[tmp2], %[ftmp3] \n\t"
  806. "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t"
  807. "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t"
  808. "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t"
  809. "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t"
  810. "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  811. "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  812. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  813. "dli %[tmp2], 0x0e \n\t"
  814. "dmtc1 %[tmp2], %[ftmp4] \n\t"
  815. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  816. "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  817. "dli %[tmp2], 0x01 \n\t"
  818. "dmtc1 %[tmp2], %[ftmp4] \n\t"
  819. "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  820. "paddsh %[ftmp6], %[ftmp0], %[ftmp1] \n\t"
  821. "dmfc1 %[tmp0], %[ftmp5] \n\t"
  822. "dsll %[tmp0], %[tmp0], 0x30 \n\t"
  823. "dsra %[tmp0], %[tmp0], 0x30 \n\t"
  824. "dmfc1 %[tmp1], %[ftmp6] \n\t"
  825. "dsll %[tmp1], %[tmp1], 0x30 \n\t"
  826. "dsra %[tmp1], %[tmp1], 0x30 \n\t"
  827. "beqz %[svq3], 1f \n\t"
  828. "dli %[tmp2], 0x04 \n\t"
  829. "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t"
  830. "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t"
  831. "dli %[tmp2], 0x05 \n\t"
  832. "dmul %[tmp0], %[tmp0], %[tmp2] \n\t"
  833. "dmul %[tmp1], %[tmp1], %[tmp2] \n\t"
  834. "dli %[tmp2], 0x10 \n\t"
  835. "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t"
  836. "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t"
  837. "daddu %[tmp2], %[tmp0], $0 \n\t"
  838. "daddu %[tmp0], %[tmp1], $0 \n\t"
  839. "daddu %[tmp1], %[tmp2], $0 \n\t"
  840. "b 2f \n\t"
  841. "1: \n\t"
  842. "beqz %[rv40], 1f \n\t"
  843. "dsra %[tmp2], %[tmp0], 0x02 \n\t"
  844. "daddu %[tmp0], %[tmp0], %[tmp2] \n\t"
  845. "dsra %[tmp2], %[tmp1], 0x02 \n\t"
  846. "daddu %[tmp1], %[tmp1], %[tmp2] \n\t"
  847. "dsra %[tmp0], %[tmp0], 0x04 \n\t"
  848. "dsra %[tmp1], %[tmp1], 0x04 \n\t"
  849. "b 2f \n\t"
  850. "1: \n\t"
  851. "dli %[tmp2], 0x05 \n\t"
  852. "dmul %[tmp0], %[tmp0], %[tmp2] \n\t"
  853. "dmul %[tmp1], %[tmp1], %[tmp2] \n\t"
  854. "daddiu %[tmp0], %[tmp0], 0x20 \n\t"
  855. "daddiu %[tmp1], %[tmp1], 0x20 \n\t"
  856. "dsra %[tmp0], %[tmp0], 0x06 \n\t"
  857. "dsra %[tmp1], %[tmp1], 0x06 \n\t"
  858. "2: \n\t"
  859. "daddu %[tmp3], %[tmp0], %[tmp1] \n\t"
  860. "dli %[tmp2], 0x07 \n\t"
  861. "dmul %[tmp3], %[tmp3], %[tmp2] \n\t"
  862. "dsubu %[tmp6], %[tmp6], %[tmp3] \n\t"
  863. "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  864. "dmtc1 %[tmp0], %[ftmp0] \n\t"
  865. "pshufh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  866. "dmtc1 %[tmp1], %[ftmp5] \n\t"
  867. "pshufh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  868. "dmtc1 %[tmp6], %[ftmp6] \n\t"
  869. "pshufh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  870. "dli %[tmp2], 0x05 \n\t"
  871. "dmtc1 %[tmp2], %[ftmp7] \n\t"
  872. "pmullh %[ftmp1], %[ff_pw_0to3], %[ftmp0] \n\t"
  873. "dmtc1 %[ff_pw_4to7], %[ftmp2] \n\t"
  874. "pmullh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  875. "dmtc1 %[ff_pw_8tob], %[ftmp3] \n\t"
  876. "pmullh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  877. "dmtc1 %[ff_pw_ctof], %[ftmp4] \n\t"
  878. "pmullh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  879. "dli %[tmp0], 0x10 \n\t"
  880. PTR_ADDU "%[addr0], %[src], $0 \n\t"
  881. "1: \n\t"
  882. "paddsh %[ftmp8], %[ftmp1], %[ftmp6] \n\t"
  883. "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  884. "paddsh %[ftmp9], %[ftmp2], %[ftmp6] \n\t"
  885. "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t"
  886. "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t"
  887. "gssdlc1 %[ftmp0], 0x07(%[addr0]) \n\t"
  888. "gssdrc1 %[ftmp0], 0x00(%[addr0]) \n\t"
  889. "paddsh %[ftmp8], %[ftmp3], %[ftmp6] \n\t"
  890. "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  891. "paddsh %[ftmp9], %[ftmp4], %[ftmp6] \n\t"
  892. "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t"
  893. "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t"
  894. "gssdlc1 %[ftmp0], 0x0f(%[addr0]) \n\t"
  895. "gssdrc1 %[ftmp0], 0x08(%[addr0]) \n\t"
  896. "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  897. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  898. "daddiu %[tmp0], %[tmp0], -0x01 \n\t"
  899. "bnez %[tmp0], 1b \n\t"
  900. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  901. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  902. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  903. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  904. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  905. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  906. [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
  907. [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
  908. [tmp6]"=&r"(tmp[6]),
  909. [addr0]"=&r"(addr[0])
  910. : [src]"r"(src), [stride]"r"((mips_reg)stride),
  911. [svq3]"r"(svq3), [rv40]"r"(rv40),
  912. [ff_pw_m8tom5]"f"(ff_pw_m8tom5), [ff_pw_m4tom1]"f"(ff_pw_m4tom1),
  913. [ff_pw_1to4]"f"(ff_pw_1to4), [ff_pw_5to8]"f"(ff_pw_5to8),
  914. [ff_pw_0to3]"f"(ff_pw_0to3), [ff_pw_4to7]"r"(ff_pw_4to7),
  915. [ff_pw_8tob]"r"(ff_pw_8tob), [ff_pw_ctof]"r"(ff_pw_ctof)
  916. : "memory"
  917. );
  918. }
  919. void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride)
  920. {
  921. pred16x16_plane_compat_mmi(src, stride, 0, 0);
  922. }
  923. void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride)
  924. {
  925. pred16x16_plane_compat_mmi(src, stride, 1, 0);
  926. }
  927. void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride)
  928. {
  929. pred16x16_plane_compat_mmi(src, stride, 0, 1);
  930. }