You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2733 lines
164KB

  1. /*
  2. * Loongson SIMD optimized h264dsp
  3. *
  4. * Copyright (c) 2015 Loongson Technology Corporation Limited
  5. * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
  7. * Heiher <r@hev.cc>
  8. *
  9. * This file is part of FFmpeg.
  10. *
  11. * FFmpeg is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU Lesser General Public
  13. * License as published by the Free Software Foundation; either
  14. * version 2.1 of the License, or (at your option) any later version.
  15. *
  16. * FFmpeg is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. * Lesser General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser General Public
  22. * License along with FFmpeg; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. */
  25. #include "libavcodec/bit_depth_template.c"
  26. #include "h264dsp_mips.h"
  27. #include "libavutil/mips/mmiutils.h"
  28. void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
  29. {
  30. double ftmp[9];
  31. DECLARE_VAR_LOW32;
  32. DECLARE_VAR_ALL64;
  33. __asm__ volatile (
  34. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  35. MMI_LDC1(%[ftmp1], %[src], 0x00)
  36. MMI_LDC1(%[ftmp2], %[src], 0x08)
  37. MMI_LDC1(%[ftmp3], %[src], 0x10)
  38. MMI_LDC1(%[ftmp4], %[src], 0x18)
  39. MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
  40. MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
  41. MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
  42. MMI_ULWC1(%[ftmp8], %[dst3], 0x00)
  43. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  44. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  45. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  46. "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  47. "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  48. "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  49. "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  50. "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  51. "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  52. "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  53. "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  54. "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  55. MMI_SWC1(%[ftmp1], %[dst0], 0x00)
  56. MMI_SWC1(%[ftmp2], %[dst1], 0x00)
  57. MMI_SWC1(%[ftmp3], %[dst2], 0x00)
  58. MMI_SWC1(%[ftmp4], %[dst3], 0x00)
  59. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  60. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  61. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  62. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  63. RESTRICT_ASM_LOW32
  64. RESTRICT_ASM_ALL64
  65. [ftmp8]"=&f"(ftmp[8])
  66. : [dst0]"r"(dst), [dst1]"r"(dst+stride),
  67. [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
  68. [src]"r"(src)
  69. : "memory"
  70. );
  71. memset(src, 0, 32);
  72. }
  73. void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
  74. {
  75. double ftmp[12];
  76. uint64_t tmp[1];
  77. DECLARE_VAR_LOW32;
  78. DECLARE_VAR_ALL64;
  79. DECLARE_VAR_ADDRT;
  80. __asm__ volatile (
  81. "dli %[tmp0], 0x01 \n\t"
  82. MMI_LDC1(%[ftmp0], %[block], 0x00)
  83. "mtc1 %[tmp0], %[ftmp8] \n\t"
  84. MMI_LDC1(%[ftmp1], %[block], 0x08)
  85. "dli %[tmp0], 0x06 \n\t"
  86. MMI_LDC1(%[ftmp2], %[block], 0x10)
  87. "mtc1 %[tmp0], %[ftmp9] \n\t"
  88. "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t"
  89. MMI_LDC1(%[ftmp3], %[block], 0x18)
  90. "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t"
  91. "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  92. "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  93. "paddh %[ftmp10], %[ftmp2], %[ftmp0] \n\t"
  94. "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  95. "paddh %[ftmp11], %[ftmp5], %[ftmp10] \n\t"
  96. "psubh %[ftmp2], %[ftmp10], %[ftmp5] \n\t"
  97. "paddh %[ftmp10], %[ftmp4], %[ftmp0] \n\t"
  98. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  99. "punpckhhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
  100. "punpcklhw %[ftmp5], %[ftmp11], %[ftmp10] \n\t"
  101. "punpckhhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
  102. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  103. "punpckhwd %[ftmp2], %[ftmp5], %[ftmp0] \n\t"
  104. "punpcklwd %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  105. "punpcklwd %[ftmp10], %[ftmp1], %[ftmp4] \n\t"
  106. "punpckhwd %[ftmp0], %[ftmp1], %[ftmp4] \n\t"
  107. "paddh %[ftmp5], %[ftmp5], %[ff_pw_32] \n\t"
  108. "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t"
  109. "psrah %[ftmp3], %[ftmp0], %[ftmp8] \n\t"
  110. "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  111. "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  112. "paddh %[ftmp1], %[ftmp10], %[ftmp5] \n\t"
  113. "psubh %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
  114. "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t"
  115. "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  116. "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t"
  117. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  118. "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  119. MMI_SDC1(%[ftmp7], %[block], 0x00)
  120. MMI_SDC1(%[ftmp7], %[block], 0x08)
  121. MMI_SDC1(%[ftmp7], %[block], 0x10)
  122. MMI_SDC1(%[ftmp7], %[block], 0x18)
  123. MMI_ULWC1(%[ftmp2], %[dst], 0x00)
  124. "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
  125. MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  126. "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
  127. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  128. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  129. "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  130. "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  131. "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  132. "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  133. MMI_SWC1(%[ftmp2], %[dst], 0x00)
  134. MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  135. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  136. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  137. MMI_ULWC1(%[ftmp2], %[dst], 0x00)
  138. "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
  139. MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  140. "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  141. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  142. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  143. "paddh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  144. "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  145. "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  146. MMI_SWC1(%[ftmp2], %[dst], 0x00)
  147. "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  148. MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  149. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  150. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  151. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  152. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  153. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  154. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  155. RESTRICT_ASM_LOW32
  156. RESTRICT_ASM_ALL64
  157. RESTRICT_ASM_ADDRT
  158. [tmp0]"=&r"(tmp[0])
  159. : [dst]"r"(dst), [block]"r"(block),
  160. [stride]"r"((mips_reg)stride), [ff_pw_32]"f"(ff_pw_32)
  161. : "memory"
  162. );
  163. memset(block, 0, 32);
  164. }
  165. void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
  166. {
  167. double ftmp[16];
  168. uint64_t tmp[7];
  169. mips_reg addr[1];
  170. DECLARE_VAR_LOW32;
  171. DECLARE_VAR_ALL64;
  172. DECLARE_VAR_ADDRT;
  173. __asm__ volatile (
  174. "lhu %[tmp0], 0x00(%[block]) \n\t"
  175. PTR_ADDI "$29, $29, -0x20 \n\t"
  176. PTR_ADDIU "%[tmp0], %[tmp0], 0x20 \n\t"
  177. MMI_LDC1(%[ftmp1], %[block], 0x10)
  178. "sh %[tmp0], 0x00(%[block]) \n\t"
  179. MMI_LDC1(%[ftmp2], %[block], 0x20)
  180. "dli %[tmp0], 0x01 \n\t"
  181. MMI_LDC1(%[ftmp3], %[block], 0x30)
  182. "mtc1 %[tmp0], %[ftmp8] \n\t"
  183. MMI_LDC1(%[ftmp5], %[block], 0x50)
  184. MMI_LDC1(%[ftmp6], %[block], 0x60)
  185. MMI_LDC1(%[ftmp7], %[block], 0x70)
  186. "mov.d %[ftmp0], %[ftmp1] \n\t"
  187. "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
  188. "psrah %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
  189. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  190. "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  191. "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  192. "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  193. "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  194. "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  195. "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  196. "psubh %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
  197. "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  198. "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  199. "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  200. "psrah %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  201. "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  202. "dli %[tmp0], 0x02 \n\t"
  203. "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  204. "mtc1 %[tmp0], %[ftmp9] \n\t"
  205. "mov.d %[ftmp7], %[ftmp1] \n\t"
  206. "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  207. "psrah %[ftmp3], %[ftmp4], %[ftmp9] \n\t"
  208. "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  209. "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
  210. "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  211. "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
  212. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  213. "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  214. "mov.d %[ftmp5], %[ftmp6] \n\t"
  215. "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  216. "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t"
  217. "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  218. "psubh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  219. MMI_LDC1(%[ftmp2], %[block], 0x00)
  220. MMI_LDC1(%[ftmp5], %[block], 0x40)
  221. "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
  222. "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  223. "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  224. "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  225. "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  226. "paddh %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  227. "psubh %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  228. "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  229. "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  230. "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  231. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  232. "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  233. "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  234. "paddh %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  235. "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  236. "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  237. "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  238. "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  239. "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  240. "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  241. MMI_SDC1(%[ftmp6], %[block], 0x00)
  242. "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  243. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
  244. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  245. "punpckhhw %[ftmp0], %[ftmp3], %[ftmp1] \n\t"
  246. "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  247. "punpckhwd %[ftmp1], %[ftmp7], %[ftmp3] \n\t"
  248. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  249. "punpckhwd %[ftmp3], %[ftmp6], %[ftmp0] \n\t"
  250. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  251. MMI_LDC1(%[ftmp0], %[block], 0x00)
  252. MMI_SDC1(%[ftmp7], $29, 0x00)
  253. MMI_SDC1(%[ftmp1], $29, 0x10)
  254. "dmfc1 %[tmp1], %[ftmp6] \n\t"
  255. "dmfc1 %[tmp3], %[ftmp3] \n\t"
  256. "punpckhhw %[ftmp3], %[ftmp5], %[ftmp2] \n\t"
  257. "punpcklhw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
  258. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t"
  259. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  260. "punpckhwd %[ftmp0], %[ftmp5], %[ftmp4] \n\t"
  261. "punpcklwd %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  262. "punpckhwd %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
  263. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  264. MMI_SDC1(%[ftmp5], $29, 0x08)
  265. MMI_SDC1(%[ftmp0], $29, 0x18)
  266. "dmfc1 %[tmp2], %[ftmp3] \n\t"
  267. "dmfc1 %[tmp4], %[ftmp4] \n\t"
  268. MMI_LDC1(%[ftmp1], %[block], 0x18)
  269. MMI_LDC1(%[ftmp6], %[block], 0x28)
  270. MMI_LDC1(%[ftmp2], %[block], 0x38)
  271. MMI_LDC1(%[ftmp0], %[block], 0x58)
  272. MMI_LDC1(%[ftmp3], %[block], 0x68)
  273. MMI_LDC1(%[ftmp4], %[block], 0x78)
  274. "mov.d %[ftmp7], %[ftmp1] \n\t"
  275. "psrah %[ftmp5], %[ftmp0], %[ftmp8] \n\t"
  276. "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
  277. "paddh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  278. "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  279. "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  280. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  281. "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  282. "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  283. "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  284. "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  285. "psrah %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  286. "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  287. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  288. "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  289. "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  290. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  291. "mov.d %[ftmp4], %[ftmp1] \n\t"
  292. "psrah %[ftmp2], %[ftmp5], %[ftmp9] \n\t"
  293. "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  294. "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  295. "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  296. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  297. "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
  298. "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  299. "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  300. "mov.d %[ftmp0], %[ftmp3] \n\t"
  301. "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  302. "psrah %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  303. "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  304. "psubh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  305. MMI_LDC1(%[ftmp6], %[block], 0x08)
  306. MMI_LDC1(%[ftmp0], %[block], 0x48)
  307. "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
  308. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  309. "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  310. "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  311. "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  312. "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  313. "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  314. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  315. "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  316. "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  317. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  318. "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  319. "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  320. "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  321. "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  322. "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  323. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  324. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  325. "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  326. "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  327. MMI_SDC1(%[ftmp3], %[block], 0x08)
  328. "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  329. "punpckhhw %[ftmp3], %[ftmp4], %[ftmp7] \n\t"
  330. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  331. "punpckhhw %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
  332. "punpcklhw %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  333. "punpckhwd %[ftmp1], %[ftmp4], %[ftmp2] \n\t"
  334. "punpcklwd %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  335. "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
  336. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  337. MMI_LDC1(%[ftmp7], %[block], 0x08)
  338. "dmfc1 %[tmp5], %[ftmp4] \n\t"
  339. "mov.d %[ftmp10], %[ftmp1] \n\t"
  340. "mov.d %[ftmp12], %[ftmp3] \n\t"
  341. "mov.d %[ftmp14], %[ftmp2] \n\t"
  342. "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
  343. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
  344. "punpckhhw %[ftmp6], %[ftmp5], %[ftmp7] \n\t"
  345. "punpcklhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  346. "punpckhwd %[ftmp7], %[ftmp0], %[ftmp5] \n\t"
  347. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
  348. "punpckhwd %[ftmp5], %[ftmp2], %[ftmp6] \n\t"
  349. "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  350. "dmfc1 %[tmp6], %[ftmp0] \n\t"
  351. "mov.d %[ftmp11], %[ftmp7] \n\t"
  352. "mov.d %[ftmp13], %[ftmp2] \n\t"
  353. "mov.d %[ftmp15], %[ftmp5] \n\t"
  354. PTR_ADDIU "%[addr0], %[dst], 0x04 \n\t"
  355. "mov.d %[ftmp7], %[ftmp10] \n\t"
  356. "dmtc1 %[tmp3], %[ftmp6] \n\t"
  357. MMI_LDC1(%[ftmp1], $29, 0x10)
  358. "dmtc1 %[tmp1], %[ftmp3] \n\t"
  359. "mov.d %[ftmp4], %[ftmp1] \n\t"
  360. "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
  361. "psrah %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
  362. "paddh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  363. "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  364. "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  365. "paddh %[ftmp0], %[ftmp0], %[ftmp14] \n\t"
  366. "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  367. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  368. "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  369. "psubh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  370. "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  371. "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
  372. "psubh %[ftmp7], %[ftmp7], %[ftmp14] \n\t"
  373. "psrah %[ftmp5], %[ftmp14], %[ftmp8] \n\t"
  374. "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  375. "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  376. "mov.d %[ftmp5], %[ftmp1] \n\t"
  377. "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  378. "psrah %[ftmp6], %[ftmp0], %[ftmp9] \n\t"
  379. "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  380. "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  381. "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  382. "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  383. "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  384. "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  385. "mov.d %[ftmp7], %[ftmp12] \n\t"
  386. "psrah %[ftmp2], %[ftmp12], %[ftmp8] \n\t"
  387. "psrah %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
  388. "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  389. "psubh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  390. MMI_LDC1(%[ftmp3], $29, 0x00)
  391. "dmtc1 %[tmp5], %[ftmp7] \n\t"
  392. "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  393. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  394. "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  395. "psubh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  396. "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  397. "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  398. "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  399. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  400. "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
  401. "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  402. "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  403. "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  404. "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  405. "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  406. "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  407. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  408. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  409. "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  410. "psubh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  411. "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  412. MMI_SDC1(%[ftmp3], $29, 0x00)
  413. "psubh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  414. MMI_SDC1(%[ftmp0], $29, 0x10)
  415. "dmfc1 %[tmp1], %[ftmp2] \n\t"
  416. "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  417. MMI_SDC1(%[ftmp2], %[block], 0x00)
  418. MMI_SDC1(%[ftmp2], %[block], 0x08)
  419. MMI_SDC1(%[ftmp2], %[block], 0x10)
  420. MMI_SDC1(%[ftmp2], %[block], 0x18)
  421. MMI_SDC1(%[ftmp2], %[block], 0x20)
  422. MMI_SDC1(%[ftmp2], %[block], 0x28)
  423. MMI_SDC1(%[ftmp2], %[block], 0x30)
  424. MMI_SDC1(%[ftmp2], %[block], 0x38)
  425. MMI_SDC1(%[ftmp2], %[block], 0x40)
  426. MMI_SDC1(%[ftmp2], %[block], 0x48)
  427. MMI_SDC1(%[ftmp2], %[block], 0x50)
  428. MMI_SDC1(%[ftmp2], %[block], 0x58)
  429. MMI_SDC1(%[ftmp2], %[block], 0x60)
  430. MMI_SDC1(%[ftmp2], %[block], 0x68)
  431. MMI_SDC1(%[ftmp2], %[block], 0x70)
  432. MMI_SDC1(%[ftmp2], %[block], 0x78)
  433. "dli %[tmp3], 0x06 \n\t"
  434. "mtc1 %[tmp3], %[ftmp10] \n\t"
  435. MMI_ULWC1(%[ftmp3], %[dst], 0x00)
  436. MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  437. "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
  438. "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
  439. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  440. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  441. "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  442. "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  443. "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  444. "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  445. MMI_SWC1(%[ftmp3], %[dst], 0x00)
  446. MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  447. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  448. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  449. MMI_ULWC1(%[ftmp3], %[dst], 0x00)
  450. MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  451. "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
  452. "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
  453. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  454. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  455. "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  456. "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  457. "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  458. "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  459. MMI_SWC1(%[ftmp3], %[dst], 0x00)
  460. MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  461. MMI_LDC1(%[ftmp5], $29, 0x00)
  462. MMI_LDC1(%[ftmp4], $29, 0x10)
  463. "dmtc1 %[tmp1], %[ftmp6] \n\t"
  464. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  465. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  466. MMI_ULWC1(%[ftmp3], %[dst], 0x00)
  467. MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  468. "psrah %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  469. "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
  470. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  471. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  472. "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  473. "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
  474. "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  475. "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  476. MMI_SWC1(%[ftmp3], %[dst], 0x00)
  477. MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  478. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  479. PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
  480. MMI_ULWC1(%[ftmp3], %[dst], 0x00)
  481. MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  482. "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
  483. "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
  484. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  485. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  486. "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  487. "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
  488. "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  489. "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  490. MMI_SWC1(%[ftmp3], %[dst], 0x00)
  491. MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
  492. "dmtc1 %[tmp4], %[ftmp1] \n\t"
  493. "dmtc1 %[tmp2], %[ftmp6] \n\t"
  494. MMI_LDC1(%[ftmp4], $29, 0x18)
  495. "mov.d %[ftmp5], %[ftmp4] \n\t"
  496. "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  497. "psrah %[ftmp7], %[ftmp11], %[ftmp8] \n\t"
  498. "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  499. "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  500. "paddh %[ftmp7], %[ftmp7], %[ftmp15] \n\t"
  501. "paddh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
  502. "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  503. "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  504. "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  505. "psubh %[ftmp3], %[ftmp11], %[ftmp1] \n\t"
  506. "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
  507. "paddh %[ftmp5], %[ftmp5], %[ftmp15] \n\t"
  508. "psubh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
  509. "psrah %[ftmp2], %[ftmp15], %[ftmp8] \n\t"
  510. "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  511. "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  512. "mov.d %[ftmp2], %[ftmp4] \n\t"
  513. "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  514. "psrah %[ftmp1], %[ftmp7], %[ftmp9] \n\t"
  515. "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  516. "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  517. "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
  518. "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  519. "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  520. "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  521. "mov.d %[ftmp3], %[ftmp13] \n\t"
  522. "psrah %[ftmp0], %[ftmp13], %[ftmp8] \n\t"
  523. "psrah %[ftmp7], %[ftmp6], %[ftmp8] \n\t"
  524. "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
  525. "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  526. MMI_LDC1(%[ftmp6], $29, 0x08)
  527. "dmtc1 %[tmp6], %[ftmp3] \n\t"
  528. "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  529. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  530. "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  531. "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  532. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  533. "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  534. "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  535. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  536. "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  537. "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  538. "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  539. "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  540. "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  541. "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  542. "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  543. "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  544. "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  545. "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  546. "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  547. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  548. MMI_SDC1(%[ftmp6], $29, 0x08)
  549. "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  550. MMI_SDC1(%[ftmp7], $29, 0x18)
  551. "dmfc1 %[tmp2], %[ftmp0] \n\t"
  552. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  553. MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
  554. MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  555. "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
  556. "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
  557. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  558. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  559. "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  560. "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  561. "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  562. "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  563. MMI_SWC1(%[ftmp6], %[addr0], 0x00)
  564. MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  565. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  566. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  567. MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
  568. MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  569. "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
  570. "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
  571. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  572. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  573. "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  574. "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  575. "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  576. "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  577. MMI_SWC1(%[ftmp6], %[addr0], 0x00)
  578. MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  579. MMI_LDC1(%[ftmp2], $29, 0x08)
  580. MMI_LDC1(%[ftmp5], $29, 0x18)
  581. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  582. "dmtc1 %[tmp2], %[ftmp1] \n\t"
  583. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  584. MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
  585. MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  586. "psrah %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
  587. "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
  588. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  589. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  590. "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  591. "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  592. "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  593. "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  594. MMI_SWC1(%[ftmp6], %[addr0], 0x00)
  595. MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  596. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  597. PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
  598. MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
  599. MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  600. "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
  601. "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
  602. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  603. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  604. "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  605. "paddh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  606. "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  607. "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  608. MMI_SWC1(%[ftmp6], %[addr0], 0x00)
  609. MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
  610. PTR_ADDIU "$29, $29, 0x20 \n\t"
  611. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  612. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  613. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  614. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  615. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  616. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  617. [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
  618. [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
  619. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  620. [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
  621. [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
  622. [tmp6]"=&r"(tmp[6]),
  623. RESTRICT_ASM_LOW32
  624. RESTRICT_ASM_ALL64
  625. RESTRICT_ASM_ADDRT
  626. [addr0]"=&r"(addr[0])
  627. : [dst]"r"(dst), [block]"r"(block),
  628. [stride]"r"((mips_reg)stride)
  629. : "$29","memory"
  630. );
  631. memset(block, 0, 128);
  632. }
  633. void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
  634. {
  635. int dc = (block[0] + 32) >> 6;
  636. double ftmp[6];
  637. DECLARE_VAR_LOW32;
  638. block[0] = 0;
  639. __asm__ volatile (
  640. "mtc1 %[dc], %[ftmp5] \n\t"
  641. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  642. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  643. MMI_ULWC1(%[ftmp1], %[dst0], 0x00)
  644. MMI_ULWC1(%[ftmp2], %[dst1], 0x00)
  645. MMI_ULWC1(%[ftmp3], %[dst2], 0x00)
  646. MMI_ULWC1(%[ftmp4], %[dst3], 0x00)
  647. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  648. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  649. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  650. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  651. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  652. "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  653. "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  654. "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  655. "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  656. "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  657. "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  658. "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  659. MMI_SWC1(%[ftmp1], %[dst0], 0x00)
  660. MMI_SWC1(%[ftmp2], %[dst1], 0x00)
  661. MMI_SWC1(%[ftmp3], %[dst2], 0x00)
  662. MMI_SWC1(%[ftmp4], %[dst3], 0x00)
  663. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  664. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  665. [ftmp4]"=&f"(ftmp[4]),
  666. RESTRICT_ASM_LOW32
  667. [ftmp5]"=&f"(ftmp[5])
  668. : [dst0]"r"(dst), [dst1]"r"(dst+stride),
  669. [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
  670. [dc]"r"(dc)
  671. : "memory"
  672. );
  673. }
  674. void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
  675. {
  676. int dc = (block[0] + 32) >> 6;
  677. double ftmp[10];
  678. DECLARE_VAR_ALL64;
  679. block[0] = 0;
  680. __asm__ volatile (
  681. "mtc1 %[dc], %[ftmp5] \n\t"
  682. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  683. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  684. MMI_LDC1(%[ftmp1], %[dst0], 0x00)
  685. MMI_LDC1(%[ftmp2], %[dst1], 0x00)
  686. MMI_LDC1(%[ftmp3], %[dst2], 0x00)
  687. MMI_LDC1(%[ftmp4], %[dst3], 0x00)
  688. "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
  689. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  690. "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
  691. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  692. "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t"
  693. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  694. "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
  695. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  696. "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  697. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  698. "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  699. "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  700. "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  701. "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  702. "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t"
  703. "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  704. "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  705. "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  706. "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  707. "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  708. MMI_SDC1(%[ftmp1], %[dst0], 0x00)
  709. MMI_SDC1(%[ftmp2], %[dst1], 0x00)
  710. MMI_SDC1(%[ftmp3], %[dst2], 0x00)
  711. MMI_SDC1(%[ftmp4], %[dst3], 0x00)
  712. MMI_LDC1(%[ftmp1], %[dst4], 0x00)
  713. MMI_LDC1(%[ftmp2], %[dst5], 0x00)
  714. MMI_LDC1(%[ftmp3], %[dst6], 0x00)
  715. MMI_LDC1(%[ftmp4], %[dst7], 0x00)
  716. "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
  717. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  718. "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
  719. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  720. "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t"
  721. "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  722. "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
  723. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  724. "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  725. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  726. "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  727. "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  728. "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  729. "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  730. "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t"
  731. "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  732. "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  733. "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  734. "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  735. "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  736. MMI_SDC1(%[ftmp1], %[dst4], 0x00)
  737. MMI_SDC1(%[ftmp2], %[dst5], 0x00)
  738. MMI_SDC1(%[ftmp3], %[dst6], 0x00)
  739. MMI_SDC1(%[ftmp4], %[dst7], 0x00)
  740. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  741. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  742. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  743. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  744. [ftmp8]"=&f"(ftmp[8]),
  745. RESTRICT_ASM_ALL64
  746. [ftmp9]"=&f"(ftmp[9])
  747. : [dst0]"r"(dst), [dst1]"r"(dst+stride),
  748. [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
  749. [dst4]"r"(dst+4*stride), [dst5]"r"(dst+5*stride),
  750. [dst6]"r"(dst+6*stride), [dst7]"r"(dst+7*stride),
  751. [dc]"r"(dc)
  752. : "memory"
  753. );
  754. }
  755. void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
  756. int16_t *block, int stride, const uint8_t nnzc[15*8])
  757. {
  758. int i;
  759. for(i=0; i<16; i++){
  760. int nnz = nnzc[ scan8[i] ];
  761. if(nnz){
  762. if(nnz==1 && ((int16_t*)block)[i*16])
  763. ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
  764. stride);
  765. else
  766. ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
  767. stride);
  768. }
  769. }
  770. }
  771. void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
  772. int16_t *block, int stride, const uint8_t nnzc[15*8])
  773. {
  774. int i;
  775. for(i=0; i<16; i++){
  776. if(nnzc[ scan8[i] ])
  777. ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
  778. else if(((int16_t*)block)[i*16])
  779. ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
  780. stride);
  781. }
  782. }
  783. void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
  784. int16_t *block, int stride, const uint8_t nnzc[15*8])
  785. {
  786. int i;
  787. for(i=0; i<16; i+=4){
  788. int nnz = nnzc[ scan8[i] ];
  789. if(nnz){
  790. if(nnz==1 && ((int16_t*)block)[i*16])
  791. ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
  792. block + i*16, stride);
  793. else
  794. ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
  795. stride);
  796. }
  797. }
  798. }
  799. void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
  800. int16_t *block, int stride, const uint8_t nnzc[15*8])
  801. {
  802. int i, j;
  803. for(j=1; j<3; j++){
  804. for(i=j*16; i<j*16+4; i++){
  805. if(nnzc[ scan8[i] ])
  806. ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
  807. block + i*16, stride);
  808. else if(((int16_t*)block)[i*16])
  809. ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
  810. block + i*16, stride);
  811. }
  812. }
  813. }
  814. void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
  815. int16_t *block, int stride, const uint8_t nnzc[15*8])
  816. {
  817. int i, j;
  818. for(j=1; j<3; j++){
  819. for(i=j*16; i<j*16+4; i++){
  820. if(nnzc[ scan8[i] ])
  821. ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
  822. block + i*16, stride);
  823. else if(((int16_t*)block)[i*16])
  824. ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
  825. block + i*16, stride);
  826. }
  827. }
  828. for(j=1; j<3; j++){
  829. for(i=j*16+4; i<j*16+8; i++){
  830. if(nnzc[ scan8[i+4] ])
  831. ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
  832. block + i*16, stride);
  833. else if(((int16_t*)block)[i*16])
  834. ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
  835. block + i*16, stride);
  836. }
  837. }
  838. }
  839. void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
  840. int qmul)
  841. {
  842. double ftmp[10];
  843. uint64_t tmp[2];
  844. DECLARE_VAR_ALL64;
  845. __asm__ volatile (
  846. ".set noreorder \n\t"
  847. "dli %[tmp0], 0x08 \n\t"
  848. MMI_LDC1(%[ftmp3], %[input], 0x18)
  849. "mtc1 %[tmp0], %[ftmp8] \n\t"
  850. MMI_LDC1(%[ftmp2], %[input], 0x10)
  851. "dli %[tmp0], 0x20 \n\t"
  852. MMI_LDC1(%[ftmp1], %[input], 0x08)
  853. "mtc1 %[tmp0], %[ftmp9] \n\t"
  854. MMI_LDC1(%[ftmp0], %[input], 0x00)
  855. "mov.d %[ftmp4], %[ftmp3] \n\t"
  856. "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  857. "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  858. "mov.d %[ftmp4], %[ftmp1] \n\t"
  859. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  860. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  861. "mov.d %[ftmp4], %[ftmp3] \n\t"
  862. "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  863. "psubh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  864. "mov.d %[ftmp4], %[ftmp2] \n\t"
  865. "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  866. "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  867. "mov.d %[ftmp4], %[ftmp3] \n\t"
  868. "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  869. "punpckhhw %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  870. "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  871. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  872. "punpckhwd %[ftmp2], %[ftmp3], %[ftmp0] \n\t"
  873. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  874. "mov.d %[ftmp0], %[ftmp4] \n\t"
  875. "punpcklwd %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  876. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  877. "mov.d %[ftmp1], %[ftmp0] \n\t"
  878. "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  879. "psubh %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  880. "mov.d %[ftmp1], %[ftmp2] \n\t"
  881. "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  882. "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  883. "mov.d %[ftmp1], %[ftmp0] \n\t"
  884. "paddh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  885. "psubh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  886. "mov.d %[ftmp1], %[ftmp4] \n\t"
  887. "daddi %[tmp0], %[qmul], -0x7fff \n\t"
  888. "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  889. "bgtz %[tmp0], 1f \n\t"
  890. "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  891. "ori %[tmp0], $0, 0x80 \n\t"
  892. "dsll %[tmp0], %[tmp0], 0x10 \n\t"
  893. "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t"
  894. "daddu %[qmul], %[qmul], %[tmp0] \n\t"
  895. "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t"
  896. "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t"
  897. "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
  898. "mtc1 %[qmul], %[ftmp7] \n\t"
  899. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  900. "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  901. "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  902. "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  903. "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  904. "psraw %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
  905. "psraw %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  906. "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
  907. "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
  908. "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  909. "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  910. "dmfc1 %[tmp1], %[ftmp0] \n\t"
  911. "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
  912. "mfc1 %[input], %[ftmp0] \n\t"
  913. "sh %[tmp1], 0x00(%[output]) \n\t"
  914. "sh %[input], 0x80(%[output]) \n\t"
  915. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  916. PTR_SRL "%[input], %[input], 0x10 \n\t"
  917. "sh %[tmp1], 0x20(%[output]) \n\t"
  918. "sh %[input], 0xa0(%[output]) \n\t"
  919. "dmfc1 %[tmp1], %[ftmp2] \n\t"
  920. "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
  921. "mfc1 %[input], %[ftmp2] \n\t"
  922. "sh %[tmp1], 0x40(%[output]) \n\t"
  923. "sh %[input], 0xc0(%[output]) \n\t"
  924. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  925. PTR_SRL "%[input], %[input], 0x10 \n\t"
  926. "sh %[tmp1], 0x60(%[output]) \n\t"
  927. "sh %[input], 0xe0(%[output]) \n\t"
  928. "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t"
  929. "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
  930. "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t"
  931. "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
  932. "mtc1 %[qmul], %[ftmp7] \n\t"
  933. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  934. "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  935. "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  936. "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  937. "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  938. "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  939. "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  940. "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
  941. "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
  942. "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  943. "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  944. "dmfc1 %[tmp1], %[ftmp3] \n\t"
  945. "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  946. "mfc1 %[input], %[ftmp3] \n\t"
  947. "sh %[tmp1], 0x100(%[output]) \n\t"
  948. "sh %[input], 0x180(%[output]) \n\t"
  949. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  950. PTR_SRL "%[input], %[input], 0x10 \n\t"
  951. "sh %[tmp1], 0x120(%[output]) \n\t"
  952. "sh %[input], 0x1a0(%[output]) \n\t"
  953. "dmfc1 %[tmp1], %[ftmp4] \n\t"
  954. "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  955. "mfc1 %[input], %[ftmp4] \n\t"
  956. "sh %[tmp1], 0x140(%[output]) \n\t"
  957. "sh %[input], 0x1c0(%[output]) \n\t"
  958. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  959. PTR_SRL "%[input], %[input], 0x10 \n\t"
  960. "sh %[tmp1], 0x160(%[output]) \n\t"
  961. "j 2f \n\t"
  962. "sh %[input], 0x1e0(%[output]) \n\t"
  963. "1: \n\t"
  964. "ori %[tmp0], $0, 0x1f \n\t"
  965. #if HAVE_LOONGSON3
  966. "clz %[tmp1], %[qmul] \n\t"
  967. #elif HAVE_LOONGSON2
  968. #endif
  969. "ori %[input], $0, 0x07 \n\t"
  970. "dsubu %[tmp1], %[tmp0], %[tmp1] \n\t"
  971. "ori %[tmp0], $0, 0x80 \n\t"
  972. "dsll %[tmp0], %[tmp0], 0x10 \n\t"
  973. "daddu %[qmul], %[qmul], %[tmp0] \n\t"
  974. "dsubu %[tmp0], %[tmp1], %[input] \n\t"
  975. "movn %[tmp1], %[input], %[tmp0] \n\t"
  976. PTR_ADDIU "%[input], %[input], 0x01 \n\t"
  977. "andi %[tmp0], %[tmp1], 0xff \n\t"
  978. "srlv %[qmul], %[qmul], %[tmp0] \n\t"
  979. PTR_SUBU "%[input], %[input], %[tmp1] \n\t"
  980. "mtc1 %[input], %[ftmp6] \n\t"
  981. "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t"
  982. "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t"
  983. "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t"
  984. "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
  985. "mtc1 %[qmul], %[ftmp7] \n\t"
  986. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  987. "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  988. "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  989. "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  990. "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  991. "psraw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
  992. "psraw %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  993. "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  994. "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  995. "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  996. "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  997. "dmfc1 %[tmp1], %[ftmp0] \n\t"
  998. "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
  999. "sh %[tmp1], 0x00(%[output]) \n\t"
  1000. "mfc1 %[input], %[ftmp0] \n\t"
  1001. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  1002. "sh %[input], 0x80(%[output]) \n\t"
  1003. "sh %[tmp1], 0x20(%[output]) \n\t"
  1004. PTR_SRL "%[input], %[input], 0x10 \n\t"
  1005. "dmfc1 %[tmp1], %[ftmp2] \n\t"
  1006. "sh %[input], 0xa0(%[output]) \n\t"
  1007. "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
  1008. "sh %[tmp1], 0x40(%[output]) \n\t"
  1009. "mfc1 %[input], %[ftmp2] \n\t"
  1010. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  1011. "sh %[input], 0xc0(%[output]) \n\t"
  1012. "sh %[tmp1], 0x60(%[output]) \n\t"
  1013. PTR_SRL "%[input], %[input], 0x10 \n\t"
  1014. "sh %[input], 0xe0(%[output]) \n\t"
  1015. "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t"
  1016. "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
  1017. "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t"
  1018. "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
  1019. "mtc1 %[qmul], %[ftmp7] \n\t"
  1020. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1021. "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  1022. "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  1023. "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  1024. "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1025. "psraw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  1026. "psraw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1027. "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1028. "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1029. "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  1030. "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1031. "dmfc1 %[tmp1], %[ftmp3] \n\t"
  1032. "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  1033. "mfc1 %[input], %[ftmp3] \n\t"
  1034. "sh %[tmp1], 0x100(%[output]) \n\t"
  1035. "sh %[input], 0x180(%[output]) \n\t"
  1036. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  1037. PTR_SRL "%[input], %[input], 0x10 \n\t"
  1038. "sh %[tmp1], 0x120(%[output]) \n\t"
  1039. "sh %[input], 0x1a0(%[output]) \n\t"
  1040. "dmfc1 %[tmp1], %[ftmp4] \n\t"
  1041. "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  1042. "mfc1 %[input], %[ftmp4] \n\t"
  1043. "sh %[tmp1], 0x140(%[output]) \n\t"
  1044. "sh %[input], 0x1c0(%[output]) \n\t"
  1045. "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
  1046. PTR_SRL "%[input], %[input], 0x10 \n\t"
  1047. "sh %[tmp1], 0x160(%[output]) \n\t"
  1048. "sh %[input], 0x1e0(%[output]) \n\t"
  1049. "2: \n\t"
  1050. ".set reorder \n\t"
  1051. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1052. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1053. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1054. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1055. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  1056. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  1057. RESTRICT_ASM_ALL64
  1058. [output]"+&r"(output), [input]"+&r"(input),
  1059. [qmul]"+&r"(qmul)
  1060. : [ff_pw_1]"f"(ff_pw_1)
  1061. : "memory"
  1062. );
  1063. }
  1064. void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
  1065. {
  1066. int temp[8];
  1067. int t[8];
  1068. temp[0] = block[0] + block[16];
  1069. temp[1] = block[0] - block[16];
  1070. temp[2] = block[32] + block[48];
  1071. temp[3] = block[32] - block[48];
  1072. temp[4] = block[64] + block[80];
  1073. temp[5] = block[64] - block[80];
  1074. temp[6] = block[96] + block[112];
  1075. temp[7] = block[96] - block[112];
  1076. t[0] = temp[0] + temp[4] + temp[2] + temp[6];
  1077. t[1] = temp[0] - temp[4] + temp[2] - temp[6];
  1078. t[2] = temp[0] - temp[4] - temp[2] + temp[6];
  1079. t[3] = temp[0] + temp[4] - temp[2] - temp[6];
  1080. t[4] = temp[1] + temp[5] + temp[3] + temp[7];
  1081. t[5] = temp[1] - temp[5] + temp[3] - temp[7];
  1082. t[6] = temp[1] - temp[5] - temp[3] + temp[7];
  1083. t[7] = temp[1] + temp[5] - temp[3] - temp[7];
  1084. block[ 0]= (t[0]*qmul + 128) >> 8;
  1085. block[ 32]= (t[1]*qmul + 128) >> 8;
  1086. block[ 64]= (t[2]*qmul + 128) >> 8;
  1087. block[ 96]= (t[3]*qmul + 128) >> 8;
  1088. block[ 16]= (t[4]*qmul + 128) >> 8;
  1089. block[ 48]= (t[5]*qmul + 128) >> 8;
  1090. block[ 80]= (t[6]*qmul + 128) >> 8;
  1091. block[112]= (t[7]*qmul + 128) >> 8;
  1092. }
  1093. void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
  1094. {
  1095. int a,b,c,d;
  1096. d = block[0] - block[16];
  1097. a = block[0] + block[16];
  1098. b = block[32] - block[48];
  1099. c = block[32] + block[48];
  1100. block[0] = ((a+c)*qmul) >> 7;
  1101. block[16]= ((d+b)*qmul) >> 7;
  1102. block[32]= ((a-c)*qmul) >> 7;
  1103. block[48]= ((d-b)*qmul) >> 7;
  1104. }
  1105. void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
  1106. int log2_denom, int weight, int offset)
  1107. {
  1108. int y;
  1109. double ftmp[8];
  1110. DECLARE_VAR_ALL64;
  1111. offset <<= log2_denom;
  1112. if (log2_denom)
  1113. offset += 1 << (log2_denom - 1);
  1114. for (y=0; y<height; y++, block+=stride) {
  1115. __asm__ volatile (
  1116. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1117. MMI_LDC1(%[ftmp1], %[block0], 0x00)
  1118. MMI_LDC1(%[ftmp2], %[block1], 0x00)
  1119. "mtc1 %[weight], %[ftmp3] \n\t"
  1120. "mtc1 %[offset], %[ftmp4] \n\t"
  1121. "mtc1 %[log2_denom], %[ftmp5] \n\t"
  1122. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1123. "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  1124. "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
  1125. "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
  1126. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1127. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1128. "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  1129. "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  1130. "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1131. "pmullh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  1132. "paddsh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  1133. "paddsh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  1134. "paddsh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  1135. "paddsh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1136. "psrah %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  1137. "psrah %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1138. "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  1139. "psrah %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  1140. "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1141. "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  1142. MMI_SDC1(%[ftmp1], %[block0], 0x00)
  1143. MMI_SDC1(%[ftmp2], %[block1], 0x00)
  1144. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1145. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1146. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1147. [ftmp6]"=&f"(ftmp[6]),
  1148. RESTRICT_ASM_ALL64
  1149. [ftmp7]"=&f"(ftmp[7])
  1150. : [block0]"r"(block), [block1]"r"(block+8),
  1151. [weight]"r"(weight), [offset]"r"(offset),
  1152. [log2_denom]"r"(log2_denom)
  1153. : "memory"
  1154. );
  1155. }
  1156. }
  1157. void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
  1158. ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
  1159. int offset)
  1160. {
  1161. int y;
  1162. double ftmp[9];
  1163. DECLARE_VAR_ALL64;
  1164. offset = ((offset + 1) | 1) << log2_denom;
  1165. for (y=0; y<height; y++, dst+=stride, src+=stride) {
  1166. __asm__ volatile (
  1167. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1168. MMI_LDC1(%[ftmp1], %[src0], 0x00)
  1169. MMI_LDC1(%[ftmp2], %[dst0], 0x00)
  1170. "mtc1 %[weights], %[ftmp3] \n\t"
  1171. "mtc1 %[weightd], %[ftmp4] \n\t"
  1172. "mtc1 %[offset], %[ftmp5] \n\t"
  1173. "mtc1 %[log2_denom], %[ftmp6] \n\t"
  1174. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1175. "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  1176. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1177. "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
  1178. "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
  1179. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1180. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1181. "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  1182. "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
  1183. "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1184. "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1185. "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1186. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  1187. "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1188. "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  1189. "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1190. "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1191. "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  1192. MMI_SDC1(%[ftmp1], %[dst0], 0x00)
  1193. MMI_LDC1(%[ftmp1], %[src1], 0x00)
  1194. MMI_LDC1(%[ftmp2], %[dst1], 0x00)
  1195. "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
  1196. "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
  1197. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1198. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1199. "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  1200. "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
  1201. "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1202. "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1203. "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1204. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  1205. "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1206. "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  1207. "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1208. "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1209. "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  1210. MMI_SDC1(%[ftmp1], %[dst1], 0x00)
  1211. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1212. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1213. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1214. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1215. RESTRICT_ASM_ALL64
  1216. [ftmp8]"=&f"(ftmp[8])
  1217. : [dst0]"r"(dst), [dst1]"r"(dst+8),
  1218. [src0]"r"(src), [src1]"r"(src+8),
  1219. [weights]"r"(weights), [weightd]"r"(weightd),
  1220. [offset]"r"(offset), [log2_denom]"r"(log2_denom+1)
  1221. : "memory"
  1222. );
  1223. }
  1224. }
  1225. void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
  1226. int log2_denom, int weight, int offset)
  1227. {
  1228. int y;
  1229. double ftmp[6];
  1230. DECLARE_VAR_ALL64;
  1231. offset <<= log2_denom;
  1232. if (log2_denom)
  1233. offset += 1 << (log2_denom - 1);
  1234. for (y=0; y<height; y++, block+=stride) {
  1235. __asm__ volatile (
  1236. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1237. MMI_LDC1(%[ftmp1], %[block], 0x00)
  1238. "mtc1 %[weight], %[ftmp2] \n\t"
  1239. "mtc1 %[offset], %[ftmp3] \n\t"
  1240. "mtc1 %[log2_denom], %[ftmp5] \n\t"
  1241. "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1242. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1243. "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
  1244. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1245. "pmullh %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  1246. "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  1247. "paddsh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  1248. "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1249. "psrah %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1250. "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  1251. "packushb %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  1252. MMI_SDC1(%[ftmp1], %[block], 0x00)
  1253. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1254. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1255. [ftmp4]"=&f"(ftmp[4]),
  1256. RESTRICT_ASM_ALL64
  1257. [ftmp5]"=&f"(ftmp[5])
  1258. : [block]"r"(block), [weight]"r"(weight),
  1259. [offset]"r"(offset), [log2_denom]"r"(log2_denom)
  1260. : "memory"
  1261. );
  1262. }
  1263. }
  1264. void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
  1265. ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
  1266. int offset)
  1267. {
  1268. int y;
  1269. double ftmp[9];
  1270. DECLARE_VAR_ALL64;
  1271. offset = ((offset + 1) | 1) << log2_denom;
  1272. for (y=0; y<height; y++, dst+=stride, src+=stride) {
  1273. __asm__ volatile (
  1274. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1275. MMI_LDC1(%[ftmp1], %[src], 0x00)
  1276. MMI_LDC1(%[ftmp2], %[dst], 0x00)
  1277. "mtc1 %[weights], %[ftmp3] \n\t"
  1278. "mtc1 %[weightd], %[ftmp4] \n\t"
  1279. "mtc1 %[offset], %[ftmp5] \n\t"
  1280. "mtc1 %[log2_denom], %[ftmp6] \n\t"
  1281. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1282. "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  1283. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1284. "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
  1285. "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
  1286. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1287. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1288. "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  1289. "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
  1290. "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1291. "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1292. "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1293. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  1294. "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1295. "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  1296. "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1297. "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1298. "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  1299. MMI_SDC1(%[ftmp1], %[dst], 0x00)
  1300. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1301. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1302. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1303. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1304. RESTRICT_ASM_ALL64
  1305. [ftmp8]"=&f"(ftmp[8])
  1306. : [dst]"r"(dst), [src]"r"(src),
  1307. [weights]"r"(weights), [weightd]"r"(weightd),
  1308. [offset]"r"(offset), [log2_denom]"r"(log2_denom+1)
  1309. : "memory"
  1310. );
  1311. }
  1312. }
  1313. void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
  1314. int log2_denom, int weight, int offset)
  1315. {
  1316. int y;
  1317. double ftmp[5];
  1318. DECLARE_VAR_LOW32;
  1319. offset <<= log2_denom;
  1320. if (log2_denom)
  1321. offset += 1 << (log2_denom - 1);
  1322. for (y=0; y<height; y++, block+=stride) {
  1323. __asm__ volatile (
  1324. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1325. MMI_ULWC1(%[ftmp1], %[block], 0x00)
  1326. "mtc1 %[weight], %[ftmp2] \n\t"
  1327. "mtc1 %[offset], %[ftmp3] \n\t"
  1328. "mtc1 %[log2_denom], %[ftmp4] \n\t"
  1329. "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1330. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1331. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1332. "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  1333. "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1334. "psrah %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  1335. "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1336. MMI_SWC1(%[ftmp1], %[block], 0x00)
  1337. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1338. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1339. RESTRICT_ASM_LOW32
  1340. [ftmp4]"=&f"(ftmp[4])
  1341. : [block]"r"(block), [weight]"r"(weight),
  1342. [offset]"r"(offset), [log2_denom]"r"(log2_denom)
  1343. : "memory"
  1344. );
  1345. }
  1346. }
  1347. void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
  1348. ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
  1349. int offset)
  1350. {
  1351. int y;
  1352. double ftmp[7];
  1353. DECLARE_VAR_LOW32;
  1354. offset = ((offset + 1) | 1) << log2_denom;
  1355. for (y=0; y<height; y++, dst+=stride, src+=stride) {
  1356. __asm__ volatile (
  1357. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1358. MMI_ULWC1(%[ftmp1], %[src], 0x00)
  1359. MMI_ULWC1(%[ftmp2], %[dst], 0x00)
  1360. "mtc1 %[weight], %[ftmp3] \n\t"
  1361. "mtc1 %[weightd], %[ftmp4] \n\t"
  1362. "mtc1 %[offset], %[ftmp5] \n\t"
  1363. "mtc1 %[log2_denom], %[ftmp6] \n\t"
  1364. "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1365. "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  1366. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1367. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1368. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  1369. "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1370. "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1371. "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  1372. "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  1373. "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1374. "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  1375. MMI_SWC1(%[ftmp1], %[dst], 0x00)
  1376. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1377. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1378. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1379. RESTRICT_ASM_LOW32
  1380. [ftmp6]"=&f"(ftmp[6])
  1381. : [dst]"r"(dst), [src]"r"(src),
  1382. [weight]"r"(weights), [weightd]"r"(weightd),
  1383. [offset]"r"(offset), [log2_denom]"r"(log2_denom+1)
  1384. : "memory"
  1385. );
  1386. }
  1387. }
  1388. void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
  1389. int8_t *tc0)
  1390. {
  1391. double ftmp[12];
  1392. mips_reg addr[2];
  1393. DECLARE_VAR_LOW32;
  1394. DECLARE_VAR_ALL64;
  1395. DECLARE_VAR_ADDRT;
  1396. __asm__ volatile (
  1397. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  1398. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1399. PTR_ADDU "%[addr1], %[stride], %[addr0] \n\t"
  1400. "addi %[alpha], %[alpha], -0x01 \n\t"
  1401. PTR_SUBU "%[addr1], $0, %[addr1] \n\t"
  1402. "addi %[beta], %[beta], -0x01 \n\t"
  1403. PTR_ADDU "%[addr1], %[addr1], %[pix] \n\t"
  1404. MMI_LDC1(%[ftmp3], %[pix], 0x00)
  1405. MMI_LDXC1(%[ftmp1], %[addr1], %[stride], 0x00)
  1406. MMI_LDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
  1407. MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
  1408. "mtc1 %[alpha], %[ftmp5] \n\t"
  1409. "mtc1 %[beta], %[ftmp6] \n\t"
  1410. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1411. "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  1412. "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1413. "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  1414. "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
  1415. "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
  1416. "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1417. "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
  1418. "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1419. "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
  1420. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1421. "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
  1422. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1423. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1424. "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
  1425. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1426. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1427. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1428. "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  1429. "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  1430. MMI_ULWC1(%[ftmp5], %[tc0], 0x00)
  1431. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1432. "punpcklbh %[ftmp9], %[ftmp5], %[ftmp5] \n\t"
  1433. "pcmpgtb %[ftmp5], %[ftmp9], %[ftmp4] \n\t"
  1434. MMI_LDC1(%[ftmp4], %[addr1], 0x00)
  1435. "and %[ftmp10], %[ftmp5], %[ftmp8] \n\t"
  1436. "psubusb %[ftmp8], %[ftmp4], %[ftmp2] \n\t"
  1437. "psubusb %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
  1438. "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
  1439. "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1440. "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1441. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1442. "and %[ftmp5], %[ftmp10], %[ftmp9] \n\t"
  1443. "psubb %[ftmp8], %[ftmp5], %[ftmp7] \n\t"
  1444. "and %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1445. "pavgb %[ftmp5], %[ftmp2], %[ftmp3] \n\t"
  1446. MMI_LDC1(%[ftmp11], %[addr1], 0x00)
  1447. "pavgb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1448. "xor %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
  1449. "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
  1450. "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1451. "psubusb %[ftmp5], %[ftmp1], %[ftmp7] \n\t"
  1452. "paddusb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  1453. "pmaxub %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1454. "pminub %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  1455. MMI_SDXC1(%[ftmp4], %[addr1], %[stride], 0x00)
  1456. MMI_LDXC1(%[ftmp5], %[pix], %[addr0], 0x00)
  1457. "psubusb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
  1458. "psubusb %[ftmp7], %[ftmp3], %[ftmp5] \n\t"
  1459. "psubusb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1460. "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1461. "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  1462. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1463. "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1464. "and %[ftmp6], %[ftmp9], %[ftmp7] \n\t"
  1465. MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
  1466. "pavgb %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
  1467. MMI_LDXC1(%[ftmp11], %[pix], %[addr0], 0x00)
  1468. "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1469. "xor %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  1470. "and %[ftmp7], %[ftmp7], %[ff_pb_1] \n\t"
  1471. "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1472. "psubusb %[ftmp7], %[ftmp4], %[ftmp6] \n\t"
  1473. "paddusb %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  1474. "pmaxub %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1475. "pminub %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1476. MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
  1477. "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  1478. "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1479. "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t"
  1480. "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1481. "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
  1482. "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  1483. "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t"
  1484. "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
  1485. "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1486. "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1487. "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t"
  1488. "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t"
  1489. "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1490. "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  1491. "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  1492. "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  1493. "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1494. "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  1495. MMI_SDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
  1496. MMI_SDC1(%[ftmp3], %[pix], 0x00)
  1497. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1498. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1499. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1500. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1501. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  1502. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  1503. RESTRICT_ASM_LOW32
  1504. RESTRICT_ASM_ALL64
  1505. RESTRICT_ASM_ADDRT
  1506. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
  1507. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  1508. [alpha]"r"((mips_reg)alpha), [beta]"r"((mips_reg)beta),
  1509. [tc0]"r"(tc0), [ff_pb_1]"f"(ff_pb_1),
  1510. [ff_pb_3]"f"(ff_pb_3), [ff_pb_A1]"f"(ff_pb_A1)
  1511. : "memory"
  1512. );
  1513. }
  1514. static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
  1515. int beta)
  1516. {
  1517. DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]);
  1518. double ftmp[16];
  1519. uint64_t tmp[1];
  1520. mips_reg addr[3];
  1521. DECLARE_VAR_ALL64;
  1522. DECLARE_VAR_ADDRT;
  1523. __asm__ volatile (
  1524. "ori %[tmp0], $0, 0x01 \n\t"
  1525. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1526. "mtc1 %[tmp0], %[ftmp9] \n\t"
  1527. PTR_SLL "%[addr0], %[stride], 0x02 \n\t"
  1528. PTR_ADDU "%[addr2], %[stride], %[stride] \n\t"
  1529. PTR_ADDIU "%[alpha], %[alpha], -0x01 \n\t"
  1530. PTR_SLL "%[ftmp11], %[ftmp9], %[ftmp9] \n\t"
  1531. "bltz %[alpha], 1f \n\t"
  1532. PTR_ADDU "%[addr1], %[addr2], %[stride] \n\t"
  1533. PTR_ADDIU "%[beta], %[beta], -0x01 \n\t"
  1534. "bltz %[beta], 1f \n\t"
  1535. PTR_SUBU "%[addr0], $0, %[addr0] \n\t"
  1536. PTR_ADDU "%[addr0], %[addr0], %[pix] \n\t"
  1537. MMI_LDC1(%[ftmp3], %[pix], 0x00)
  1538. MMI_LDXC1(%[ftmp1], %[addr0], %[addr2], 0x00)
  1539. MMI_LDXC1(%[ftmp2], %[addr0], %[addr1], 0x00)
  1540. MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
  1541. "mtc1 %[alpha], %[ftmp5] \n\t"
  1542. "mtc1 %[beta], %[ftmp6] \n\t"
  1543. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1544. "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  1545. "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1546. "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
  1547. "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
  1548. "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  1549. "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1550. MMI_SDC1(%[ftmp5], %[stack], 0x10)
  1551. "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1552. "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
  1553. "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
  1554. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1555. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1556. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1557. "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
  1558. "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
  1559. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1560. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1561. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1562. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1563. MMI_LDC1(%[ftmp5], %[stack], 0x10)
  1564. "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1565. "ldc1 %[ftmp10], %[ff_pb_1] \n\t"
  1566. MMI_SDC1(%[ftmp8], %[stack], 0x20)
  1567. "pavgb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1568. "psubusb %[ftmp8], %[ftmp3], %[ftmp2] \n\t"
  1569. "pavgb %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
  1570. "psubusb %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
  1571. "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1572. "psubusb %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1573. MMI_LDC1(%[ftmp15], %[stack], 0x20)
  1574. "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1575. "and %[ftmp7], %[ftmp7], %[ftmp15] \n\t"
  1576. MMI_LDXC1(%[ftmp15], %[addr0], %[stride], 0x00)
  1577. "psubusb %[ftmp8], %[ftmp15], %[ftmp2] \n\t"
  1578. "psubusb %[ftmp5], %[ftmp2], %[ftmp15] \n\t"
  1579. "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
  1580. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1581. "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
  1582. "and %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1583. MMI_LDXC1(%[ftmp14], %[pix], %[addr2], 0x00)
  1584. MMI_SDC1(%[ftmp5], %[stack], 0x30)
  1585. "psubusb %[ftmp8], %[ftmp14], %[ftmp3] \n\t"
  1586. "psubusb %[ftmp5], %[ftmp3], %[ftmp14] \n\t"
  1587. "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
  1588. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1589. "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
  1590. "and %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1591. MMI_SDC1(%[ftmp5], %[stack], 0x40)
  1592. "pavgb %[ftmp5], %[ftmp15], %[ftmp1] \n\t"
  1593. "pavgb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  1594. "pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1595. MMI_SDC1(%[ftmp6], %[stack], 0x10)
  1596. "paddb %[ftmp7], %[ftmp15], %[ftmp1] \n\t"
  1597. "paddb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
  1598. "paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1599. "mov.d %[ftmp8], %[ftmp7] \n\t"
  1600. MMI_SDC1(%[ftmp7], %[stack], 0x00)
  1601. "psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  1602. "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  1603. "xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1604. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1605. "psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1606. "pavgb %[ftmp6], %[ftmp15], %[ftmp4] \n\t"
  1607. "psubb %[ftmp7], %[ftmp15], %[ftmp4] \n\t"
  1608. "paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  1609. "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1610. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1611. "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1612. MMI_LDC1(%[ftmp13], %[stack], 0x10)
  1613. "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  1614. "psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
  1615. "pavgb %[ftmp6], %[ftmp6], %[ftmp13] \n\t"
  1616. "pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  1617. "xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
  1618. "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  1619. "psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  1620. "xor %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
  1621. "pavgb %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
  1622. "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  1623. "psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1624. MMI_LDC1(%[ftmp13], %[stack], 0x30)
  1625. "pavgb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  1626. MMI_LDC1(%[ftmp12], %[stack], 0x20)
  1627. "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1628. "xor %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  1629. "and %[ftmp6], %[ftmp6], %[ftmp13] \n\t"
  1630. "and %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
  1631. "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1632. "xor %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  1633. MMI_SDXC1(%[ftmp6], %[addr0], %[addr1], 0x00)
  1634. MMI_LDC1(%[ftmp6], %[addr0], 0x00)
  1635. "paddb %[ftmp7], %[ftmp15], %[ftmp6] \n\t"
  1636. "pavgb %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
  1637. MMI_LDC1(%[ftmp12], %[stack], 0x00)
  1638. "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  1639. "paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1640. "paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
  1641. "psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  1642. "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  1643. "xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1644. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1645. MMI_LDC1(%[ftmp12], %[stack], 0x30)
  1646. "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1647. "xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  1648. "xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
  1649. "and %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
  1650. "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
  1651. "xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  1652. "xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
  1653. MMI_SDXC1(%[ftmp5], %[addr0], %[addr2], 0x00)
  1654. MMI_SDXC1(%[ftmp6], %[addr0], %[stride], 0x00)
  1655. "pavgb %[ftmp5], %[ftmp14], %[ftmp4] \n\t"
  1656. "pavgb %[ftmp6], %[ftmp3], %[ftmp2] \n\t"
  1657. "pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1658. MMI_SDC1(%[ftmp6], %[stack], 0x10)
  1659. "paddb %[ftmp7], %[ftmp14], %[ftmp4] \n\t"
  1660. "paddb %[ftmp8], %[ftmp3], %[ftmp2] \n\t"
  1661. "paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1662. "mov.d %[ftmp8], %[ftmp7] \n\t"
  1663. MMI_SDC1(%[ftmp7], %[stack], 0x00)
  1664. "psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  1665. "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  1666. "xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  1667. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1668. "psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1669. "pavgb %[ftmp6], %[ftmp14], %[ftmp1] \n\t"
  1670. "paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  1671. "psubb %[ftmp7], %[ftmp14], %[ftmp1] \n\t"
  1672. "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1673. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1674. "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1675. MMI_LDC1(%[ftmp12], %[stack], 0x10)
  1676. "pavgb %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  1677. "pavgb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
  1678. "psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
  1679. "pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  1680. "xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
  1681. "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  1682. "psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  1683. "xor %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
  1684. "pavgb %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
  1685. "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  1686. MMI_LDC1(%[ftmp12], %[stack], 0x40)
  1687. "psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1688. MMI_LDC1(%[ftmp13], %[stack], 0x20)
  1689. "pavgb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  1690. "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1691. "xor %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  1692. "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
  1693. "and %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
  1694. "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1695. "xor %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  1696. MMI_SDC1(%[ftmp6], %[pix], 0x00)
  1697. MMI_LDXC1(%[ftmp6], %[pix], %[addr1], 0x00)
  1698. "paddb %[ftmp7], %[ftmp14], %[ftmp6] \n\t"
  1699. "pavgb %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
  1700. MMI_LDC1(%[ftmp12], %[stack], 0x00)
  1701. "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  1702. "paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1703. "paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
  1704. "psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  1705. "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  1706. "xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1707. "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  1708. MMI_LDC1(%[ftmp12], %[stack], 0x40)
  1709. "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1710. "xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  1711. "xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
  1712. "and %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
  1713. "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
  1714. "xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
  1715. "xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
  1716. MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
  1717. MMI_SDXC1(%[ftmp6], %[pix], %[addr2], 0x00)
  1718. "1: \n\t"
  1719. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1720. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1721. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1722. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1723. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  1724. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  1725. [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
  1726. [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
  1727. [tmp0]"=&r"(tmp[0]),
  1728. RESTRICT_ASM_ALL64
  1729. RESTRICT_ASM_ADDRT
  1730. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  1731. [addr2]"=&r"(addr[2]),
  1732. [alpha]"+&r"(alpha), [beta]"+&r"(beta)
  1733. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  1734. [stack]"r"(stack), [ff_pb_1]"m"(ff_pb_1)
  1735. : "memory"
  1736. );
  1737. }
  1738. void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha,
  1739. int beta, int8_t *tc0)
  1740. {
  1741. double ftmp[9];
  1742. mips_reg addr[1];
  1743. DECLARE_VAR_LOW32;
  1744. DECLARE_VAR_ALL64;
  1745. DECLARE_VAR_ADDRT;
  1746. __asm__ volatile (
  1747. "addi %[alpha], %[alpha], -0x01 \n\t"
  1748. "addi %[beta], %[beta], -0x01 \n\t"
  1749. "or %[addr0], $0, %[pix] \n\t"
  1750. PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
  1751. PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
  1752. MMI_LDC1(%[ftmp1], %[addr0], 0x00)
  1753. MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
  1754. MMI_LDC1(%[ftmp3], %[pix], 0x00)
  1755. MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
  1756. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1757. "mtc1 %[alpha], %[ftmp5] \n\t"
  1758. "mtc1 %[beta], %[ftmp6] \n\t"
  1759. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1760. "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  1761. "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1762. "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  1763. "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
  1764. "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
  1765. "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1766. "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1767. "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
  1768. "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
  1769. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1770. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1771. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1772. "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
  1773. "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
  1774. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1775. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1776. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1777. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1778. "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1779. MMI_ULWC1(%[ftmp7], %[tc0], 0x00)
  1780. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1781. "and %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1782. "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1783. "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  1784. "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1785. "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t"
  1786. "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  1787. "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
  1788. "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t"
  1789. "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
  1790. "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1791. "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1792. "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t"
  1793. "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t"
  1794. "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  1795. "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  1796. "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  1797. "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  1798. "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1799. "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  1800. MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
  1801. MMI_SDC1(%[ftmp3], %[pix], 0x00)
  1802. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1803. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1804. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1805. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1806. [ftmp8]"=&f"(ftmp[8]),
  1807. RESTRICT_ASM_LOW32
  1808. RESTRICT_ASM_ALL64
  1809. RESTRICT_ASM_ADDRT
  1810. [addr0]"=&r"(addr[0])
  1811. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  1812. [alpha]"r"(alpha), [beta]"r"(beta),
  1813. [tc0]"r"(tc0), [ff_pb_1]"f"(ff_pb_1),
  1814. [ff_pb_3]"f"(ff_pb_3), [ff_pb_A1]"f"(ff_pb_A1)
  1815. : "memory"
  1816. );
  1817. }
  1818. void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
  1819. int beta)
  1820. {
  1821. double ftmp[9];
  1822. mips_reg addr[1];
  1823. DECLARE_VAR_ALL64;
  1824. DECLARE_VAR_ADDRT;
  1825. __asm__ volatile (
  1826. "addi %[alpha], %[alpha], -0x01 \n\t"
  1827. "addi %[beta], %[beta], -0x01 \n\t"
  1828. "or %[addr0], $0, %[pix] \n\t"
  1829. PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
  1830. PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
  1831. MMI_LDC1(%[ftmp1], %[addr0], 0x00)
  1832. MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
  1833. MMI_LDC1(%[ftmp3], %[pix], 0x00)
  1834. MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
  1835. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1836. "mtc1 %[alpha], %[ftmp5] \n\t"
  1837. "mtc1 %[beta], %[ftmp6] \n\t"
  1838. "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  1839. "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  1840. "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1841. "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  1842. "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
  1843. "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
  1844. "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1845. "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1846. "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
  1847. "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
  1848. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1849. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1850. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1851. "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
  1852. "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
  1853. "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1854. "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  1855. "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
  1856. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  1857. "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  1858. "mov.d %[ftmp6], %[ftmp2] \n\t"
  1859. "mov.d %[ftmp7], %[ftmp3] \n\t"
  1860. "xor %[ftmp5], %[ftmp2], %[ftmp4] \n\t"
  1861. "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
  1862. "pavgb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  1863. "psubusb %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  1864. "pavgb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  1865. "xor %[ftmp5], %[ftmp3], %[ftmp1] \n\t"
  1866. "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
  1867. "pavgb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  1868. "psubusb %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  1869. "pavgb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  1870. "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  1871. "psubb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  1872. "and %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  1873. "and %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  1874. "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  1875. "paddb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  1876. MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
  1877. MMI_SDC1(%[ftmp3], %[pix], 0x00)
  1878. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  1879. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  1880. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  1881. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  1882. [ftmp8]"=&f"(ftmp[8]),
  1883. RESTRICT_ASM_ALL64
  1884. RESTRICT_ASM_ADDRT
  1885. [addr0]"=&r"(addr[0])
  1886. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  1887. [alpha]"r"(alpha), [beta]"r"(beta),
  1888. [ff_pb_1]"f"(ff_pb_1)
  1889. : "memory"
  1890. );
  1891. }
  1892. void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
  1893. int8_t *tc0)
  1894. {
  1895. double ftmp[11];
  1896. mips_reg addr[6];
  1897. DECLARE_VAR_LOW32;
  1898. __asm__ volatile (
  1899. "addi %[alpha], %[alpha], -0x01 \n\t"
  1900. "addi %[beta], %[beta], -0x01 \n\t"
  1901. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  1902. PTR_ADDI "%[pix], %[pix], -0x02 \n\t"
  1903. PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t"
  1904. PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t"
  1905. "or %[addr5], $0, %[pix] \n\t"
  1906. PTR_ADDU "%[pix], %[pix], %[addr1] \n\t"
  1907. MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
  1908. PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
  1909. MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
  1910. PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
  1911. MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
  1912. MMI_ULWC1(%[ftmp3], %[pix], 0x00)
  1913. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  1914. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1915. PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
  1916. "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t"
  1917. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  1918. MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
  1919. PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t"
  1920. MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
  1921. PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t"
  1922. MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
  1923. PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t"
  1924. MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
  1925. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1926. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  1927. "mov.d %[ftmp6], %[ftmp4] \n\t"
  1928. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1929. "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  1930. "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  1931. "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t"
  1932. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  1933. "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  1934. "mov.d %[ftmp9], %[ftmp0] \n\t"
  1935. "mov.d %[ftmp10], %[ftmp3] \n\t"
  1936. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  1937. "mtc1 %[alpha], %[ftmp4] \n\t"
  1938. "mtc1 %[beta], %[ftmp5] \n\t"
  1939. "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  1940. "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
  1941. "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  1942. "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  1943. "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
  1944. "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
  1945. "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1946. "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  1947. "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
  1948. "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t"
  1949. "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1950. "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1951. "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  1952. "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  1953. "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
  1954. "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  1955. "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1956. "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  1957. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  1958. "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1959. MMI_ULWC1(%[ftmp6], %[tc0], 0x00)
  1960. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  1961. "and %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  1962. "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  1963. "xor %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
  1964. "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  1965. "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
  1966. "pavgb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  1967. "xor %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
  1968. "pavgb %[ftmp3], %[ftmp3], %[ff_pb_3] \n\t"
  1969. "pavgb %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  1970. "pavgb %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  1971. "paddusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  1972. "psubusb %[ftmp6], %[ff_pb_A1], %[ftmp3] \n\t"
  1973. "psubusb %[ftmp3], %[ftmp3], %[ff_pb_A1] \n\t"
  1974. "pminub %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
  1975. "pminub %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  1976. "psubusb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  1977. "psubusb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  1978. "paddusb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  1979. "paddusb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  1980. "punpckhwd %[ftmp4], %[ftmp9], %[ftmp9] \n\t"
  1981. "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
  1982. "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
  1983. "punpcklbh %[ftmp0], %[ftmp9], %[ftmp1] \n\t"
  1984. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
  1985. "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  1986. "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  1987. MMI_USWC1(%[ftmp1], %[addr5], 0x00)
  1988. PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
  1989. "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  1990. MMI_USWC1(%[ftmp1], %[addr3], 0x00)
  1991. PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
  1992. MMI_USWC1(%[ftmp0], %[addr4], 0x00)
  1993. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1994. "punpckhwd %[ftmp3], %[ftmp10], %[ftmp10] \n\t"
  1995. MMI_USWC1(%[ftmp0], %[pix], 0x00)
  1996. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  1997. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  1998. PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
  1999. "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
  2000. "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2001. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2002. "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  2003. PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t"
  2004. PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t"
  2005. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2006. MMI_USWC1(%[ftmp4], %[addr4], 0x00)
  2007. PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t"
  2008. "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t"
  2009. MMI_USWC1(%[ftmp9], %[addr3], 0x00)
  2010. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  2011. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  2012. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  2013. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  2014. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  2015. [ftmp10]"=&f"(ftmp[10]),
  2016. RESTRICT_ASM_LOW32
  2017. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  2018. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  2019. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  2020. [pix]"+&r"(pix)
  2021. : [alpha]"r"(alpha), [beta]"r"(beta),
  2022. [stride]"r"((mips_reg)stride), [tc0]"r"(tc0),
  2023. [ff_pb_1]"f"(ff_pb_1), [ff_pb_3]"f"(ff_pb_3),
  2024. [ff_pb_A1]"f"(ff_pb_A1)
  2025. : "memory"
  2026. );
  2027. }
  2028. void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
  2029. int beta)
  2030. {
  2031. double ftmp[11];
  2032. mips_reg addr[6];
  2033. DECLARE_VAR_LOW32;
  2034. __asm__ volatile (
  2035. "addi %[alpha], %[alpha], -0x01 \n\t"
  2036. "addi %[beta], %[beta], -0x01 \n\t"
  2037. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  2038. PTR_ADDI "%[pix], %[pix], -0x02 \n\t"
  2039. PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t"
  2040. PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t"
  2041. "or %[addr5], $0, %[pix] \n\t"
  2042. PTR_ADDU "%[pix], %[pix], %[addr1] \n\t"
  2043. MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
  2044. PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
  2045. MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
  2046. PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
  2047. MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
  2048. MMI_ULWC1(%[ftmp3], %[pix], 0x00)
  2049. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2050. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  2051. PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
  2052. "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t"
  2053. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2054. MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
  2055. PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t"
  2056. MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
  2057. PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t"
  2058. MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
  2059. PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t"
  2060. MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
  2061. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2062. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  2063. "mov.d %[ftmp6], %[ftmp4] \n\t"
  2064. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2065. "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
  2066. "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
  2067. "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t"
  2068. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2069. "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  2070. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  2071. "mtc1 %[alpha], %[ftmp4] \n\t"
  2072. "mtc1 %[beta], %[ftmp5] \n\t"
  2073. "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  2074. "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
  2075. "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  2076. "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  2077. "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
  2078. "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
  2079. "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  2080. "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  2081. "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
  2082. "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t"
  2083. "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2084. "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2085. "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  2086. "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  2087. "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
  2088. "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2089. "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2090. "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  2091. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  2092. "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  2093. "mov.d %[ftmp5], %[ftmp1] \n\t"
  2094. "mov.d %[ftmp6], %[ftmp2] \n\t"
  2095. "xor %[ftmp4], %[ftmp1], %[ftmp3] \n\t"
  2096. "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t"
  2097. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  2098. "psubusb %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  2099. "pavgb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  2100. "xor %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
  2101. "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t"
  2102. "pavgb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  2103. "psubusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  2104. "pavgb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2105. "psubb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  2106. "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  2107. "and %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  2108. "and %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  2109. "paddb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  2110. "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  2111. "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
  2112. "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
  2113. "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
  2114. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2115. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2116. "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  2117. "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2118. MMI_USWC1(%[ftmp1], %[addr5], 0x00)
  2119. PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
  2120. "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  2121. PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
  2122. MMI_USWC1(%[ftmp1], %[addr3], 0x00)
  2123. MMI_USWC1(%[ftmp0], %[addr4], 0x00)
  2124. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  2125. "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  2126. MMI_USWC1(%[ftmp0], %[pix], 0x00)
  2127. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2128. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  2129. PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
  2130. "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
  2131. "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2132. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2133. "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  2134. PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t"
  2135. PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t"
  2136. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2137. PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t"
  2138. MMI_USWC1(%[ftmp4], %[addr4], 0x00)
  2139. "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t"
  2140. MMI_USWC1(%[ftmp9], %[addr3], 0x00)
  2141. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  2142. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  2143. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  2144. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  2145. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  2146. [ftmp10]"=&f"(ftmp[10]),
  2147. RESTRICT_ASM_LOW32
  2148. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  2149. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  2150. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  2151. [pix]"+&r"(pix)
  2152. : [alpha]"r"(alpha), [beta]"r"(beta),
  2153. [stride]"r"((mips_reg)stride), [ff_pb_1]"f"(ff_pb_1)
  2154. : "memory"
  2155. );
  2156. }
  2157. void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
  2158. int8_t *tc0)
  2159. {
  2160. if ((tc0[0] & tc0[1]) >= 0)
  2161. ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
  2162. if ((tc0[2] & tc0[3]) >= 0)
  2163. ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
  2164. }
  2165. void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
  2166. int beta)
  2167. {
  2168. deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
  2169. deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
  2170. }
  2171. void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
  2172. int8_t *tc0)
  2173. {
  2174. DECLARE_ALIGNED(8, const uint64_t, stack[0x0d]);
  2175. double ftmp[9];
  2176. mips_reg addr[8];
  2177. DECLARE_VAR_LOW32;
  2178. DECLARE_VAR_ALL64;
  2179. __asm__ volatile (
  2180. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  2181. PTR_ADDI "%[addr1], %[pix], -0x4 \n\t"
  2182. PTR_ADDU "%[addr2], %[stride], %[addr0] \n\t"
  2183. MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
  2184. PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
  2185. PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
  2186. MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
  2187. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2188. MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
  2189. MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
  2190. PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
  2191. MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
  2192. PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
  2193. MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
  2194. PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t"
  2195. MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
  2196. PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
  2197. "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
  2198. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2199. "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  2200. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2201. "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
  2202. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2203. PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
  2204. MMI_SDC1(%[ftmp1], %[stack], 0x10)
  2205. MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
  2206. PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t"
  2207. "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  2208. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  2209. "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  2210. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2211. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
  2212. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2213. MMI_LDC1(%[ftmp8], %[stack], 0x10)
  2214. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2215. MMI_SDC1(%[ftmp0], %[stack], 0x00)
  2216. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t"
  2217. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  2218. "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t"
  2219. "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  2220. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  2221. "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t"
  2222. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  2223. "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t"
  2224. "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  2225. MMI_SDC1(%[ftmp1], %[stack], 0x10)
  2226. MMI_SDC1(%[ftmp3], %[stack], 0x20)
  2227. MMI_SDC1(%[ftmp7], %[stack], 0x30)
  2228. MMI_SDC1(%[ftmp5], %[stack], 0x40)
  2229. MMI_SDC1(%[ftmp6], %[stack], 0x50)
  2230. PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t"
  2231. PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t"
  2232. MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
  2233. PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
  2234. MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
  2235. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2236. MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
  2237. MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
  2238. PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
  2239. MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
  2240. PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
  2241. MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
  2242. PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t"
  2243. MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
  2244. "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
  2245. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2246. "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  2247. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2248. "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
  2249. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2250. PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
  2251. MMI_SDC1(%[ftmp1], %[stack], 0x18)
  2252. MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
  2253. "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  2254. "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  2255. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  2256. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2257. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
  2258. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2259. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2260. MMI_LDC1(%[ftmp8], %[stack], 0x18)
  2261. MMI_SDC1(%[ftmp0], %[stack], 0x08)
  2262. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t"
  2263. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
  2264. "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t"
  2265. "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  2266. "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t"
  2267. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  2268. "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t"
  2269. "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  2270. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  2271. MMI_SDC1(%[ftmp1], %[stack], 0x18)
  2272. MMI_SDC1(%[ftmp3], %[stack], 0x28)
  2273. MMI_SDC1(%[ftmp7], %[stack], 0x38)
  2274. MMI_SDC1(%[ftmp5], %[stack], 0x48)
  2275. MMI_SDC1(%[ftmp6], %[stack], 0x58)
  2276. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  2277. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  2278. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  2279. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  2280. [ftmp8]"=&f"(ftmp[8]),
  2281. RESTRICT_ASM_ALL64
  2282. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  2283. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  2284. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  2285. [addr6]"=&r"(addr[6]), [addr7]"=&r"(addr[7])
  2286. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  2287. [stack]"r"(stack)
  2288. : "memory"
  2289. );
  2290. ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
  2291. __asm__ volatile (
  2292. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  2293. PTR_ADDI "%[addr1], %[pix], -0x02 \n\t"
  2294. PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
  2295. PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t"
  2296. PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t"
  2297. PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
  2298. MMI_LDC1(%[ftmp0], %[stack], 0x10)
  2299. MMI_LDC1(%[ftmp1], %[stack], 0x20)
  2300. MMI_LDC1(%[ftmp2], %[stack], 0x30)
  2301. MMI_LDC1(%[ftmp3], %[stack], 0x40)
  2302. "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
  2303. "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
  2304. "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
  2305. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2306. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2307. "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  2308. "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2309. MMI_USWC1(%[ftmp1], %[addr1], 0x00)
  2310. PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
  2311. "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  2312. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2313. MMI_USWC1(%[ftmp1], %[addr3], 0x00)
  2314. MMI_USWC1(%[ftmp0], %[addr5], 0x00)
  2315. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  2316. "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  2317. MMI_USWC1(%[ftmp0], %[addr4], 0x00)
  2318. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2319. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  2320. "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
  2321. PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
  2322. "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2323. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2324. PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t"
  2325. "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  2326. PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
  2327. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2328. MMI_USWC1(%[ftmp4], %[addr5], 0x00)
  2329. PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
  2330. "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  2331. PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t"
  2332. MMI_USWC1(%[ftmp4], %[addr3], 0x00)
  2333. PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t"
  2334. MMI_LDC1(%[ftmp0], %[stack], 0x18)
  2335. MMI_LDC1(%[ftmp1], %[stack], 0x28)
  2336. MMI_LDC1(%[ftmp2], %[stack], 0x38)
  2337. MMI_LDC1(%[ftmp3], %[stack], 0x48)
  2338. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  2339. "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
  2340. PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
  2341. "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
  2342. "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
  2343. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2344. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2345. PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
  2346. "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
  2347. "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2348. MMI_USWC1(%[ftmp1], %[addr1], 0x00)
  2349. "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  2350. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2351. MMI_USWC1(%[ftmp1], %[addr3], 0x00)
  2352. MMI_USWC1(%[ftmp0], %[addr5], 0x00)
  2353. "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  2354. "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  2355. MMI_USWC1(%[ftmp0], %[addr4], 0x00)
  2356. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2357. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
  2358. PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
  2359. "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
  2360. "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2361. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2362. PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t"
  2363. "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  2364. PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
  2365. MMI_USWC1(%[ftmp5], %[addr3], 0x00)
  2366. MMI_USWC1(%[ftmp4], %[addr5], 0x00)
  2367. PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
  2368. "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  2369. MMI_USWC1(%[ftmp4], %[addr3], 0x00)
  2370. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  2371. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  2372. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  2373. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  2374. [ftmp8]"=&f"(ftmp[8]),
  2375. RESTRICT_ASM_LOW32
  2376. RESTRICT_ASM_ALL64
  2377. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  2378. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  2379. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  2380. [addr6]"=&r"(addr[6]), [addr7]"=&r"(addr[7])
  2381. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  2382. [stack]"r"(stack)
  2383. : "memory"
  2384. );
  2385. }
  2386. void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
  2387. int beta)
  2388. {
  2389. DECLARE_ALIGNED(8, const uint64_t, ptmp[0x11]);
  2390. DECLARE_ALIGNED(8, const uint64_t, pdat[0x04]);
  2391. double ftmp[9];
  2392. mips_reg addr[7];
  2393. DECLARE_VAR_ALL64;
  2394. __asm__ volatile (
  2395. PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
  2396. PTR_ADDI "%[addr1], %[pix], -0x04 \n\t"
  2397. PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t"
  2398. PTR_ADDU "%[addr3], %[addr0], %[addr0] \n\t"
  2399. PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
  2400. PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
  2401. MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
  2402. PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
  2403. MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
  2404. MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
  2405. PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t"
  2406. MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
  2407. PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t"
  2408. MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
  2409. PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
  2410. MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
  2411. MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
  2412. PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t"
  2413. "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
  2414. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2415. "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  2416. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2417. "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
  2418. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2419. MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
  2420. "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  2421. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  2422. MMI_SDC1(%[ftmp3], %[ptmp], 0x00)
  2423. "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
  2424. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2425. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
  2426. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2427. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
  2428. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  2429. MMI_SDC1(%[ftmp2], %[ptmp], 0x20)
  2430. MMI_LDC1(%[ftmp2], %[ptmp], 0x00)
  2431. "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
  2432. "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  2433. "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
  2434. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2435. "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
  2436. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  2437. MMI_SDC1(%[ftmp0], %[ptmp], 0x00)
  2438. MMI_SDC1(%[ftmp5], %[ptmp], 0x10)
  2439. MMI_SDC1(%[ftmp7], %[ptmp], 0x40)
  2440. MMI_SDC1(%[ftmp4], %[ptmp], 0x50)
  2441. MMI_LDC1(%[ftmp8], %[ptmp], 0x20)
  2442. "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
  2443. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  2444. "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
  2445. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  2446. PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t"
  2447. MMI_SDC1(%[ftmp3], %[ptmp], 0x20)
  2448. MMI_SDC1(%[ftmp0], %[ptmp], 0x30)
  2449. MMI_SDC1(%[ftmp6], %[ptmp], 0x60)
  2450. MMI_SDC1(%[ftmp5], %[ptmp], 0x70)
  2451. PTR_ADDU "%[addr1], %[addr1], %[addr5] \n\t"
  2452. PTR_ADDU "%[addr4], %[addr4], %[addr5] \n\t"
  2453. PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
  2454. MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
  2455. PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
  2456. MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
  2457. MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
  2458. PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t"
  2459. MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
  2460. PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t"
  2461. MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
  2462. PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
  2463. MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
  2464. MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
  2465. PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t"
  2466. "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
  2467. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2468. "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  2469. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2470. "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
  2471. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2472. MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
  2473. "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  2474. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  2475. MMI_SDC1(%[ftmp3], %[ptmp], 0x08)
  2476. "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
  2477. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2478. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
  2479. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2480. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
  2481. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  2482. MMI_SDC1(%[ftmp2], %[ptmp], 0x28)
  2483. MMI_LDC1(%[ftmp2], %[ptmp], 0x08)
  2484. "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
  2485. "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  2486. "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
  2487. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2488. "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
  2489. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  2490. MMI_SDC1(%[ftmp0], %[ptmp], 0x08)
  2491. MMI_SDC1(%[ftmp5], %[ptmp], 0x18)
  2492. MMI_SDC1(%[ftmp7], %[ptmp], 0x48)
  2493. MMI_SDC1(%[ftmp4], %[ptmp], 0x58)
  2494. MMI_LDC1(%[ftmp8], %[ptmp], 0x28)
  2495. "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
  2496. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  2497. "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
  2498. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  2499. MMI_SDC1(%[ftmp3], %[ptmp], 0x28)
  2500. MMI_SDC1(%[ftmp0], %[ptmp], 0x38)
  2501. MMI_SDC1(%[ftmp6], %[ptmp], 0x68)
  2502. MMI_SDC1(%[ftmp5], %[ptmp], 0x78)
  2503. PTR_S "%[addr1], 0x00(%[pdat]) \n\t"
  2504. PTR_S "%[addr2], 0x08(%[pdat]) \n\t"
  2505. PTR_S "%[addr0], 0x10(%[pdat]) \n\t"
  2506. PTR_S "%[addr3], 0x18(%[pdat]) \n\t"
  2507. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  2508. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  2509. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  2510. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  2511. [ftmp8]"=&f"(ftmp[8]),
  2512. RESTRICT_ASM_ALL64
  2513. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  2514. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  2515. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  2516. [addr6]"=&r"(addr[6])
  2517. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  2518. [ptmp]"r"(ptmp), [pdat]"r"(pdat)
  2519. : "memory"
  2520. );
  2521. ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
  2522. __asm__ volatile (
  2523. PTR_L "%[addr1], 0x00(%[pdat]) \n\t"
  2524. PTR_L "%[addr2], 0x08(%[pdat]) \n\t"
  2525. PTR_L "%[addr0], 0x10(%[pdat]) \n\t"
  2526. PTR_L "%[addr3], 0x18(%[pdat]) \n\t"
  2527. PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
  2528. MMI_LDC1(%[ftmp0], %[ptmp], 0x08)
  2529. MMI_LDC1(%[ftmp1], %[ptmp], 0x18)
  2530. MMI_LDC1(%[ftmp2], %[ptmp], 0x28)
  2531. MMI_LDC1(%[ftmp3], %[ptmp], 0x38)
  2532. MMI_LDC1(%[ftmp4], %[ptmp], 0x48)
  2533. MMI_LDC1(%[ftmp5], %[ptmp], 0x58)
  2534. MMI_LDC1(%[ftmp6], %[ptmp], 0x68)
  2535. "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
  2536. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2537. "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  2538. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2539. "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
  2540. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2541. MMI_LDC1(%[ftmp8], %[ptmp], 0x78)
  2542. "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  2543. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  2544. MMI_USDC1(%[ftmp3], %[addr1], 0x00)
  2545. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2546. "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
  2547. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2548. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
  2549. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2550. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
  2551. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  2552. MMI_USDC1(%[ftmp2], %[addr5], 0x00)
  2553. MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
  2554. "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
  2555. "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  2556. "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
  2557. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2558. "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
  2559. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  2560. PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
  2561. MMI_USDC1(%[ftmp0], %[addr1], 0x00)
  2562. PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t"
  2563. MMI_USDC1(%[ftmp5], %[addr5], 0x00)
  2564. PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
  2565. MMI_USDC1(%[ftmp7], %[addr6], 0x00)
  2566. PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
  2567. MMI_USDC1(%[ftmp4], %[addr5], 0x00)
  2568. MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
  2569. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2570. "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
  2571. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  2572. "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
  2573. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  2574. MMI_USDC1(%[ftmp3], %[addr5], 0x00)
  2575. PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
  2576. MMI_USDC1(%[ftmp0], %[addr4], 0x00)
  2577. PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t"
  2578. MMI_USDC1(%[ftmp6], %[addr5], 0x00)
  2579. PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t"
  2580. MMI_USDC1(%[ftmp5], %[addr6], 0x00)
  2581. PTR_SUBU "%[addr1], %[addr1], %[addr5] \n\t"
  2582. PTR_SUBU "%[addr4], %[addr4], %[addr5] \n\t"
  2583. MMI_LDC1(%[ftmp0], %[ptmp], 0x00)
  2584. MMI_LDC1(%[ftmp1], %[ptmp], 0x10)
  2585. MMI_LDC1(%[ftmp2], %[ptmp], 0x20)
  2586. MMI_LDC1(%[ftmp3], %[ptmp], 0x30)
  2587. MMI_LDC1(%[ftmp4], %[ptmp], 0x40)
  2588. MMI_LDC1(%[ftmp5], %[ptmp], 0x50)
  2589. MMI_LDC1(%[ftmp6], %[ptmp], 0x60)
  2590. "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
  2591. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  2592. "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
  2593. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  2594. "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
  2595. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  2596. MMI_LDC1(%[ftmp8], %[ptmp], 0x70)
  2597. "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
  2598. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  2599. MMI_USDC1(%[ftmp3], %[addr1], 0x00)
  2600. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2601. "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
  2602. "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  2603. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
  2604. "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  2605. "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
  2606. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  2607. MMI_USDC1(%[ftmp2], %[addr5], 0x00)
  2608. MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
  2609. "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
  2610. "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
  2611. "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
  2612. "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  2613. "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
  2614. "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  2615. PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
  2616. MMI_USDC1(%[ftmp0], %[addr1], 0x00)
  2617. PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t"
  2618. MMI_USDC1(%[ftmp5], %[addr5], 0x00)
  2619. PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
  2620. MMI_USDC1(%[ftmp7], %[addr6], 0x00)
  2621. PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
  2622. MMI_USDC1(%[ftmp4], %[addr5], 0x00)
  2623. MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
  2624. PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
  2625. "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
  2626. "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
  2627. "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
  2628. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  2629. MMI_USDC1(%[ftmp3], %[addr5], 0x00)
  2630. PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
  2631. MMI_USDC1(%[ftmp0], %[addr4], 0x00)
  2632. PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t"
  2633. MMI_USDC1(%[ftmp6], %[addr5], 0x00)
  2634. MMI_USDC1(%[ftmp5], %[addr6], 0x00)
  2635. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  2636. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  2637. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  2638. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  2639. [ftmp8]"=&f"(ftmp[8]),
  2640. RESTRICT_ASM_ALL64
  2641. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  2642. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  2643. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  2644. [addr6]"=&r"(addr[6])
  2645. : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
  2646. [ptmp]"r"(ptmp), [pdat]"r"(pdat)
  2647. : "memory"
  2648. );
  2649. }