You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1191 lines
53KB

  1. /*
  2. * Loongson SIMD optimized qpeldsp
  3. *
  4. * Copyright (c) 2016 Loongson Technology Corporation Limited
  5. * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "hpeldsp_mips.h"
  24. #include "libavcodec/bit_depth_template.c"
  25. #include "libavutil/mips/mmiutils.h"
  26. #include "constants.h"
  27. void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
  28. ptrdiff_t line_size, int h)
  29. {
  30. double ftmp[2];
  31. mips_reg addr[2];
  32. DECLARE_VAR_LOW32;
  33. DECLARE_VAR_ADDRT;
  34. __asm__ volatile (
  35. PTR_ADDU "%[addr1], %[line_size], %[line_size] \n\t"
  36. "1: \n\t"
  37. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  38. MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
  39. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  40. MMI_SWC1(%[ftmp0], %[block], 0x00)
  41. MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
  42. PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t"
  43. PTR_ADDU "%[block], %[block], %[addr1] \n\t"
  44. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  45. MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
  46. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  47. MMI_SWC1(%[ftmp0], %[block], 0x00)
  48. MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
  49. PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t"
  50. PTR_ADDU "%[block], %[block], %[addr1] \n\t"
  51. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  52. "bnez %[h], 1b \n\t"
  53. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  54. RESTRICT_ASM_LOW32
  55. RESTRICT_ASM_ADDRT
  56. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  57. [block]"+&r"(block), [pixels]"+&r"(pixels),
  58. [h]"+&r"(h)
  59. : [line_size]"r"((mips_reg)line_size)
  60. : "memory"
  61. );
  62. }
  63. void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
  64. ptrdiff_t line_size, int h)
  65. {
  66. double ftmp[2];
  67. mips_reg addr[3];
  68. DECLARE_VAR_ALL64;
  69. __asm__ volatile (
  70. PTR_ADDU "%[addr1], %[line_size], %[line_size] \n\t"
  71. "1: \n\t"
  72. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  73. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  74. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  75. MMI_SDC1(%[ftmp0], %[block], 0x00)
  76. PTR_ADDU "%[addr2], %[block], %[line_size] \n\t"
  77. MMI_SDC1(%[ftmp1], %[addr2], 0x00)
  78. PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t"
  79. PTR_ADDU "%[block], %[block], %[addr1] \n\t"
  80. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  81. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  82. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  83. MMI_SDC1(%[ftmp0], %[block], 0x00)
  84. PTR_ADDU "%[addr2], %[block], %[line_size] \n\t"
  85. MMI_SDC1(%[ftmp1], %[addr2], 0x00)
  86. PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t"
  87. PTR_ADDU "%[block], %[block], %[addr1] \n\t"
  88. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  89. "bnez %[h], 1b \n\t"
  90. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  91. RESTRICT_ASM_ALL64
  92. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  93. [addr2]"=&r"(addr[2]),
  94. [block]"+&r"(block), [pixels]"+&r"(pixels),
  95. [h]"+&r"(h)
  96. : [line_size]"r"((mips_reg)line_size)
  97. : "memory"
  98. );
  99. }
  100. void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
  101. ptrdiff_t line_size, int h)
  102. {
  103. double ftmp[4];
  104. mips_reg addr[2];
  105. DECLARE_VAR_ALL64;
  106. DECLARE_VAR_ADDRT;
  107. __asm__ volatile (
  108. PTR_ADDU "%[addr1], %[line_size], %[line_size] \n\t"
  109. "1: \n\t"
  110. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  111. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  112. MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
  113. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  114. MMI_ULDC1(%[ftmp3], %[addr0], 0x08)
  115. MMI_SDC1(%[ftmp0], %[block], 0x00)
  116. MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
  117. MMI_SDC1(%[ftmp2], %[block], 0x08)
  118. MMI_SDXC1(%[ftmp3], %[block], %[line_size], 0x08)
  119. PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t"
  120. PTR_ADDU "%[block], %[block], %[addr1] \n\t"
  121. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  122. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  123. MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
  124. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  125. MMI_ULDC1(%[ftmp3], %[addr0], 0x08)
  126. MMI_SDC1(%[ftmp0], %[block], 0x00)
  127. MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
  128. MMI_SDC1(%[ftmp2], %[block], 0x08)
  129. MMI_SDXC1(%[ftmp3], %[block], %[line_size], 0x08)
  130. PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t"
  131. PTR_ADDU "%[block], %[block], %[addr1] \n\t"
  132. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  133. "bnez %[h], 1b \n\t"
  134. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  135. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  136. RESTRICT_ASM_ALL64
  137. RESTRICT_ASM_ADDRT
  138. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  139. [block]"+&r"(block), [pixels]"+&r"(pixels),
  140. [h]"+&r"(h)
  141. : [line_size]"r"((mips_reg)line_size)
  142. : "memory"
  143. );
  144. }
  145. void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
  146. ptrdiff_t line_size, int h)
  147. {
  148. double ftmp[4];
  149. mips_reg addr[3];
  150. DECLARE_VAR_LOW32;
  151. DECLARE_VAR_ADDRT;
  152. __asm__ volatile (
  153. PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t"
  154. "1: \n\t"
  155. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  156. MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
  157. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  158. PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
  159. MMI_ULWC1(%[ftmp2], %[block], 0x00)
  160. MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
  161. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  162. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  163. MMI_SWC1(%[ftmp0], %[block], 0x00)
  164. MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
  165. PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
  166. PTR_ADDU "%[block], %[block], %[addr2] \n\t"
  167. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  168. MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
  169. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  170. PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
  171. MMI_ULWC1(%[ftmp2], %[block], 0x00)
  172. MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
  173. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  174. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  175. MMI_SWC1(%[ftmp0], %[block], 0x00)
  176. MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
  177. PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
  178. PTR_ADDU "%[block], %[block], %[addr2] \n\t"
  179. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  180. "bnez %[h], 1b \n\t"
  181. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  182. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  183. RESTRICT_ASM_LOW32
  184. RESTRICT_ASM_ADDRT
  185. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  186. [addr2]"=&r"(addr[2]),
  187. [block]"+&r"(block), [pixels]"+&r"(pixels),
  188. [h]"+&r"(h)
  189. : [line_size]"r"((mips_reg)line_size)
  190. : "memory"
  191. );
  192. }
  193. void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
  194. ptrdiff_t line_size, int h)
  195. {
  196. double ftmp[4];
  197. mips_reg addr[3];
  198. DECLARE_VAR_ALL64;
  199. DECLARE_VAR_ADDRT;
  200. __asm__ volatile (
  201. PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t"
  202. "1: \n\t"
  203. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  204. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  205. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  206. PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
  207. MMI_ULDC1(%[ftmp2], %[block], 0x00)
  208. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  209. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  210. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  211. MMI_SDC1(%[ftmp0], %[block], 0x00)
  212. MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
  213. PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
  214. PTR_ADDU "%[block], %[block], %[addr2] \n\t"
  215. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  216. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  217. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  218. PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
  219. MMI_ULDC1(%[ftmp2], %[block], 0x00)
  220. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  221. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  222. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  223. MMI_SDC1(%[ftmp0], %[block], 0x00)
  224. MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
  225. PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
  226. PTR_ADDU "%[block], %[block], %[addr2] \n\t"
  227. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  228. "bnez %[h], 1b \n\t"
  229. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  230. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  231. RESTRICT_ASM_ALL64
  232. RESTRICT_ASM_ADDRT
  233. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  234. [addr2]"=&r"(addr[2]),
  235. [block]"+&r"(block), [pixels]"+&r"(pixels),
  236. [h]"+&r"(h)
  237. : [line_size]"r"((mips_reg)line_size)
  238. : "memory"
  239. );
  240. }
  241. void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
  242. ptrdiff_t line_size, int h)
  243. {
  244. double ftmp[8];
  245. mips_reg addr[3];
  246. DECLARE_VAR_ALL64;
  247. DECLARE_VAR_ADDRT;
  248. __asm__ volatile (
  249. PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t"
  250. "1: \n\t"
  251. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  252. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  253. MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
  254. PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
  255. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  256. MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
  257. MMI_ULDC1(%[ftmp2], %[block], 0x00)
  258. MMI_ULDC1(%[ftmp6], %[block], 0x08)
  259. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  260. MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
  261. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  262. "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  263. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  264. "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  265. MMI_SDC1(%[ftmp0], %[block], 0x00)
  266. MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
  267. MMI_SDC1(%[ftmp4], %[block], 0x08)
  268. MMI_SDXC1(%[ftmp5], %[block], %[line_size], 0x08)
  269. PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
  270. PTR_ADDU "%[block], %[block], %[addr2] \n\t"
  271. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  272. PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
  273. MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
  274. PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
  275. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  276. MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
  277. MMI_ULDC1(%[ftmp2], %[block], 0x00)
  278. MMI_ULDC1(%[ftmp6], %[block], 0x08)
  279. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  280. MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
  281. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  282. "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  283. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  284. "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  285. MMI_SDC1(%[ftmp0], %[block], 0x00)
  286. MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
  287. MMI_SDC1(%[ftmp4], %[block], 0x08)
  288. MMI_SDXC1(%[ftmp5], %[block], %[line_size], 0x08)
  289. PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
  290. PTR_ADDU "%[block], %[block], %[addr2] \n\t"
  291. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  292. "bnez %[h], 1b \n\t"
  293. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  294. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  295. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  296. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  297. RESTRICT_ASM_ALL64
  298. RESTRICT_ASM_ADDRT
  299. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  300. [addr2]"=&r"(addr[2]),
  301. [block]"+&r"(block), [pixels]"+&r"(pixels),
  302. [h]"+&r"(h)
  303. : [line_size]"r"((mips_reg)line_size)
  304. : "memory"
  305. );
  306. }
  307. inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  308. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  309. int h)
  310. {
  311. double ftmp[4];
  312. mips_reg addr[5];
  313. DECLARE_VAR_LOW32;
  314. DECLARE_VAR_ADDRT;
  315. __asm__ volatile (
  316. PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
  317. PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
  318. PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
  319. "1: \n\t"
  320. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  321. MMI_ULWC1(%[ftmp0], %[src1], 0x00)
  322. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  323. MMI_ULWC1(%[ftmp2], %[src2], 0x00)
  324. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  325. MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
  326. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  327. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  328. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  329. MMI_SWC1(%[ftmp0], %[dst], 0x00)
  330. MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  331. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  332. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  333. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  334. MMI_ULWC1(%[ftmp0], %[src1], 0x00)
  335. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  336. MMI_ULWC1(%[ftmp2], %[src2], 0x00)
  337. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  338. MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
  339. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  340. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  341. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  342. MMI_SWC1(%[ftmp0], %[dst], 0x00)
  343. MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  344. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  345. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  346. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  347. "bnez %[h], 1b \n\t"
  348. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  349. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  350. RESTRICT_ASM_LOW32
  351. RESTRICT_ASM_ADDRT
  352. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  353. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  354. [addr4]"=&r"(addr[4]),
  355. [dst]"+&r"(dst), [src1]"+&r"(src1),
  356. [src2]"+&r"(src2), [h]"+&r"(h)
  357. : [dst_stride]"r"((mips_reg)dst_stride),
  358. [src_stride1]"r"((mips_reg)src_stride1),
  359. [src_stride2]"r"((mips_reg)src_stride2)
  360. : "memory"
  361. );
  362. }
  363. inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  364. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  365. int h)
  366. {
  367. double ftmp[4];
  368. mips_reg addr[5];
  369. DECLARE_VAR_ALL64;
  370. DECLARE_VAR_ADDRT;
  371. __asm__ volatile (
  372. PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
  373. PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
  374. PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
  375. "1: \n\t"
  376. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  377. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  378. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  379. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  380. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  381. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  382. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  383. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  384. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  385. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  386. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  387. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  388. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  389. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  390. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  391. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  392. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  393. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  394. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  395. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  396. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  397. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  398. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  399. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  400. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  401. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  402. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  403. "bnez %[h], 1b \n\t"
  404. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  405. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  406. RESTRICT_ASM_ALL64
  407. RESTRICT_ASM_ADDRT
  408. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  409. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  410. [addr4]"=&r"(addr[4]),
  411. [dst]"+&r"(dst), [src1]"+&r"(src1),
  412. [src2]"+&r"(src2), [h]"+&r"(h)
  413. : [dst_stride]"r"((mips_reg)dst_stride),
  414. [src_stride1]"r"((mips_reg)src_stride1),
  415. [src_stride2]"r"((mips_reg)src_stride2)
  416. : "memory"
  417. );
  418. }
  419. inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  420. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  421. int h)
  422. {
  423. double ftmp[8];
  424. mips_reg addr[5];
  425. DECLARE_VAR_ALL64;
  426. DECLARE_VAR_ADDRT;
  427. __asm__ volatile (
  428. PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
  429. PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
  430. PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
  431. "1: \n\t"
  432. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  433. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  434. MMI_ULDC1(%[ftmp4], %[src1], 0x08)
  435. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  436. MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
  437. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  438. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  439. MMI_ULDC1(%[ftmp6], %[src2], 0x08)
  440. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  441. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  442. MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
  443. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  444. "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  445. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  446. "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  447. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  448. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  449. MMI_SDC1(%[ftmp4], %[dst], 0x08)
  450. MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
  451. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  452. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  453. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  454. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  455. MMI_ULDC1(%[ftmp4], %[src1], 0x08)
  456. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  457. MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
  458. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  459. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  460. MMI_ULDC1(%[ftmp6], %[src2], 0x08)
  461. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  462. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  463. MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
  464. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  465. "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  466. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  467. "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  468. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  469. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  470. MMI_SDC1(%[ftmp4], %[dst], 0x08)
  471. MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
  472. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  473. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  474. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  475. "bnez %[h], 1b \n\t"
  476. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  477. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  478. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  479. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  480. RESTRICT_ASM_ALL64
  481. RESTRICT_ASM_ADDRT
  482. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  483. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  484. [addr4]"=&r"(addr[4]),
  485. [dst]"+&r"(dst), [src1]"+&r"(src1),
  486. [src2]"+&r"(src2), [h]"+&r"(h)
  487. : [dst_stride]"r"((mips_reg)dst_stride),
  488. [src_stride1]"r"((mips_reg)src_stride1),
  489. [src_stride2]"r"((mips_reg)src_stride2)
  490. : "memory"
  491. );
  492. }
  493. inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  494. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  495. int h)
  496. {
  497. double ftmp[6];
  498. mips_reg addr[6];
  499. DECLARE_VAR_LOW32;
  500. DECLARE_VAR_ADDRT;
  501. __asm__ volatile (
  502. PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
  503. PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
  504. PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
  505. "1: \n\t"
  506. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  507. MMI_ULWC1(%[ftmp0], %[src1], 0x00)
  508. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  509. MMI_ULWC1(%[ftmp2], %[src2], 0x00)
  510. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  511. MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
  512. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  513. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  514. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  515. PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
  516. MMI_ULWC1(%[ftmp4], %[dst], 0x00)
  517. MMI_ULWC1(%[ftmp5], %[addr5], 0x00)
  518. "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  519. "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  520. MMI_SWC1(%[ftmp0], %[dst], 0x00)
  521. MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  522. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  523. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  524. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  525. MMI_ULWC1(%[ftmp0], %[src1], 0x00)
  526. MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
  527. MMI_ULWC1(%[ftmp2], %[src2], 0x00)
  528. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  529. MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
  530. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  531. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  532. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  533. PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
  534. MMI_ULWC1(%[ftmp4], %[dst], 0x00)
  535. MMI_ULWC1(%[ftmp5], %[addr5], 0x00)
  536. "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  537. "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  538. MMI_SWC1(%[ftmp0], %[dst], 0x00)
  539. MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  540. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  541. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  542. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  543. "bnez %[h], 1b \n\t"
  544. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  545. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  546. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  547. RESTRICT_ASM_LOW32
  548. RESTRICT_ASM_ADDRT
  549. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  550. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  551. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  552. [dst]"+&r"(dst), [src1]"+&r"(src1),
  553. [src2]"+&r"(src2), [h]"+&r"(h)
  554. : [dst_stride]"r"((mips_reg)dst_stride),
  555. [src_stride1]"r"((mips_reg)src_stride1),
  556. [src_stride2]"r"((mips_reg)src_stride2)
  557. : "memory"
  558. );
  559. }
  560. inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  561. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  562. int h)
  563. {
  564. double ftmp[6];
  565. mips_reg addr[6];
  566. DECLARE_VAR_ALL64;
  567. DECLARE_VAR_ADDRT;
  568. __asm__ volatile (
  569. PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
  570. PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
  571. PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
  572. "1: \n\t"
  573. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  574. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  575. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  576. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  577. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  578. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  579. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  580. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  581. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  582. PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
  583. MMI_ULDC1(%[ftmp4], %[dst], 0x00)
  584. MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
  585. "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  586. "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  587. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  588. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  589. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  590. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  591. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  592. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  593. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  594. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  595. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  596. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  597. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  598. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  599. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  600. PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
  601. MMI_ULDC1(%[ftmp4], %[dst], 0x00)
  602. MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
  603. "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  604. "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  605. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  606. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  607. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  608. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  609. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  610. "bnez %[h], 1b \n\t"
  611. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  612. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  613. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  614. RESTRICT_ASM_ALL64
  615. RESTRICT_ASM_ADDRT
  616. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  617. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  618. [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
  619. [dst]"+&r"(dst), [src1]"+&r"(src1),
  620. [src2]"+&r"(src2), [h]"+&r"(h)
  621. : [dst_stride]"r"((mips_reg)dst_stride),
  622. [src_stride1]"r"((mips_reg)src_stride1),
  623. [src_stride2]"r"((mips_reg)src_stride2)
  624. : "memory"
  625. );
  626. }
  627. inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  628. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  629. int h)
  630. {
  631. ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
  632. src_stride2, h);
  633. ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
  634. src_stride1, src_stride2, h);
  635. }
  636. void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  637. ptrdiff_t line_size, int h)
  638. {
  639. ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
  640. line_size, h);
  641. }
  642. void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  643. ptrdiff_t line_size, int h)
  644. {
  645. ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
  646. line_size, h);
  647. }
  648. void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  649. ptrdiff_t line_size, int h)
  650. {
  651. ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
  652. line_size, h);
  653. }
  654. void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  655. ptrdiff_t line_size, int h)
  656. {
  657. ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
  658. line_size, h);
  659. }
  660. void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  661. ptrdiff_t line_size, int h)
  662. {
  663. ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
  664. line_size, h);
  665. }
  666. void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  667. ptrdiff_t line_size, int h)
  668. {
  669. ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
  670. ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
  671. }
  672. inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
  673. const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
  674. int h)
  675. {
  676. double ftmp[5];
  677. mips_reg addr[5];
  678. DECLARE_VAR_ALL64;
  679. DECLARE_VAR_ADDRT;
  680. __asm__ volatile (
  681. "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  682. PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
  683. PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
  684. PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
  685. "1: \n\t"
  686. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  687. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  688. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  689. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  690. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  691. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  692. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  693. "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  694. "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  695. "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  696. "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  697. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  698. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  699. "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  700. "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  701. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  702. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  703. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  704. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  705. MMI_ULDC1(%[ftmp0], %[src1], 0x00)
  706. PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
  707. MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
  708. MMI_ULDC1(%[ftmp2], %[src2], 0x00)
  709. PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
  710. MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
  711. PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
  712. "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  713. "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  714. "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  715. "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  716. "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  717. "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  718. "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  719. "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
  720. MMI_SDC1(%[ftmp0], %[dst], 0x00)
  721. MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
  722. PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
  723. PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
  724. PTR_ADDI "%[h], %[h], -0x04 \n\t"
  725. "bnez %[h], 1b \n\t"
  726. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  727. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  728. [ftmp4]"=&f"(ftmp[4]),
  729. RESTRICT_ASM_ALL64
  730. RESTRICT_ASM_ADDRT
  731. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  732. [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
  733. [addr4]"=&r"(addr[4]),
  734. [dst]"+&r"(dst), [src1]"+&r"(src1),
  735. [src2]"+&r"(src2), [h]"+&r"(h)
  736. : [dst_stride]"r"((mips_reg)dst_stride),
  737. [src_stride1]"r"((mips_reg)src_stride1),
  738. [src_stride2]"r"((mips_reg)src_stride2)
  739. : "memory"
  740. );
  741. }
  742. void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  743. ptrdiff_t line_size, int h)
  744. {
  745. ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
  746. line_size, line_size, h);
  747. }
  748. void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
  749. ptrdiff_t line_size, int h)
  750. {
  751. ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
  752. ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
  753. }
  754. void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  755. ptrdiff_t line_size, int h)
  756. {
  757. ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
  758. line_size, line_size, h);
  759. }
  760. void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  761. ptrdiff_t line_size, int h)
  762. {
  763. ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
  764. line_size, line_size, h);
  765. }
  766. void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  767. ptrdiff_t line_size, int h)
  768. {
  769. ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
  770. line_size, line_size, h);
  771. }
  772. void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  773. ptrdiff_t line_size, int h)
  774. {
  775. ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
  776. line_size, line_size, h);
  777. }
  778. void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  779. ptrdiff_t line_size, int h)
  780. {
  781. ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
  782. line_size, line_size, h);
  783. }
  784. void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  785. ptrdiff_t line_size, int h)
  786. {
  787. ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
  788. ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
  789. }
  790. void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  791. ptrdiff_t line_size, int h)
  792. {
  793. ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
  794. line_size, line_size, line_size, h);
  795. }
  796. void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
  797. ptrdiff_t line_size, int h)
  798. {
  799. ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
  800. ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
  801. }
  802. void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  803. ptrdiff_t line_size, int h)
  804. {
  805. /* FIXME HIGH BIT DEPTH */
  806. int i;
  807. const uint32_t a = AV_RN32(pixels);
  808. const uint32_t b = AV_RN32(pixels + 1);
  809. uint32_t l0 = (a & 0x03030303UL) +
  810. (b & 0x03030303UL) +
  811. 0x02020202UL;
  812. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  813. ((b & 0xFCFCFCFCUL) >> 2);
  814. uint32_t l1, h1;
  815. pixels += line_size;
  816. for (i = 0; i < h; i += 2) {
  817. uint32_t a = AV_RN32(pixels);
  818. uint32_t b = AV_RN32(pixels + 1);
  819. l1 = (a & 0x03030303UL) +
  820. (b & 0x03030303UL);
  821. h1 = ((a & 0xFCFCFCFCUL) >> 2) +
  822. ((b & 0xFCFCFCFCUL) >> 2);
  823. *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  824. pixels += line_size;
  825. block += line_size;
  826. a = AV_RN32(pixels);
  827. b = AV_RN32(pixels + 1);
  828. l0 = (a & 0x03030303UL) +
  829. (b & 0x03030303UL) +
  830. 0x02020202UL;
  831. h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  832. ((b & 0xFCFCFCFCUL) >> 2);
  833. *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  834. pixels += line_size;
  835. block += line_size;
  836. }
  837. }
  838. void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  839. ptrdiff_t line_size, int h)
  840. {
  841. #if 1
  842. double ftmp[10];
  843. mips_reg addr[2];
  844. DECLARE_VAR_ALL64;
  845. DECLARE_VAR_ADDRT;
  846. __asm__ volatile (
  847. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  848. "dli %[addr0], 0x0f \n\t"
  849. "pcmpeqw %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  850. "dmtc1 %[addr0], %[ftmp8] \n\t"
  851. "dli %[addr0], 0x01 \n\t"
  852. "psrlh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  853. "dmtc1 %[addr0], %[ftmp8] \n\t"
  854. "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
  855. "dli %[addr0], 0x02 \n\t"
  856. "dmtc1 %[addr0], %[ftmp9] \n\t"
  857. MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
  858. MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
  859. "mov.d %[ftmp1], %[ftmp0] \n\t"
  860. "mov.d %[ftmp5], %[ftmp4] \n\t"
  861. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  862. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  863. "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  864. "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  865. "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  866. "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  867. "xor %[addr0], %[addr0], %[addr0] \n\t"
  868. PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
  869. ".p2align 3 \n\t"
  870. "1: \n\t"
  871. PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t"
  872. MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
  873. MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
  874. "mov.d %[ftmp1], %[ftmp0] \n\t"
  875. "mov.d %[ftmp3], %[ftmp2] \n\t"
  876. "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  877. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  878. "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  879. "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  880. "paddush %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
  881. "paddush %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  882. "paddush %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  883. "paddush %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
  884. "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  885. "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  886. "psrlh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  887. "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
  888. "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  889. MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
  890. PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t"
  891. PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t"
  892. MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
  893. MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
  894. "mov.d %[ftmp3], %[ftmp2] \n\t"
  895. "mov.d %[ftmp5], %[ftmp4] \n\t"
  896. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
  897. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
  898. "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  899. "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
  900. "paddush %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  901. "paddush %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
  902. "paddush %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
  903. "paddush %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  904. "paddush %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  905. "paddush %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  906. "psrlh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
  907. "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  908. "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  909. MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
  910. PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t"
  911. PTR_ADDU "%[h], %[h], -0x02 \n\t"
  912. "bnez %[h], 1b \n\t"
  913. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  914. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  915. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  916. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  917. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  918. RESTRICT_ASM_ALL64
  919. RESTRICT_ASM_ADDRT
  920. [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
  921. [h]"+&r"(h), [pixels]"+&r"(pixels)
  922. : [block]"r"(block), [line_size]"r"((mips_reg)line_size)
  923. : "memory"
  924. );
  925. #else
  926. /* FIXME HIGH BIT DEPTH */
  927. int j;
  928. for (j = 0; j < 2; j++) {
  929. int i;
  930. const uint32_t a = AV_RN32(pixels);
  931. const uint32_t b = AV_RN32(pixels + 1);
  932. uint32_t l0 = (a & 0x03030303UL) +
  933. (b & 0x03030303UL) +
  934. 0x02020202UL;
  935. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  936. ((b & 0xFCFCFCFCUL) >> 2);
  937. uint32_t l1, h1;
  938. pixels += line_size;
  939. for (i = 0; i < h; i += 2) {
  940. uint32_t a = AV_RN32(pixels);
  941. uint32_t b = AV_RN32(pixels + 1);
  942. l1 = (a & 0x03030303UL) +
  943. (b & 0x03030303UL);
  944. h1 = ((a & 0xFCFCFCFCUL) >> 2) +
  945. ((b & 0xFCFCFCFCUL) >> 2);
  946. *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  947. pixels += line_size;
  948. block += line_size;
  949. a = AV_RN32(pixels);
  950. b = AV_RN32(pixels + 1);
  951. l0 = (a & 0x03030303UL) +
  952. (b & 0x03030303UL) +
  953. 0x02020202UL;
  954. h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  955. ((b & 0xFCFCFCFCUL) >> 2);
  956. *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  957. pixels += line_size;
  958. block += line_size;
  959. }
  960. pixels += 4 - line_size * (h + 1);
  961. block += 4 - line_size * h;
  962. }
  963. #endif
  964. }
  965. void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  966. ptrdiff_t line_size, int h)
  967. {
  968. ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
  969. ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
  970. }
  971. void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  972. ptrdiff_t line_size, int h)
  973. {
  974. /* FIXME HIGH BIT DEPTH */
  975. int i;
  976. const uint32_t a = AV_RN32(pixels);
  977. const uint32_t b = AV_RN32(pixels + 1);
  978. uint32_t l0 = (a & 0x03030303UL) +
  979. (b & 0x03030303UL) +
  980. 0x02020202UL;
  981. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  982. ((b & 0xFCFCFCFCUL) >> 2);
  983. uint32_t l1, h1;
  984. pixels += line_size;
  985. for (i = 0; i < h; i += 2) {
  986. uint32_t a = AV_RN32(pixels);
  987. uint32_t b = AV_RN32(pixels + 1);
  988. l1 = (a & 0x03030303UL) +
  989. (b & 0x03030303UL);
  990. h1 = ((a & 0xFCFCFCFCUL) >> 2) +
  991. ((b & 0xFCFCFCFCUL) >> 2);
  992. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  993. pixels += line_size;
  994. block += line_size;
  995. a = AV_RN32(pixels);
  996. b = AV_RN32(pixels + 1);
  997. l0 = (a & 0x03030303UL) +
  998. (b & 0x03030303UL) +
  999. 0x02020202UL;
  1000. h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  1001. ((b & 0xFCFCFCFCUL) >> 2);
  1002. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1003. pixels += line_size;
  1004. block += line_size;
  1005. }
  1006. }
  1007. void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  1008. ptrdiff_t line_size, int h)
  1009. {
  1010. /* FIXME HIGH BIT DEPTH */
  1011. int j;
  1012. for (j = 0; j < 2; j++) {
  1013. int i;
  1014. const uint32_t a = AV_RN32(pixels);
  1015. const uint32_t b = AV_RN32(pixels + 1);
  1016. uint32_t l0 = (a & 0x03030303UL) +
  1017. (b & 0x03030303UL) +
  1018. 0x02020202UL;
  1019. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  1020. ((b & 0xFCFCFCFCUL) >> 2);
  1021. uint32_t l1, h1;
  1022. pixels += line_size;
  1023. for (i = 0; i < h; i += 2) {
  1024. uint32_t a = AV_RN32(pixels);
  1025. uint32_t b = AV_RN32(pixels + 1);
  1026. l1 = (a & 0x03030303UL) +
  1027. (b & 0x03030303UL);
  1028. h1 = ((a & 0xFCFCFCFCUL) >> 2) +
  1029. ((b & 0xFCFCFCFCUL) >> 2);
  1030. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1031. pixels += line_size;
  1032. block += line_size;
  1033. a = AV_RN32(pixels);
  1034. b = AV_RN32(pixels + 1);
  1035. l0 = (a & 0x03030303UL) +
  1036. (b & 0x03030303UL) +
  1037. 0x02020202UL;
  1038. h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  1039. ((b & 0xFCFCFCFCUL) >> 2);
  1040. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1041. pixels += line_size;
  1042. block += line_size;
  1043. }
  1044. pixels += 4 - line_size * (h + 1);
  1045. block += 4 - line_size * h;
  1046. }
  1047. }
  1048. void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  1049. ptrdiff_t line_size, int h)
  1050. {
  1051. ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
  1052. ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
  1053. }
  1054. void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  1055. ptrdiff_t line_size, int h)
  1056. {
  1057. /* FIXME HIGH BIT DEPTH */
  1058. int j;
  1059. for (j = 0; j < 2; j++) {
  1060. int i;
  1061. const uint32_t a = AV_RN32(pixels);
  1062. const uint32_t b = AV_RN32(pixels + 1);
  1063. uint32_t l0 = (a & 0x03030303UL) +
  1064. (b & 0x03030303UL) +
  1065. 0x01010101UL;
  1066. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  1067. ((b & 0xFCFCFCFCUL) >> 2);
  1068. uint32_t l1, h1;
  1069. pixels += line_size;
  1070. for (i = 0; i < h; i += 2) {
  1071. uint32_t a = AV_RN32(pixels);
  1072. uint32_t b = AV_RN32(pixels + 1);
  1073. l1 = (a & 0x03030303UL) +
  1074. (b & 0x03030303UL);
  1075. h1 = ((a & 0xFCFCFCFCUL) >> 2) +
  1076. ((b & 0xFCFCFCFCUL) >> 2);
  1077. *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1078. pixels += line_size;
  1079. block += line_size;
  1080. a = AV_RN32(pixels);
  1081. b = AV_RN32(pixels + 1);
  1082. l0 = (a & 0x03030303UL) +
  1083. (b & 0x03030303UL) +
  1084. 0x01010101UL;
  1085. h0 = ((a & 0xFCFCFCFCUL) >> 2) +
  1086. ((b & 0xFCFCFCFCUL) >> 2);
  1087. *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1088. pixels += line_size;
  1089. block += line_size;
  1090. }
  1091. pixels += 4 - line_size * (h + 1);
  1092. block += 4 - line_size * h;
  1093. }
  1094. }
  1095. void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
  1096. ptrdiff_t line_size, int h)
  1097. {
  1098. ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
  1099. ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
  1100. }