You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

507 lines
26KB

  1. /*
  2. * Loongson SIMD optimized mpegvideo
  3. *
  4. * Copyright (c) 2015 Loongson Technology Corporation Limited
  5. * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "mpegvideo_mips.h"
  25. #include "libavutil/mips/mmiutils.h"
  26. void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
  27. int n, int qscale)
  28. {
  29. int64_t level, qmul, qadd, nCoeffs;
  30. double ftmp[6];
  31. mips_reg addr[1];
  32. DECLARE_VAR_ALL64;
  33. qmul = qscale << 1;
  34. av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
  35. if (!s->h263_aic) {
  36. if (n<4)
  37. level = block[0] * s->y_dc_scale;
  38. else
  39. level = block[0] * s->c_dc_scale;
  40. qadd = (qscale-1) | 1;
  41. } else {
  42. qadd = 0;
  43. level = block[0];
  44. }
  45. if(s->ac_pred)
  46. nCoeffs = 63;
  47. else
  48. nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
  49. __asm__ volatile (
  50. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  51. "packsswh %[qmul], %[qmul], %[qmul] \n\t"
  52. "packsswh %[qmul], %[qmul], %[qmul] \n\t"
  53. "packsswh %[qadd], %[qadd], %[qadd] \n\t"
  54. "packsswh %[qadd], %[qadd], %[qadd] \n\t"
  55. "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t"
  56. "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  57. ".p2align 4 \n\t"
  58. "1: \n\t"
  59. PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t"
  60. MMI_LDC1(%[ftmp1], %[addr0], 0x00)
  61. MMI_LDC1(%[ftmp2], %[addr0], 0x08)
  62. "mov.d %[ftmp3], %[ftmp1] \n\t"
  63. "mov.d %[ftmp4], %[ftmp2] \n\t"
  64. "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t"
  65. "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t"
  66. "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  67. "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  68. "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  69. "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  70. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  71. "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  72. "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  73. "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  74. "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  75. "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  76. "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  77. "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  78. PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t"
  79. MMI_SDC1(%[ftmp1], %[addr0], 0x00)
  80. MMI_SDC1(%[ftmp2], %[addr0], 0x08)
  81. "blez %[nCoeffs], 1b \n\t"
  82. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  83. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  84. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  85. RESTRICT_ASM_ALL64
  86. [addr0]"=&r"(addr[0])
  87. : [block]"r"((mips_reg)(block+nCoeffs)),
  88. [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
  89. [qmul]"f"(qmul), [qadd]"f"(qadd)
  90. : "memory"
  91. );
  92. block[0] = level;
  93. }
  94. void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
  95. int n, int qscale)
  96. {
  97. int64_t qmul, qadd, nCoeffs;
  98. double ftmp[6];
  99. mips_reg addr[1];
  100. DECLARE_VAR_ALL64;
  101. qmul = qscale << 1;
  102. qadd = (qscale - 1) | 1;
  103. av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
  104. nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
  105. __asm__ volatile (
  106. "packsswh %[qmul], %[qmul], %[qmul] \n\t"
  107. "packsswh %[qmul], %[qmul], %[qmul] \n\t"
  108. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  109. "packsswh %[qadd], %[qadd], %[qadd] \n\t"
  110. "packsswh %[qadd], %[qadd], %[qadd] \n\t"
  111. "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t"
  112. "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  113. ".p2align 4 \n\t"
  114. "1: \n\t"
  115. PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t"
  116. MMI_LDC1(%[ftmp1], %[addr0], 0x00)
  117. MMI_LDC1(%[ftmp2], %[addr0], 0x08)
  118. "mov.d %[ftmp3], %[ftmp1] \n\t"
  119. "mov.d %[ftmp4], %[ftmp2] \n\t"
  120. "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t"
  121. "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t"
  122. "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  123. "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
  124. "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  125. "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  126. "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  127. "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  128. "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
  129. "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
  130. "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  131. "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  132. "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  133. "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  134. PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t"
  135. MMI_SDC1(%[ftmp1], %[addr0], 0x00)
  136. MMI_SDC1(%[ftmp2], %[addr0], 0x08)
  137. "blez %[nCoeffs], 1b \n\t"
  138. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  139. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  140. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  141. RESTRICT_ASM_ALL64
  142. [addr0]"=&r"(addr[0])
  143. : [block]"r"((mips_reg)(block+nCoeffs)),
  144. [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
  145. [qmul]"f"(qmul), [qadd]"f"(qadd)
  146. : "memory"
  147. );
  148. }
  149. void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
  150. int n, int qscale)
  151. {
  152. int64_t nCoeffs;
  153. const uint16_t *quant_matrix;
  154. int block0;
  155. double ftmp[10];
  156. uint64_t tmp[1];
  157. mips_reg addr[1];
  158. DECLARE_VAR_ALL64;
  159. DECLARE_VAR_ADDRT;
  160. av_assert2(s->block_last_index[n]>=0);
  161. nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
  162. if (n<4)
  163. block0 = block[0] * s->y_dc_scale;
  164. else
  165. block0 = block[0] * s->c_dc_scale;
  166. /* XXX: only mpeg1 */
  167. quant_matrix = s->intra_matrix;
  168. __asm__ volatile (
  169. "dli %[tmp0], 0x0f \n\t"
  170. "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  171. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  172. "dmtc1 %[qscale], %[ftmp1] \n\t"
  173. "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  174. "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  175. "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  176. "or %[addr0], %[nCoeffs], $0 \n\t"
  177. ".p2align 4 \n\t"
  178. "1: \n\t"
  179. MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
  180. MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
  181. "mov.d %[ftmp4], %[ftmp2] \n\t"
  182. "mov.d %[ftmp5], %[ftmp3] \n\t"
  183. MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
  184. MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
  185. "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  186. "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  187. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  188. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  189. "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  190. "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t"
  191. "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  192. "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  193. "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  194. "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  195. "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  196. "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  197. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  198. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  199. "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  200. "dli %[tmp0], 0x03 \n\t"
  201. "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  202. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  203. "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  204. "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  205. "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  206. "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  207. "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  208. "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  209. "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  210. "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  211. "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  212. "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  213. "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  214. "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  215. MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
  216. MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
  217. PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
  218. "bltz %[addr0], 1b \n\t"
  219. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  220. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  221. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  222. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  223. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  224. [tmp0]"=&r"(tmp[0]),
  225. RESTRICT_ASM_ALL64
  226. RESTRICT_ASM_ADDRT
  227. [addr0]"=&r"(addr[0])
  228. : [block]"r"((mips_reg)(block+nCoeffs)),
  229. [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
  230. [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
  231. [qscale]"r"(qscale)
  232. : "memory"
  233. );
  234. block[0] = block0;
  235. }
  236. void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
  237. int n, int qscale)
  238. {
  239. int64_t nCoeffs;
  240. const uint16_t *quant_matrix;
  241. double ftmp[10];
  242. uint64_t tmp[1];
  243. mips_reg addr[1];
  244. DECLARE_VAR_ALL64;
  245. DECLARE_VAR_ADDRT;
  246. av_assert2(s->block_last_index[n] >= 0);
  247. nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
  248. quant_matrix = s->inter_matrix;
  249. __asm__ volatile (
  250. "dli %[tmp0], 0x0f \n\t"
  251. "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  252. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  253. "dmtc1 %[qscale], %[ftmp1] \n\t"
  254. "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
  255. "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  256. "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
  257. "or %[addr0], %[nCoeffs], $0 \n\t"
  258. ".p2align 4 \n\t"
  259. "1: \n\t"
  260. MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
  261. MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
  262. "mov.d %[ftmp4], %[ftmp2] \n\t"
  263. "mov.d %[ftmp5], %[ftmp3] \n\t"
  264. MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
  265. MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
  266. "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
  267. "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  268. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  269. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  270. "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  271. "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t"
  272. "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  273. "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  274. "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  275. "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  276. "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  277. "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  278. "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  279. "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  280. "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  281. "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  282. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  283. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  284. "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
  285. "dli %[tmp0], 0x04 \n\t"
  286. "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  287. "dmtc1 %[tmp0], %[ftmp4] \n\t"
  288. "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
  289. "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  290. "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  291. "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  292. "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  293. "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  294. "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  295. "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  296. "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  297. "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  298. "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  299. "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  300. MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
  301. MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
  302. PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
  303. "bltz %[addr0], 1b \n\t"
  304. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  305. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  306. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  307. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  308. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  309. [tmp0]"=&r"(tmp[0]),
  310. RESTRICT_ASM_ALL64
  311. RESTRICT_ASM_ADDRT
  312. [addr0]"=&r"(addr[0])
  313. : [block]"r"((mips_reg)(block+nCoeffs)),
  314. [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
  315. [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
  316. [qscale]"r"(qscale)
  317. : "memory"
  318. );
  319. }
  320. void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
  321. int n, int qscale)
  322. {
  323. uint64_t nCoeffs;
  324. const uint16_t *quant_matrix;
  325. int block0;
  326. double ftmp[10];
  327. uint64_t tmp[1];
  328. mips_reg addr[1];
  329. DECLARE_VAR_ALL64;
  330. DECLARE_VAR_ADDRT;
  331. assert(s->block_last_index[n]>=0);
  332. if (s->alternate_scan)
  333. nCoeffs = 63;
  334. else
  335. nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
  336. if (n < 4)
  337. block0 = block[0] * s->y_dc_scale;
  338. else
  339. block0 = block[0] * s->c_dc_scale;
  340. quant_matrix = s->intra_matrix;
  341. __asm__ volatile (
  342. "dli %[tmp0], 0x0f \n\t"
  343. "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  344. "mtc1 %[tmp0], %[ftmp3] \n\t"
  345. "mtc1 %[qscale], %[ftmp9] \n\t"
  346. "psrlh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  347. "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  348. "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  349. "or %[addr0], %[nCoeffs], $0 \n\t"
  350. ".p2align 4 \n\t"
  351. "1: \n\t"
  352. MMI_LDXC1(%[ftmp1], %[addr0], %[block], 0x00)
  353. MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x08)
  354. "mov.d %[ftmp3], %[ftmp1] \n\t"
  355. "mov.d %[ftmp4], %[ftmp2] \n\t"
  356. MMI_LDXC1(%[ftmp5], %[addr0], %[quant], 0x00)
  357. MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x08)
  358. "pmullh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
  359. "pmullh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
  360. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  361. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  362. "pcmpgth %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
  363. "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
  364. "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  365. "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  366. "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  367. "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  368. "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  369. "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  370. "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  371. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  372. "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
  373. "dli %[tmp0], 0x03 \n\t"
  374. "pcmpeqh %[ftmp6] , %[ftmp6], %[ftmp4] \n\t"
  375. "mtc1 %[tmp0], %[ftmp3] \n\t"
  376. "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  377. "psrah %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
  378. "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  379. "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  380. "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
  381. "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
  382. "pandn %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
  383. "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
  384. PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
  385. MMI_SDXC1(%[ftmp5], %[addr0], %[block], 0x00)
  386. MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x08)
  387. "blez %[addr0], 1b \n\t"
  388. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  389. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  390. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  391. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  392. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  393. [tmp0]"=&r"(tmp[0]),
  394. RESTRICT_ASM_ALL64
  395. RESTRICT_ASM_ADDRT
  396. [addr0]"=&r"(addr[0])
  397. : [block]"r"((mips_reg)(block+nCoeffs)),
  398. [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
  399. [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
  400. [qscale]"r"(qscale)
  401. : "memory"
  402. );
  403. block[0]= block0;
  404. }
  405. void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
  406. {
  407. const int intra = s->mb_intra;
  408. int *sum = s->dct_error_sum[intra];
  409. uint16_t *offset = s->dct_offset[intra];
  410. double ftmp[8];
  411. mips_reg addr[1];
  412. DECLARE_VAR_ALL64;
  413. s->dct_count[intra]++;
  414. __asm__ volatile(
  415. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  416. "1: \n\t"
  417. MMI_LDC1(%[ftmp1], %[block], 0x00)
  418. "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
  419. MMI_LDC1(%[ftmp3], %[block], 0x08)
  420. "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
  421. "pcmpgth %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
  422. "pcmpgth %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
  423. "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  424. "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  425. "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  426. "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  427. MMI_LDC1(%[ftmp6], %[offset], 0x00)
  428. "mov.d %[ftmp5], %[ftmp1] \n\t"
  429. "psubush %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  430. MMI_LDC1(%[ftmp6], %[offset], 0x08)
  431. "mov.d %[ftmp7], %[ftmp3] \n\t"
  432. "psubush %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  433. "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  434. "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  435. "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  436. "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  437. MMI_SDC1(%[ftmp1], %[block], 0x00)
  438. MMI_SDC1(%[ftmp3], %[block], 0x08)
  439. "mov.d %[ftmp1], %[ftmp5] \n\t"
  440. "mov.d %[ftmp3], %[ftmp7] \n\t"
  441. "punpcklhw %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  442. "punpckhhw %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  443. "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  444. "punpckhhw %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  445. MMI_LDC1(%[ftmp2], %[sum], 0x00)
  446. "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
  447. MMI_LDC1(%[ftmp2], %[sum], 0x08)
  448. "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  449. MMI_LDC1(%[ftmp2], %[sum], 0x10)
  450. "paddw %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
  451. MMI_LDC1(%[ftmp2], %[sum], 0x18)
  452. "paddw %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
  453. MMI_SDC1(%[ftmp5], %[sum], 0x00)
  454. MMI_SDC1(%[ftmp1], %[sum], 0x08)
  455. MMI_SDC1(%[ftmp7], %[sum], 0x10)
  456. MMI_SDC1(%[ftmp3], %[sum], 0x18)
  457. PTR_ADDIU "%[block], %[block], 0x10 \n\t"
  458. PTR_ADDIU "%[sum], %[sum], 0x20 \n\t"
  459. PTR_SUBU "%[addr0], %[block1], %[block] \n\t"
  460. PTR_ADDIU "%[offset], %[offset], 0x10 \n\t"
  461. "bgtz %[addr0], 1b \n\t"
  462. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  463. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  464. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  465. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  466. RESTRICT_ASM_ALL64
  467. [addr0]"=&r"(addr[0]),
  468. [block]"+&r"(block), [sum]"+&r"(sum),
  469. [offset]"+&r"(offset)
  470. : [block1]"r"(block+64)
  471. : "memory"
  472. );
  473. }