You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

574 lines
27KB

  1. /*
  2. * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/mips/generic_macros_msa.h"
  21. #include "idctdsp_mips.h"
  22. static void simple_idct_msa(int16_t *block)
  23. {
  24. int32_t const_val;
  25. v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
  26. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  27. v8i16 w1, w3, w5, w7;
  28. v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
  29. v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
  30. v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
  31. v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
  32. v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
  33. v4i32 w2, w4, w6;
  34. v8i16 select_vec, temp;
  35. v8i16 zero = { 0 };
  36. v4i32 const_val0 = __msa_ldi_w(1);
  37. v4i32 const_val1 = __msa_ldi_w(1);
  38. LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  39. const_val0 <<= 10;
  40. const_val = 16383 * ((1 << 19) / 16383);
  41. const_val1 = __msa_insert_w(const_val0, 0, const_val);
  42. const_val1 = __msa_splati_w(const_val1, 0);
  43. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  44. in0, in1, in2, in3, in4, in5, in6, in7);
  45. select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
  46. select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
  47. UNPCK_SH_SW(in0, a0_r, a0_l);
  48. UNPCK_SH_SW(in2, temp3_r, temp3_l);
  49. temp = in0 << 3;
  50. w2 = (v4i32) __msa_splati_h(weights, 2);
  51. w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  52. w4 = (v4i32) __msa_splati_h(weights, 4);
  53. w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  54. w6 = (v4i32) __msa_splati_h(weights, 6);
  55. w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  56. MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  57. ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
  58. MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  59. temp1_r, temp1_l, temp2_r, temp2_l);
  60. BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  61. temp2_l, temp2_r, temp1_l, temp1_r,
  62. a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  63. UNPCK_SH_SW(in4, temp0_r, temp0_l);
  64. UNPCK_SH_SW(in6, temp3_r, temp3_l);
  65. MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  66. MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  67. temp2_r, temp2_l, temp1_r, temp1_l);
  68. ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  69. SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
  70. a1_r, a1_l, a2_r, a2_l);
  71. ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
  72. a3_r, a3_l, a0_r, a0_l);
  73. SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  74. ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  75. SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  76. ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  77. SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  78. ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  79. ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  80. const0, const1, const2, const3);
  81. ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  82. const5 = __msa_ilvod_h(-w1, -w5);
  83. const7 = __msa_ilvod_h(w3, -w1);
  84. DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  85. b0_r, b1_r, b2_r, b3_r);
  86. DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  87. const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  88. DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  89. b0_l, b1_l, b2_l, b3_l);
  90. DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  91. const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  92. BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  93. b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  94. temp0_r, temp0_l, temp1_r, temp1_l,
  95. temp2_r, temp2_l, temp3_r, temp3_l,
  96. a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  97. SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
  98. SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
  99. PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
  100. temp2_l, temp2_r, temp3_l, temp3_r,
  101. temp0_r, temp1_r, temp2_r, temp3_r);
  102. in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
  103. (v16u8) select_vec);
  104. in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
  105. (v16u8) select_vec);
  106. in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
  107. (v16u8) select_vec);
  108. in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
  109. (v16u8) select_vec);
  110. SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
  111. SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
  112. PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  113. a0_r, a1_r, a2_r, a3_r);
  114. in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
  115. in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
  116. in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
  117. in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
  118. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  119. in0, in1, in2, in3, in4, in5, in6, in7);
  120. UNPCK_SH_SW(in0, a0_r, a0_l);
  121. UNPCK_SH_SW(in2, temp3_r, temp3_l);
  122. w2 = (v4i32) __msa_splati_h(weights, 2);
  123. w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  124. w4 = (v4i32) __msa_splati_h(weights, 4);
  125. w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  126. w6 = (v4i32) __msa_splati_h(weights, 6);
  127. w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  128. MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  129. ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
  130. MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  131. temp1_r, temp1_l, temp2_r, temp2_l);
  132. BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  133. temp2_l, temp2_r, temp1_l, temp1_r,
  134. a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  135. UNPCK_SH_SW(in4, temp0_r, temp0_l);
  136. UNPCK_SH_SW(in6, temp3_r, temp3_l);
  137. MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  138. MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  139. temp2_r, temp2_l, temp1_r, temp1_l);
  140. ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  141. SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
  142. a1_r, a1_l, a2_r, a2_l);
  143. ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
  144. a3_r, a3_l, a0_r, a0_l);
  145. SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  146. ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  147. SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  148. ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  149. SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  150. ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  151. const0, const1, const2, const3);
  152. DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  153. b0_r, b1_r, b2_r, b3_r);
  154. DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  155. b0_l, b1_l, b2_l, b3_l);
  156. ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  157. ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  158. const5 = __msa_ilvod_h(-w1, -w5);
  159. const7 = __msa_ilvod_h(w3, -w1);
  160. DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  161. const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  162. DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  163. const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  164. BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  165. b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  166. temp0_r, temp0_l, temp1_r, temp1_l,
  167. temp2_r, temp2_l, temp3_r, temp3_l,
  168. a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  169. SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
  170. SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
  171. PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
  172. temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
  173. SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
  174. SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
  175. PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  176. a0_r, a1_r, a2_r, a3_r);
  177. ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
  178. block, 8);
  179. }
  180. static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
  181. int16_t *block)
  182. {
  183. int32_t const_val;
  184. uint64_t tmp0, tmp1, tmp2, tmp3;
  185. v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
  186. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  187. v8i16 w1, w3, w5, w7;
  188. v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
  189. v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
  190. v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
  191. v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
  192. v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
  193. v4i32 w2, w4, w6;
  194. v8i16 select_vec, temp;
  195. v8i16 zero = { 0 };
  196. v4i32 const_val0 = __msa_ldi_w(1);
  197. v4i32 const_val1 = __msa_ldi_w(1);
  198. LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  199. const_val0 <<= 10;
  200. const_val = 16383 * ((1 << 19) / 16383);
  201. const_val1 = __msa_insert_w(const_val0, 0, const_val);
  202. const_val1 = __msa_splati_w(const_val1, 0);
  203. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  204. in0, in1, in2, in3, in4, in5, in6, in7);
  205. select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
  206. select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
  207. UNPCK_SH_SW(in0, a0_r, a0_l);
  208. UNPCK_SH_SW(in2, temp3_r, temp3_l);
  209. temp = in0 << 3;
  210. w2 = (v4i32) __msa_splati_h(weights, 2);
  211. w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  212. w4 = (v4i32) __msa_splati_h(weights, 4);
  213. w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  214. w6 = (v4i32) __msa_splati_h(weights, 6);
  215. w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  216. MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  217. ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
  218. MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  219. MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  220. BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  221. temp2_l, temp2_r, temp1_l, temp1_r,
  222. a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  223. UNPCK_SH_SW(in4, temp0_r, temp0_l);
  224. UNPCK_SH_SW(in6, temp3_r, temp3_l);
  225. MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  226. MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
  227. MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
  228. ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  229. SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
  230. SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
  231. ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
  232. ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
  233. SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  234. ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  235. SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  236. ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  237. SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  238. ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  239. ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  240. const0, const1, const2, const3);
  241. ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  242. const5 = __msa_ilvod_h(-w1, -w5);
  243. const7 = __msa_ilvod_h(w3, -w1);
  244. DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  245. b0_r, b1_r, b2_r, b3_r);
  246. DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  247. const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  248. DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  249. b0_l, b1_l, b2_l, b3_l);
  250. DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  251. const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  252. BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  253. b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  254. temp0_r, temp0_l, temp1_r, temp1_l,
  255. temp2_r, temp2_l, temp3_r, temp3_l,
  256. a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  257. SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
  258. SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
  259. PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
  260. temp2_l, temp2_r, temp3_l, temp3_r,
  261. temp0_r, temp1_r, temp2_r, temp3_r);
  262. in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
  263. (v16u8) select_vec);
  264. in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
  265. (v16u8) select_vec);
  266. in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
  267. (v16u8) select_vec);
  268. in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
  269. (v16u8) select_vec);
  270. SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
  271. SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
  272. PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  273. a0_r, a1_r, a2_r, a3_r);
  274. in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
  275. in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
  276. in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
  277. in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
  278. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  279. in0, in1, in2, in3, in4, in5, in6, in7);
  280. UNPCK_SH_SW(in0, a0_r, a0_l);
  281. UNPCK_SH_SW(in2, temp3_r, temp3_l);
  282. w2 = (v4i32) __msa_splati_h(weights, 2);
  283. w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  284. w4 = (v4i32) __msa_splati_h(weights, 4);
  285. w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  286. w6 = (v4i32) __msa_splati_h(weights, 6);
  287. w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  288. MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  289. ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
  290. MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  291. MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  292. BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  293. temp2_l, temp2_r, temp1_l, temp1_r,
  294. a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  295. UNPCK_SH_SW(in4, temp0_r, temp0_l);
  296. UNPCK_SH_SW(in6, temp3_r, temp3_l);
  297. MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  298. MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
  299. MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
  300. ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  301. SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
  302. SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
  303. ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
  304. ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
  305. SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  306. ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  307. SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  308. ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  309. SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  310. ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  311. const0, const1, const2, const3);
  312. DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  313. b0_r, b1_r, b2_r, b3_r);
  314. DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  315. b0_l, b1_l, b2_l, b3_l);
  316. ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  317. ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  318. const5 = __msa_ilvod_h(-w1, -w5);
  319. const7 = __msa_ilvod_h(w3, -w1);
  320. DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  321. const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  322. DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  323. const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  324. BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  325. b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  326. temp0_r, temp0_l, temp1_r, temp1_l,
  327. temp2_r, temp2_l, temp3_r, temp3_l,
  328. a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  329. SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
  330. SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
  331. SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
  332. SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
  333. PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
  334. temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
  335. PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  336. a0_r, a1_r, a2_r, a3_r);
  337. temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
  338. temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
  339. temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
  340. temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
  341. PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
  342. temp2_r, temp2_r, temp3_r, temp3_r,
  343. temp0_r, temp1_r, temp2_r, temp3_r);
  344. tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
  345. tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
  346. tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
  347. tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
  348. SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
  349. dst += 4 * dst_stride;
  350. a0_r = (v4i32) CLIP_SH_0_255(a0_r);
  351. a1_r = (v4i32) CLIP_SH_0_255(a1_r);
  352. a2_r = (v4i32) CLIP_SH_0_255(a2_r);
  353. a3_r = (v4i32) CLIP_SH_0_255(a3_r);
  354. PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
  355. a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
  356. tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
  357. tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
  358. tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
  359. tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
  360. SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
  361. dst += 4 * dst_stride;
  362. }
  363. static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
  364. int16_t *block)
  365. {
  366. int32_t const_val;
  367. uint64_t tmp0, tmp1, tmp2, tmp3;
  368. v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
  369. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  370. v8i16 w1, w3, w5, w7;
  371. v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
  372. v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
  373. v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
  374. v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
  375. v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
  376. v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
  377. v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
  378. v4i32 w2, w4, w6;
  379. v8i16 select_vec, temp;
  380. v8i16 zero = { 0 };
  381. v4i32 const_val0 = __msa_ldi_w(1);
  382. v4i32 const_val1 = __msa_ldi_w(1);
  383. const_val0 <<= 10;
  384. const_val = 16383 * ((1 << 19) / 16383);
  385. const_val1 = __msa_insert_w(const_val0, 0, const_val);
  386. const_val1 = __msa_splati_w(const_val1, 0);
  387. LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  388. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  389. in0, in1, in2, in3, in4, in5, in6, in7);
  390. select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
  391. select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
  392. UNPCK_SH_SW(in0, a0_r, a0_l);
  393. UNPCK_SH_SW(in2, temp3_r, temp3_l);
  394. ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  395. UNPCK_SH_SW(in4, temp4_r, temp4_l);
  396. UNPCK_SH_SW(in6, temp7_r, temp7_l);
  397. ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
  398. temp = in0 << 3;
  399. SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  400. ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  401. const0, const1, const2, const3);
  402. ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  403. const5 = __msa_ilvod_h(-w1, -w5);
  404. const7 = __msa_ilvod_h(w3, -w1);
  405. DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  406. b0_r, b1_r, b2_r, b3_r);
  407. DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
  408. const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  409. DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  410. b0_l, b1_l, b2_l, b3_l);
  411. DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
  412. const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  413. w2 = (v4i32) __msa_splati_h(weights, 2);
  414. w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  415. w4 = (v4i32) __msa_splati_h(weights, 4);
  416. w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  417. w6 = (v4i32) __msa_splati_h(weights, 6);
  418. w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  419. MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  420. ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
  421. MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  422. MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  423. BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  424. temp2_l, temp2_r, temp1_l, temp1_r,
  425. a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  426. MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
  427. MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
  428. MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
  429. ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
  430. SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
  431. SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
  432. ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
  433. ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
  434. SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
  435. ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
  436. SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
  437. BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  438. b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  439. temp0_r, temp0_l, temp1_r, temp1_l,
  440. temp2_r, temp2_l, temp3_r, temp3_l,
  441. a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  442. SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
  443. SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
  444. PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
  445. temp2_l, temp2_r, temp3_l, temp3_r,
  446. temp0_r, temp1_r, temp2_r, temp3_r);
  447. in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
  448. (v16u8) select_vec);
  449. in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
  450. (v16u8) select_vec);
  451. in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
  452. (v16u8) select_vec);
  453. in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
  454. (v16u8) select_vec);
  455. SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
  456. SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
  457. PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  458. a0_r, a1_r, a2_r, a3_r);
  459. in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
  460. in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
  461. in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
  462. in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
  463. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  464. in0, in1, in2, in3, in4, in5, in6, in7);
  465. UNPCK_SH_SW(in0, a0_r, a0_l);
  466. UNPCK_SH_SW(in2, temp3_r, temp3_l);
  467. MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  468. ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
  469. MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  470. MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  471. BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  472. temp2_l, temp2_r, temp1_l, temp1_r,
  473. a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  474. UNPCK_SH_SW(in4, temp0_r, temp0_l);
  475. UNPCK_SH_SW(in6, temp3_r, temp3_l);
  476. MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  477. MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
  478. MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
  479. ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  480. SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
  481. SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
  482. ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
  483. ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
  484. SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  485. ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  486. SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  487. ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  488. ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  489. DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  490. b0_r, b1_r, b2_r, b3_r);
  491. DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  492. b0_l, b1_l, b2_l, b3_l);
  493. DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  494. const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  495. DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  496. const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  497. BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  498. b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  499. temp0_r, temp0_l, temp1_r, temp1_l,
  500. temp2_r, temp2_l, temp3_r, temp3_l,
  501. a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  502. SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
  503. SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
  504. LD_SH4(dst, dst_stride, in0, in1, in2, in3);
  505. PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
  506. temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
  507. ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
  508. temp0_l, temp1_l, temp2_l, temp3_l);
  509. temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
  510. temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
  511. temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
  512. temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
  513. temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
  514. temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
  515. temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
  516. temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
  517. PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
  518. temp2_r, temp2_r, temp3_r, temp3_r,
  519. temp0_r, temp1_r, temp2_r, temp3_r);
  520. tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
  521. tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
  522. tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
  523. tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
  524. SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
  525. SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
  526. SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
  527. LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
  528. PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  529. a0_r, a1_r, a2_r, a3_r);
  530. ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
  531. a3_l, a2_l, a1_l, a0_l);
  532. a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
  533. a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
  534. a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
  535. a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
  536. a3_r = (v4i32) CLIP_SH_0_255(a3_r);
  537. a2_r = (v4i32) CLIP_SH_0_255(a2_r);
  538. a1_r = (v4i32) CLIP_SH_0_255(a1_r);
  539. a0_r = (v4i32) CLIP_SH_0_255(a0_r);
  540. PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
  541. a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
  542. tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
  543. tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
  544. tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
  545. tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
  546. SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
  547. }
  548. void ff_simple_idct_msa(int16_t *block)
  549. {
  550. simple_idct_msa(block);
  551. }
  552. void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
  553. {
  554. simple_idct_put_msa(dst, dst_stride, block);
  555. }
  556. void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
  557. {
  558. simple_idct_add_msa(dst, dst_stride, block);
  559. }