You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5369 lines
222KB

  1. /*
  2. * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/mips/generic_macros_msa.h"
  21. #include "libavcodec/mips/hevcdsp_mips.h"
  22. #include "libavcodec/mips/hevc_macros_msa.h"
  23. static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
  24. /* 8 width cases */
  25. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  26. /* 4 width cases */
  27. 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
  28. };
  29. #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
  30. out0_h, out1_h) \
  31. { \
  32. v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
  33. \
  34. ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \
  35. ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \
  36. DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \
  37. wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \
  38. SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \
  39. PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
  40. ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \
  41. CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h); \
  42. }
  43. #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
  44. offset_h, rnd_w, out0_h, out1_h, \
  45. out2_h, out3_h) \
  46. { \
  47. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
  48. out0_h, out1_h); \
  49. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
  50. out2_h, out3_h); \
  51. }
  52. static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
  53. int32_t src_stride,
  54. uint8_t *dst,
  55. int32_t dst_stride,
  56. int32_t height,
  57. int32_t weight,
  58. int32_t offset,
  59. int32_t rnd_val)
  60. {
  61. uint32_t loop_cnt, tp0, tp1, tp2, tp3;
  62. v16i8 zero = { 0 };
  63. v16u8 out0, out1;
  64. v16i8 src0 = { 0 }, src1 = { 0 };
  65. v8i16 dst0, dst1, dst2, dst3, offset_vec;
  66. v4i32 weight_vec, rnd_vec;
  67. weight = weight & 0x0000FFFF;
  68. weight_vec = __msa_fill_w(weight);
  69. offset_vec = __msa_fill_h(offset);
  70. rnd_vec = __msa_fill_w(rnd_val);
  71. if (2 == height) {
  72. v4i32 dst0_r, dst0_l;
  73. LW2(src, src_stride, tp0, tp1);
  74. INSERT_W2_SB(tp0, tp1, src0);
  75. dst0 = (v8i16) __msa_ilvr_b(zero, src0);
  76. dst0 <<= 6;
  77. ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
  78. DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
  79. SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
  80. dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
  81. dst0 += offset_vec;
  82. dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
  83. out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
  84. ST4x2_UB(out0, dst, dst_stride);
  85. } else if (4 == height) {
  86. LW4(src, src_stride, tp0, tp1, tp2, tp3);
  87. INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
  88. ILVRL_B2_SH(zero, src0, dst0, dst1);
  89. SLLI_2V(dst0, dst1, 6);
  90. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
  91. rnd_vec, dst0, dst1);
  92. out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  93. ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
  94. } else if (0 == (height % 8)) {
  95. for (loop_cnt = (height >> 3); loop_cnt--;) {
  96. LW4(src, src_stride, tp0, tp1, tp2, tp3);
  97. src += 4 * src_stride;
  98. INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
  99. LW4(src, src_stride, tp0, tp1, tp2, tp3);
  100. src += 4 * src_stride;
  101. INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
  102. ILVRL_B2_SH(zero, src0, dst0, dst1);
  103. ILVRL_B2_SH(zero, src1, dst2, dst3);
  104. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  105. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  106. offset_vec, rnd_vec, dst0, dst1,
  107. dst2, dst3);
  108. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  109. ST4x8_UB(out0, out1, dst, dst_stride);
  110. dst += 8 * dst_stride;
  111. }
  112. }
  113. }
  114. static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
  115. int32_t src_stride,
  116. uint8_t *dst,
  117. int32_t dst_stride,
  118. int32_t height,
  119. int32_t weight,
  120. int32_t offset,
  121. int32_t rnd_val)
  122. {
  123. uint32_t loop_cnt;
  124. uint64_t tp0, tp1, tp2, tp3;
  125. v16i8 zero = { 0 };
  126. v16u8 out0, out1, out2, out3;
  127. v16i8 src0, src1, src2, src3;
  128. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
  129. v4i32 weight_vec, rnd_vec;
  130. weight = weight & 0x0000FFFF;
  131. weight_vec = __msa_fill_w(weight);
  132. offset_vec = __msa_fill_h(offset);
  133. rnd_vec = __msa_fill_w(rnd_val);
  134. for (loop_cnt = (height >> 3); loop_cnt--;) {
  135. LD4(src, src_stride, tp0, tp1, tp2, tp3);
  136. src += (4 * src_stride);
  137. INSERT_D2_SB(tp0, tp1, src0);
  138. INSERT_D2_SB(tp2, tp3, src1);
  139. LD4(src, src_stride, tp0, tp1, tp2, tp3);
  140. src += (4 * src_stride);
  141. INSERT_D2_SB(tp0, tp1, src2);
  142. INSERT_D2_SB(tp2, tp3, src3);
  143. ILVRL_B2_SH(zero, src0, dst0, dst1);
  144. ILVRL_B2_SH(zero, src1, dst2, dst3);
  145. ILVRL_B2_SH(zero, src2, dst4, dst5);
  146. ILVRL_B2_SH(zero, src3, dst6, dst7);
  147. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  148. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  149. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  150. offset_vec, rnd_vec, dst0, dst1, dst2,
  151. dst3);
  152. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  153. offset_vec, rnd_vec, dst4, dst5, dst6,
  154. dst7);
  155. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  156. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  157. ST6x4_UB(out0, out1, dst, dst_stride);
  158. dst += (4 * dst_stride);
  159. ST6x4_UB(out2, out3, dst, dst_stride);
  160. dst += (4 * dst_stride);
  161. }
  162. }
  163. static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
  164. int32_t src_stride,
  165. uint8_t *dst,
  166. int32_t dst_stride,
  167. int32_t height,
  168. int32_t weight,
  169. int32_t offset,
  170. int32_t rnd_val)
  171. {
  172. uint32_t loop_cnt;
  173. uint64_t tp0, tp1, tp2, tp3;
  174. v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
  175. v16i8 zero = { 0 };
  176. v16u8 out0, out1, out2, out3;
  177. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
  178. v4i32 weight_vec, rnd_vec;
  179. weight = weight & 0x0000FFFF;
  180. weight_vec = __msa_fill_w(weight);
  181. offset_vec = __msa_fill_h(offset);
  182. rnd_vec = __msa_fill_w(rnd_val);
  183. if (2 == height) {
  184. LD2(src, src_stride, tp0, tp1);
  185. INSERT_D2_SB(tp0, tp1, src0);
  186. ILVRL_B2_SH(zero, src0, dst0, dst1);
  187. SLLI_2V(dst0, dst1, 6);
  188. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
  189. rnd_vec, dst0, dst1);
  190. out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  191. ST8x2_UB(out0, dst, dst_stride);
  192. } else if (4 == height) {
  193. LD4(src, src_stride, tp0, tp1, tp2, tp3);
  194. INSERT_D2_SB(tp0, tp1, src0);
  195. INSERT_D2_SB(tp2, tp3, src1);
  196. ILVRL_B2_SH(zero, src0, dst0, dst1);
  197. ILVRL_B2_SH(zero, src1, dst2, dst3);
  198. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  199. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  200. offset_vec, rnd_vec, dst0, dst1, dst2,
  201. dst3);
  202. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  203. ST8x4_UB(out0, out1, dst, dst_stride);
  204. } else if (6 == height) {
  205. LD4(src, src_stride, tp0, tp1, tp2, tp3);
  206. src += 4 * src_stride;
  207. INSERT_D2_SB(tp0, tp1, src0);
  208. INSERT_D2_SB(tp2, tp3, src1);
  209. LD2(src, src_stride, tp0, tp1);
  210. INSERT_D2_SB(tp0, tp1, src2);
  211. ILVRL_B2_SH(zero, src0, dst0, dst1);
  212. ILVRL_B2_SH(zero, src1, dst2, dst3);
  213. ILVRL_B2_SH(zero, src2, dst4, dst5);
  214. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  215. SLLI_2V(dst4, dst5, 6);
  216. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  217. offset_vec, rnd_vec, dst0, dst1, dst2,
  218. dst3);
  219. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  220. rnd_vec, dst4, dst5);
  221. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  222. ST8x4_UB(out0, out1, dst, dst_stride);
  223. dst += (4 * dst_stride);
  224. ST8x2_UB(out2, dst, dst_stride);
  225. } else if (0 == height % 8) {
  226. for (loop_cnt = (height >> 3); loop_cnt--;) {
  227. LD4(src, src_stride, tp0, tp1, tp2, tp3);
  228. src += 4 * src_stride;
  229. INSERT_D2_SB(tp0, tp1, src0);
  230. INSERT_D2_SB(tp2, tp3, src1);
  231. LD4(src, src_stride, tp0, tp1, tp2, tp3);
  232. src += 4 * src_stride;
  233. INSERT_D2_SB(tp0, tp1, src2);
  234. INSERT_D2_SB(tp2, tp3, src3);
  235. ILVRL_B2_SH(zero, src0, dst0, dst1);
  236. ILVRL_B2_SH(zero, src1, dst2, dst3);
  237. ILVRL_B2_SH(zero, src2, dst4, dst5);
  238. ILVRL_B2_SH(zero, src3, dst6, dst7);
  239. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  240. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  241. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  242. offset_vec, rnd_vec, dst0, dst1,
  243. dst2, dst3);
  244. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  245. offset_vec, rnd_vec, dst4, dst5,
  246. dst6, dst7);
  247. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  248. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  249. ST8x4_UB(out0, out1, dst, dst_stride);
  250. dst += (4 * dst_stride);
  251. ST8x4_UB(out2, out3, dst, dst_stride);
  252. dst += (4 * dst_stride);
  253. }
  254. }
  255. }
  256. static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
  257. int32_t src_stride,
  258. uint8_t *dst,
  259. int32_t dst_stride,
  260. int32_t height,
  261. int32_t weight,
  262. int32_t offset,
  263. int32_t rnd_val)
  264. {
  265. uint32_t loop_cnt;
  266. v16u8 out0, out1, out2;
  267. v16i8 src0, src1, src2, src3;
  268. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  269. v8i16 offset_vec;
  270. v16i8 zero = { 0 };
  271. v4i32 weight_vec, rnd_vec;
  272. weight = weight & 0x0000FFFF;
  273. weight_vec = __msa_fill_w(weight);
  274. offset_vec = __msa_fill_h(offset);
  275. rnd_vec = __msa_fill_w(rnd_val);
  276. for (loop_cnt = 4; loop_cnt--;) {
  277. LD_SB4(src, src_stride, src0, src1, src2, src3);
  278. src += (4 * src_stride);
  279. ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
  280. dst0, dst1, dst2, dst3);
  281. ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
  282. ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
  283. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  284. SLLI_2V(dst4, dst5, 6);
  285. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  286. offset_vec, rnd_vec, dst0, dst1, dst2,
  287. dst3);
  288. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  289. rnd_vec, dst4, dst5);
  290. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  291. ST12x4_UB(out0, out1, out2, dst, dst_stride);
  292. dst += (4 * dst_stride);
  293. }
  294. }
  295. static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
  296. int32_t src_stride,
  297. uint8_t *dst,
  298. int32_t dst_stride,
  299. int32_t height,
  300. int32_t weight,
  301. int32_t offset,
  302. int32_t rnd_val)
  303. {
  304. uint32_t loop_cnt;
  305. v16u8 out0, out1, out2, out3;
  306. v16i8 src0, src1, src2, src3;
  307. v16i8 zero = { 0 };
  308. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
  309. v4i32 weight_vec, rnd_vec;
  310. weight = weight & 0x0000FFFF;
  311. weight_vec = __msa_fill_w(weight);
  312. offset_vec = __msa_fill_h(offset);
  313. rnd_vec = __msa_fill_w(rnd_val);
  314. for (loop_cnt = height >> 2; loop_cnt--;) {
  315. LD_SB4(src, src_stride, src0, src1, src2, src3);
  316. src += (4 * src_stride);
  317. ILVRL_B2_SH(zero, src0, dst0, dst1);
  318. ILVRL_B2_SH(zero, src1, dst2, dst3);
  319. ILVRL_B2_SH(zero, src2, dst4, dst5);
  320. ILVRL_B2_SH(zero, src3, dst6, dst7);
  321. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  322. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  323. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  324. offset_vec, rnd_vec, dst0, dst1, dst2,
  325. dst3);
  326. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  327. offset_vec, rnd_vec, dst4, dst5, dst6,
  328. dst7);
  329. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  330. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  331. ST_UB4(out0, out1, out2, out3, dst, dst_stride);
  332. dst += (4 * dst_stride);
  333. }
  334. }
  335. static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
  336. int32_t src_stride,
  337. uint8_t *dst,
  338. int32_t dst_stride,
  339. int32_t height,
  340. int32_t weight,
  341. int32_t offset,
  342. int32_t rnd_val)
  343. {
  344. uint32_t loop_cnt;
  345. v16u8 out0, out1, out2, out3, out4, out5;
  346. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  347. v16i8 zero = { 0 };
  348. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
  349. v8i16 dst8, dst9, dst10, dst11;
  350. v4i32 weight_vec, rnd_vec;
  351. weight = weight & 0x0000FFFF;
  352. weight_vec = __msa_fill_w(weight);
  353. offset_vec = __msa_fill_h(offset);
  354. rnd_vec = __msa_fill_w(rnd_val);
  355. for (loop_cnt = (height >> 2); loop_cnt--;) {
  356. LD_SB4(src, src_stride, src0, src1, src4, src5);
  357. LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
  358. src += (4 * src_stride);
  359. ILVRL_B2_SH(zero, src0, dst0, dst1);
  360. ILVRL_B2_SH(zero, src1, dst2, dst3);
  361. ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
  362. ILVRL_B2_SH(zero, src4, dst6, dst7);
  363. ILVRL_B2_SH(zero, src5, dst8, dst9);
  364. ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
  365. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  366. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  367. SLLI_4V(dst8, dst9, dst10, dst11, 6);
  368. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  369. offset_vec, rnd_vec, dst0, dst1, dst2,
  370. dst3);
  371. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  372. offset_vec, rnd_vec, dst4, dst5, dst6,
  373. dst7);
  374. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
  375. offset_vec, rnd_vec, dst8, dst9, dst10,
  376. dst11);
  377. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  378. PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
  379. ST_UB4(out0, out1, out3, out4, dst, dst_stride);
  380. ST8x4_UB(out2, out5, dst + 16, dst_stride);
  381. dst += (4 * dst_stride);
  382. }
  383. }
  384. static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
  385. int32_t src_stride,
  386. uint8_t *dst,
  387. int32_t dst_stride,
  388. int32_t height,
  389. int32_t weight,
  390. int32_t offset,
  391. int32_t rnd_val)
  392. {
  393. uint32_t loop_cnt;
  394. v16u8 out0, out1, out2, out3;
  395. v16i8 src0, src1, src2, src3;
  396. v16i8 zero = { 0 };
  397. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
  398. v4i32 weight_vec, rnd_vec;
  399. weight = weight & 0x0000FFFF;
  400. weight_vec = __msa_fill_w(weight);
  401. offset_vec = __msa_fill_h(offset);
  402. rnd_vec = __msa_fill_w(rnd_val);
  403. for (loop_cnt = (height >> 1); loop_cnt--;) {
  404. LD_SB2(src, src_stride, src0, src1);
  405. LD_SB2(src + 16, src_stride, src2, src3);
  406. src += (2 * src_stride);
  407. ILVRL_B2_SH(zero, src0, dst0, dst1);
  408. ILVRL_B2_SH(zero, src1, dst2, dst3);
  409. ILVRL_B2_SH(zero, src2, dst4, dst5);
  410. ILVRL_B2_SH(zero, src3, dst6, dst7);
  411. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  412. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  413. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  414. offset_vec, rnd_vec, dst0, dst1, dst2,
  415. dst3);
  416. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  417. offset_vec, rnd_vec, dst4, dst5, dst6,
  418. dst7);
  419. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  420. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  421. ST_UB2(out0, out1, dst, dst_stride);
  422. ST_UB2(out2, out3, dst + 16, dst_stride);
  423. dst += (2 * dst_stride);
  424. }
  425. }
  426. static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
  427. int32_t src_stride,
  428. uint8_t *dst,
  429. int32_t dst_stride,
  430. int32_t height,
  431. int32_t weight,
  432. int32_t offset,
  433. int32_t rnd_val)
  434. {
  435. uint32_t loop_cnt;
  436. v16u8 out0, out1, out2, out3, out4, out5;
  437. v16i8 src0, src1, src2, src3, src4, src5;
  438. v16i8 zero = { 0 };
  439. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
  440. v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
  441. v4i32 weight_vec, rnd_vec;
  442. weight = weight & 0x0000FFFF;
  443. weight_vec = __msa_fill_w(weight);
  444. offset_vec = __msa_fill_h(offset);
  445. rnd_vec = __msa_fill_w(rnd_val);
  446. for (loop_cnt = (height >> 1); loop_cnt--;) {
  447. LD_SB3(src, 16, src0, src1, src2);
  448. src += src_stride;
  449. LD_SB3(src, 16, src3, src4, src5);
  450. src += src_stride;
  451. ILVRL_B2_SH(zero, src0, dst0, dst1);
  452. ILVRL_B2_SH(zero, src1, dst2, dst3);
  453. ILVRL_B2_SH(zero, src2, dst4, dst5);
  454. ILVRL_B2_SH(zero, src3, dst6, dst7);
  455. ILVRL_B2_SH(zero, src4, dst8, dst9);
  456. ILVRL_B2_SH(zero, src5, dst10, dst11);
  457. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  458. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  459. SLLI_4V(dst8, dst9, dst10, dst11, 6);
  460. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  461. offset_vec, rnd_vec, dst0, dst1, dst2,
  462. dst3);
  463. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  464. offset_vec, rnd_vec, dst4, dst5, dst6,
  465. dst7);
  466. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
  467. offset_vec, rnd_vec, dst8, dst9, dst10,
  468. dst11);
  469. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  470. PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
  471. ST_UB2(out0, out1, dst, 16);
  472. ST_UB(out2, dst + 32);
  473. dst += dst_stride;
  474. ST_UB2(out3, out4, dst, 16);
  475. ST_UB(out5, dst + 32);
  476. dst += dst_stride;
  477. }
  478. }
  479. static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
  480. int32_t src_stride,
  481. uint8_t *dst,
  482. int32_t dst_stride,
  483. int32_t height,
  484. int32_t weight,
  485. int32_t offset,
  486. int32_t rnd_val)
  487. {
  488. uint32_t loop_cnt;
  489. v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
  490. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  491. v16i8 zero = { 0 };
  492. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
  493. v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
  494. v4i32 weight_vec, rnd_vec;
  495. weight = weight & 0x0000FFFF;
  496. weight_vec = __msa_fill_w(weight);
  497. offset_vec = __msa_fill_h(offset);
  498. rnd_vec = __msa_fill_w(rnd_val);
  499. for (loop_cnt = (height >> 1); loop_cnt--;) {
  500. LD_SB4(src, 16, src0, src1, src2, src3);
  501. src += src_stride;
  502. LD_SB4(src, 16, src4, src5, src6, src7);
  503. src += src_stride;
  504. ILVRL_B2_SH(zero, src0, dst0, dst1);
  505. ILVRL_B2_SH(zero, src1, dst2, dst3);
  506. ILVRL_B2_SH(zero, src2, dst4, dst5);
  507. ILVRL_B2_SH(zero, src3, dst6, dst7);
  508. ILVRL_B2_SH(zero, src4, dst8, dst9);
  509. ILVRL_B2_SH(zero, src5, dst10, dst11);
  510. ILVRL_B2_SH(zero, src6, dst12, dst13);
  511. ILVRL_B2_SH(zero, src7, dst14, dst15);
  512. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  513. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  514. SLLI_4V(dst8, dst9, dst10, dst11, 6);
  515. SLLI_4V(dst12, dst13, dst14, dst15, 6);
  516. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  517. offset_vec, rnd_vec, dst0, dst1, dst2,
  518. dst3);
  519. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  520. offset_vec, rnd_vec, dst4, dst5, dst6,
  521. dst7);
  522. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
  523. offset_vec, rnd_vec, dst8, dst9, dst10,
  524. dst11);
  525. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
  526. offset_vec, rnd_vec, dst12, dst13, dst14,
  527. dst15);
  528. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  529. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  530. PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
  531. PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
  532. ST_UB4(out0, out1, out2, out3, dst, 16);
  533. dst += dst_stride;
  534. ST_UB4(out4, out5, out6, out7, dst, 16);
  535. dst += dst_stride;
  536. }
  537. }
  538. static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
  539. int32_t src_stride,
  540. uint8_t *dst,
  541. int32_t dst_stride,
  542. const int8_t *filter,
  543. int32_t height,
  544. int32_t weight,
  545. int32_t offset,
  546. int32_t rnd_val)
  547. {
  548. uint32_t loop_cnt;
  549. v16u8 out0, out1;
  550. v8i16 filt0, filt1, filt2, filt3;
  551. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  552. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
  553. v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
  554. v8i16 filter_vec, dst01, dst23, dst45, dst67;
  555. v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
  556. v4i32 weight_vec, rnd_vec;
  557. src -= 3;
  558. weight = weight & 0x0000FFFF;
  559. weight_vec = __msa_fill_w(weight);
  560. rnd_vec = __msa_fill_w(rnd_val);
  561. weight *= 128;
  562. rnd_val -= 6;
  563. weight_vec_h = __msa_fill_h(weight);
  564. offset_vec = __msa_fill_h(offset);
  565. denom_vec = __msa_fill_h(rnd_val);
  566. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  567. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  568. filter_vec = LD_SH(filter);
  569. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  570. mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  571. mask1 = mask0 + 2;
  572. mask2 = mask0 + 4;
  573. mask3 = mask0 + 6;
  574. for (loop_cnt = (height >> 3); loop_cnt--;) {
  575. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  576. src += (8 * src_stride);
  577. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  578. VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
  579. vec0, vec1, vec2, vec3);
  580. VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
  581. vec4, vec5, vec6, vec7);
  582. VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
  583. vec8, vec9, vec10, vec11);
  584. VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
  585. vec12, vec13, vec14, vec15);
  586. dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  587. filt3);
  588. dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  589. filt3);
  590. dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  591. filt3);
  592. dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  593. filt2, filt3);
  594. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
  595. offset_vec, rnd_vec, dst0, dst1, dst2,
  596. dst3);
  597. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  598. ST4x8_UB(out0, out1, dst, dst_stride);
  599. dst += (8 * dst_stride);
  600. }
  601. }
  602. static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
  603. int32_t src_stride,
  604. uint8_t *dst,
  605. int32_t dst_stride,
  606. const int8_t *filter,
  607. int32_t height,
  608. int32_t weight,
  609. int32_t offset,
  610. int32_t rnd_val)
  611. {
  612. uint32_t loop_cnt;
  613. v16u8 out0, out1;
  614. v16i8 src0, src1, src2, src3;
  615. v8i16 filt0, filt1, filt2, filt3;
  616. v16i8 mask0, mask1, mask2, mask3;
  617. v8i16 filter_vec;
  618. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  619. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  620. v8i16 dst0, dst1, dst2, dst3;
  621. v8i16 weight_vec_h, offset_vec, denom_vec;
  622. v4i32 weight_vec, rnd_vec;
  623. src -= 3;
  624. weight = weight & 0x0000FFFF;
  625. weight_vec = __msa_fill_w(weight);
  626. rnd_vec = __msa_fill_w(rnd_val);
  627. weight *= 128;
  628. rnd_val -= 6;
  629. weight_vec_h = __msa_fill_h(weight);
  630. offset_vec = __msa_fill_h(offset);
  631. denom_vec = __msa_fill_h(rnd_val);
  632. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  633. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  634. filter_vec = LD_SH(filter);
  635. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  636. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  637. mask1 = mask0 + 2;
  638. mask2 = mask0 + 4;
  639. mask3 = mask0 + 6;
  640. for (loop_cnt = (height >> 2); loop_cnt--;) {
  641. LD_SB4(src, src_stride, src0, src1, src2, src3);
  642. src += (4 * src_stride);
  643. XORI_B4_128_SB(src0, src1, src2, src3);
  644. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  645. vec0, vec1, vec2, vec3);
  646. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  647. vec4, vec5, vec6, vec7);
  648. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  649. vec8, vec9, vec10, vec11);
  650. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  651. vec12, vec13, vec14, vec15);
  652. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  653. filt3);
  654. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  655. filt3);
  656. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  657. filt3);
  658. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  659. filt2, filt3);
  660. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  661. offset_vec, rnd_vec, dst0, dst1, dst2,
  662. dst3);
  663. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  664. ST8x4_UB(out0, out1, dst, dst_stride);
  665. dst += (4 * dst_stride);
  666. }
  667. }
  668. static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
  669. int32_t src_stride,
  670. uint8_t *dst,
  671. int32_t dst_stride,
  672. const int8_t *filter,
  673. int32_t height,
  674. int32_t weight,
  675. int32_t offset,
  676. int32_t rnd_val)
  677. {
  678. uint32_t loop_cnt;
  679. v16u8 out0, out1, out2;
  680. v8i16 filt0, filt1, filt2, filt3;
  681. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  682. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  683. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  684. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  685. v8i16 filter_vec;
  686. v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
  687. v8i16 weight_vec_h, offset_vec, denom_vec;
  688. v4i32 weight_vec, rnd_vec;
  689. src -= 3;
  690. weight = weight & 0x0000FFFF;
  691. weight_vec = __msa_fill_w(weight);
  692. rnd_vec = __msa_fill_w(rnd_val);
  693. weight *= 128;
  694. rnd_val -= 6;
  695. weight_vec_h = __msa_fill_h(weight);
  696. offset_vec = __msa_fill_h(offset);
  697. denom_vec = __msa_fill_h(rnd_val);
  698. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  699. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  700. filter_vec = LD_SH(filter);
  701. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  702. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  703. mask1 = mask0 + 2;
  704. mask2 = mask0 + 4;
  705. mask3 = mask0 + 6;
  706. mask4 = LD_SB(&ff_hevc_mask_arr[16]);
  707. mask5 = mask4 + 2;
  708. mask6 = mask4 + 4;
  709. mask7 = mask4 + 6;
  710. for (loop_cnt = (height >> 2); loop_cnt--;) {
  711. LD_SB4(src, src_stride, src0, src1, src2, src3);
  712. LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
  713. src += (4 * src_stride);
  714. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  715. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  716. vec0, vec1, vec2, vec3);
  717. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  718. vec4, vec5, vec6, vec7);
  719. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  720. vec8, vec9, vec10, vec11);
  721. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  722. vec12, vec13, vec14, vec15);
  723. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  724. filt3);
  725. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  726. filt3);
  727. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  728. filt3);
  729. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  730. filt2, filt3);
  731. VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
  732. vec0, vec1, vec2, vec3);
  733. VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
  734. vec4, vec5, vec6, vec7);
  735. dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  736. filt3);
  737. dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  738. filt3);
  739. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  740. offset_vec, rnd_vec, dst0, dst1, dst2,
  741. dst3);
  742. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
  743. rnd_vec, dst4, dst5);
  744. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  745. ST8x4_UB(out0, out1, dst, dst_stride);
  746. ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
  747. dst += (4 * dst_stride);
  748. }
  749. }
  750. static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
  751. int32_t src_stride,
  752. uint8_t *dst,
  753. int32_t dst_stride,
  754. const int8_t *filter,
  755. int32_t height,
  756. int32_t weight,
  757. int32_t offset,
  758. int32_t rnd_val)
  759. {
  760. uint32_t loop_cnt;
  761. v16u8 out0, out1;
  762. v16i8 src0, src1, src2, src3;
  763. v8i16 filt0, filt1, filt2, filt3;
  764. v16i8 mask0, mask1, mask2, mask3;
  765. v8i16 filter_vec;
  766. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  767. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  768. v8i16 dst0, dst1, dst2, dst3;
  769. v8i16 weight_vec_h, offset_vec, denom_vec;
  770. v4i32 weight_vec, rnd_vec;
  771. src -= 3;
  772. weight_vec = __msa_fill_w(weight);
  773. rnd_vec = __msa_fill_w(rnd_val);
  774. weight *= 128;
  775. rnd_val -= 6;
  776. weight_vec_h = __msa_fill_h(weight);
  777. offset_vec = __msa_fill_h(offset);
  778. denom_vec = __msa_fill_h(rnd_val);
  779. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  780. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  781. filter_vec = LD_SH(filter);
  782. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  783. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  784. mask1 = mask0 + 2;
  785. mask2 = mask0 + 4;
  786. mask3 = mask0 + 6;
  787. for (loop_cnt = (height >> 1); loop_cnt--;) {
  788. LD_SB2(src, src_stride, src0, src2);
  789. LD_SB2(src + 8, src_stride, src1, src3);
  790. src += (2 * src_stride);
  791. XORI_B4_128_SB(src0, src1, src2, src3);
  792. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  793. vec0, vec1, vec2, vec3);
  794. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  795. vec4, vec5, vec6, vec7);
  796. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  797. vec8, vec9, vec10, vec11);
  798. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  799. vec12, vec13, vec14, vec15);
  800. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  801. filt3);
  802. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  803. filt3);
  804. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  805. filt3);
  806. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  807. filt2, filt3);
  808. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  809. offset_vec, rnd_vec, dst0, dst1, dst2,
  810. dst3);
  811. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  812. ST_UB2(out0, out1, dst, dst_stride);
  813. dst += (2 * dst_stride);
  814. }
  815. }
  816. static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
  817. int32_t src_stride,
  818. uint8_t *dst,
  819. int32_t dst_stride,
  820. const int8_t *filter,
  821. int32_t height,
  822. int32_t weight,
  823. int32_t offset,
  824. int32_t rnd_val)
  825. {
  826. uint32_t loop_cnt;
  827. v16u8 out0, out1, out2;
  828. v16i8 src0, src1, src2, src3;
  829. v8i16 filt0, filt1, filt2, filt3;
  830. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  831. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  832. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  833. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  834. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  835. v4i32 weight_vec, rnd_vec;
  836. src -= 3;
  837. weight_vec = __msa_fill_w(weight);
  838. rnd_vec = __msa_fill_w(rnd_val);
  839. weight *= 128;
  840. rnd_val -= 6;
  841. weight_vec_h = __msa_fill_h(weight);
  842. offset_vec = __msa_fill_h(offset);
  843. denom_vec = __msa_fill_h(rnd_val);
  844. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  845. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  846. filter_vec = LD_SH(filter);
  847. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  848. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  849. mask1 = mask0 + 2;
  850. mask2 = mask0 + 4;
  851. mask3 = mask0 + 6;
  852. mask4 = mask0 + 8;
  853. mask5 = mask0 + 10;
  854. mask6 = mask0 + 12;
  855. mask7 = mask0 + 14;
  856. for (loop_cnt = 16; loop_cnt--;) {
  857. LD_SB2(src, 16, src0, src1);
  858. src += src_stride;
  859. LD_SB2(src, 16, src2, src3);
  860. src += src_stride;
  861. XORI_B4_128_SB(src0, src1, src2, src3);
  862. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  863. vec0, vec1, vec2, vec3);
  864. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  865. vec4, vec5, vec6, vec7);
  866. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  867. vec8, vec9, vec10, vec11);
  868. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  869. vec12, vec13, vec14, vec15);
  870. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  871. filt3);
  872. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  873. filt3);
  874. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  875. filt3);
  876. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  877. filt2, filt3);
  878. VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
  879. vec0, vec1, vec2, vec3);
  880. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  881. vec4, vec5, vec6, vec7);
  882. dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  883. filt3);
  884. dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  885. filt3);
  886. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  887. offset_vec, rnd_vec, dst0, dst1, dst2,
  888. dst3);
  889. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  890. rnd_vec, dst4, dst5);
  891. PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
  892. ST_UB2(out0, out1, dst, dst_stride);
  893. ST8x2_UB(out2, dst + 16, dst_stride);
  894. dst += (2 * dst_stride);
  895. }
  896. }
  897. static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
  898. int32_t src_stride,
  899. uint8_t *dst,
  900. int32_t dst_stride,
  901. const int8_t *filter,
  902. int32_t height,
  903. int32_t weight,
  904. int32_t offset,
  905. int32_t rnd_val)
  906. {
  907. uint32_t loop_cnt;
  908. v16u8 out0, out1, out2, out3;
  909. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  910. v8i16 filt0, filt1, filt2, filt3;
  911. v16i8 mask0, mask1, mask2, mask3;
  912. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  913. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  914. v8i16 filter_vec;
  915. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  916. v8i16 weight_vec_h, offset_vec, denom_vec;
  917. v4i32 weight_vec, rnd_vec;
  918. src -= 3;
  919. weight_vec = __msa_fill_w(weight);
  920. rnd_vec = __msa_fill_w(rnd_val);
  921. weight *= 128;
  922. rnd_val -= 6;
  923. weight_vec_h = __msa_fill_h(weight);
  924. offset_vec = __msa_fill_h(offset);
  925. denom_vec = __msa_fill_h(rnd_val);
  926. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  927. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  928. filter_vec = LD_SH(filter);
  929. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  930. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  931. mask1 = mask0 + 2;
  932. mask2 = mask0 + 4;
  933. mask3 = mask0 + 6;
  934. for (loop_cnt = height >> 1; loop_cnt--;) {
  935. LD_SB4(src, 8, src0, src1, src2, src3);
  936. src += src_stride;
  937. LD_SB4(src, 8, src4, src5, src6, src7);
  938. src += src_stride;
  939. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  940. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  941. vec0, vec1, vec2, vec3);
  942. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  943. vec4, vec5, vec6, vec7);
  944. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  945. vec8, vec9, vec10, vec11);
  946. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  947. vec12, vec13, vec14, vec15);
  948. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  949. filt3);
  950. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  951. filt3);
  952. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  953. filt3);
  954. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  955. filt2, filt3);
  956. VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
  957. vec0, vec1, vec2, vec3);
  958. VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
  959. vec4, vec5, vec6, vec7);
  960. VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
  961. vec8, vec9, vec10, vec11);
  962. VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
  963. vec12, vec13, vec14, vec15);
  964. dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  965. filt3);
  966. dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  967. filt3);
  968. dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  969. filt3);
  970. dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  971. filt2, filt3);
  972. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  973. offset_vec, rnd_vec, dst0, dst1, dst2,
  974. dst3);
  975. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  976. offset_vec, rnd_vec, dst4, dst5, dst6,
  977. dst7);
  978. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  979. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  980. ST_UB2(out0, out1, dst, 16);
  981. dst += dst_stride;
  982. ST_UB2(out2, out3, dst, 16);
  983. dst += dst_stride;
  984. }
  985. }
  986. static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
  987. int32_t src_stride,
  988. uint8_t *dst,
  989. int32_t dst_stride,
  990. const int8_t *filter,
  991. int32_t height,
  992. int32_t weight,
  993. int32_t offset,
  994. int32_t rnd_val)
  995. {
  996. uint32_t loop_cnt;
  997. v16u8 out0, out1, out2;
  998. v16i8 src0, src1, src2, src3;
  999. v8i16 filt0, filt1, filt2, filt3;
  1000. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  1001. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1002. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1003. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  1004. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  1005. v4i32 weight_vec, rnd_vec;
  1006. src -= 3;
  1007. weight = weight & 0x0000FFFF;
  1008. weight_vec = __msa_fill_w(weight);
  1009. rnd_vec = __msa_fill_w(rnd_val);
  1010. weight *= 128;
  1011. rnd_val -= 6;
  1012. weight_vec_h = __msa_fill_h(weight);
  1013. offset_vec = __msa_fill_h(offset);
  1014. denom_vec = __msa_fill_h(rnd_val);
  1015. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  1016. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  1017. filter_vec = LD_SH(filter);
  1018. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1019. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  1020. mask1 = mask0 + 2;
  1021. mask2 = mask0 + 4;
  1022. mask3 = mask0 + 6;
  1023. mask4 = mask0 + 8;
  1024. mask5 = mask0 + 10;
  1025. mask6 = mask0 + 12;
  1026. mask7 = mask0 + 14;
  1027. for (loop_cnt = 64; loop_cnt--;) {
  1028. LD_SB3(src, 16, src0, src1, src2);
  1029. src3 = LD_SB(src + 40);
  1030. src += src_stride;
  1031. XORI_B4_128_SB(src0, src1, src2, src3);
  1032. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1033. vec0, vec1, vec2, vec3);
  1034. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  1035. vec4, vec5, vec6, vec7);
  1036. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1037. vec8, vec9, vec10, vec11);
  1038. VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
  1039. vec12, vec13, vec14, vec15);
  1040. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1041. filt3);
  1042. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1043. filt3);
  1044. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1045. filt3);
  1046. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  1047. filt2, filt3);
  1048. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1049. vec0, vec1, vec2, vec3);
  1050. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  1051. vec4, vec5, vec6, vec7);
  1052. dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1053. filt3);
  1054. dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1055. filt3);
  1056. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  1057. offset_vec, rnd_vec, dst0, dst1, dst2,
  1058. dst3);
  1059. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  1060. rnd_vec, dst4, dst5);
  1061. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  1062. ST_UB2(out0, out1, dst, 16);
  1063. ST_UB(out2, dst + 32);
  1064. dst += dst_stride;
  1065. }
  1066. }
  1067. static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
  1068. int32_t src_stride,
  1069. uint8_t *dst,
  1070. int32_t dst_stride,
  1071. const int8_t *filter,
  1072. int32_t height,
  1073. int32_t weight,
  1074. int32_t offset,
  1075. int32_t rnd_val)
  1076. {
  1077. uint8_t *src_tmp;
  1078. uint8_t *dst_tmp;
  1079. uint32_t loop_cnt, cnt;
  1080. v16u8 out0, out1;
  1081. v16i8 src0, src1, src2;
  1082. v8i16 filt0, filt1, filt2, filt3;
  1083. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  1084. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1085. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1086. v8i16 dst0, dst1, dst2, dst3;
  1087. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  1088. v4i32 weight_vec, rnd_vec;
  1089. src -= 3;
  1090. weight_vec = __msa_fill_w(weight);
  1091. rnd_vec = __msa_fill_w(rnd_val);
  1092. weight *= 128;
  1093. rnd_val -= 6;
  1094. weight_vec_h = __msa_fill_h(weight);
  1095. offset_vec = __msa_fill_h(offset);
  1096. denom_vec = __msa_fill_h(rnd_val);
  1097. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  1098. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  1099. filter_vec = LD_SH(filter);
  1100. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1101. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  1102. mask1 = mask0 + 2;
  1103. mask2 = mask0 + 4;
  1104. mask3 = mask0 + 6;
  1105. mask4 = mask0 + 8;
  1106. mask5 = mask0 + 10;
  1107. mask6 = mask0 + 12;
  1108. mask7 = mask0 + 14;
  1109. for (loop_cnt = height; loop_cnt--;) {
  1110. src_tmp = src;
  1111. dst_tmp = dst;
  1112. for (cnt = 2; cnt--;) {
  1113. LD_SB2(src_tmp, 16, src0, src1);
  1114. src2 = LD_SB(src_tmp + 24);
  1115. src_tmp += 32;
  1116. XORI_B3_128_SB(src0, src1, src2);
  1117. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1118. vec0, vec1, vec2, vec3);
  1119. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  1120. vec4, vec5, vec6, vec7);
  1121. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1122. vec8, vec9, vec10, vec11);
  1123. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1124. vec12, vec13, vec14, vec15);
  1125. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1126. filt2, filt3);
  1127. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
  1128. filt2, filt3);
  1129. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
  1130. filt2, filt3);
  1131. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  1132. filt2, filt3);
  1133. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  1134. offset_vec, rnd_vec, dst0, dst1,
  1135. dst2, dst3);
  1136. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  1137. ST_UB2(out0, out1, dst_tmp, 16);
  1138. dst_tmp += 32;
  1139. }
  1140. src += src_stride;
  1141. dst += dst_stride;
  1142. }
  1143. }
  1144. static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
  1145. int32_t src_stride,
  1146. uint8_t *dst,
  1147. int32_t dst_stride,
  1148. const int8_t *filter,
  1149. int32_t height,
  1150. int32_t weight,
  1151. int32_t offset,
  1152. int32_t rnd_val)
  1153. {
  1154. int32_t loop_cnt;
  1155. v16u8 out0, out1;
  1156. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  1157. v16i8 src9, src10, src11, src12, src13, src14;
  1158. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  1159. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  1160. v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
  1161. v16i8 src2110, src4332, src6554, src8776, src10998;
  1162. v16i8 src12111110, src14131312;
  1163. v8i16 filter_vec, dst01, dst23, dst45, dst67;
  1164. v8i16 filt0, filt1, filt2, filt3;
  1165. v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
  1166. v4i32 weight_vec, rnd_vec;
  1167. src -= (3 * src_stride);
  1168. weight_vec = __msa_fill_w(weight);
  1169. rnd_vec = __msa_fill_w(rnd_val);
  1170. weight *= 128;
  1171. rnd_val -= 6;
  1172. weight_vec_h = __msa_fill_h(weight);
  1173. offset_vec = __msa_fill_h(offset);
  1174. denom_vec = __msa_fill_h(rnd_val);
  1175. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  1176. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  1177. filter_vec = LD_SH(filter);
  1178. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1179. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1180. src += (7 * src_stride);
  1181. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1182. src10_r, src32_r, src54_r, src21_r);
  1183. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1184. ILVR_D3_SB(src21_r, src10_r, src43_r,
  1185. src32_r, src65_r, src54_r, src2110, src4332, src6554);
  1186. XORI_B3_128_SB(src2110, src4332, src6554);
  1187. for (loop_cnt = (height >> 3); loop_cnt--;) {
  1188. LD_SB8(src, src_stride,
  1189. src7, src8, src9, src10, src11, src12, src13, src14);
  1190. src += (8 * src_stride);
  1191. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1192. src76_r, src87_r, src98_r, src109_r);
  1193. ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
  1194. src1110_r, src1211_r, src1312_r, src1413_r);
  1195. ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
  1196. src1413_r, src1312_r,
  1197. src8776, src10998, src12111110, src14131312);
  1198. XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
  1199. dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
  1200. filt1, filt2, filt3);
  1201. dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
  1202. filt1, filt2, filt3);
  1203. dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
  1204. filt0, filt1, filt2, filt3);
  1205. dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
  1206. filt0, filt1, filt2, filt3);
  1207. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
  1208. offset_vec, rnd_vec, dst0, dst1, dst2,
  1209. dst3);
  1210. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  1211. ST4x8_UB(out0, out1, dst, dst_stride);
  1212. dst += (8 * dst_stride);
  1213. src2110 = src10998;
  1214. src4332 = src12111110;
  1215. src6554 = src14131312;
  1216. src6 = src14;
  1217. }
  1218. }
  1219. static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
  1220. int32_t src_stride,
  1221. uint8_t *dst,
  1222. int32_t dst_stride,
  1223. const int8_t *filter,
  1224. int32_t height,
  1225. int32_t weight,
  1226. int32_t offset,
  1227. int32_t rnd_val)
  1228. {
  1229. int32_t loop_cnt;
  1230. v16u8 out0, out1;
  1231. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1232. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  1233. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  1234. v8i16 filt0, filt1, filt2, filt3;
  1235. v8i16 filter_vec;
  1236. v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
  1237. v4i32 weight_vec, rnd_vec;
  1238. src -= (3 * src_stride);
  1239. weight_vec = __msa_fill_w(weight);
  1240. rnd_vec = __msa_fill_w(rnd_val);
  1241. weight *= 128;
  1242. rnd_val -= 6;
  1243. weight_vec_h = __msa_fill_h(weight);
  1244. offset_vec = __msa_fill_h(offset);
  1245. denom_vec = __msa_fill_h(rnd_val);
  1246. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  1247. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  1248. filter_vec = LD_SH(filter);
  1249. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1250. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1251. src += (7 * src_stride);
  1252. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1253. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1254. src10_r, src32_r, src54_r, src21_r);
  1255. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1256. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1257. LD_SB4(src, src_stride, src7, src8, src9, src10);
  1258. src += (4 * src_stride);
  1259. XORI_B4_128_SB(src7, src8, src9, src10);
  1260. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1261. src76_r, src87_r, src98_r, src109_r);
  1262. dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
  1263. filt1, filt2, filt3);
  1264. dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
  1265. filt1, filt2, filt3);
  1266. dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
  1267. filt1, filt2, filt3);
  1268. dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
  1269. filt1, filt2, filt3);
  1270. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  1271. offset_vec, rnd_vec, dst0, dst1, dst2,
  1272. dst3);
  1273. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  1274. ST8x4_UB(out0, out1, dst, dst_stride);
  1275. dst += (4 * dst_stride);
  1276. src10_r = src54_r;
  1277. src32_r = src76_r;
  1278. src54_r = src98_r;
  1279. src21_r = src65_r;
  1280. src43_r = src87_r;
  1281. src65_r = src109_r;
  1282. src6 = src10;
  1283. }
  1284. }
  1285. static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
  1286. int32_t src_stride,
  1287. uint8_t *dst,
  1288. int32_t dst_stride,
  1289. const int8_t *filter,
  1290. int32_t height,
  1291. int32_t weight,
  1292. int32_t offset,
  1293. int32_t rnd_val)
  1294. {
  1295. int32_t loop_cnt;
  1296. v16u8 out0, out1, out2;
  1297. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1298. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  1299. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  1300. v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
  1301. v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
  1302. v16i8 src2110, src4332, src6554, src8776, src10998;
  1303. v8i16 filt0, filt1, filt2, filt3;
  1304. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  1305. v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
  1306. v4i32 weight_vec, rnd_vec;
  1307. src -= (3 * src_stride);
  1308. weight = weight & 0x0000FFFF;
  1309. weight_vec = __msa_fill_w(weight);
  1310. rnd_vec = __msa_fill_w(rnd_val);
  1311. weight *= 128;
  1312. rnd_val -= 6;
  1313. weight_vec_h = __msa_fill_h(weight);
  1314. offset_vec = __msa_fill_h(offset);
  1315. denom_vec = __msa_fill_h(rnd_val);
  1316. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  1317. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  1318. filter_vec = LD_SH(filter);
  1319. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1320. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1321. src += (7 * src_stride);
  1322. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1323. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1324. src10_r, src32_r, src54_r, src21_r);
  1325. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1326. ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1327. src10_l, src32_l, src54_l, src21_l);
  1328. ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
  1329. ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
  1330. src2110, src4332, src6554);
  1331. for (loop_cnt = 4; loop_cnt--;) {
  1332. LD_SB4(src, src_stride, src7, src8, src9, src10);
  1333. src += (4 * src_stride);
  1334. XORI_B4_128_SB(src7, src8, src9, src10);
  1335. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1336. src76_r, src87_r, src98_r, src109_r);
  1337. ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1338. src76_l, src87_l, src98_l, src109_l);
  1339. ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
  1340. dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
  1341. filt1, filt2, filt3);
  1342. dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
  1343. filt1, filt2, filt3);
  1344. dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
  1345. filt1, filt2, filt3);
  1346. dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
  1347. filt1, filt2, filt3);
  1348. dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
  1349. filt1, filt2, filt3);
  1350. dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
  1351. filt1, filt2, filt3);
  1352. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  1353. offset_vec, rnd_vec, dst0, dst1, dst2,
  1354. dst3);
  1355. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  1356. rnd_vec, dst4, dst5);
  1357. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  1358. ST8x4_UB(out0, out1, dst, dst_stride);
  1359. ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
  1360. dst += (4 * dst_stride);
  1361. src10_r = src54_r;
  1362. src32_r = src76_r;
  1363. src54_r = src98_r;
  1364. src21_r = src65_r;
  1365. src43_r = src87_r;
  1366. src65_r = src109_r;
  1367. src2110 = src6554;
  1368. src4332 = src8776;
  1369. src6554 = src10998;
  1370. src6 = src10;
  1371. }
  1372. }
  1373. static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src,
  1374. int32_t src_stride,
  1375. uint8_t *dst,
  1376. int32_t dst_stride,
  1377. const int8_t *filter,
  1378. int32_t height,
  1379. int32_t weight,
  1380. int32_t offset,
  1381. int32_t rnd_val,
  1382. int32_t weightmul16)
  1383. {
  1384. uint8_t *src_tmp;
  1385. uint8_t *dst_tmp;
  1386. int32_t loop_cnt, cnt;
  1387. v16u8 out0, out1, out2, out3;
  1388. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1389. v16i8 src10_r, src32_r, src54_r, src76_r;
  1390. v16i8 src21_r, src43_r, src65_r, src87_r;
  1391. v16i8 src10_l, src32_l, src54_l, src76_l;
  1392. v16i8 src21_l, src43_l, src65_l, src87_l;
  1393. v16i8 src98_r, src109_r, src98_l, src109_l;
  1394. v8i16 filt0, filt1, filt2, filt3;
  1395. v8i16 filter_vec;
  1396. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  1397. v8i16 weight_vec_h, offset_vec, denom_vec;
  1398. v4i32 weight_vec, rnd_vec;
  1399. src -= (3 * src_stride);
  1400. weight_vec = __msa_fill_w(weight);
  1401. rnd_vec = __msa_fill_w(rnd_val);
  1402. weight *= 128;
  1403. rnd_val -= 6;
  1404. weight_vec_h = __msa_fill_h(weight);
  1405. offset_vec = __msa_fill_h(offset);
  1406. denom_vec = __msa_fill_h(rnd_val);
  1407. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  1408. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  1409. filter_vec = LD_SH(filter);
  1410. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1411. for (cnt = weightmul16; cnt--;) {
  1412. src_tmp = src;
  1413. dst_tmp = dst;
  1414. LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1415. src_tmp += (7 * src_stride);
  1416. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1417. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1418. LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
  1419. src_tmp += (4 * src_stride);
  1420. XORI_B4_128_SB(src7, src8, src9, src10);
  1421. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1422. src10_r, src32_r, src54_r, src21_r);
  1423. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1424. ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1425. src10_l, src32_l, src54_l, src21_l);
  1426. ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
  1427. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1428. src76_r, src87_r, src98_r, src109_r);
  1429. ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1430. src76_l, src87_l, src98_l, src109_l);
  1431. dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
  1432. filt1, filt2, filt3);
  1433. dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
  1434. filt1, filt2, filt3);
  1435. dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
  1436. filt1, filt2, filt3);
  1437. dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
  1438. filt1, filt2, filt3);
  1439. dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
  1440. filt1, filt2, filt3);
  1441. dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
  1442. filt1, filt2, filt3);
  1443. dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
  1444. filt1, filt2, filt3);
  1445. dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
  1446. filt1, filt2, filt3);
  1447. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  1448. offset_vec, rnd_vec, dst0, dst1,
  1449. dst2, dst3);
  1450. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  1451. offset_vec, rnd_vec, dst4, dst5,
  1452. dst6, dst7);
  1453. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  1454. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  1455. ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
  1456. dst_tmp += (4 * dst_stride);
  1457. src0 = src4;
  1458. src1 = src5;
  1459. src2 = src6;
  1460. src3 = src7;
  1461. src4 = src8;
  1462. src5 = src9;
  1463. src6 = src10;
  1464. }
  1465. src += 16;
  1466. dst += 16;
  1467. }
  1468. }
  1469. static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
  1470. int32_t src_stride,
  1471. uint8_t *dst,
  1472. int32_t dst_stride,
  1473. const int8_t *filter,
  1474. int32_t height,
  1475. int32_t weight,
  1476. int32_t offset,
  1477. int32_t rnd_val)
  1478. {
  1479. hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
  1480. filter, height, weight,
  1481. offset, rnd_val, 1);
  1482. }
  1483. static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
  1484. int32_t src_stride,
  1485. uint8_t *dst,
  1486. int32_t dst_stride,
  1487. const int8_t *filter,
  1488. int32_t height,
  1489. int32_t weight,
  1490. int32_t offset,
  1491. int32_t rnd_val)
  1492. {
  1493. hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
  1494. filter, 32, weight,
  1495. offset, rnd_val, 1);
  1496. hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
  1497. filter, 32, weight, offset, rnd_val);
  1498. }
  1499. static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
  1500. int32_t src_stride,
  1501. uint8_t *dst,
  1502. int32_t dst_stride,
  1503. const int8_t *filter,
  1504. int32_t height,
  1505. int32_t weight,
  1506. int32_t offset,
  1507. int32_t rnd_val)
  1508. {
  1509. hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
  1510. filter, height, weight,
  1511. offset, rnd_val, 2);
  1512. }
  1513. static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
  1514. int32_t src_stride,
  1515. uint8_t *dst,
  1516. int32_t dst_stride,
  1517. const int8_t *filter,
  1518. int32_t height,
  1519. int32_t weight,
  1520. int32_t offset,
  1521. int32_t rnd_val)
  1522. {
  1523. hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
  1524. filter, 64, weight,
  1525. offset, rnd_val, 3);
  1526. }
  1527. static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
  1528. int32_t src_stride,
  1529. uint8_t *dst,
  1530. int32_t dst_stride,
  1531. const int8_t *filter,
  1532. int32_t height,
  1533. int32_t weight,
  1534. int32_t offset,
  1535. int32_t rnd_val)
  1536. {
  1537. hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
  1538. filter, height, weight,
  1539. offset, rnd_val, 4);
  1540. }
  1541. static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
  1542. int32_t src_stride,
  1543. uint8_t *dst,
  1544. int32_t dst_stride,
  1545. const int8_t *filter_x,
  1546. const int8_t *filter_y,
  1547. int32_t height,
  1548. int32_t weight,
  1549. int32_t offset,
  1550. int32_t rnd_val)
  1551. {
  1552. uint32_t loop_cnt;
  1553. v16u8 out;
  1554. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1555. v8i16 filt0, filt1, filt2, filt3;
  1556. v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
  1557. v16i8 mask1, mask2, mask3;
  1558. v8i16 filter_vec;
  1559. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1560. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1561. v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
  1562. v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
  1563. v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
  1564. v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
  1565. v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
  1566. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  1567. src -= ((3 * src_stride) + 3);
  1568. filter_vec = LD_SH(filter_x);
  1569. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1570. filter_vec = LD_SH(filter_y);
  1571. UNPCK_R_SB_SH(filter_vec, filter_vec);
  1572. SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
  1573. mask1 = mask0 + 2;
  1574. mask2 = mask0 + 4;
  1575. mask3 = mask0 + 6;
  1576. weight_vec = __msa_fill_w(weight);
  1577. offset_vec = __msa_fill_w(offset);
  1578. rnd_vec = __msa_fill_w(rnd_val);
  1579. denom_vec = rnd_vec - 6;
  1580. const_128 = __msa_ldi_w(128);
  1581. const_128 *= weight_vec;
  1582. offset_vec += __msa_srar_w(const_128, denom_vec);
  1583. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1584. src += (7 * src_stride);
  1585. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1586. /* row 0 row 1 row 2 row 3 */
  1587. VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  1588. VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
  1589. VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
  1590. vec8, vec9, vec10, vec11);
  1591. VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
  1592. vec12, vec13, vec14, vec15);
  1593. dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1594. filt3);
  1595. dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1596. filt3);
  1597. dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1598. filt3);
  1599. dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
  1600. filt3);
  1601. ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
  1602. ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
  1603. ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
  1604. dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
  1605. for (loop_cnt = height >> 2; loop_cnt--;) {
  1606. LD_SB4(src, src_stride, src7, src8, src9, src10);
  1607. src += (4 * src_stride);
  1608. XORI_B4_128_SB(src7, src8, src9, src10);
  1609. VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
  1610. vec0, vec1, vec2, vec3);
  1611. VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
  1612. vec4, vec5, vec6, vec7);
  1613. dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1614. filt3);
  1615. dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1616. filt3);
  1617. dst76_r = __msa_ilvr_h(dst97, dst66);
  1618. ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
  1619. dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
  1620. dst98_r = __msa_ilvr_h(dst66, dst108);
  1621. dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
  1622. filt_h1, filt_h2, filt_h3);
  1623. dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
  1624. filt_h1, filt_h2, filt_h3);
  1625. dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
  1626. filt_h1, filt_h2, filt_h3);
  1627. dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
  1628. filt_h1, filt_h2, filt_h3);
  1629. SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
  1630. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  1631. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  1632. SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
  1633. ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
  1634. ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
  1635. CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
  1636. PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
  1637. out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
  1638. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  1639. dst += (4 * dst_stride);
  1640. dst10_r = dst54_r;
  1641. dst32_r = dst76_r;
  1642. dst54_r = dst98_r;
  1643. dst21_r = dst65_r;
  1644. dst43_r = dst87_r;
  1645. dst65_r = dst109_r;
  1646. dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
  1647. }
  1648. }
  1649. static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
  1650. int32_t src_stride,
  1651. uint8_t *dst,
  1652. int32_t dst_stride,
  1653. const int8_t *filter_x,
  1654. const int8_t *filter_y,
  1655. int32_t height,
  1656. int32_t weight,
  1657. int32_t offset,
  1658. int32_t rnd_val,
  1659. int32_t width)
  1660. {
  1661. uint32_t loop_cnt, cnt;
  1662. uint8_t *src_tmp;
  1663. uint8_t *dst_tmp;
  1664. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  1665. v8i16 filt0, filt1, filt2, filt3;
  1666. v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
  1667. v16i8 mask1, mask2, mask3;
  1668. v8i16 filter_vec;
  1669. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1670. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1671. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
  1672. v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
  1673. v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
  1674. v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
  1675. v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
  1676. v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
  1677. v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
  1678. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  1679. src -= ((3 * src_stride) + 3);
  1680. weight_vec = __msa_fill_w(weight);
  1681. offset_vec = __msa_fill_w(offset);
  1682. rnd_vec = __msa_fill_w(rnd_val);
  1683. denom_vec = rnd_vec - 6;
  1684. const_128 = __msa_ldi_w(128);
  1685. const_128 *= weight_vec;
  1686. offset_vec += __msa_srar_w(const_128, denom_vec);
  1687. filter_vec = LD_SH(filter_x);
  1688. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1689. filter_vec = LD_SH(filter_y);
  1690. UNPCK_R_SB_SH(filter_vec, filter_vec);
  1691. SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
  1692. mask1 = mask0 + 2;
  1693. mask2 = mask0 + 4;
  1694. mask3 = mask0 + 6;
  1695. for (cnt = width >> 3; cnt--;) {
  1696. src_tmp = src;
  1697. dst_tmp = dst;
  1698. LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1699. src_tmp += (7 * src_stride);
  1700. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1701. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1702. vec0, vec1, vec2, vec3);
  1703. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1704. vec4, vec5, vec6, vec7);
  1705. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1706. vec8, vec9, vec10, vec11);
  1707. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  1708. vec12, vec13, vec14, vec15);
  1709. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1710. filt3);
  1711. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1712. filt3);
  1713. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1714. filt3);
  1715. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  1716. filt2, filt3);
  1717. VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
  1718. vec0, vec1, vec2, vec3);
  1719. VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
  1720. vec4, vec5, vec6, vec7);
  1721. VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
  1722. vec8, vec9, vec10, vec11);
  1723. dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1724. filt3);
  1725. dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1726. filt3);
  1727. dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1728. filt3);
  1729. ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
  1730. dst10_r, dst32_r, dst54_r, dst21_r);
  1731. ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
  1732. ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
  1733. dst10_l, dst32_l, dst54_l, dst21_l);
  1734. ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
  1735. for (loop_cnt = height >> 1; loop_cnt--;) {
  1736. LD_SB2(src_tmp, src_stride, src7, src8);
  1737. src_tmp += 2 * src_stride;
  1738. XORI_B2_128_SB(src7, src8);
  1739. VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
  1740. vec0, vec1, vec2, vec3);
  1741. dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1742. filt2, filt3);
  1743. ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
  1744. dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
  1745. filt_h0, filt_h1, filt_h2, filt_h3);
  1746. dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
  1747. filt_h0, filt_h1, filt_h2, filt_h3);
  1748. dst0_r >>= 6;
  1749. dst0_l >>= 6;
  1750. /* row 8 */
  1751. VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
  1752. vec0, vec1, vec2, vec3);
  1753. dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1754. filt2, filt3);
  1755. ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
  1756. dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
  1757. filt_h0, filt_h1, filt_h2, filt_h3);
  1758. dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
  1759. filt_h0, filt_h1, filt_h2, filt_h3);
  1760. dst1_r >>= 6;
  1761. dst1_l >>= 6;
  1762. MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
  1763. MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
  1764. SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
  1765. ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
  1766. ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
  1767. CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst0_l, dst1_l);
  1768. PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
  1769. dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
  1770. ST8x2_UB(dst0_r, dst_tmp, dst_stride);
  1771. dst_tmp += (2 * dst_stride);
  1772. dst10_r = dst32_r;
  1773. dst32_r = dst54_r;
  1774. dst54_r = dst76_r;
  1775. dst10_l = dst32_l;
  1776. dst32_l = dst54_l;
  1777. dst54_l = dst76_l;
  1778. dst21_r = dst43_r;
  1779. dst43_r = dst65_r;
  1780. dst65_r = dst87_r;
  1781. dst21_l = dst43_l;
  1782. dst43_l = dst65_l;
  1783. dst65_l = dst87_l;
  1784. dst6 = dst8;
  1785. }
  1786. src += 8;
  1787. dst += 8;
  1788. }
  1789. }
  1790. static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
  1791. int32_t src_stride,
  1792. uint8_t *dst,
  1793. int32_t dst_stride,
  1794. const int8_t *filter_x,
  1795. const int8_t *filter_y,
  1796. int32_t height,
  1797. int32_t weight,
  1798. int32_t offset,
  1799. int32_t rnd_val)
  1800. {
  1801. hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
  1802. filter_x, filter_y, height, weight,
  1803. offset, rnd_val, 8);
  1804. }
  1805. static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
  1806. int32_t src_stride,
  1807. uint8_t *dst,
  1808. int32_t dst_stride,
  1809. const int8_t *filter_x,
  1810. const int8_t *filter_y,
  1811. int32_t height,
  1812. int32_t weight,
  1813. int32_t offset,
  1814. int32_t rnd_val)
  1815. {
  1816. uint32_t loop_cnt;
  1817. uint8_t *src_tmp, *dst_tmp;
  1818. v16u8 out;
  1819. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1820. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  1821. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1822. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1823. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  1824. v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
  1825. v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
  1826. v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
  1827. v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
  1828. v8i16 dst76_l, filter_vec;
  1829. v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
  1830. v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
  1831. src -= ((3 * src_stride) + 3);
  1832. filter_vec = LD_SH(filter_x);
  1833. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1834. filter_vec = LD_SH(filter_y);
  1835. UNPCK_R_SB_SH(filter_vec, filter_vec);
  1836. SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
  1837. weight_vec = __msa_fill_w(weight);
  1838. offset_vec = __msa_fill_w(offset);
  1839. rnd_vec = __msa_fill_w(rnd_val);
  1840. denom_vec = rnd_vec - 6;
  1841. const_128 = __msa_ldi_w(128);
  1842. const_128 *= weight_vec;
  1843. offset_vec += __msa_srar_w(const_128, denom_vec);
  1844. mask0 = LD_SB(ff_hevc_mask_arr);
  1845. mask1 = mask0 + 2;
  1846. mask2 = mask0 + 4;
  1847. mask3 = mask0 + 6;
  1848. src_tmp = src;
  1849. dst_tmp = dst;
  1850. LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1851. src_tmp += (7 * src_stride);
  1852. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1853. /* row 0 row 1 row 2 row 3 */
  1854. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  1855. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
  1856. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
  1857. vec11);
  1858. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
  1859. vec15);
  1860. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1861. filt3);
  1862. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1863. filt3);
  1864. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1865. filt3);
  1866. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  1867. filt2, filt3);
  1868. VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  1869. VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
  1870. VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
  1871. vec11);
  1872. dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1873. filt3);
  1874. dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1875. filt3);
  1876. dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1877. filt3);
  1878. for (loop_cnt = 16; loop_cnt--;) {
  1879. src7 = LD_SB(src_tmp);
  1880. src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
  1881. src_tmp += src_stride;
  1882. VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  1883. vec3);
  1884. dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1885. filt3);
  1886. ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
  1887. ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
  1888. ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
  1889. ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
  1890. dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
  1891. filt_h0, filt_h1, filt_h2, filt_h3);
  1892. dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
  1893. filt_h0, filt_h1, filt_h2, filt_h3);
  1894. dst0_r >>= 6;
  1895. dst0_l >>= 6;
  1896. MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
  1897. SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
  1898. ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
  1899. CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
  1900. dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
  1901. out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
  1902. ST8x1_UB(out, dst_tmp);
  1903. dst_tmp += dst_stride;
  1904. dst0 = dst1;
  1905. dst1 = dst2;
  1906. dst2 = dst3;
  1907. dst3 = dst4;
  1908. dst4 = dst5;
  1909. dst5 = dst6;
  1910. dst6 = dst7;
  1911. }
  1912. src += 8;
  1913. dst += 8;
  1914. mask4 = LD_SB(ff_hevc_mask_arr + 16);
  1915. mask5 = mask4 + 2;
  1916. mask6 = mask4 + 4;
  1917. mask7 = mask4 + 6;
  1918. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1919. src += (7 * src_stride);
  1920. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1921. VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
  1922. VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
  1923. VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
  1924. vec11);
  1925. VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
  1926. vec15);
  1927. dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1928. filt3);
  1929. dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1930. filt3);
  1931. dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1932. filt3);
  1933. dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
  1934. filt3);
  1935. ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
  1936. ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
  1937. ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
  1938. dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
  1939. for (loop_cnt = 4; loop_cnt--;) {
  1940. LD_SB4(src, src_stride, src7, src8, src9, src10);
  1941. src += (4 * src_stride);
  1942. XORI_B4_128_SB(src7, src8, src9, src10);
  1943. VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
  1944. vec3);
  1945. VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
  1946. vec7);
  1947. dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1948. filt3);
  1949. dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1950. filt3);
  1951. dst76_r = __msa_ilvr_h(dst97, dst66);
  1952. ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
  1953. dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
  1954. dst98_r = __msa_ilvr_h(dst66, dst108);
  1955. dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
  1956. filt_h1, filt_h2, filt_h3);
  1957. dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
  1958. filt_h1, filt_h2, filt_h3);
  1959. dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
  1960. filt_h1, filt_h2, filt_h3);
  1961. dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
  1962. filt_h1, filt_h2, filt_h3);
  1963. SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
  1964. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  1965. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  1966. SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
  1967. ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
  1968. ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
  1969. CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
  1970. PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
  1971. out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
  1972. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  1973. dst += (4 * dst_stride);
  1974. dst10_r = dst54_r;
  1975. dst32_r = dst76_r;
  1976. dst54_r = dst98_r;
  1977. dst21_r = dst65_r;
  1978. dst43_r = dst87_r;
  1979. dst65_r = dst109_r;
  1980. dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
  1981. }
  1982. }
  1983. static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
  1984. int32_t src_stride,
  1985. uint8_t *dst,
  1986. int32_t dst_stride,
  1987. const int8_t *filter_x,
  1988. const int8_t *filter_y,
  1989. int32_t height,
  1990. int32_t weight,
  1991. int32_t offset,
  1992. int32_t rnd_val)
  1993. {
  1994. hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
  1995. filter_x, filter_y, height, weight,
  1996. offset, rnd_val, 16);
  1997. }
  1998. static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
  1999. int32_t src_stride,
  2000. uint8_t *dst,
  2001. int32_t dst_stride,
  2002. const int8_t *filter_x,
  2003. const int8_t *filter_y,
  2004. int32_t height,
  2005. int32_t weight,
  2006. int32_t offset,
  2007. int32_t rnd_val)
  2008. {
  2009. hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
  2010. filter_x, filter_y, height, weight,
  2011. offset, rnd_val, 24);
  2012. }
  2013. static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
  2014. int32_t src_stride,
  2015. uint8_t *dst,
  2016. int32_t dst_stride,
  2017. const int8_t *filter_x,
  2018. const int8_t *filter_y,
  2019. int32_t height,
  2020. int32_t weight,
  2021. int32_t offset,
  2022. int32_t rnd_val)
  2023. {
  2024. hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
  2025. filter_x, filter_y, height, weight,
  2026. offset, rnd_val, 32);
  2027. }
  2028. static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
  2029. int32_t src_stride,
  2030. uint8_t *dst,
  2031. int32_t dst_stride,
  2032. const int8_t *filter_x,
  2033. const int8_t *filter_y,
  2034. int32_t height,
  2035. int32_t weight,
  2036. int32_t offset,
  2037. int32_t rnd_val)
  2038. {
  2039. hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
  2040. filter_x, filter_y, height, weight,
  2041. offset, rnd_val, 48);
  2042. }
  2043. static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
  2044. int32_t src_stride,
  2045. uint8_t *dst,
  2046. int32_t dst_stride,
  2047. const int8_t *filter_x,
  2048. const int8_t *filter_y,
  2049. int32_t height,
  2050. int32_t weight,
  2051. int32_t offset,
  2052. int32_t rnd_val)
  2053. {
  2054. hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
  2055. filter_x, filter_y, height, weight,
  2056. offset, rnd_val, 64);
  2057. }
  2058. static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
  2059. int32_t src_stride,
  2060. uint8_t *dst,
  2061. int32_t dst_stride,
  2062. const int8_t *filter,
  2063. int32_t weight,
  2064. int32_t offset,
  2065. int32_t rnd_val)
  2066. {
  2067. v16u8 out;
  2068. v8i16 filt0, filt1;
  2069. v16i8 src0, src1, vec0, vec1;
  2070. v16i8 mask1;
  2071. v8i16 dst0;
  2072. v4i32 dst0_r, dst0_l;
  2073. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2074. v4i32 weight_vec, rnd_vec;
  2075. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  2076. src -= 1;
  2077. filter_vec = LD_SH(filter);
  2078. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2079. mask1 = mask0 + 2;
  2080. weight = weight & 0x0000FFFF;
  2081. weight_vec = __msa_fill_w(weight);
  2082. rnd_vec = __msa_fill_w(rnd_val);
  2083. weight *= 128;
  2084. rnd_val -= 6;
  2085. weight_vec_h = __msa_fill_h(weight);
  2086. offset_vec = __msa_fill_h(offset);
  2087. denom_vec = __msa_fill_h(rnd_val);
  2088. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2089. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2090. LD_SB2(src, src_stride, src0, src1);
  2091. XORI_B2_128_SB(src0, src1);
  2092. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  2093. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2094. ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
  2095. DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
  2096. SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
  2097. dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
  2098. dst0 = __msa_adds_s_h(dst0, offset_vec);
  2099. dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
  2100. out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
  2101. ST4x2_UB(out, dst, dst_stride);
  2102. dst += (4 * dst_stride);
  2103. }
  2104. static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
  2105. int32_t src_stride,
  2106. uint8_t *dst,
  2107. int32_t dst_stride,
  2108. const int8_t *filter,
  2109. int32_t weight,
  2110. int32_t offset,
  2111. int32_t rnd_val)
  2112. {
  2113. v16u8 out;
  2114. v8i16 filt0, filt1;
  2115. v16i8 src0, src1, src2, src3;
  2116. v16i8 mask1, vec0, vec1, vec2, vec3;
  2117. v8i16 dst0, dst1;
  2118. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2119. v4i32 weight_vec, rnd_vec;
  2120. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  2121. src -= 1;
  2122. /* rearranging filter */
  2123. filter_vec = LD_SH(filter);
  2124. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2125. mask1 = mask0 + 2;
  2126. weight = weight & 0x0000FFFF;
  2127. weight_vec = __msa_fill_w(weight);
  2128. rnd_vec = __msa_fill_w(rnd_val);
  2129. weight *= 128;
  2130. rnd_val -= 6;
  2131. weight_vec_h = __msa_fill_h(weight);
  2132. offset_vec = __msa_fill_h(offset);
  2133. denom_vec = __msa_fill_h(rnd_val);
  2134. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2135. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2136. LD_SB4(src, src_stride, src0, src1, src2, src3);
  2137. XORI_B4_128_SB(src0, src1, src2, src3);
  2138. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  2139. VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
  2140. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2141. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2142. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
  2143. dst0, dst1);
  2144. out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  2145. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  2146. dst += (4 * dst_stride);
  2147. }
  2148. static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
  2149. int32_t src_stride,
  2150. uint8_t *dst,
  2151. int32_t dst_stride,
  2152. const int8_t *filter,
  2153. int32_t height,
  2154. int32_t weight,
  2155. int32_t offset,
  2156. int32_t rnd_val)
  2157. {
  2158. uint32_t loop_cnt;
  2159. v16u8 out0, out1;
  2160. v8i16 filt0, filt1;
  2161. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  2162. v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2163. v8i16 dst0, dst1, dst2, dst3;
  2164. v8i16 filter_vec;
  2165. v8i16 weight_vec_h, offset_vec, denom_vec;
  2166. v4i32 weight_vec, rnd_vec;
  2167. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  2168. src -= 1;
  2169. filter_vec = LD_SH(filter);
  2170. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2171. weight = weight & 0x0000FFFF;
  2172. weight_vec = __msa_fill_w(weight);
  2173. rnd_vec = __msa_fill_w(rnd_val);
  2174. weight *= 128;
  2175. rnd_val -= 6;
  2176. weight_vec_h = __msa_fill_h(weight);
  2177. offset_vec = __msa_fill_h(offset);
  2178. denom_vec = __msa_fill_h(rnd_val);
  2179. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2180. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2181. mask1 = mask0 + 2;
  2182. for (loop_cnt = (height >> 3); loop_cnt--;) {
  2183. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  2184. src += (8 * src_stride);
  2185. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  2186. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  2187. VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
  2188. VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
  2189. VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
  2190. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2191. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2192. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2193. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2194. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2195. weight_vec, offset_vec, rnd_vec,
  2196. dst0, dst1, dst2, dst3);
  2197. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  2198. ST4x8_UB(out0, out1, dst, dst_stride);
  2199. dst += (8 * dst_stride);
  2200. }
  2201. }
  2202. static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
  2203. int32_t src_stride,
  2204. uint8_t *dst,
  2205. int32_t dst_stride,
  2206. const int8_t *filter,
  2207. int32_t height,
  2208. int32_t weight,
  2209. int32_t offset,
  2210. int32_t rnd_val)
  2211. {
  2212. if (2 == height) {
  2213. hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
  2214. filter, weight, offset, rnd_val);
  2215. } else if (4 == height) {
  2216. hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
  2217. filter, weight, offset, rnd_val);
  2218. } else if (8 == height || 16 == height) {
  2219. hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
  2220. filter, height, weight,
  2221. offset, rnd_val);
  2222. }
  2223. }
  2224. static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
  2225. int32_t src_stride,
  2226. uint8_t *dst,
  2227. int32_t dst_stride,
  2228. const int8_t *filter,
  2229. int32_t height,
  2230. int32_t weight,
  2231. int32_t offset,
  2232. int32_t rnd_val)
  2233. {
  2234. v16u8 out0, out1, out2, out3;
  2235. v8i16 filt0, filt1;
  2236. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  2237. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  2238. v16i8 mask1;
  2239. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2240. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  2241. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2242. v4i32 weight_vec, rnd_vec;
  2243. src -= 1;
  2244. filter_vec = LD_SH(filter);
  2245. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2246. weight = weight & 0x0000FFFF;
  2247. weight_vec = __msa_fill_w(weight);
  2248. rnd_vec = __msa_fill_w(rnd_val);
  2249. weight *= 128;
  2250. rnd_val -= 6;
  2251. weight_vec_h = __msa_fill_h(weight);
  2252. offset_vec = __msa_fill_h(offset);
  2253. denom_vec = __msa_fill_h(rnd_val);
  2254. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2255. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2256. mask1 = mask0 + 2;
  2257. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  2258. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  2259. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2260. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2261. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2262. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  2263. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2264. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2265. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2266. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2267. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
  2268. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
  2269. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
  2270. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
  2271. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2272. dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2273. dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2274. dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2275. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2276. weight_vec, offset_vec, rnd_vec,
  2277. dst0, dst1, dst2, dst3);
  2278. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
  2279. weight_vec, offset_vec, rnd_vec,
  2280. dst4, dst5, dst6, dst7);
  2281. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  2282. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  2283. ST6x4_UB(out0, out1, dst, dst_stride);
  2284. dst += (4 * dst_stride);
  2285. ST6x4_UB(out2, out3, dst, dst_stride);
  2286. }
  2287. static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
  2288. int32_t src_stride,
  2289. uint8_t *dst,
  2290. int32_t dst_stride,
  2291. const int8_t *filter,
  2292. int32_t weight,
  2293. int32_t offset,
  2294. int32_t rnd_val)
  2295. {
  2296. v16u8 out;
  2297. v8i16 filt0, filt1, dst0, dst1;
  2298. v16i8 src0, src1;
  2299. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  2300. v16i8 mask1;
  2301. v16i8 vec0, vec1, vec2, vec3;
  2302. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2303. v4i32 weight_vec, rnd_vec;
  2304. src -= 1;
  2305. filter_vec = LD_SH(filter);
  2306. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2307. weight = weight & 0x0000FFFF;
  2308. weight_vec = __msa_fill_w(weight);
  2309. rnd_vec = __msa_fill_w(rnd_val);
  2310. weight *= 128;
  2311. rnd_val -= 6;
  2312. weight_vec_h = __msa_fill_h(weight);
  2313. offset_vec = __msa_fill_h(offset);
  2314. denom_vec = __msa_fill_h(rnd_val);
  2315. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2316. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2317. mask1 = mask0 + 2;
  2318. LD_SB2(src, src_stride, src0, src1);
  2319. XORI_B2_128_SB(src0, src1);
  2320. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2321. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2322. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2323. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2324. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
  2325. dst0, dst1);
  2326. out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  2327. ST8x2_UB(out, dst, dst_stride);
  2328. }
  2329. static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
  2330. int32_t src_stride,
  2331. uint8_t *dst,
  2332. int32_t dst_stride,
  2333. const int8_t *filter,
  2334. int32_t weight,
  2335. int32_t offset,
  2336. int32_t rnd_val)
  2337. {
  2338. v16u8 out0, out1;
  2339. v16i8 src0, src1, src2, src3;
  2340. v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2341. v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
  2342. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2343. v4i32 weight_vec, rnd_vec;
  2344. src -= 1;
  2345. filter_vec = LD_SH(filter);
  2346. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2347. weight = weight & 0x0000FFFF;
  2348. weight_vec = __msa_fill_w(weight);
  2349. rnd_vec = __msa_fill_w(rnd_val);
  2350. weight *= 128;
  2351. rnd_val -= 6;
  2352. weight_vec_h = __msa_fill_h(weight);
  2353. offset_vec = __msa_fill_h(offset);
  2354. denom_vec = __msa_fill_h(rnd_val);
  2355. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2356. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2357. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2358. mask1 = mask0 + 2;
  2359. LD_SB4(src, src_stride, src0, src1, src2, src3);
  2360. XORI_B4_128_SB(src0, src1, src2, src3);
  2361. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2362. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2363. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2364. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  2365. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2366. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2367. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2368. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2369. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2370. weight_vec, offset_vec, rnd_vec,
  2371. dst0, dst1, dst2, dst3);
  2372. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  2373. ST8x4_UB(out0, out1, dst, dst_stride);
  2374. }
  2375. static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
  2376. int32_t src_stride,
  2377. uint8_t *dst,
  2378. int32_t dst_stride,
  2379. const int8_t *filter,
  2380. int32_t weight,
  2381. int32_t offset,
  2382. int32_t rnd_val)
  2383. {
  2384. v16u8 out0, out1, out2;
  2385. v8i16 filt0, filt1;
  2386. v16i8 src0, src1, src2, src3, src4, src5;
  2387. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2388. v16i8 mask1;
  2389. v16i8 vec11;
  2390. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
  2391. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  2392. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2393. v4i32 weight_vec, rnd_vec;
  2394. src -= 1;
  2395. filter_vec = LD_SH(filter);
  2396. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2397. weight = weight & 0x0000FFFF;
  2398. weight_vec = __msa_fill_w(weight);
  2399. rnd_vec = __msa_fill_w(rnd_val);
  2400. weight *= 128;
  2401. rnd_val -= 6;
  2402. weight_vec_h = __msa_fill_h(weight);
  2403. offset_vec = __msa_fill_h(offset);
  2404. denom_vec = __msa_fill_h(rnd_val);
  2405. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2406. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2407. mask1 = mask0 + 2;
  2408. LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
  2409. XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
  2410. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2411. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2412. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2413. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  2414. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
  2415. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
  2416. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2417. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2418. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2419. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2420. dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
  2421. dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
  2422. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2423. weight_vec, offset_vec, rnd_vec,
  2424. dst0, dst1, dst2, dst3);
  2425. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
  2426. dst4, dst5);
  2427. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  2428. ST8x4_UB(out0, out1, dst, dst_stride);
  2429. dst += (4 * dst_stride);
  2430. ST8x2_UB(out2, dst, dst_stride);
  2431. }
  2432. static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
  2433. int32_t src_stride,
  2434. uint8_t *dst,
  2435. int32_t dst_stride,
  2436. const int8_t *filter,
  2437. int32_t height,
  2438. int32_t weight,
  2439. int32_t offset,
  2440. int32_t rnd_val)
  2441. {
  2442. uint32_t loop_cnt;
  2443. v8i16 filt0, filt1;
  2444. v16u8 out0, out1, out2, out3;
  2445. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  2446. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2447. v16i8 mask1;
  2448. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2449. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  2450. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2451. v4i32 weight_vec, rnd_vec;
  2452. src -= 1;
  2453. filter_vec = LD_SH(filter);
  2454. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2455. weight = weight & 0x0000FFFF;
  2456. weight_vec = __msa_fill_w(weight);
  2457. rnd_vec = __msa_fill_w(rnd_val);
  2458. weight *= 128;
  2459. rnd_val -= 6;
  2460. weight_vec_h = __msa_fill_h(weight);
  2461. offset_vec = __msa_fill_h(offset);
  2462. denom_vec = __msa_fill_h(rnd_val);
  2463. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2464. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2465. mask1 = mask0 + 2;
  2466. for (loop_cnt = (height >> 3); loop_cnt--;) {
  2467. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  2468. src += (8 * src_stride);
  2469. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  2470. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2471. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2472. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2473. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  2474. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2475. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2476. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2477. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2478. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
  2479. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
  2480. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
  2481. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
  2482. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2483. dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2484. dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2485. dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2486. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2487. weight_vec, offset_vec, rnd_vec,
  2488. dst0, dst1, dst2, dst3);
  2489. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
  2490. weight_vec, offset_vec, rnd_vec,
  2491. dst4, dst5, dst6, dst7);
  2492. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  2493. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  2494. ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
  2495. dst += (8 * dst_stride);
  2496. }
  2497. }
  2498. static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
  2499. int32_t src_stride,
  2500. uint8_t *dst,
  2501. int32_t dst_stride,
  2502. const int8_t *filter,
  2503. int32_t height,
  2504. int32_t weight,
  2505. int32_t offset,
  2506. int32_t rnd_val)
  2507. {
  2508. if (2 == height) {
  2509. hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
  2510. filter, weight, offset, rnd_val);
  2511. } else if (4 == height) {
  2512. hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
  2513. filter, weight, offset, rnd_val);
  2514. } else if (6 == height) {
  2515. hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
  2516. filter, weight, offset, rnd_val);
  2517. } else {
  2518. hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
  2519. filter, height, weight, offset,
  2520. rnd_val);
  2521. }
  2522. }
  2523. static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
  2524. int32_t src_stride,
  2525. uint8_t *dst,
  2526. int32_t dst_stride,
  2527. const int8_t *filter,
  2528. int32_t height,
  2529. int32_t weight,
  2530. int32_t offset,
  2531. int32_t rnd_val)
  2532. {
  2533. uint32_t loop_cnt;
  2534. v16u8 out0, out1, out2;
  2535. v8i16 filt0, filt1;
  2536. v16i8 src0, src1, src2, src3;
  2537. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2538. v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
  2539. };
  2540. v16i8 mask1;
  2541. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
  2542. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  2543. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2544. v16i8 mask3, vec11;
  2545. v4i32 weight_vec, rnd_vec;
  2546. src -= 1;
  2547. filter_vec = LD_SH(filter);
  2548. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2549. weight = weight & 0x0000FFFF;
  2550. weight_vec = __msa_fill_w(weight);
  2551. rnd_vec = __msa_fill_w(rnd_val);
  2552. weight *= 128;
  2553. rnd_val -= 6;
  2554. weight_vec_h = __msa_fill_h(weight);
  2555. offset_vec = __msa_fill_h(offset);
  2556. denom_vec = __msa_fill_h(rnd_val);
  2557. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2558. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2559. mask1 = mask0 + 2;
  2560. mask3 = mask2 + 2;
  2561. for (loop_cnt = 4; loop_cnt--;) {
  2562. LD_SB4(src, src_stride, src0, src1, src2, src3);
  2563. src += (4 * src_stride);
  2564. XORI_B4_128_SB(src0, src1, src2, src3);
  2565. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2566. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2567. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2568. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  2569. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
  2570. VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
  2571. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2572. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2573. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2574. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2575. dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
  2576. dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
  2577. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2578. weight_vec, offset_vec, rnd_vec,
  2579. dst0, dst1, dst2, dst3);
  2580. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  2581. rnd_vec, dst4, dst5);
  2582. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  2583. ST12x4_UB(out0, out1, out2, dst, dst_stride);
  2584. dst += (4 * dst_stride);
  2585. }
  2586. }
  2587. static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
  2588. int32_t src_stride,
  2589. uint8_t *dst,
  2590. int32_t dst_stride,
  2591. const int8_t *filter,
  2592. int32_t height,
  2593. int32_t weight,
  2594. int32_t offset,
  2595. int32_t rnd_val)
  2596. {
  2597. uint32_t loop_cnt;
  2598. v16u8 out0, out1, out2, out3;
  2599. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  2600. v8i16 filt0, filt1;
  2601. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2602. v16i8 mask1;
  2603. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2604. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  2605. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2606. v4i32 weight_vec, rnd_vec;
  2607. src -= 1;
  2608. filter_vec = LD_SH(filter);
  2609. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2610. weight = weight & 0x0000FFFF;
  2611. weight_vec = __msa_fill_w(weight);
  2612. rnd_vec = __msa_fill_w(rnd_val);
  2613. weight *= 128;
  2614. rnd_val -= 6;
  2615. weight_vec_h = __msa_fill_h(weight);
  2616. offset_vec = __msa_fill_h(offset);
  2617. denom_vec = __msa_fill_h(rnd_val);
  2618. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2619. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2620. mask1 = mask0 + 2;
  2621. for (loop_cnt = (height >> 2); loop_cnt--;) {
  2622. LD_SB4(src, src_stride, src0, src2, src4, src6);
  2623. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  2624. src += (4 * src_stride);
  2625. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  2626. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2627. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  2628. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2629. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  2630. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2631. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2632. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2633. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2634. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
  2635. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
  2636. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
  2637. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
  2638. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2639. dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2640. dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2641. dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2642. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2643. weight_vec, offset_vec, rnd_vec,
  2644. dst0, dst1, dst2, dst3);
  2645. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
  2646. weight_vec, offset_vec, rnd_vec,
  2647. dst4, dst5, dst6, dst7);
  2648. PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  2649. out0, out1, out2, out3);
  2650. ST_UB4(out0, out1, out2, out3, dst, dst_stride);
  2651. dst += (4 * dst_stride);
  2652. }
  2653. }
  2654. static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
  2655. int32_t src_stride,
  2656. uint8_t *dst,
  2657. int32_t dst_stride,
  2658. const int8_t *filter,
  2659. int32_t height,
  2660. int32_t weight,
  2661. int32_t offset,
  2662. int32_t rnd_val)
  2663. {
  2664. uint32_t loop_cnt;
  2665. v16u8 out0, out1, out2;
  2666. v16i8 src0, src1, src2, src3;
  2667. v8i16 filt0, filt1;
  2668. v16i8 mask0, mask1, mask2, mask3;
  2669. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2670. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  2671. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2672. v4i32 weight_vec, rnd_vec;
  2673. src -= 1;
  2674. filter_vec = LD_SH(filter);
  2675. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2676. weight = weight & 0x0000FFFF;
  2677. weight_vec = __msa_fill_w(weight);
  2678. rnd_vec = __msa_fill_w(rnd_val);
  2679. weight *= 128;
  2680. rnd_val -= 6;
  2681. weight_vec_h = __msa_fill_h(weight);
  2682. offset_vec = __msa_fill_h(offset);
  2683. denom_vec = __msa_fill_h(rnd_val);
  2684. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2685. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2686. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2687. mask1 = mask0 + 2;
  2688. mask2 = mask0 + 8;
  2689. mask3 = mask0 + 10;
  2690. for (loop_cnt = 16; loop_cnt--;) {
  2691. LD_SB2(src, src_stride, src0, src2);
  2692. LD_SB2(src + 16, src_stride, src1, src3);
  2693. src += (2 * src_stride);
  2694. XORI_B4_128_SB(src0, src1, src2, src3);
  2695. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2696. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
  2697. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  2698. VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
  2699. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2700. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2701. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2702. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2703. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2704. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
  2705. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2706. dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2707. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2708. weight_vec, offset_vec, rnd_vec,
  2709. dst0, dst1, dst2, dst3);
  2710. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  2711. rnd_vec, dst4, dst5);
  2712. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  2713. ST_UB2(out0, out1, dst, dst_stride);
  2714. ST8x2_UB(out2, dst + 16, dst_stride);
  2715. dst += (2 * dst_stride);
  2716. }
  2717. }
  2718. static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
  2719. int32_t src_stride,
  2720. uint8_t *dst,
  2721. int32_t dst_stride,
  2722. const int8_t *filter,
  2723. int32_t height,
  2724. int32_t weight,
  2725. int32_t offset,
  2726. int32_t rnd_val)
  2727. {
  2728. uint32_t loop_cnt;
  2729. v16u8 out0, out1, out2, out3;
  2730. v16i8 src0, src1, src2, src3, src4, src5;
  2731. v8i16 filt0, filt1;
  2732. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2733. v16i8 mask1, mask2, mask3;
  2734. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2735. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  2736. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2737. v4i32 weight_vec, rnd_vec;
  2738. src -= 1;
  2739. filter_vec = LD_SH(filter);
  2740. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2741. weight = weight & 0x0000FFFF;
  2742. weight_vec = __msa_fill_w(weight);
  2743. rnd_vec = __msa_fill_w(rnd_val);
  2744. weight *= 128;
  2745. rnd_val -= 6;
  2746. weight_vec_h = __msa_fill_h(weight);
  2747. offset_vec = __msa_fill_h(offset);
  2748. denom_vec = __msa_fill_h(rnd_val);
  2749. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2750. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2751. mask1 = mask0 + 2;
  2752. mask2 = mask0 + 8;
  2753. mask3 = mask0 + 10;
  2754. for (loop_cnt = (height >> 1); loop_cnt--;) {
  2755. LD_SB2(src, 16, src0, src1);
  2756. src2 = LD_SB(src + 24);
  2757. src += src_stride;
  2758. LD_SB2(src, 16, src3, src4);
  2759. src5 = LD_SB(src + 24);
  2760. src += src_stride;
  2761. XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
  2762. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2763. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
  2764. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
  2765. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
  2766. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2767. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2768. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2769. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2770. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  2771. VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
  2772. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
  2773. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
  2774. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2775. dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  2776. dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  2777. dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  2778. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2779. weight_vec, offset_vec, rnd_vec,
  2780. dst0, dst1, dst2, dst3);
  2781. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
  2782. weight_vec, offset_vec, rnd_vec,
  2783. dst4, dst5, dst6, dst7);
  2784. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  2785. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  2786. ST_UB2(out0, out1, dst, 16);
  2787. dst += dst_stride;
  2788. ST_UB2(out2, out3, dst, 16);
  2789. dst += dst_stride;
  2790. }
  2791. }
  2792. static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
  2793. int32_t src_stride,
  2794. uint8_t *dst,
  2795. int32_t dst_stride,
  2796. const int8_t *filter,
  2797. int32_t weight,
  2798. int32_t offset,
  2799. int32_t rnd_val)
  2800. {
  2801. v16u8 out;
  2802. v16i8 src0, src1, src2, src3, src4;
  2803. v16i8 src10_r, src32_r, src21_r, src43_r;
  2804. v16i8 src2110, src4332;
  2805. v8i16 dst0;
  2806. v4i32 dst0_r, dst0_l;
  2807. v8i16 filt0, filt1;
  2808. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2809. v4i32 weight_vec, rnd_vec;
  2810. src -= src_stride;
  2811. weight = weight & 0x0000FFFF;
  2812. weight_vec = __msa_fill_w(weight);
  2813. rnd_vec = __msa_fill_w(rnd_val);
  2814. weight *= 128;
  2815. rnd_val -= 6;
  2816. weight_vec_h = __msa_fill_h(weight);
  2817. offset_vec = __msa_fill_h(offset);
  2818. denom_vec = __msa_fill_h(rnd_val);
  2819. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2820. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2821. filter_vec = LD_SH(filter);
  2822. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2823. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  2824. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  2825. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  2826. ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
  2827. XORI_B2_128_SB(src2110, src4332);
  2828. dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  2829. ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
  2830. DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
  2831. SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
  2832. dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
  2833. dst0 = __msa_adds_s_h(dst0, offset_vec);
  2834. dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
  2835. out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
  2836. ST4x2_UB(out, dst, dst_stride);
  2837. }
  2838. static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
  2839. int32_t src_stride,
  2840. uint8_t *dst,
  2841. int32_t dst_stride,
  2842. const int8_t *filter,
  2843. int32_t weight,
  2844. int32_t offset,
  2845. int32_t rnd_val)
  2846. {
  2847. v16u8 out;
  2848. v16i8 src0, src1, src2, src3, src4, src5, src6;
  2849. v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
  2850. v16i8 src2110, src4332, src6554;
  2851. v8i16 dst0, dst1;
  2852. v8i16 filt0, filt1;
  2853. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2854. v4i32 weight_vec, rnd_vec;
  2855. src -= src_stride;
  2856. weight = weight & 0x0000FFFF;
  2857. weight_vec = __msa_fill_w(weight);
  2858. rnd_vec = __msa_fill_w(rnd_val);
  2859. weight *= 128;
  2860. rnd_val -= 6;
  2861. weight_vec_h = __msa_fill_h(weight);
  2862. offset_vec = __msa_fill_h(offset);
  2863. denom_vec = __msa_fill_h(rnd_val);
  2864. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2865. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2866. filter_vec = LD_SH(filter);
  2867. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2868. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  2869. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  2870. ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
  2871. src32_r, src43_r, src54_r, src65_r);
  2872. ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
  2873. src2110, src4332, src6554);
  2874. XORI_B3_128_SB(src2110, src4332, src6554);
  2875. dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  2876. dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
  2877. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
  2878. dst0, dst1);
  2879. out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  2880. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  2881. }
  2882. static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
  2883. int32_t src_stride,
  2884. uint8_t *dst,
  2885. int32_t dst_stride,
  2886. const int8_t *filter,
  2887. int32_t height,
  2888. int32_t weight,
  2889. int32_t offset,
  2890. int32_t rnd_val)
  2891. {
  2892. int32_t loop_cnt;
  2893. v16u8 out0, out1;
  2894. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  2895. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  2896. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  2897. v16i8 src2110, src4332, src6554, src8776;
  2898. v16i8 src10998;
  2899. v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
  2900. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2901. v4i32 weight_vec, rnd_vec;
  2902. src -= src_stride;
  2903. weight = weight & 0x0000FFFF;
  2904. weight_vec = __msa_fill_w(weight);
  2905. rnd_vec = __msa_fill_w(rnd_val);
  2906. weight *= 128;
  2907. rnd_val -= 6;
  2908. weight_vec_h = __msa_fill_h(weight);
  2909. offset_vec = __msa_fill_h(offset);
  2910. denom_vec = __msa_fill_h(rnd_val);
  2911. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2912. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2913. filter_vec = LD_SH(filter);
  2914. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2915. LD_SB3(src, src_stride, src0, src1, src2);
  2916. src += (3 * src_stride);
  2917. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  2918. src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
  2919. src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
  2920. for (loop_cnt = (height >> 3); loop_cnt--;) {
  2921. LD_SB8(src, src_stride,
  2922. src3, src4, src5, src6, src7, src8, src9, src10);
  2923. src += (8 * src_stride);
  2924. ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
  2925. src32_r, src43_r, src54_r, src65_r);
  2926. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  2927. ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
  2928. ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
  2929. src109_r, src98_r, src4332, src6554, src8776, src10998);
  2930. XORI_B4_128_SB(src4332, src6554, src8776, src10998);
  2931. dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  2932. dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
  2933. dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
  2934. dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
  2935. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  2936. weight_vec, offset_vec, rnd_vec,
  2937. dst0, dst1, dst2, dst3);
  2938. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  2939. ST4x8_UB(out0, out1, dst, dst_stride);
  2940. dst += (8 * dst_stride);
  2941. src2 = src10;
  2942. src2110 = src10998;
  2943. }
  2944. }
  2945. static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
  2946. int32_t src_stride,
  2947. uint8_t *dst,
  2948. int32_t dst_stride,
  2949. const int8_t *filter,
  2950. int32_t height,
  2951. int32_t weight,
  2952. int32_t offset,
  2953. int32_t rnd_val)
  2954. {
  2955. if (2 == height) {
  2956. hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
  2957. filter, weight, offset, rnd_val);
  2958. } else if (4 == height) {
  2959. hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
  2960. filter, weight, offset, rnd_val);
  2961. } else if (0 == (height % 8)) {
  2962. hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
  2963. filter, height, weight, offset,
  2964. rnd_val);
  2965. }
  2966. }
  2967. static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
  2968. int32_t src_stride,
  2969. uint8_t *dst,
  2970. int32_t dst_stride,
  2971. const int8_t *filter,
  2972. int32_t height,
  2973. int32_t weight,
  2974. int32_t offset,
  2975. int32_t rnd_val)
  2976. {
  2977. v16u8 out0, out1, out2, out3;
  2978. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  2979. v16i8 src10_r, src32_r, src21_r, src43_r;
  2980. v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
  2981. v8i16 filt0, filt1;
  2982. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  2983. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  2984. v4i32 weight_vec, rnd_vec;
  2985. src -= src_stride;
  2986. weight = weight & 0x0000FFFF;
  2987. weight_vec = __msa_fill_w(weight);
  2988. rnd_vec = __msa_fill_w(rnd_val);
  2989. weight *= 128;
  2990. rnd_val -= 6;
  2991. weight_vec_h = __msa_fill_h(weight);
  2992. offset_vec = __msa_fill_h(offset);
  2993. denom_vec = __msa_fill_h(rnd_val);
  2994. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  2995. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  2996. filter_vec = LD_SH(filter);
  2997. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2998. LD_SB3(src, src_stride, src0, src1, src2);
  2999. src += (3 * src_stride);
  3000. LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
  3001. XORI_B3_128_SB(src0, src1, src2);
  3002. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  3003. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3004. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3005. ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
  3006. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  3007. ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
  3008. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3009. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3010. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3011. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3012. dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
  3013. dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
  3014. dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
  3015. dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
  3016. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
  3017. weight_vec, offset_vec, rnd_vec,
  3018. dst0, dst1, dst2, dst3);
  3019. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
  3020. weight_vec, offset_vec, rnd_vec,
  3021. dst4, dst5, dst6, dst7);
  3022. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  3023. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  3024. ST6x4_UB(out0, out1, dst, dst_stride);
  3025. dst += (4 * dst_stride);
  3026. ST6x4_UB(out2, out3, dst, dst_stride);
  3027. }
  3028. static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
  3029. int32_t src_stride,
  3030. uint8_t *dst,
  3031. int32_t dst_stride,
  3032. const int8_t *filter,
  3033. int32_t weight,
  3034. int32_t offset,
  3035. int32_t rnd_val)
  3036. {
  3037. v16u8 out;
  3038. v16i8 src0, src1, src2, src3, src4;
  3039. v16i8 src10_r, src32_r, src21_r, src43_r;
  3040. v8i16 dst0, dst1;
  3041. v8i16 filt0, filt1;
  3042. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  3043. v4i32 weight_vec, rnd_vec;
  3044. src -= src_stride;
  3045. weight = weight & 0x0000FFFF;
  3046. weight_vec = __msa_fill_w(weight);
  3047. rnd_vec = __msa_fill_w(rnd_val);
  3048. weight *= 128;
  3049. rnd_val -= 6;
  3050. weight_vec_h = __msa_fill_h(weight);
  3051. offset_vec = __msa_fill_h(offset);
  3052. denom_vec = __msa_fill_h(rnd_val);
  3053. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3054. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3055. filter_vec = LD_SH(filter);
  3056. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3057. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  3058. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  3059. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3060. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3061. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3062. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3063. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
  3064. dst0, dst1);
  3065. out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  3066. ST8x2_UB(out, dst, dst_stride);
  3067. }
  3068. static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
  3069. int32_t src_stride,
  3070. uint8_t *dst,
  3071. int32_t dst_stride,
  3072. const int8_t *filter,
  3073. int32_t weight,
  3074. int32_t offset,
  3075. int32_t rnd_val)
  3076. {
  3077. v16u8 out0, out1;
  3078. v16i8 src0, src1, src2, src3, src4;
  3079. v16i8 src10_r, src32_r, src21_r, src43_r;
  3080. v16i8 src5, src6, src54_r, src65_r;
  3081. v8i16 filt0, filt1;
  3082. v8i16 dst0, dst1, dst2, dst3;
  3083. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  3084. v4i32 weight_vec, rnd_vec;
  3085. src -= src_stride;
  3086. weight = weight & 0x0000FFFF;
  3087. weight_vec = __msa_fill_w(weight);
  3088. rnd_vec = __msa_fill_w(rnd_val);
  3089. weight *= 128;
  3090. rnd_val -= 6;
  3091. weight_vec_h = __msa_fill_h(weight);
  3092. offset_vec = __msa_fill_h(offset);
  3093. denom_vec = __msa_fill_h(rnd_val);
  3094. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3095. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3096. filter_vec = LD_SH(filter);
  3097. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3098. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  3099. src += (3 * src_stride);
  3100. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  3101. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3102. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3103. ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
  3104. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3105. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3106. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3107. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3108. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3109. offset_vec, rnd_vec, dst0, dst1, dst2,
  3110. dst3);
  3111. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  3112. ST8x4_UB(out0, out1, dst, dst_stride);
  3113. }
  3114. static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
  3115. int32_t src_stride,
  3116. uint8_t *dst,
  3117. int32_t dst_stride,
  3118. const int8_t *filter,
  3119. int32_t weight,
  3120. int32_t offset,
  3121. int32_t rnd_val)
  3122. {
  3123. v16u8 out0, out1, out2;
  3124. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  3125. v16i8 src10_r, src32_r, src54_r, src76_r;
  3126. v16i8 src21_r, src43_r, src65_r, src87_r;
  3127. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  3128. v8i16 filt0, filt1;
  3129. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  3130. v4i32 weight_vec, rnd_vec;
  3131. src -= src_stride;
  3132. weight = weight & 0x0000FFFF;
  3133. weight_vec = __msa_fill_w(weight);
  3134. rnd_vec = __msa_fill_w(rnd_val);
  3135. weight *= 128;
  3136. rnd_val -= 6;
  3137. weight_vec_h = __msa_fill_h(weight);
  3138. offset_vec = __msa_fill_h(offset);
  3139. denom_vec = __msa_fill_h(rnd_val);
  3140. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3141. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3142. filter_vec = LD_SH(filter);
  3143. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3144. LD_SB3(src, src_stride, src0, src1, src2);
  3145. src += (3 * src_stride);
  3146. LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
  3147. XORI_B3_128_SB(src0, src1, src2);
  3148. XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
  3149. ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  3150. src32_r, src43_r);
  3151. ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
  3152. src76_r, src87_r);
  3153. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3154. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3155. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3156. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3157. dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
  3158. dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
  3159. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3160. offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
  3161. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
  3162. dst4, dst5);
  3163. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  3164. ST8x4_UB(out0, out1, dst, dst_stride);
  3165. dst += (4 * dst_stride);
  3166. ST8x2_UB(out2, dst, dst_stride);
  3167. }
  3168. static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
  3169. int32_t src_stride,
  3170. uint8_t *dst,
  3171. int32_t dst_stride,
  3172. const int8_t *filter,
  3173. int32_t height,
  3174. int32_t weight,
  3175. int32_t offset,
  3176. int32_t rnd_val)
  3177. {
  3178. int32_t loop_cnt;
  3179. v16u8 out0, out1, out2, out3;
  3180. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  3181. v16i8 src10_r, src32_r, src21_r, src43_r;
  3182. v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
  3183. v8i16 filt0, filt1;
  3184. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  3185. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  3186. v4i32 weight_vec, rnd_vec;
  3187. src -= src_stride;
  3188. weight = weight & 0x0000FFFF;
  3189. weight_vec = __msa_fill_w(weight);
  3190. rnd_vec = __msa_fill_w(rnd_val);
  3191. weight *= 128;
  3192. rnd_val -= 6;
  3193. weight_vec_h = __msa_fill_h(weight);
  3194. offset_vec = __msa_fill_h(offset);
  3195. denom_vec = __msa_fill_h(rnd_val);
  3196. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3197. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3198. filter_vec = LD_SH(filter);
  3199. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3200. LD_SB3(src, src_stride, src0, src1, src2);
  3201. src += (3 * src_stride);
  3202. XORI_B3_128_SB(src0, src1, src2);
  3203. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3204. for (loop_cnt = (height >> 3); loop_cnt--;) {
  3205. LD_SB8(src, src_stride,
  3206. src3, src4, src5, src6, src7, src8, src9, src10);
  3207. src += (8 * src_stride);
  3208. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  3209. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3210. ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
  3211. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  3212. ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
  3213. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3214. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3215. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3216. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3217. dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
  3218. dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
  3219. dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
  3220. dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
  3221. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3222. offset_vec, rnd_vec, dst0, dst1, dst2,
  3223. dst3);
  3224. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  3225. offset_vec, rnd_vec, dst4, dst5, dst6,
  3226. dst7);
  3227. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  3228. PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
  3229. ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
  3230. dst += (8 * dst_stride);
  3231. src2 = src10;
  3232. src10_r = src98_r;
  3233. src21_r = src109_r;
  3234. }
  3235. }
  3236. static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
  3237. int32_t src_stride,
  3238. uint8_t *dst,
  3239. int32_t dst_stride,
  3240. const int8_t *filter,
  3241. int32_t height,
  3242. int32_t weight,
  3243. int32_t offset,
  3244. int32_t rnd_val)
  3245. {
  3246. if (2 == height) {
  3247. hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
  3248. filter, weight, offset, rnd_val);
  3249. } else if (4 == height) {
  3250. hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
  3251. filter, weight, offset, rnd_val);
  3252. } else if (6 == height) {
  3253. hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
  3254. filter, weight, offset, rnd_val);
  3255. } else {
  3256. hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
  3257. filter, height, weight, offset,
  3258. rnd_val);
  3259. }
  3260. }
  3261. static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
  3262. int32_t src_stride,
  3263. uint8_t *dst,
  3264. int32_t dst_stride,
  3265. const int8_t *filter,
  3266. int32_t height,
  3267. int32_t weight,
  3268. int32_t offset,
  3269. int32_t rnd_val)
  3270. {
  3271. int32_t loop_cnt;
  3272. v16u8 out0, out1, out2, out3, out4, out5;
  3273. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  3274. v16i8 src10_r, src32_r, src21_r, src43_r;
  3275. v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
  3276. v16i8 src2110, src4332;
  3277. v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
  3278. v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
  3279. v8i16 filt0, filt1;
  3280. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
  3281. v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
  3282. v4i32 weight_vec, rnd_vec;
  3283. src -= (1 * src_stride);
  3284. weight = weight & 0x0000FFFF;
  3285. weight_vec = __msa_fill_w(weight);
  3286. rnd_vec = __msa_fill_w(rnd_val);
  3287. weight *= 128;
  3288. rnd_val -= 6;
  3289. weight_vec_h = __msa_fill_h(weight);
  3290. offset_vec = __msa_fill_h(offset);
  3291. denom_vec = __msa_fill_h(rnd_val);
  3292. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3293. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3294. filter_vec = LD_SH(filter);
  3295. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3296. LD_SB3(src, src_stride, src0, src1, src2);
  3297. src += (3 * src_stride);
  3298. XORI_B3_128_SB(src0, src1, src2);
  3299. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3300. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3301. src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
  3302. for (loop_cnt = 2; loop_cnt--;) {
  3303. LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
  3304. src += (8 * src_stride);
  3305. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  3306. ILVRL_B2_SB(src3, src2, src32_r, src32_l);
  3307. ILVRL_B2_SB(src4, src3, src43_r, src43_l);
  3308. ILVRL_B2_SB(src5, src4, src54_r, src54_l);
  3309. ILVRL_B2_SB(src6, src5, src65_r, src65_l);
  3310. src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
  3311. src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
  3312. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3313. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3314. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3315. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3316. dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  3317. dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
  3318. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3319. offset_vec, rnd_vec, dst0, dst1, dst2,
  3320. dst3);
  3321. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
  3322. rnd_vec, dst4, dst5);
  3323. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  3324. ST12x4_UB(out0, out1, out2, dst, dst_stride);
  3325. dst += (4 * dst_stride);
  3326. ILVRL_B2_SB(src7, src6, src76_r, src76_l);
  3327. ILVRL_B2_SB(src8, src7, src87_r, src87_l);
  3328. ILVRL_B2_SB(src9, src8, src98_r, src98_l);
  3329. ILVRL_B2_SB(src10, src9, src109_r, src109_l);
  3330. src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
  3331. src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
  3332. dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
  3333. dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
  3334. dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
  3335. dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
  3336. dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
  3337. dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
  3338. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec,
  3339. offset_vec, rnd_vec, dst6, dst7, dst8,
  3340. dst9);
  3341. HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
  3342. rnd_vec, dst10, dst11);
  3343. PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
  3344. ST12x4_UB(out3, out4, out5, dst, dst_stride);
  3345. dst += (4 * dst_stride);
  3346. src2 = src10;
  3347. src10_r = src98_r;
  3348. src21_r = src109_r;
  3349. src2110 = src10998;
  3350. }
  3351. }
  3352. static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
  3353. int32_t src_stride,
  3354. uint8_t *dst,
  3355. int32_t dst_stride,
  3356. const int8_t *filter,
  3357. int32_t height,
  3358. int32_t weight,
  3359. int32_t offset,
  3360. int32_t rnd_val)
  3361. {
  3362. int32_t loop_cnt;
  3363. v16u8 out0, out1, out2, out3;
  3364. v16i8 src0, src1, src2, src3, src4, src5;
  3365. v16i8 src10_r, src32_r, src21_r, src43_r;
  3366. v16i8 src10_l, src32_l, src21_l, src43_l;
  3367. v16i8 src54_r, src54_l, src65_r, src65_l, src6;
  3368. v8i16 filt0, filt1;
  3369. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  3370. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  3371. v4i32 weight_vec, rnd_vec;
  3372. src -= src_stride;
  3373. weight = weight & 0x0000FFFF;
  3374. weight_vec = __msa_fill_w(weight);
  3375. rnd_vec = __msa_fill_w(rnd_val);
  3376. weight *= 128;
  3377. rnd_val -= 6;
  3378. weight_vec_h = __msa_fill_h(weight);
  3379. offset_vec = __msa_fill_h(offset);
  3380. denom_vec = __msa_fill_h(rnd_val);
  3381. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3382. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3383. filter_vec = LD_SH(filter);
  3384. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3385. LD_SB3(src, src_stride, src0, src1, src2);
  3386. src += (3 * src_stride);
  3387. XORI_B3_128_SB(src0, src1, src2);
  3388. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3389. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3390. for (loop_cnt = (height >> 2); loop_cnt--;) {
  3391. LD_SB4(src, src_stride, src3, src4, src5, src6);
  3392. src += (4 * src_stride);
  3393. XORI_B4_128_SB(src3, src4, src5, src6);
  3394. ILVRL_B2_SB(src3, src2, src32_r, src32_l);
  3395. ILVRL_B2_SB(src4, src3, src43_r, src43_l);
  3396. ILVRL_B2_SB(src5, src4, src54_r, src54_l);
  3397. ILVRL_B2_SB(src6, src5, src65_r, src65_l);
  3398. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3399. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3400. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3401. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3402. dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
  3403. dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
  3404. dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
  3405. dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
  3406. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3407. offset_vec, rnd_vec, dst0, dst1, dst2,
  3408. dst3);
  3409. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  3410. offset_vec, rnd_vec, dst4, dst5, dst6,
  3411. dst7);
  3412. PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
  3413. out2, out3);
  3414. ST_UB4(out0, out1, out2, out3, dst, dst_stride);
  3415. dst += (4 * dst_stride);
  3416. src2 = src6;
  3417. src10_r = src54_r;
  3418. src21_r = src65_r;
  3419. src10_l = src54_l;
  3420. src21_l = src65_l;
  3421. }
  3422. }
  3423. static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
  3424. int32_t src_stride,
  3425. uint8_t *dst,
  3426. int32_t dst_stride,
  3427. const int8_t *filter,
  3428. int32_t height,
  3429. int32_t weight,
  3430. int32_t offset,
  3431. int32_t rnd_val)
  3432. {
  3433. uint32_t loop_cnt;
  3434. v16u8 out0, out1, out2, out3, out4, out5;
  3435. v16i8 src0, src1, src2, src3, src4, src5;
  3436. v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
  3437. v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
  3438. v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
  3439. v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
  3440. v8i16 filt0, filt1;
  3441. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
  3442. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
  3443. v4i32 weight_vec, rnd_vec;
  3444. src -= src_stride;
  3445. weight = weight & 0x0000FFFF;
  3446. weight_vec = __msa_fill_w(weight);
  3447. rnd_vec = __msa_fill_w(rnd_val);
  3448. weight *= 128;
  3449. rnd_val -= 6;
  3450. weight_vec_h = __msa_fill_h(weight);
  3451. offset_vec = __msa_fill_h(offset);
  3452. denom_vec = __msa_fill_h(rnd_val);
  3453. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3454. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3455. filter_vec = LD_SH(filter);
  3456. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3457. LD_SB3(src, src_stride, src0, src1, src2);
  3458. LD_SB3(src + 16, src_stride, src7, src8, src9);
  3459. src += (3 * src_stride);
  3460. XORI_B3_128_SB(src0, src1, src2);
  3461. XORI_B3_128_SB(src7, src8, src9);
  3462. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3463. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3464. ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
  3465. for (loop_cnt = 8; loop_cnt--;) {
  3466. LD_SB4(src, src_stride, src3, src4, src5, src6);
  3467. LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
  3468. src += (4 * src_stride);
  3469. XORI_B4_128_SB(src3, src4, src5, src6);
  3470. XORI_B4_128_SB(src10, src11, src12, src13);
  3471. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3472. ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
  3473. ILVRL_B2_SB(src5, src4, src54_r, src54_l);
  3474. ILVRL_B2_SB(src6, src5, src65_r, src65_l);
  3475. ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
  3476. ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
  3477. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3478. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3479. dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3480. dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3481. dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
  3482. dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
  3483. dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
  3484. dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
  3485. dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
  3486. dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1);
  3487. dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1);
  3488. dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1);
  3489. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3490. offset_vec, rnd_vec, dst0, dst1, dst2,
  3491. dst3);
  3492. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  3493. offset_vec, rnd_vec, dst4, dst5, dst6,
  3494. dst7);
  3495. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
  3496. offset_vec, rnd_vec, dst8, dst9, dst10,
  3497. dst11);
  3498. PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
  3499. out2, out3);
  3500. PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
  3501. ST_UB4(out0, out1, out2, out3, dst, dst_stride);
  3502. ST8x4_UB(out4, out5, dst + 16, dst_stride);
  3503. dst += (4 * dst_stride);
  3504. src2 = src6;
  3505. src9 = src13;
  3506. src10_r = src54_r;
  3507. src21_r = src65_r;
  3508. src10_l = src54_l;
  3509. src21_l = src65_l;
  3510. src87_r = src1211_r;
  3511. src98_r = src1312_r;
  3512. }
  3513. }
  3514. static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
  3515. int32_t src_stride,
  3516. uint8_t *dst,
  3517. int32_t dst_stride,
  3518. const int8_t *filter,
  3519. int32_t height,
  3520. int32_t weight,
  3521. int32_t offset,
  3522. int32_t rnd_val)
  3523. {
  3524. uint32_t loop_cnt;
  3525. v16u8 out0, out1, out2, out3;
  3526. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  3527. v16i8 src10_r, src32_r, src76_r, src98_r;
  3528. v16i8 src21_r, src43_r, src65_r, src87_r;
  3529. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  3530. v16i8 src10_l, src32_l, src76_l, src98_l;
  3531. v16i8 src21_l, src43_l, src65_l, src87_l;
  3532. v8i16 filt0, filt1;
  3533. v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
  3534. v4i32 weight_vec, rnd_vec;
  3535. src -= src_stride;
  3536. weight = weight & 0x0000FFFF;
  3537. weight_vec = __msa_fill_w(weight);
  3538. rnd_vec = __msa_fill_w(rnd_val);
  3539. weight *= 128;
  3540. rnd_val -= 6;
  3541. weight_vec_h = __msa_fill_h(weight);
  3542. offset_vec = __msa_fill_h(offset);
  3543. denom_vec = __msa_fill_h(rnd_val);
  3544. weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
  3545. offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
  3546. filter_vec = LD_SH(filter);
  3547. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3548. LD_SB3(src, src_stride, src0, src1, src2);
  3549. LD_SB3(src + 16, src_stride, src5, src6, src7);
  3550. src += (3 * src_stride);
  3551. XORI_B6_128_SB(src0, src1, src2, src5, src6, src7);
  3552. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3553. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3554. ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
  3555. ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
  3556. for (loop_cnt = (height >> 1); loop_cnt--;) {
  3557. LD_SB2(src, src_stride, src3, src4);
  3558. LD_SB2(src + 16, src_stride, src8, src9);
  3559. src += (2 * src_stride);
  3560. XORI_B4_128_SB(src3, src4, src8, src9);
  3561. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3562. ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
  3563. ILVRL_B2_SB(src8, src7, src87_r, src87_l);
  3564. ILVRL_B2_SB(src9, src8, src98_r, src98_l);
  3565. dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3566. dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3567. dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
  3568. dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
  3569. dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
  3570. dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
  3571. dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1);
  3572. dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
  3573. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
  3574. offset_vec, rnd_vec, dst0, dst1, dst2,
  3575. dst3);
  3576. HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
  3577. offset_vec, rnd_vec, dst4, dst5, dst6,
  3578. dst7);
  3579. PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
  3580. out2, out3);
  3581. ST_UB2(out0, out2, dst, 16);
  3582. dst += dst_stride;
  3583. ST_UB2(out1, out3, dst, 16);
  3584. dst += dst_stride;
  3585. src2 = src4;
  3586. src7 = src9;
  3587. src10_r = src32_r;
  3588. src21_r = src43_r;
  3589. src10_l = src32_l;
  3590. src21_l = src43_l;
  3591. src65_r = src87_r;
  3592. src76_r = src98_r;
  3593. src65_l = src87_l;
  3594. src76_l = src98_l;
  3595. }
  3596. }
  3597. static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
  3598. int32_t src_stride,
  3599. uint8_t *dst,
  3600. int32_t dst_stride,
  3601. const int8_t *filter_x,
  3602. const int8_t *filter_y,
  3603. int32_t weight,
  3604. int32_t offset,
  3605. int32_t rnd_val)
  3606. {
  3607. v16u8 out;
  3608. v16i8 src0, src1, src2, src3, src4;
  3609. v8i16 filt0, filt1;
  3610. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  3611. v16i8 mask1;
  3612. v8i16 filt_h0, filt_h1, filter_vec, tmp;
  3613. v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
  3614. v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
  3615. v8i16 offset_vec, const_128, denom_vec;
  3616. v4i32 dst0, dst1, weight_vec, rnd_vec;
  3617. src -= (src_stride + 1);
  3618. filter_vec = LD_SH(filter_x);
  3619. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3620. filter_vec = LD_SH(filter_y);
  3621. UNPCK_R_SB_SH(filter_vec, filter_vec);
  3622. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  3623. mask1 = mask0 + 2;
  3624. weight_vec = __msa_fill_w(weight);
  3625. rnd_vec = __msa_fill_w(rnd_val);
  3626. offset_vec = __msa_fill_h(offset);
  3627. denom_vec = __msa_fill_h(rnd_val - 6);
  3628. const_128 = __msa_fill_h((128 * weight));
  3629. offset_vec += __msa_srar_h(const_128, denom_vec);
  3630. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  3631. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  3632. VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
  3633. VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
  3634. VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
  3635. dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3636. dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3637. dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3638. ILVRL_H2_SH(dst31, dst20, dst10, dst32);
  3639. ILVRL_H2_SH(dst42, dst31, dst21, dst43);
  3640. dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
  3641. dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
  3642. dst0 >>= 6;
  3643. dst1 >>= 6;
  3644. MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
  3645. SRAR_W2_SW(dst0, dst1, rnd_vec);
  3646. tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
  3647. tmp += offset_vec;
  3648. tmp = CLIP_SH_0_255_MAX_SATU(tmp);
  3649. out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
  3650. ST4x2_UB(out, dst, dst_stride);
  3651. }
  3652. static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
  3653. int32_t src_stride,
  3654. uint8_t *dst,
  3655. int32_t dst_stride,
  3656. const int8_t *filter_x,
  3657. const int8_t *filter_y,
  3658. int32_t weight,
  3659. int32_t offset,
  3660. int32_t rnd_val)
  3661. {
  3662. v16u8 out;
  3663. v16i8 src0, src1, src2, src3, src4, src5, src6;
  3664. v8i16 filt0, filt1;
  3665. v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
  3666. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  3667. v16i8 mask1;
  3668. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  3669. v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
  3670. v8i16 offset_vec, const_128, denom_vec;
  3671. v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
  3672. src -= (src_stride + 1);
  3673. filter_vec = LD_SH(filter_x);
  3674. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3675. filter_vec = LD_SH(filter_y);
  3676. UNPCK_R_SB_SH(filter_vec, filter_vec);
  3677. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  3678. mask1 = mask0 + 2;
  3679. weight_vec = __msa_fill_w(weight);
  3680. rnd_vec = __msa_fill_w(rnd_val);
  3681. offset_vec = __msa_fill_h(offset);
  3682. denom_vec = __msa_fill_h(rnd_val - 6);
  3683. const_128 = __msa_fill_h((128 * weight));
  3684. offset_vec += __msa_srar_h(const_128, denom_vec);
  3685. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  3686. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  3687. VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
  3688. VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
  3689. VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
  3690. VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
  3691. dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3692. dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3693. dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3694. dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  3695. ILVRL_H2_SH(dst41, dst30, dst10, dst43);
  3696. ILVRL_H2_SH(dst52, dst41, dst21, dst54);
  3697. ILVRL_H2_SH(dst63, dst52, dst32, dst65);
  3698. dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
  3699. dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
  3700. dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
  3701. dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
  3702. SRA_4V(dst0, dst1, dst2, dst3, 6);
  3703. MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
  3704. MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
  3705. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  3706. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
  3707. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  3708. CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
  3709. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  3710. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  3711. }
  3712. static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
  3713. int32_t src_stride,
  3714. uint8_t *dst,
  3715. int32_t dst_stride,
  3716. const int8_t *filter_x,
  3717. const int8_t *filter_y,
  3718. int32_t height,
  3719. int32_t weight,
  3720. int32_t offset,
  3721. int32_t rnd_val)
  3722. {
  3723. uint32_t loop_cnt;
  3724. v16u8 out0, out1;
  3725. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  3726. v8i16 filt0, filt1;
  3727. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  3728. v16i8 mask1;
  3729. v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
  3730. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  3731. v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
  3732. v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
  3733. v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
  3734. v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
  3735. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
  3736. src -= (src_stride + 1);
  3737. filter_vec = LD_SH(filter_x);
  3738. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3739. filter_vec = LD_SH(filter_y);
  3740. UNPCK_R_SB_SH(filter_vec, filter_vec);
  3741. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  3742. mask1 = mask0 + 2;
  3743. weight_vec = __msa_fill_w(weight);
  3744. rnd_vec = __msa_fill_w(rnd_val);
  3745. offset_vec = __msa_fill_h(offset);
  3746. denom_vec = __msa_fill_h(rnd_val - 6);
  3747. const_128 = __msa_fill_h((128 * weight));
  3748. offset_vec += __msa_srar_h(const_128, denom_vec);
  3749. LD_SB3(src, src_stride, src0, src1, src2);
  3750. src += (3 * src_stride);
  3751. XORI_B3_128_SB(src0, src1, src2);
  3752. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  3753. VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
  3754. dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3755. dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3756. ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
  3757. dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
  3758. for (loop_cnt = height >> 3; loop_cnt--;) {
  3759. LD_SB8(src, src_stride,
  3760. src3, src4, src5, src6, src7, src8, src9, src10);
  3761. src += (8 * src_stride);
  3762. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  3763. VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
  3764. VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
  3765. VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
  3766. VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
  3767. dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3768. dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3769. dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3770. dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  3771. dst32_r = __msa_ilvr_h(dst73, dst22);
  3772. ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
  3773. ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
  3774. ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
  3775. dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
  3776. dst76_r = __msa_ilvr_h(dst22, dst106);
  3777. dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  3778. dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  3779. dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  3780. dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  3781. dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  3782. dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  3783. dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
  3784. dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
  3785. SRA_4V(dst0, dst1, dst2, dst3, 6);
  3786. SRA_4V(dst4, dst5, dst6, dst7, 6);
  3787. MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
  3788. MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
  3789. MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
  3790. MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
  3791. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  3792. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  3793. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
  3794. tmp2, tmp3);
  3795. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  3796. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  3797. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  3798. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  3799. ST4x8_UB(out0, out1, dst, dst_stride);
  3800. dst += (8 * dst_stride);
  3801. dst10_r = dst98_r;
  3802. dst21_r = dst109_r;
  3803. dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
  3804. }
  3805. }
  3806. static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
  3807. int32_t src_stride,
  3808. uint8_t *dst,
  3809. int32_t dst_stride,
  3810. const int8_t *filter_x,
  3811. const int8_t *filter_y,
  3812. int32_t height,
  3813. int32_t weight,
  3814. int32_t offset,
  3815. int32_t rnd_val)
  3816. {
  3817. if (2 == height) {
  3818. hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
  3819. filter_x, filter_y, weight,
  3820. offset, rnd_val);
  3821. } else if (4 == height) {
  3822. hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
  3823. filter_x,filter_y, weight,
  3824. offset, rnd_val);
  3825. } else if (0 == (height % 8)) {
  3826. hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
  3827. filter_x, filter_y, height, weight,
  3828. offset, rnd_val);
  3829. }
  3830. }
  3831. static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
  3832. int32_t src_stride,
  3833. uint8_t *dst,
  3834. int32_t dst_stride,
  3835. const int8_t *filter_x,
  3836. const int8_t *filter_y,
  3837. int32_t height,
  3838. int32_t weight,
  3839. int32_t offset,
  3840. int32_t rnd_val)
  3841. {
  3842. v16u8 out0, out1, out2;
  3843. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  3844. v8i16 filt0, filt1;
  3845. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  3846. v16i8 mask1;
  3847. v8i16 filt_h0, filt_h1, filter_vec;
  3848. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  3849. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
  3850. v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  3851. v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
  3852. v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
  3853. v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
  3854. v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
  3855. v8i16 offset_vec, const_128, denom_vec;
  3856. v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
  3857. v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
  3858. src -= (src_stride + 1);
  3859. filter_vec = LD_SH(filter_x);
  3860. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3861. filter_vec = LD_SH(filter_y);
  3862. UNPCK_R_SB_SH(filter_vec, filter_vec);
  3863. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  3864. mask1 = mask0 + 2;
  3865. weight_vec = __msa_fill_w(weight);
  3866. rnd_vec = __msa_fill_w(rnd_val);
  3867. offset_vec = __msa_fill_h(offset);
  3868. denom_vec = __msa_fill_h(rnd_val - 6);
  3869. const_128 = __msa_fill_h((128 * weight));
  3870. offset_vec += __msa_srar_h(const_128, denom_vec);
  3871. LD_SB3(src, src_stride, src0, src1, src2);
  3872. src += (3 * src_stride);
  3873. XORI_B3_128_SB(src0, src1, src2);
  3874. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  3875. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  3876. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  3877. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3878. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3879. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3880. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  3881. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  3882. LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
  3883. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  3884. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  3885. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  3886. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  3887. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  3888. dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3889. dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3890. dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3891. dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  3892. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
  3893. VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
  3894. VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
  3895. VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
  3896. dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3897. dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3898. dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3899. dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  3900. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  3901. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  3902. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  3903. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  3904. ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
  3905. ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
  3906. ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
  3907. ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
  3908. PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
  3909. PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
  3910. dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
  3911. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  3912. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  3913. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  3914. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  3915. dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  3916. dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  3917. dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
  3918. dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
  3919. dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
  3920. dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
  3921. dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
  3922. dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
  3923. SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
  3924. SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
  3925. SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
  3926. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  3927. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  3928. MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
  3929. MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
  3930. MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
  3931. MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
  3932. SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
  3933. SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
  3934. SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
  3935. PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
  3936. PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
  3937. PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
  3938. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  3939. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  3940. ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
  3941. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  3942. CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
  3943. PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
  3944. ST4x8_UB(out0, out1, dst, dst_stride);
  3945. ST2x4_UB(out2, 0, dst + 4, dst_stride);
  3946. dst += 4 * dst_stride;
  3947. ST2x4_UB(out2, 4, dst + 4, dst_stride);
  3948. }
  3949. static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
  3950. int32_t src_stride,
  3951. uint8_t *dst,
  3952. int32_t dst_stride,
  3953. const int8_t *filter_x,
  3954. const int8_t *filter_y,
  3955. int32_t weight,
  3956. int32_t offset,
  3957. int32_t rnd_val)
  3958. {
  3959. v16u8 out;
  3960. v16i8 src0, src1, src2, src3, src4;
  3961. v8i16 filt0, filt1;
  3962. v8i16 filt_h0, filt_h1, filter_vec;
  3963. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  3964. v16i8 mask1;
  3965. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  3966. v8i16 dst0, dst1, dst2, dst3, dst4;
  3967. v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
  3968. v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
  3969. v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
  3970. v8i16 tmp0, tmp1;
  3971. v8i16 offset_vec, const_128, denom_vec;
  3972. v4i32 weight_vec, rnd_vec;
  3973. src -= (src_stride + 1);
  3974. filter_vec = LD_SH(filter_x);
  3975. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3976. filter_vec = LD_SH(filter_y);
  3977. UNPCK_R_SB_SH(filter_vec, filter_vec);
  3978. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  3979. mask1 = mask0 + 2;
  3980. weight_vec = __msa_fill_w(weight);
  3981. rnd_vec = __msa_fill_w(rnd_val);
  3982. offset_vec = __msa_fill_h(offset);
  3983. denom_vec = __msa_fill_h(rnd_val - 6);
  3984. const_128 = __msa_fill_h((128 * weight));
  3985. offset_vec += __msa_srar_h(const_128, denom_vec);
  3986. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  3987. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  3988. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  3989. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  3990. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  3991. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  3992. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
  3993. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3994. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  3995. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  3996. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  3997. dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
  3998. ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
  3999. ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
  4000. ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
  4001. ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
  4002. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4003. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4004. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4005. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4006. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4007. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  4008. MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
  4009. SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
  4010. PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
  4011. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  4012. CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
  4013. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  4014. ST8x2_UB(out, dst, dst_stride);
  4015. }
  4016. static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
  4017. int32_t src_stride,
  4018. uint8_t *dst,
  4019. int32_t dst_stride,
  4020. const int8_t *filter_x,
  4021. const int8_t *filter_y,
  4022. int32_t width8mult,
  4023. int32_t weight,
  4024. int32_t offset,
  4025. int32_t rnd_val)
  4026. {
  4027. uint32_t cnt;
  4028. v16u8 out0, out1;
  4029. v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
  4030. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4031. v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
  4032. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
  4033. v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
  4034. v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
  4035. v8i16 offset_vec, const_128, denom_vec;
  4036. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4037. v4i32 weight_vec, rnd_vec;
  4038. src -= (src_stride + 1);
  4039. filter_vec = LD_SH(filter_x);
  4040. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4041. filter_vec = LD_SH(filter_y);
  4042. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4043. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4044. mask0 = LD_SB(ff_hevc_mask_arr);
  4045. mask1 = mask0 + 2;
  4046. weight_vec = __msa_fill_w(weight);
  4047. rnd_vec = __msa_fill_w(rnd_val);
  4048. offset_vec = __msa_fill_h(offset);
  4049. denom_vec = __msa_fill_h(rnd_val - 6);
  4050. const_128 = __msa_fill_h((128 * weight));
  4051. offset_vec += __msa_srar_h(const_128, denom_vec);
  4052. for (cnt = width8mult; cnt--;) {
  4053. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  4054. src += 8;
  4055. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  4056. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4057. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4058. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4059. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4060. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4061. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4062. ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
  4063. ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
  4064. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  4065. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  4066. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  4067. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  4068. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4069. dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4070. dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4071. dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4072. ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
  4073. ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
  4074. ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
  4075. ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
  4076. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4077. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4078. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4079. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4080. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4081. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4082. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4083. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4084. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4085. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4086. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  4087. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  4088. MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
  4089. MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
  4090. SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
  4091. SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
  4092. PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
  4093. dst3_r, tmp0, tmp1, tmp2, tmp3);
  4094. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  4095. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  4096. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4097. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4098. ST8x4_UB(out0, out1, dst, dst_stride);
  4099. dst += 8;
  4100. }
  4101. }
  4102. static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
  4103. int32_t src_stride,
  4104. uint8_t *dst,
  4105. int32_t dst_stride,
  4106. const int8_t *filter_x,
  4107. const int8_t *filter_y,
  4108. int32_t weight,
  4109. int32_t offset,
  4110. int32_t rnd_val)
  4111. {
  4112. v16u8 out0, out1, out2;
  4113. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  4114. v8i16 filt0, filt1;
  4115. v8i16 filt_h0, filt_h1, filter_vec;
  4116. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  4117. v16i8 mask1;
  4118. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  4119. v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
  4120. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
  4121. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4122. v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
  4123. v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
  4124. v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
  4125. v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
  4126. v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
  4127. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  4128. v8i16 offset_vec, const_128, denom_vec;
  4129. src -= (src_stride + 1);
  4130. filter_vec = LD_SH(filter_x);
  4131. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4132. filter_vec = LD_SH(filter_y);
  4133. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4134. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4135. mask1 = mask0 + 2;
  4136. weight_vec = __msa_fill_w(weight);
  4137. rnd_vec = __msa_fill_w(rnd_val);
  4138. offset_vec = __msa_fill_h(offset);
  4139. denom_vec = __msa_fill_h(rnd_val - 6);
  4140. const_128 = __msa_fill_h((128 * weight));
  4141. offset_vec += __msa_srar_h(const_128, denom_vec);
  4142. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  4143. src += (5 * src_stride);
  4144. LD_SB4(src, src_stride, src5, src6, src7, src8);
  4145. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  4146. XORI_B4_128_SB(src5, src6, src7, src8);
  4147. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4148. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4149. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4150. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  4151. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
  4152. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
  4153. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
  4154. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
  4155. VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
  4156. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4157. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4158. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4159. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4160. dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
  4161. dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
  4162. dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
  4163. dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
  4164. dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
  4165. ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
  4166. ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
  4167. ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
  4168. ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
  4169. ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
  4170. ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
  4171. ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
  4172. ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
  4173. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4174. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4175. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4176. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4177. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4178. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4179. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4180. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4181. dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  4182. dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
  4183. dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  4184. dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
  4185. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4186. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4187. SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
  4188. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  4189. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  4190. MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
  4191. MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
  4192. MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
  4193. MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
  4194. SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
  4195. SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
  4196. SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
  4197. PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
  4198. tmp0, tmp1, tmp2, tmp3);
  4199. PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
  4200. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  4201. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  4202. ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
  4203. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4204. CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
  4205. PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
  4206. ST8x4_UB(out0, out1, dst, dst_stride);
  4207. dst += (4 * dst_stride);
  4208. ST8x2_UB(out2, dst, dst_stride);
  4209. }
  4210. static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
  4211. int32_t src_stride,
  4212. uint8_t *dst,
  4213. int32_t dst_stride,
  4214. const int8_t *filter_x,
  4215. const int8_t *filter_y,
  4216. int32_t height,
  4217. int32_t weight,
  4218. int32_t offset,
  4219. int32_t rnd_val,
  4220. int32_t width8mult)
  4221. {
  4222. uint32_t loop_cnt, cnt;
  4223. uint8_t *src_tmp;
  4224. uint8_t *dst_tmp;
  4225. v16u8 out0, out1;
  4226. v16i8 src0, src1, src2, src3, src4, src5, src6;
  4227. v8i16 filt0, filt1;
  4228. v8i16 filt_h0, filt_h1, filter_vec;
  4229. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  4230. v16i8 mask1;
  4231. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4232. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
  4233. v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
  4234. v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
  4235. v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
  4236. v8i16 offset_vec, const_128, denom_vec;
  4237. v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
  4238. v4i32 weight_vec, rnd_vec;
  4239. src -= (src_stride + 1);
  4240. filter_vec = LD_SH(filter_x);
  4241. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4242. filter_vec = LD_SH(filter_y);
  4243. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4244. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4245. mask1 = mask0 + 2;
  4246. weight_vec = __msa_fill_w(weight);
  4247. rnd_vec = __msa_fill_w(rnd_val);
  4248. offset_vec = __msa_fill_h(offset);
  4249. denom_vec = __msa_fill_h(rnd_val - 6);
  4250. const_128 = __msa_fill_h((128 * weight));
  4251. offset_vec += __msa_srar_h(const_128, denom_vec);
  4252. for (cnt = width8mult; cnt--;) {
  4253. src_tmp = src;
  4254. dst_tmp = dst;
  4255. LD_SB3(src_tmp, src_stride, src0, src1, src2);
  4256. src_tmp += (3 * src_stride);
  4257. XORI_B3_128_SB(src0, src1, src2);
  4258. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4259. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4260. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4261. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4262. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4263. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4264. ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
  4265. ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
  4266. for (loop_cnt = height >> 2; loop_cnt--;) {
  4267. LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
  4268. src_tmp += (4 * src_stride);
  4269. XORI_B4_128_SB(src3, src4, src5, src6);
  4270. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  4271. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  4272. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  4273. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  4274. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4275. dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4276. dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4277. dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4278. ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
  4279. ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
  4280. ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
  4281. ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
  4282. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4283. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4284. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4285. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4286. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4287. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4288. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4289. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4290. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4291. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4292. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  4293. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  4294. MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
  4295. MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
  4296. SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
  4297. SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
  4298. PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
  4299. dst3_r, tmp0, tmp1, tmp2, tmp3);
  4300. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  4301. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  4302. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4303. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4304. ST8x4_UB(out0, out1, dst_tmp, dst_stride);
  4305. dst_tmp += (4 * dst_stride);
  4306. dst10_r = dst54_r;
  4307. dst10_l = dst54_l;
  4308. dst21_r = dst65_r;
  4309. dst21_l = dst65_l;
  4310. dst2 = dst6;
  4311. }
  4312. src += 8;
  4313. dst += 8;
  4314. }
  4315. }
  4316. static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
  4317. int32_t src_stride,
  4318. uint8_t *dst,
  4319. int32_t dst_stride,
  4320. const int8_t *filter_x,
  4321. const int8_t *filter_y,
  4322. int32_t height,
  4323. int32_t weight,
  4324. int32_t offset,
  4325. int32_t rnd_val)
  4326. {
  4327. if (2 == height) {
  4328. hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
  4329. filter_x, filter_y, weight,
  4330. offset, rnd_val);
  4331. } else if (4 == height) {
  4332. hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
  4333. filter_x, filter_y, 1, weight,
  4334. offset, rnd_val);
  4335. } else if (6 == height) {
  4336. hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
  4337. filter_x, filter_y, weight,
  4338. offset, rnd_val);
  4339. } else if (0 == (height % 4)) {
  4340. hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
  4341. filter_x, filter_y, height, weight,
  4342. offset, rnd_val, 1);
  4343. }
  4344. }
  4345. static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
  4346. int32_t src_stride,
  4347. uint8_t *dst,
  4348. int32_t dst_stride,
  4349. const int8_t *filter_x,
  4350. const int8_t *filter_y,
  4351. int32_t height,
  4352. int32_t weight,
  4353. int32_t offset,
  4354. int32_t rnd_val)
  4355. {
  4356. uint32_t loop_cnt;
  4357. uint8_t *src_tmp, *dst_tmp;
  4358. v16u8 out0, out1;
  4359. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  4360. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4361. v16i8 mask0, mask1, mask2, mask3;
  4362. v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
  4363. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
  4364. v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
  4365. v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
  4366. v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
  4367. v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
  4368. v8i16 offset_vec, const_128, denom_vec;
  4369. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4370. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
  4371. src -= (src_stride + 1);
  4372. filter_vec = LD_SH(filter_x);
  4373. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4374. filter_vec = LD_SH(filter_y);
  4375. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4376. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4377. mask0 = LD_SB(ff_hevc_mask_arr);
  4378. mask1 = mask0 + 2;
  4379. weight_vec = __msa_fill_w(weight);
  4380. rnd_vec = __msa_fill_w(rnd_val);
  4381. offset_vec = __msa_fill_h(offset);
  4382. denom_vec = __msa_fill_h(rnd_val - 6);
  4383. const_128 = __msa_fill_h((128 * weight));
  4384. offset_vec += __msa_srar_h(const_128, denom_vec);
  4385. src_tmp = src;
  4386. dst_tmp = dst;
  4387. LD_SB3(src_tmp, src_stride, src0, src1, src2);
  4388. src_tmp += (3 * src_stride);
  4389. XORI_B3_128_SB(src0, src1, src2);
  4390. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4391. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4392. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4393. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4394. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4395. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4396. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  4397. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  4398. for (loop_cnt = 4; loop_cnt--;) {
  4399. LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
  4400. src_tmp += (4 * src_stride);
  4401. XORI_B4_128_SB(src3, src4, src5, src6);
  4402. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  4403. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  4404. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  4405. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  4406. dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4407. dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4408. dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4409. dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4410. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  4411. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  4412. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  4413. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  4414. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4415. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4416. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4417. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4418. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4419. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4420. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4421. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4422. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4423. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4424. MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
  4425. MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
  4426. MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
  4427. MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
  4428. SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
  4429. SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
  4430. PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
  4431. dst3_r, tmp0, tmp1, tmp2, tmp3);
  4432. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  4433. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  4434. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4435. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4436. ST8x4_UB(out0, out1, dst_tmp, dst_stride);
  4437. dst_tmp += (4 * dst_stride);
  4438. dst10_r = dst54_r;
  4439. dst10_l = dst54_l;
  4440. dst21_r = dst65_r;
  4441. dst21_l = dst65_l;
  4442. dsth2 = dsth6;
  4443. }
  4444. src += 8;
  4445. dst += 8;
  4446. mask2 = LD_SB(ff_hevc_mask_arr + 16);
  4447. mask3 = mask2 + 2;
  4448. LD_SB3(src, src_stride, src0, src1, src2);
  4449. src += (3 * src_stride);
  4450. XORI_B3_128_SB(src0, src1, src2);
  4451. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
  4452. VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
  4453. dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4454. dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4455. ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
  4456. dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
  4457. for (loop_cnt = 2; loop_cnt--;) {
  4458. LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
  4459. src10);
  4460. src += (8 * src_stride);
  4461. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  4462. VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
  4463. VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
  4464. VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
  4465. VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
  4466. dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4467. dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4468. dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4469. dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4470. dst32_r = __msa_ilvr_h(dst73, dst22);
  4471. ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
  4472. ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
  4473. ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
  4474. dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
  4475. dst76_r = __msa_ilvr_h(dst22, dst106);
  4476. dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4477. dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4478. dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4479. dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4480. dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  4481. dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  4482. dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
  4483. dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
  4484. SRA_4V(dst0, dst1, dst2, dst3, 6);
  4485. SRA_4V(dst4, dst5, dst6, dst7, 6);
  4486. MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
  4487. MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
  4488. MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
  4489. MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
  4490. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4491. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  4492. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
  4493. tmp2, tmp3);
  4494. ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
  4495. ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
  4496. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4497. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4498. ST4x8_UB(out0, out1, dst, dst_stride);
  4499. dst += (8 * dst_stride);
  4500. dst10_r = dst98_r;
  4501. dst21_r = dst109_r;
  4502. dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
  4503. }
  4504. }
  4505. static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
  4506. int32_t src_stride,
  4507. uint8_t *dst,
  4508. int32_t dst_stride,
  4509. const int8_t *filter_x,
  4510. const int8_t *filter_y,
  4511. int32_t height,
  4512. int32_t weight,
  4513. int32_t offset,
  4514. int32_t rnd_val)
  4515. {
  4516. if (4 == height) {
  4517. hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
  4518. filter_x, filter_y, 2, weight, offset,
  4519. rnd_val);
  4520. } else {
  4521. hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
  4522. filter_x, filter_y, height, weight,
  4523. offset, rnd_val, 2);
  4524. }
  4525. }
  4526. static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
  4527. int32_t src_stride,
  4528. uint8_t *dst,
  4529. int32_t dst_stride,
  4530. const int8_t *filter_x,
  4531. const int8_t *filter_y,
  4532. int32_t height,
  4533. int32_t weight,
  4534. int32_t offset,
  4535. int32_t rnd_val)
  4536. {
  4537. hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
  4538. filter_x, filter_y, height, weight,
  4539. offset, rnd_val, 3);
  4540. }
  4541. static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
  4542. int32_t src_stride,
  4543. uint8_t *dst,
  4544. int32_t dst_stride,
  4545. const int8_t *filter_x,
  4546. const int8_t *filter_y,
  4547. int32_t height,
  4548. int32_t weight,
  4549. int32_t offset,
  4550. int32_t rnd_val)
  4551. {
  4552. hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
  4553. filter_x, filter_y, height, weight,
  4554. offset, rnd_val, 4);
  4555. }
  4556. #define UNIWGT_MC_COPY(WIDTH) \
  4557. void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
  4558. ptrdiff_t dst_stride, \
  4559. uint8_t *src, \
  4560. ptrdiff_t src_stride, \
  4561. int height, \
  4562. int denom, \
  4563. int weight, \
  4564. int offset, \
  4565. intptr_t mx, \
  4566. intptr_t my, \
  4567. int width) \
  4568. { \
  4569. int shift = denom + 14 - 8; \
  4570. hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
  4571. height, weight, offset, shift); \
  4572. }
  4573. UNIWGT_MC_COPY(4);
  4574. UNIWGT_MC_COPY(6);
  4575. UNIWGT_MC_COPY(8);
  4576. UNIWGT_MC_COPY(12);
  4577. UNIWGT_MC_COPY(16);
  4578. UNIWGT_MC_COPY(24);
  4579. UNIWGT_MC_COPY(32);
  4580. UNIWGT_MC_COPY(48);
  4581. UNIWGT_MC_COPY(64);
  4582. #undef UNIWGT_MC_COPY
  4583. #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
  4584. void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
  4585. ptrdiff_t \
  4586. dst_stride, \
  4587. uint8_t *src, \
  4588. ptrdiff_t \
  4589. src_stride, \
  4590. int height, \
  4591. int denom, \
  4592. int weight, \
  4593. int offset, \
  4594. intptr_t mx, \
  4595. intptr_t my, \
  4596. int width) \
  4597. { \
  4598. const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
  4599. int shift = denom + 14 - 8; \
  4600. \
  4601. hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
  4602. dst_stride, filter, height, \
  4603. weight, offset, shift); \
  4604. }
  4605. UNI_W_MC(qpel, h, 4, 8, hz, mx);
  4606. UNI_W_MC(qpel, h, 8, 8, hz, mx);
  4607. UNI_W_MC(qpel, h, 12, 8, hz, mx);
  4608. UNI_W_MC(qpel, h, 16, 8, hz, mx);
  4609. UNI_W_MC(qpel, h, 24, 8, hz, mx);
  4610. UNI_W_MC(qpel, h, 32, 8, hz, mx);
  4611. UNI_W_MC(qpel, h, 48, 8, hz, mx);
  4612. UNI_W_MC(qpel, h, 64, 8, hz, mx);
  4613. UNI_W_MC(qpel, v, 4, 8, vt, my);
  4614. UNI_W_MC(qpel, v, 8, 8, vt, my);
  4615. UNI_W_MC(qpel, v, 12, 8, vt, my);
  4616. UNI_W_MC(qpel, v, 16, 8, vt, my);
  4617. UNI_W_MC(qpel, v, 24, 8, vt, my);
  4618. UNI_W_MC(qpel, v, 32, 8, vt, my);
  4619. UNI_W_MC(qpel, v, 48, 8, vt, my);
  4620. UNI_W_MC(qpel, v, 64, 8, vt, my);
  4621. UNI_W_MC(epel, h, 4, 4, hz, mx);
  4622. UNI_W_MC(epel, h, 6, 4, hz, mx);
  4623. UNI_W_MC(epel, h, 8, 4, hz, mx);
  4624. UNI_W_MC(epel, h, 12, 4, hz, mx);
  4625. UNI_W_MC(epel, h, 16, 4, hz, mx);
  4626. UNI_W_MC(epel, h, 24, 4, hz, mx);
  4627. UNI_W_MC(epel, h, 32, 4, hz, mx);
  4628. UNI_W_MC(epel, v, 4, 4, vt, my);
  4629. UNI_W_MC(epel, v, 6, 4, vt, my);
  4630. UNI_W_MC(epel, v, 8, 4, vt, my);
  4631. UNI_W_MC(epel, v, 12, 4, vt, my);
  4632. UNI_W_MC(epel, v, 16, 4, vt, my);
  4633. UNI_W_MC(epel, v, 24, 4, vt, my);
  4634. UNI_W_MC(epel, v, 32, 4, vt, my);
  4635. #undef UNI_W_MC
  4636. #define UNI_W_MC_HV(PEL, WIDTH, TAP) \
  4637. void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
  4638. ptrdiff_t dst_stride, \
  4639. uint8_t *src, \
  4640. ptrdiff_t src_stride, \
  4641. int height, \
  4642. int denom, \
  4643. int weight, \
  4644. int offset, \
  4645. intptr_t mx, \
  4646. intptr_t my, \
  4647. int width) \
  4648. { \
  4649. const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
  4650. const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
  4651. int shift = denom + 14 - 8; \
  4652. \
  4653. hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
  4654. filter_x, filter_y, height, \
  4655. weight, offset, shift); \
  4656. }
  4657. UNI_W_MC_HV(qpel, 4, 8);
  4658. UNI_W_MC_HV(qpel, 8, 8);
  4659. UNI_W_MC_HV(qpel, 12, 8);
  4660. UNI_W_MC_HV(qpel, 16, 8);
  4661. UNI_W_MC_HV(qpel, 24, 8);
  4662. UNI_W_MC_HV(qpel, 32, 8);
  4663. UNI_W_MC_HV(qpel, 48, 8);
  4664. UNI_W_MC_HV(qpel, 64, 8);
  4665. UNI_W_MC_HV(epel, 4, 4);
  4666. UNI_W_MC_HV(epel, 6, 4);
  4667. UNI_W_MC_HV(epel, 8, 4);
  4668. UNI_W_MC_HV(epel, 12, 4);
  4669. UNI_W_MC_HV(epel, 16, 4);
  4670. UNI_W_MC_HV(epel, 24, 4);
  4671. UNI_W_MC_HV(epel, 32, 4);
  4672. #undef UNI_W_MC_HV