You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1499 lines
56KB

  1. /*
  2. * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/mips/generic_macros_msa.h"
  21. #include "libavcodec/mips/hpeldsp_mips.h"
  22. #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
  23. { \
  24. v16u8 tmp_m; \
  25. \
  26. tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
  27. tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
  28. ST_UB(tmp_m, (pdst)); \
  29. }
  30. #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
  31. { \
  32. v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  33. uint8_t *pdst_m = (uint8_t *) (pdst); \
  34. \
  35. PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
  36. tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
  37. ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
  38. }
  39. #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
  40. pdst, stride) \
  41. { \
  42. v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  43. uint8_t *pdst_m = (uint8_t *) (pdst); \
  44. \
  45. PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
  46. PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
  47. AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
  48. ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
  49. }
  50. static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
  51. uint8_t *dst, int32_t dst_stride,
  52. uint8_t height)
  53. {
  54. uint8_t loop_cnt;
  55. uint32_t out0, out1;
  56. v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
  57. for (loop_cnt = (height >> 1); loop_cnt--;) {
  58. LD_UB2(src, src_stride, src0, src1);
  59. src += (2 * src_stride);
  60. SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
  61. AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
  62. out0 = __msa_copy_u_w((v4i32) res0, 0);
  63. out1 = __msa_copy_u_w((v4i32) res1, 0);
  64. SW(out0, dst);
  65. dst += dst_stride;
  66. SW(out1, dst);
  67. dst += dst_stride;
  68. }
  69. }
  70. static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
  71. uint8_t *dst, int32_t dst_stride,
  72. uint8_t height)
  73. {
  74. uint8_t loop_cnt;
  75. v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
  76. for (loop_cnt = (height >> 2); loop_cnt--;) {
  77. LD_SB4(src, src_stride, src0, src1, src2, src3);
  78. src += (4 * src_stride);
  79. SLDI_B4_0_SB(src0, src1, src2, src3,
  80. src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
  81. AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
  82. src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
  83. dst += (4 * dst_stride);
  84. }
  85. }
  86. static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
  87. uint8_t *dst, int32_t dst_stride,
  88. uint8_t height)
  89. {
  90. uint8_t loop_cnt;
  91. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  92. v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
  93. for (loop_cnt = (height >> 3); loop_cnt--;) {
  94. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  95. LD_UB8((src + 1), src_stride,
  96. src8, src9, src10, src11, src12, src13, src14, src15);
  97. src += (8 * src_stride);
  98. AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
  99. dst, dst_stride);
  100. dst += (4 * dst_stride);
  101. AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
  102. dst, dst_stride);
  103. dst += (4 * dst_stride);
  104. }
  105. }
  106. static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
  107. uint8_t *dst, int32_t dst_stride)
  108. {
  109. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  110. v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
  111. v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
  112. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  113. src += (8 * src_stride);
  114. SLDI_B4_0_SB(src0, src1, src2, src3,
  115. src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
  116. SLDI_B4_0_SB(src4, src5, src6, src7,
  117. src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1);
  118. AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
  119. src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
  120. dst += (4 * dst_stride);
  121. AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
  122. src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
  123. }
  124. static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
  125. uint8_t *dst, int32_t dst_stride)
  126. {
  127. v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
  128. LD_SB4(src, src_stride, src0, src1, src2, src3);
  129. SLDI_B4_0_SB(src0, src1, src2, src3,
  130. src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
  131. AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
  132. src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
  133. }
  134. static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
  135. int32_t src_stride,
  136. uint8_t *dst, int32_t dst_stride)
  137. {
  138. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  139. v16u8 src9, src10, src11, src12, src13, src14, src15;
  140. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  141. LD_UB8((src + 1), src_stride,
  142. src8, src9, src10, src11, src12, src13, src14, src15);
  143. src += (8 * src_stride);
  144. AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
  145. dst, dst_stride);
  146. dst += (4 * dst_stride);
  147. LD_UB4(src, src_stride, src0, src1, src2, src3);
  148. LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
  149. src += (4 * src_stride);
  150. AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
  151. dst, dst_stride);
  152. dst += (4 * dst_stride);
  153. LD_UB4(src, src_stride, src4, src5, src6, src7);
  154. LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
  155. src += (4 * src_stride);
  156. AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
  157. dst, dst_stride);
  158. dst += (4 * dst_stride);
  159. AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
  160. dst, dst_stride);
  161. }
  162. static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
  163. int32_t src_stride,
  164. uint8_t *dst, int32_t dst_stride)
  165. {
  166. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  167. v16u8 src9, src10, src11, src12, src13, src14, src15;
  168. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  169. LD_UB8((src + 1), src_stride,
  170. src8, src9, src10, src11, src12, src13, src14, src15);
  171. AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
  172. dst, dst_stride);
  173. dst += (4 * dst_stride);
  174. AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
  175. dst, dst_stride);
  176. }
  177. static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
  178. int32_t src_stride,
  179. uint8_t *dst, int32_t dst_stride,
  180. uint8_t height)
  181. {
  182. uint8_t loop_cnt;
  183. uint32_t dst0, dst1, out0, out1;
  184. v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
  185. v16u8 tmp0 = { 0 };
  186. v16u8 tmp1 = { 0 };
  187. for (loop_cnt = (height >> 1); loop_cnt--;) {
  188. LD_UB2(src, src_stride, src0, src1);
  189. src += (2 * src_stride);
  190. SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
  191. dst0 = LW(dst);
  192. dst1 = LW(dst + dst_stride);
  193. tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
  194. tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
  195. AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
  196. AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
  197. out0 = __msa_copy_u_w((v4i32) res0, 0);
  198. out1 = __msa_copy_u_w((v4i32) res1, 0);
  199. SW(out0, dst);
  200. dst += dst_stride;
  201. SW(out1, dst);
  202. dst += dst_stride;
  203. }
  204. }
  205. static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
  206. int32_t src_stride,
  207. uint8_t *dst, int32_t dst_stride,
  208. uint8_t height)
  209. {
  210. uint8_t loop_cnt;
  211. v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
  212. for (loop_cnt = (height >> 2); loop_cnt--;) {
  213. LD_SB4(src, src_stride, src0, src1, src2, src3);
  214. src += (4 * src_stride);
  215. SLDI_B4_0_SB(src0, src1, src2, src3,
  216. src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
  217. AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
  218. src3, src3_sld1, dst, dst_stride);
  219. dst += (4 * dst_stride);
  220. }
  221. }
  222. static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
  223. int32_t src_stride,
  224. uint8_t *dst, int32_t dst_stride,
  225. uint8_t height)
  226. {
  227. uint8_t loop_cnt;
  228. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  229. v16u8 src9, src10, src11, src12, src13, src14, src15;
  230. for (loop_cnt = (height >> 3); loop_cnt--;) {
  231. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  232. LD_UB8((src + 1), src_stride,
  233. src8, src9, src10, src11, src12, src13, src14, src15);
  234. src += (8 * src_stride);
  235. AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
  236. dst, dst_stride);
  237. dst += (4 * dst_stride);
  238. AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
  239. dst, dst_stride);
  240. dst += (4 * dst_stride);
  241. }
  242. }
  243. static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
  244. uint8_t *dst, int32_t dst_stride,
  245. uint8_t height)
  246. {
  247. uint8_t loop_cnt;
  248. uint32_t out0, out1;
  249. v16u8 src0, src1, src2, res0, res1;
  250. src0 = LD_UB(src);
  251. src += src_stride;
  252. for (loop_cnt = (height >> 1); loop_cnt--;) {
  253. LD_UB2(src, src_stride, src1, src2);
  254. src += (2 * src_stride);
  255. AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
  256. out0 = __msa_copy_u_w((v4i32) res0, 0);
  257. out1 = __msa_copy_u_w((v4i32) res1, 0);
  258. SW(out0, dst);
  259. dst += dst_stride;
  260. SW(out1, dst);
  261. dst += dst_stride;
  262. src0 = src2;
  263. }
  264. }
  265. static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
  266. uint8_t *dst, int32_t dst_stride,
  267. uint8_t height)
  268. {
  269. uint8_t loop_cnt;
  270. v16u8 src0, src1, src2, src3, src4;
  271. src0 = LD_UB(src);
  272. src += src_stride;
  273. for (loop_cnt = (height >> 2); loop_cnt--;) {
  274. LD_UB4(src, src_stride, src1, src2, src3, src4);
  275. src += (4 * src_stride);
  276. AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  277. dst, dst_stride);
  278. dst += (4 * dst_stride);
  279. src0 = src4;
  280. }
  281. }
  282. static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
  283. uint8_t *dst, int32_t dst_stride,
  284. uint8_t height)
  285. {
  286. uint8_t loop_cnt;
  287. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  288. src0 = LD_UB(src);
  289. src += src_stride;
  290. for (loop_cnt = (height >> 3); loop_cnt--;) {
  291. LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
  292. src += (8 * src_stride);
  293. AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  294. dst, dst_stride);
  295. dst += (4 * dst_stride);
  296. AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
  297. dst, dst_stride);
  298. dst += (4 * dst_stride);
  299. src0 = src8;
  300. }
  301. }
  302. static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
  303. uint8_t *dst, int32_t dst_stride)
  304. {
  305. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  306. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  307. src += (8 * src_stride);
  308. src8 = LD_UB(src);
  309. AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  310. dst, dst_stride);
  311. dst += (4 * dst_stride);
  312. AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
  313. dst, dst_stride);
  314. }
  315. static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
  316. uint8_t *dst, int32_t dst_stride)
  317. {
  318. v16u8 src0, src1, src2, src3, src4;
  319. LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
  320. AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  321. dst, dst_stride);
  322. }
  323. static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
  324. int32_t src_stride,
  325. uint8_t *dst, int32_t dst_stride)
  326. {
  327. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  328. v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
  329. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  330. src += (8 * src_stride);
  331. LD_UB8(src, src_stride,
  332. src8, src9, src10, src11, src12, src13, src14, src15);
  333. src += (8 * src_stride);
  334. src16 = LD_UB(src);
  335. AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  336. dst, dst_stride);
  337. dst += (4 * dst_stride);
  338. AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
  339. dst, dst_stride);
  340. dst += (4 * dst_stride);
  341. AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
  342. dst, dst_stride);
  343. dst += (4 * dst_stride);
  344. AVE_ST16x4_UB(src12, src13, src13, src14,
  345. src14, src15, src15, src16, dst, dst_stride);
  346. }
  347. static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
  348. int32_t src_stride,
  349. uint8_t *dst, int32_t dst_stride)
  350. {
  351. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  352. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  353. src += (8 * src_stride);
  354. src8 = LD_UB(src);
  355. AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  356. dst, dst_stride);
  357. dst += (4 * dst_stride);
  358. AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
  359. dst, dst_stride);
  360. }
  361. static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
  362. int32_t src_stride,
  363. uint8_t *dst, int32_t dst_stride,
  364. uint8_t height)
  365. {
  366. uint8_t loop_cnt;
  367. uint32_t out0, out1, dst0, dst1;
  368. v16u8 src0, src1, src2;
  369. v16u8 tmp0 = { 0 };
  370. v16u8 tmp1 = { 0 };
  371. v16u8 res0, res1;
  372. src0 = LD_UB(src);
  373. src += src_stride;
  374. for (loop_cnt = (height >> 1); loop_cnt--;) {
  375. LD_UB2(src, src_stride, src1, src2);
  376. src += (2 * src_stride);
  377. dst0 = LW(dst);
  378. dst1 = LW(dst + dst_stride);
  379. tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
  380. tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
  381. AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
  382. AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
  383. out0 = __msa_copy_u_w((v4i32) res0, 0);
  384. out1 = __msa_copy_u_w((v4i32) res1, 0);
  385. SW(out0, dst);
  386. dst += dst_stride;
  387. SW(out1, dst);
  388. dst += dst_stride;
  389. src0 = src2;
  390. }
  391. }
  392. static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
  393. int32_t src_stride,
  394. uint8_t *dst, int32_t dst_stride,
  395. uint8_t height)
  396. {
  397. uint8_t loop_cnt;
  398. v16u8 src0, src1, src2, src3, src4;
  399. src0 = LD_UB(src);
  400. src += src_stride;
  401. for (loop_cnt = (height >> 2); loop_cnt--;) {
  402. LD_UB4(src, src_stride, src1, src2, src3, src4);
  403. src += (4 * src_stride);
  404. AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  405. dst, dst_stride);
  406. dst += (4 * dst_stride);
  407. src0 = src4;
  408. }
  409. }
  410. static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
  411. int32_t src_stride,
  412. uint8_t *dst, int32_t dst_stride,
  413. uint8_t height)
  414. {
  415. uint8_t loop_cnt;
  416. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  417. v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
  418. v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  419. src0 = LD_UB(src);
  420. src += src_stride;
  421. for (loop_cnt = (height >> 3); loop_cnt--;) {
  422. LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
  423. src += (8 * src_stride);
  424. AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
  425. res0, res1, res2, res3);
  426. AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
  427. res4, res5, res6, res7);
  428. LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
  429. AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
  430. res0, res1, res2, res3);
  431. AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
  432. res4, res5, res6, res7);
  433. ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
  434. dst += (8 * dst_stride);
  435. src0 = src8;
  436. }
  437. }
  438. static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
  439. uint8_t *dst, int32_t dst_stride,
  440. uint8_t height)
  441. {
  442. uint8_t loop_cnt;
  443. uint32_t res0, res1;
  444. v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
  445. v16u8 src0_r, src1_r, src2_r, res;
  446. v8u16 add0, add1, add2, sum0, sum1;
  447. src0 = LD_SB(src);
  448. src += src_stride;
  449. for (loop_cnt = (height >> 1); loop_cnt--;) {
  450. LD_SB2(src, src_stride, src1, src2);
  451. src += (2 * src_stride);
  452. SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
  453. ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
  454. src0_r, src1_r, src2_r);
  455. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  456. ADD2(add0, add1, add1, add2, sum0, sum1);
  457. SRARI_H2_UH(sum0, sum1, 2);
  458. res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
  459. res0 = __msa_copy_u_w((v4i32) res, 0);
  460. res1 = __msa_copy_u_w((v4i32) res, 2);
  461. SW(res0, dst);
  462. dst += dst_stride;
  463. SW(res1, dst);
  464. dst += dst_stride;
  465. src0 = src2;
  466. }
  467. }
  468. static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
  469. uint8_t *dst, int32_t dst_stride,
  470. uint8_t height)
  471. {
  472. uint8_t loop_cnt;
  473. v16i8 src0, src1, src2, src3, src4;
  474. v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
  475. v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
  476. v8u16 add0, add1, add2, add3, add4;
  477. v8u16 sum0, sum1, sum2, sum3;
  478. src0 = LD_SB(src);
  479. src += src_stride;
  480. for (loop_cnt = (height >> 2); loop_cnt--;) {
  481. LD_SB4(src, src_stride, src1, src2, src3, src4);
  482. src += (4 * src_stride);
  483. SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
  484. SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
  485. ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
  486. src1_r, src2_r);
  487. ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
  488. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  489. HADD_UB2_UH(src3_r, src4_r, add3, add4);
  490. ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
  491. sum0, sum1, sum2, sum3);
  492. SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
  493. PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
  494. ST8x4_UB(src0, src1, dst, dst_stride);
  495. dst += (4 * dst_stride);
  496. src0 = src4;
  497. }
  498. }
  499. static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
  500. uint8_t *dst, int32_t dst_stride,
  501. uint8_t height)
  502. {
  503. uint8_t loop_cnt;
  504. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  505. v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
  506. v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
  507. v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
  508. v8u16 src7_l, src8_l;
  509. v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
  510. v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
  511. for (loop_cnt = (height >> 3); loop_cnt--;) {
  512. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  513. LD_UB8((src + 1), src_stride,
  514. src9, src10, src11, src12, src13, src14, src15, src16);
  515. src += (8 * src_stride);
  516. src8 = LD_UB(src);
  517. src17 = LD_UB(src + 1);
  518. ILVRL_B2_UH(src9, src0, src0_r, src0_l);
  519. ILVRL_B2_UH(src10, src1, src1_r, src1_l);
  520. ILVRL_B2_UH(src11, src2, src2_r, src2_l);
  521. ILVRL_B2_UH(src12, src3, src3_r, src3_l);
  522. ILVRL_B2_UH(src13, src4, src4_r, src4_l);
  523. ILVRL_B2_UH(src14, src5, src5_r, src5_l);
  524. ILVRL_B2_UH(src15, src6, src6_r, src6_l);
  525. ILVRL_B2_UH(src16, src7, src7_r, src7_l);
  526. ILVRL_B2_UH(src17, src8, src8_r, src8_l);
  527. HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
  528. HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
  529. HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
  530. HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
  531. HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
  532. HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
  533. ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
  534. sum0_r, sum1_r, sum2_r, sum3_r);
  535. ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
  536. sum4_r, sum5_r, sum6_r, sum7_r);
  537. ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
  538. sum0_l, sum1_l, sum2_l, sum3_l);
  539. ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
  540. sum4_l, sum5_l, sum6_l, sum7_l);
  541. SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
  542. SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
  543. SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
  544. SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
  545. PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
  546. sum3_l, sum3_r, dst, dst_stride);
  547. dst += (4 * dst_stride);
  548. PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
  549. sum7_l, sum7_r, dst, dst_stride);
  550. dst += (4 * dst_stride);
  551. }
  552. }
  553. static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
  554. uint8_t *dst, int32_t dst_stride)
  555. {
  556. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  557. v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
  558. v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
  559. v8u16 src0_r, src1_r, src2_r, src3_r;
  560. v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
  561. v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
  562. v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  563. v16i8 out0, out1;
  564. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  565. src += (8 * src_stride);
  566. src8 = LD_UB(src);
  567. SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1,
  568. src3_sld1, 1);
  569. SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1);
  570. SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1);
  571. ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
  572. src3, src0_r, src1_r, src2_r, src3_r);
  573. ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
  574. src5_r, src6_r);
  575. ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
  576. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  577. HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
  578. HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
  579. sum0 = add0 + add1 + 1;
  580. sum1 = add1 + add2 + 1;
  581. sum2 = add2 + add3 + 1;
  582. sum3 = add3 + add4 + 1;
  583. sum4 = add4 + add5 + 1;
  584. sum5 = add5 + add6 + 1;
  585. sum6 = add6 + add7 + 1;
  586. sum7 = add7 + add8 + 1;
  587. SRA_4V(sum0, sum1, sum2, sum3, 2);
  588. SRA_4V(sum4, sum5, sum6, sum7, 2);
  589. PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
  590. ST8x4_UB(out0, out1, dst, dst_stride);
  591. PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
  592. ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
  593. }
  594. static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
  595. uint8_t *dst, int32_t dst_stride)
  596. {
  597. v16i8 src0, src1, src2, src3, src4;
  598. v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
  599. v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
  600. v8u16 add0, add1, add2, add3, add4;
  601. v8u16 sum0, sum1, sum2, sum3;
  602. v16i8 out0, out1;
  603. LD_SB4(src, src_stride, src0, src1, src2, src3);
  604. src += (4 * src_stride);
  605. src4 = LD_SB(src);
  606. SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
  607. SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
  608. ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
  609. src1_r, src2_r);
  610. ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
  611. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  612. HADD_UB2_UH(src3_r, src4_r, add3, add4);
  613. sum0 = add0 + add1 + 1;
  614. sum1 = add1 + add2 + 1;
  615. sum2 = add2 + add3 + 1;
  616. sum3 = add3 + add4 + 1;
  617. SRA_4V(sum0, sum1, sum2, sum3, 2);
  618. PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
  619. ST8x4_UB(out0, out1, dst, dst_stride);
  620. }
  621. static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
  622. int32_t src_stride,
  623. uint8_t *dst, int32_t dst_stride)
  624. {
  625. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  626. v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
  627. v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
  628. v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
  629. v8u16 src7_l, src8_l;
  630. v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
  631. v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
  632. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  633. LD_UB8((src + 1), src_stride,
  634. src9, src10, src11, src12, src13, src14, src15, src16);
  635. src += (8 * src_stride);
  636. src8 = LD_UB(src);
  637. src17 = LD_UB(src + 1);
  638. ILVRL_B2_UH(src9, src0, src0_r, src0_l);
  639. ILVRL_B2_UH(src10, src1, src1_r, src1_l);
  640. ILVRL_B2_UH(src11, src2, src2_r, src2_l);
  641. ILVRL_B2_UH(src12, src3, src3_r, src3_l);
  642. ILVRL_B2_UH(src13, src4, src4_r, src4_l);
  643. ILVRL_B2_UH(src14, src5, src5_r, src5_l);
  644. ILVRL_B2_UH(src15, src6, src6_r, src6_l);
  645. ILVRL_B2_UH(src16, src7, src7_r, src7_l);
  646. ILVRL_B2_UH(src17, src8, src8_r, src8_l);
  647. HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
  648. HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
  649. HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
  650. HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
  651. HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
  652. HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
  653. sum0_r = src0_r + src1_r + 1;
  654. sum1_r = src1_r + src2_r + 1;
  655. sum2_r = src2_r + src3_r + 1;
  656. sum3_r = src3_r + src4_r + 1;
  657. sum4_r = src4_r + src5_r + 1;
  658. sum5_r = src5_r + src6_r + 1;
  659. sum6_r = src6_r + src7_r + 1;
  660. sum7_r = src7_r + src8_r + 1;
  661. sum0_l = src0_l + src1_l + 1;
  662. sum1_l = src1_l + src2_l + 1;
  663. sum2_l = src2_l + src3_l + 1;
  664. sum3_l = src3_l + src4_l + 1;
  665. sum4_l = src4_l + src5_l + 1;
  666. sum5_l = src5_l + src6_l + 1;
  667. sum6_l = src6_l + src7_l + 1;
  668. sum7_l = src7_l + src8_l + 1;
  669. SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
  670. SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
  671. SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
  672. SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
  673. PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
  674. sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
  675. dst += (4 * dst_stride);
  676. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  677. LD_UB8((src + 1), src_stride,
  678. src9, src10, src11, src12, src13, src14, src15, src16);
  679. src += (8 * src_stride);
  680. src8 = LD_UB(src);
  681. src17 = LD_UB(src + 1);
  682. PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
  683. sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
  684. dst += (4 * dst_stride);
  685. ILVRL_B2_UH(src9, src0, src0_r, src0_l);
  686. ILVRL_B2_UH(src10, src1, src1_r, src1_l);
  687. ILVRL_B2_UH(src11, src2, src2_r, src2_l);
  688. ILVRL_B2_UH(src12, src3, src3_r, src3_l);
  689. ILVRL_B2_UH(src13, src4, src4_r, src4_l);
  690. ILVRL_B2_UH(src14, src5, src5_r, src5_l);
  691. ILVRL_B2_UH(src15, src6, src6_r, src6_l);
  692. ILVRL_B2_UH(src16, src7, src7_r, src7_l);
  693. ILVRL_B2_UH(src17, src8, src8_r, src8_l);
  694. HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
  695. HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
  696. HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
  697. HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
  698. HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
  699. HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
  700. sum0_r = src0_r + src1_r + 1;
  701. sum1_r = src1_r + src2_r + 1;
  702. sum2_r = src2_r + src3_r + 1;
  703. sum3_r = src3_r + src4_r + 1;
  704. sum4_r = src4_r + src5_r + 1;
  705. sum5_r = src5_r + src6_r + 1;
  706. sum6_r = src6_r + src7_r + 1;
  707. sum7_r = src7_r + src8_r + 1;
  708. sum0_l = src0_l + src1_l + 1;
  709. sum1_l = src1_l + src2_l + 1;
  710. sum2_l = src2_l + src3_l + 1;
  711. sum3_l = src3_l + src4_l + 1;
  712. sum4_l = src4_l + src5_l + 1;
  713. sum5_l = src5_l + src6_l + 1;
  714. sum6_l = src6_l + src7_l + 1;
  715. sum7_l = src7_l + src8_l + 1;
  716. SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
  717. SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
  718. SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
  719. SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
  720. PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
  721. sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
  722. dst += (4 * dst_stride);
  723. PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
  724. sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
  725. }
  726. static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
  727. int32_t src_stride,
  728. uint8_t *dst, int32_t dst_stride)
  729. {
  730. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  731. v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
  732. v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
  733. v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
  734. v8u16 src7_l, src8_l;
  735. v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
  736. v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
  737. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  738. LD_UB8((src + 1), src_stride,
  739. src9, src10, src11, src12, src13, src14, src15, src16);
  740. src += (8 * src_stride);
  741. src8 = LD_UB(src);
  742. src17 = LD_UB(src + 1);
  743. ILVRL_B2_UH(src9, src0, src0_r, src0_l);
  744. ILVRL_B2_UH(src10, src1, src1_r, src1_l);
  745. ILVRL_B2_UH(src11, src2, src2_r, src2_l);
  746. ILVRL_B2_UH(src12, src3, src3_r, src3_l);
  747. ILVRL_B2_UH(src13, src4, src4_r, src4_l);
  748. ILVRL_B2_UH(src14, src5, src5_r, src5_l);
  749. ILVRL_B2_UH(src15, src6, src6_r, src6_l);
  750. ILVRL_B2_UH(src16, src7, src7_r, src7_l);
  751. ILVRL_B2_UH(src17, src8, src8_r, src8_l);
  752. HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
  753. HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
  754. HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
  755. HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
  756. HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
  757. HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
  758. sum0_r = src0_r + src1_r + 1;
  759. sum1_r = src1_r + src2_r + 1;
  760. sum2_r = src2_r + src3_r + 1;
  761. sum3_r = src3_r + src4_r + 1;
  762. sum4_r = src4_r + src5_r + 1;
  763. sum5_r = src5_r + src6_r + 1;
  764. sum6_r = src6_r + src7_r + 1;
  765. sum7_r = src7_r + src8_r + 1;
  766. sum0_l = src0_l + src1_l + 1;
  767. sum1_l = src1_l + src2_l + 1;
  768. sum2_l = src2_l + src3_l + 1;
  769. sum3_l = src3_l + src4_l + 1;
  770. sum4_l = src4_l + src5_l + 1;
  771. sum5_l = src5_l + src6_l + 1;
  772. sum6_l = src6_l + src7_l + 1;
  773. sum7_l = src7_l + src8_l + 1;
  774. SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
  775. SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
  776. SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
  777. SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
  778. PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
  779. sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
  780. dst += (4 * dst_stride);
  781. PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
  782. sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
  783. }
  784. static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
  785. int32_t src_stride,
  786. uint8_t *dst, int32_t dst_stride,
  787. uint8_t height)
  788. {
  789. uint8_t loop_cnt;
  790. uint32_t out0, out1;
  791. v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
  792. v16u8 src0_r, src1_r, src2_r;
  793. v8u16 add0, add1, add2, sum0, sum1;
  794. v16u8 dst0, dst1, res0, res1;
  795. src0 = LD_SB(src);
  796. src += src_stride;
  797. for (loop_cnt = (height >> 1); loop_cnt--;) {
  798. LD_SB2(src, src_stride, src1, src2);
  799. src += (2 * src_stride);
  800. LD_UB2(dst, dst_stride, dst0, dst1);
  801. SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
  802. ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
  803. src1_r, src2_r);
  804. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  805. ADD2(add0, add1, add1, add2, sum0, sum1);
  806. SRARI_H2_UH(sum0, sum1, 2);
  807. PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
  808. AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
  809. out0 = __msa_copy_u_w((v4i32) res0, 0);
  810. out1 = __msa_copy_u_w((v4i32) res1, 0);
  811. SW(out0, dst);
  812. dst += dst_stride;
  813. SW(out1, dst);
  814. dst += dst_stride;
  815. src0 = src2;
  816. }
  817. }
  818. static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
  819. int32_t src_stride,
  820. uint8_t *dst, int32_t dst_stride,
  821. uint8_t height)
  822. {
  823. uint8_t loop_cnt;
  824. v16i8 src0, src1, src2, src3, src4;
  825. v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
  826. v16u8 dst0, dst1, dst2, dst3;
  827. v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
  828. v8u16 add0, add1, add2, add3, add4;
  829. v8u16 sum0, sum1, sum2, sum3;
  830. src0 = LD_SB(src);
  831. src += src_stride;
  832. for (loop_cnt = (height >> 2); loop_cnt--;) {
  833. LD_SB4(src, src_stride, src1, src2, src3, src4);
  834. src += (4 * src_stride);
  835. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  836. SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
  837. SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
  838. ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
  839. src1_r, src2_r);
  840. ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
  841. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  842. HADD_UB2_UH(src3_r, src4_r, add3, add4);
  843. ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
  844. sum0, sum1, sum2, sum3);
  845. SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
  846. PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
  847. sum2, dst2, sum3, dst3, dst, dst_stride);
  848. dst += (4 * dst_stride);
  849. src0 = src4;
  850. }
  851. }
  852. static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
  853. int32_t src_stride,
  854. uint8_t *dst, int32_t dst_stride,
  855. uint8_t height)
  856. {
  857. uint8_t loop_cnt;
  858. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  859. v16u8 src11, src12, src13, src14, src15, src16, src17;
  860. v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
  861. v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
  862. v16u8 src7_l, src8_l;
  863. v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  864. v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
  865. v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
  866. v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
  867. for (loop_cnt = (height >> 3); loop_cnt--;) {
  868. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  869. LD_UB8((src + 1), src_stride,
  870. src9, src10, src11, src12, src13, src14, src15, src16);
  871. src += (8 * src_stride);
  872. src8 = LD_UB(src);
  873. src17 = LD_UB(src + 1);
  874. ILVRL_B2_UB(src9, src0, src0_r, src0_l);
  875. ILVRL_B2_UB(src10, src1, src1_r, src1_l);
  876. ILVRL_B2_UB(src11, src2, src2_r, src2_l);
  877. ILVRL_B2_UB(src12, src3, src3_r, src3_l);
  878. ILVRL_B2_UB(src13, src4, src4_r, src4_l);
  879. ILVRL_B2_UB(src14, src5, src5_r, src5_l);
  880. ILVRL_B2_UB(src15, src6, src6_r, src6_l);
  881. ILVRL_B2_UB(src16, src7, src7_r, src7_l);
  882. ILVRL_B2_UB(src17, src8, src8_r, src8_l);
  883. HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
  884. HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
  885. HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
  886. ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
  887. sum2_r, sum3_r);
  888. ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
  889. sum6_r, sum7_r);
  890. HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
  891. HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
  892. HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
  893. ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
  894. sum2_l, sum3_l);
  895. ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
  896. sum6_l, sum7_l);
  897. SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
  898. SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
  899. SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
  900. SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
  901. LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
  902. PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
  903. dst += dst_stride;
  904. PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
  905. dst += dst_stride;
  906. PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
  907. dst += dst_stride;
  908. PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
  909. dst += dst_stride;
  910. PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
  911. dst += dst_stride;
  912. PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
  913. dst += dst_stride;
  914. PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
  915. dst += dst_stride;
  916. PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
  917. dst += dst_stride;
  918. }
  919. }
  920. static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
  921. uint8_t *dst, int32_t dst_stride,
  922. int32_t height)
  923. {
  924. int32_t cnt;
  925. uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
  926. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  927. if (0 == height % 12) {
  928. for (cnt = (height / 12); cnt--;) {
  929. LD_UB8(src, src_stride,
  930. src0, src1, src2, src3, src4, src5, src6, src7);
  931. src += (8 * src_stride);
  932. out0 = __msa_copy_u_d((v2i64) src0, 0);
  933. out1 = __msa_copy_u_d((v2i64) src1, 0);
  934. out2 = __msa_copy_u_d((v2i64) src2, 0);
  935. out3 = __msa_copy_u_d((v2i64) src3, 0);
  936. out4 = __msa_copy_u_d((v2i64) src4, 0);
  937. out5 = __msa_copy_u_d((v2i64) src5, 0);
  938. out6 = __msa_copy_u_d((v2i64) src6, 0);
  939. out7 = __msa_copy_u_d((v2i64) src7, 0);
  940. SD4(out0, out1, out2, out3, dst, dst_stride);
  941. dst += (4 * dst_stride);
  942. SD4(out4, out5, out6, out7, dst, dst_stride);
  943. dst += (4 * dst_stride);
  944. LD_UB4(src, src_stride, src0, src1, src2, src3);
  945. src += (4 * src_stride);
  946. out0 = __msa_copy_u_d((v2i64) src0, 0);
  947. out1 = __msa_copy_u_d((v2i64) src1, 0);
  948. out2 = __msa_copy_u_d((v2i64) src2, 0);
  949. out3 = __msa_copy_u_d((v2i64) src3, 0);
  950. SD4(out0, out1, out2, out3, dst, dst_stride);
  951. dst += (4 * dst_stride);
  952. }
  953. } else if (0 == height % 8) {
  954. for (cnt = height >> 3; cnt--;) {
  955. LD_UB8(src, src_stride,
  956. src0, src1, src2, src3, src4, src5, src6, src7);
  957. src += (8 * src_stride);
  958. out0 = __msa_copy_u_d((v2i64) src0, 0);
  959. out1 = __msa_copy_u_d((v2i64) src1, 0);
  960. out2 = __msa_copy_u_d((v2i64) src2, 0);
  961. out3 = __msa_copy_u_d((v2i64) src3, 0);
  962. out4 = __msa_copy_u_d((v2i64) src4, 0);
  963. out5 = __msa_copy_u_d((v2i64) src5, 0);
  964. out6 = __msa_copy_u_d((v2i64) src6, 0);
  965. out7 = __msa_copy_u_d((v2i64) src7, 0);
  966. SD4(out0, out1, out2, out3, dst, dst_stride);
  967. dst += (4 * dst_stride);
  968. SD4(out4, out5, out6, out7, dst, dst_stride);
  969. dst += (4 * dst_stride);
  970. }
  971. } else if (0 == height % 4) {
  972. for (cnt = (height / 4); cnt--;) {
  973. LD_UB4(src, src_stride, src0, src1, src2, src3);
  974. src += (4 * src_stride);
  975. out0 = __msa_copy_u_d((v2i64) src0, 0);
  976. out1 = __msa_copy_u_d((v2i64) src1, 0);
  977. out2 = __msa_copy_u_d((v2i64) src2, 0);
  978. out3 = __msa_copy_u_d((v2i64) src3, 0);
  979. SD4(out0, out1, out2, out3, dst, dst_stride);
  980. dst += (4 * dst_stride);
  981. }
  982. } else if (0 == height % 2) {
  983. for (cnt = (height / 2); cnt--;) {
  984. LD_UB2(src, src_stride, src0, src1);
  985. src += (2 * src_stride);
  986. out0 = __msa_copy_u_d((v2i64) src0, 0);
  987. out1 = __msa_copy_u_d((v2i64) src1, 0);
  988. SD(out0, dst);
  989. dst += dst_stride;
  990. SD(out1, dst);
  991. dst += dst_stride;
  992. }
  993. }
  994. }
  995. static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
  996. uint8_t *dst, int32_t dst_stride,
  997. int32_t height, int32_t width)
  998. {
  999. int32_t cnt, loop_cnt;
  1000. const uint8_t *src_tmp;
  1001. uint8_t *dst_tmp;
  1002. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1003. for (cnt = (width >> 4); cnt--;) {
  1004. src_tmp = src;
  1005. dst_tmp = dst;
  1006. for (loop_cnt = (height >> 3); loop_cnt--;) {
  1007. LD_UB8(src_tmp, src_stride,
  1008. src0, src1, src2, src3, src4, src5, src6, src7);
  1009. src_tmp += (8 * src_stride);
  1010. ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
  1011. dst_tmp, dst_stride);
  1012. dst_tmp += (8 * dst_stride);
  1013. }
  1014. src += 16;
  1015. dst += 16;
  1016. }
  1017. }
  1018. static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
  1019. uint8_t *dst, int32_t dst_stride,
  1020. int32_t height)
  1021. {
  1022. int32_t cnt;
  1023. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1024. if (0 == height % 12) {
  1025. for (cnt = (height / 12); cnt--;) {
  1026. LD_UB8(src, src_stride,
  1027. src0, src1, src2, src3, src4, src5, src6, src7);
  1028. src += (8 * src_stride);
  1029. ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
  1030. dst, dst_stride);
  1031. dst += (8 * dst_stride);
  1032. LD_UB4(src, src_stride, src0, src1, src2, src3);
  1033. src += (4 * src_stride);
  1034. ST_UB4(src0, src1, src2, src3, dst, dst_stride);
  1035. dst += (4 * dst_stride);
  1036. }
  1037. } else if (0 == height % 8) {
  1038. copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
  1039. } else if (0 == height % 4) {
  1040. for (cnt = (height >> 2); cnt--;) {
  1041. LD_UB4(src, src_stride, src0, src1, src2, src3);
  1042. src += (4 * src_stride);
  1043. ST_UB4(src0, src1, src2, src3, dst, dst_stride);
  1044. dst += (4 * dst_stride);
  1045. }
  1046. }
  1047. }
  1048. static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
  1049. uint8_t *dst, int32_t dst_stride,
  1050. int32_t height)
  1051. {
  1052. int32_t cnt;
  1053. uint32_t out0, out1, out2, out3;
  1054. v16u8 src0, src1, src2, src3;
  1055. v16u8 dst0, dst1, dst2, dst3;
  1056. if (0 == (height % 4)) {
  1057. for (cnt = (height / 4); cnt--;) {
  1058. LD_UB4(src, src_stride, src0, src1, src2, src3);
  1059. src += (4 * src_stride);
  1060. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  1061. AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
  1062. dst0, dst1, dst2, dst3);
  1063. out0 = __msa_copy_u_w((v4i32) dst0, 0);
  1064. out1 = __msa_copy_u_w((v4i32) dst1, 0);
  1065. out2 = __msa_copy_u_w((v4i32) dst2, 0);
  1066. out3 = __msa_copy_u_w((v4i32) dst3, 0);
  1067. SW4(out0, out1, out2, out3, dst, dst_stride);
  1068. dst += (4 * dst_stride);
  1069. }
  1070. } else if (0 == (height % 2)) {
  1071. for (cnt = (height / 2); cnt--;) {
  1072. LD_UB2(src, src_stride, src0, src1);
  1073. src += (2 * src_stride);
  1074. LD_UB2(dst, dst_stride, dst0, dst1);
  1075. AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
  1076. out0 = __msa_copy_u_w((v4i32) dst0, 0);
  1077. out1 = __msa_copy_u_w((v4i32) dst1, 0);
  1078. SW(out0, dst);
  1079. dst += dst_stride;
  1080. SW(out1, dst);
  1081. dst += dst_stride;
  1082. }
  1083. }
  1084. }
  1085. static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
  1086. uint8_t *dst, int32_t dst_stride,
  1087. int32_t height)
  1088. {
  1089. int32_t cnt;
  1090. uint64_t out0, out1, out2, out3;
  1091. v16u8 src0, src1, src2, src3;
  1092. v16u8 dst0, dst1, dst2, dst3;
  1093. for (cnt = (height / 4); cnt--;) {
  1094. LD_UB4(src, src_stride, src0, src1, src2, src3);
  1095. src += (4 * src_stride);
  1096. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  1097. AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
  1098. dst0, dst1, dst2, dst3);
  1099. out0 = __msa_copy_u_d((v2i64) dst0, 0);
  1100. out1 = __msa_copy_u_d((v2i64) dst1, 0);
  1101. out2 = __msa_copy_u_d((v2i64) dst2, 0);
  1102. out3 = __msa_copy_u_d((v2i64) dst3, 0);
  1103. SD4(out0, out1, out2, out3, dst, dst_stride);
  1104. dst += (4 * dst_stride);
  1105. }
  1106. }
  1107. static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
  1108. uint8_t *dst, int32_t dst_stride,
  1109. int32_t height)
  1110. {
  1111. int32_t cnt;
  1112. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1113. v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  1114. for (cnt = (height / 8); cnt--;) {
  1115. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  1116. src += (8 * src_stride);
  1117. LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
  1118. AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
  1119. dst0, dst1, dst2, dst3);
  1120. AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
  1121. dst4, dst5, dst6, dst7);
  1122. ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
  1123. dst += (8 * dst_stride);
  1124. }
  1125. }
  1126. void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
  1127. ptrdiff_t line_size, int h)
  1128. {
  1129. copy_width16_msa(pixels, line_size, block, line_size, h);
  1130. }
  1131. void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
  1132. ptrdiff_t line_size, int h)
  1133. {
  1134. common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
  1135. }
  1136. void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
  1137. ptrdiff_t line_size, int h)
  1138. {
  1139. common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
  1140. }
  1141. void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1142. ptrdiff_t line_size, int h)
  1143. {
  1144. common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
  1145. }
  1146. void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
  1147. ptrdiff_t line_size, int h)
  1148. {
  1149. copy_width8_msa(pixels, line_size, block, line_size, h);
  1150. }
  1151. void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
  1152. ptrdiff_t line_size, int h)
  1153. {
  1154. common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
  1155. }
  1156. void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
  1157. ptrdiff_t line_size, int h)
  1158. {
  1159. common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
  1160. }
  1161. void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1162. ptrdiff_t line_size, int h)
  1163. {
  1164. common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
  1165. }
  1166. void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
  1167. ptrdiff_t line_size, int h)
  1168. {
  1169. common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
  1170. }
  1171. void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
  1172. ptrdiff_t line_size, int h)
  1173. {
  1174. common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
  1175. }
  1176. void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1177. ptrdiff_t line_size, int h)
  1178. {
  1179. common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
  1180. }
  1181. void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
  1182. ptrdiff_t line_size, int h)
  1183. {
  1184. if (h == 16) {
  1185. common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
  1186. } else if (h == 8) {
  1187. common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
  1188. }
  1189. }
  1190. void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
  1191. ptrdiff_t line_size, int h)
  1192. {
  1193. if (h == 16) {
  1194. common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
  1195. } else if (h == 8) {
  1196. common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
  1197. }
  1198. }
  1199. void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
  1200. const uint8_t *pixels,
  1201. ptrdiff_t line_size, int h)
  1202. {
  1203. if (h == 16) {
  1204. common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
  1205. } else if (h == 8) {
  1206. common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
  1207. }
  1208. }
  1209. void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
  1210. ptrdiff_t line_size, int h)
  1211. {
  1212. if (h == 8) {
  1213. common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
  1214. } else if (h == 4) {
  1215. common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
  1216. }
  1217. }
  1218. void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
  1219. ptrdiff_t line_size, int h)
  1220. {
  1221. if (h == 8) {
  1222. common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
  1223. } else if (h == 4) {
  1224. common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
  1225. }
  1226. }
  1227. void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1228. ptrdiff_t line_size, int h)
  1229. {
  1230. if (h == 8) {
  1231. common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
  1232. } else if (h == 4) {
  1233. common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
  1234. }
  1235. }
  1236. void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
  1237. ptrdiff_t line_size, int h)
  1238. {
  1239. avg_width16_msa(pixels, line_size, block, line_size, h);
  1240. }
  1241. void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
  1242. ptrdiff_t line_size, int h)
  1243. {
  1244. common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
  1245. }
  1246. void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
  1247. ptrdiff_t line_size, int h)
  1248. {
  1249. common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
  1250. }
  1251. void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1252. ptrdiff_t line_size, int h)
  1253. {
  1254. common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
  1255. }
  1256. void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
  1257. ptrdiff_t line_size, int h)
  1258. {
  1259. avg_width8_msa(pixels, line_size, block, line_size, h);
  1260. }
  1261. void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
  1262. ptrdiff_t line_size, int h)
  1263. {
  1264. common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
  1265. }
  1266. void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
  1267. ptrdiff_t line_size, int h)
  1268. {
  1269. common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
  1270. }
  1271. void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1272. ptrdiff_t line_size, int h)
  1273. {
  1274. common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
  1275. }
  1276. void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
  1277. ptrdiff_t line_size, int h)
  1278. {
  1279. avg_width4_msa(pixels, line_size, block, line_size, h);
  1280. }
  1281. void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
  1282. ptrdiff_t line_size, int h)
  1283. {
  1284. common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
  1285. }
  1286. void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
  1287. ptrdiff_t line_size, int h)
  1288. {
  1289. common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
  1290. }
  1291. void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
  1292. ptrdiff_t line_size, int h)
  1293. {
  1294. common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
  1295. }