You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1427 lines
69KB

  1. /*
  2. * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  21. #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  22. #include <stdint.h>
  23. #include <msa.h>
  24. #define LOAD_UB(psrc) \
  25. ( { \
  26. v16u8 out_m; \
  27. out_m = *((v16u8 *) (psrc)); \
  28. out_m; \
  29. } )
  30. #define LOAD_SB(psrc) \
  31. ( { \
  32. v16i8 out_m; \
  33. out_m = *((v16i8 *) (psrc)); \
  34. out_m; \
  35. } )
  36. #define LOAD_UH(psrc) *((const v8u16 *)(psrc))
  37. #define LOAD_SH(psrc) \
  38. ( { \
  39. v8i16 out_m; \
  40. out_m = *((v8i16 *) (psrc)); \
  41. out_m; \
  42. } )
  43. #define LOAD_SW(psrc) *((const v4i32 *)(psrc))
  44. #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
  45. #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
  46. #define STORE_SH(vec, pdest) \
  47. { \
  48. *((v8i16 *) (pdest)) = (vec); \
  49. }
  50. #define STORE_SW(vec, pdest) \
  51. { \
  52. *((v4i32 *) (pdest)) = (vec); \
  53. }
  54. #if (__mips_isa_rev >= 6)
  55. #define LOAD_WORD(psrc) \
  56. ( { \
  57. uint8_t *src_m = (uint8_t *) (psrc); \
  58. uint32_t val_m; \
  59. \
  60. __asm__ volatile ( \
  61. "lw %[val_m], %[src_m] \n\t" \
  62. \
  63. : [val_m] "=r" (val_m) \
  64. : [src_m] "m" (*src_m) \
  65. ); \
  66. \
  67. val_m; \
  68. } )
  69. #if (__mips == 64)
  70. #define LOAD_DWORD(psrc) \
  71. ( { \
  72. uint8_t *src_m = (uint8_t *) (psrc); \
  73. uint64_t val_m = 0; \
  74. \
  75. __asm__ volatile ( \
  76. "ld %[val_m], %[src_m] \n\t" \
  77. \
  78. : [val_m] "=r" (val_m) \
  79. : [src_m] "m" (*src_m) \
  80. ); \
  81. \
  82. val_m; \
  83. } )
  84. #else
  85. #define LOAD_DWORD(psrc) \
  86. ( { \
  87. uint8_t *src1_m = (uint8_t *) (psrc); \
  88. uint8_t *src2_m = ((uint8_t *) (psrc)) + 4; \
  89. uint32_t val0_m, val1_m; \
  90. uint64_t genval_m = 0; \
  91. \
  92. __asm__ volatile ( \
  93. "lw %[val0_m], %[src1_m] \n\t" \
  94. \
  95. : [val0_m] "=r" (val0_m) \
  96. : [src1_m] "m" (*src1_m) \
  97. ); \
  98. \
  99. __asm__ volatile ( \
  100. "lw %[val1_m], %[src2_m] \n\t" \
  101. \
  102. : [val1_m] "=r" (val1_m) \
  103. : [src2_m] "m" (*src2_m) \
  104. ); \
  105. \
  106. genval_m = (uint64_t) (val1_m); \
  107. genval_m = (uint64_t) ((genval_m << 32) & 0xFFFFFFFF00000000); \
  108. genval_m = (uint64_t) (genval_m | (uint64_t) val0_m); \
  109. \
  110. genval_m; \
  111. } )
  112. #endif
  113. #define STORE_WORD(pdst, val) \
  114. { \
  115. uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
  116. uint32_t val_m = (val); \
  117. \
  118. __asm__ volatile ( \
  119. "sw %[val_m], %[dst_ptr_m] \n\t" \
  120. \
  121. : [dst_ptr_m] "=m" (*dst_ptr_m) \
  122. : [val_m] "r" (val_m) \
  123. ); \
  124. }
  125. #define STORE_DWORD(pdst, val) \
  126. { \
  127. uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
  128. uint64_t val_m = (val); \
  129. \
  130. __asm__ volatile ( \
  131. "sd %[val_m], %[dst_ptr_m] \n\t" \
  132. \
  133. : [dst_ptr_m] "=m" (*dst_ptr_m) \
  134. : [val_m] "r" (val_m) \
  135. ); \
  136. }
  137. #define STORE_HWORD(pdst, val) \
  138. { \
  139. uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
  140. uint16_t val_m = (val); \
  141. \
  142. __asm__ volatile ( \
  143. "sh %[val_m], %[dst_ptr_m] \n\t" \
  144. \
  145. : [dst_ptr_m] "=m" (*dst_ptr_m) \
  146. : [val_m] "r" (val_m) \
  147. ); \
  148. }
  149. #else
  150. #define LOAD_WORD(psrc) \
  151. ( { \
  152. uint8_t *src_m = (uint8_t *) (psrc); \
  153. uint32_t val_m; \
  154. \
  155. __asm__ volatile ( \
  156. "ulw %[val_m], %[src_m] \n\t" \
  157. \
  158. : [val_m] "=r" (val_m) \
  159. : [src_m] "m" (*src_m) \
  160. ); \
  161. \
  162. val_m; \
  163. } )
  164. #if (__mips == 64)
  165. #define LOAD_DWORD(psrc) \
  166. ( { \
  167. uint8_t *src_m = (uint8_t *) (psrc); \
  168. uint64_t val_m = 0; \
  169. \
  170. __asm__ volatile ( \
  171. "uld %[val_m], %[src_m] \n\t" \
  172. \
  173. : [val_m] "=r" (val_m) \
  174. : [src_m] "m" (*src_m) \
  175. ); \
  176. \
  177. val_m; \
  178. } )
  179. #else
  180. #define LOAD_DWORD(psrc) \
  181. ( { \
  182. uint8_t *src1_m = (uint8_t *) (psrc); \
  183. uint8_t *src2_m = ((uint8_t *) (psrc)) + 4; \
  184. uint32_t val0_m, val1_m; \
  185. uint64_t genval_m = 0; \
  186. \
  187. __asm__ volatile ( \
  188. "ulw %[val0_m], %[src1_m] \n\t" \
  189. \
  190. : [val0_m] "=r" (val0_m) \
  191. : [src1_m] "m" (*src1_m) \
  192. ); \
  193. \
  194. __asm__ volatile ( \
  195. "ulw %[val1_m], %[src2_m] \n\t" \
  196. \
  197. : [val1_m] "=r" (val1_m) \
  198. : [src2_m] "m" (*src2_m) \
  199. ); \
  200. \
  201. genval_m = (uint64_t) (val1_m); \
  202. genval_m = (uint64_t) ((genval_m << 32) & 0xFFFFFFFF00000000); \
  203. genval_m = (uint64_t) (genval_m | (uint64_t) val0_m); \
  204. \
  205. genval_m; \
  206. } )
  207. #endif
  208. #define STORE_WORD(pdst, val) \
  209. { \
  210. uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
  211. uint32_t val_m = (val); \
  212. \
  213. __asm__ volatile ( \
  214. "usw %[val_m], %[dst_ptr_m] \n\t" \
  215. \
  216. : [dst_ptr_m] "=m" (*dst_ptr_m) \
  217. : [val_m] "r" (val_m) \
  218. ); \
  219. }
  220. #define STORE_DWORD(pdst, val) \
  221. { \
  222. uint8_t *dst1_m = (uint8_t *) (pdst); \
  223. uint8_t *dst2_m = ((uint8_t *) (pdst)) + 4; \
  224. uint32_t val0_m, val1_m; \
  225. \
  226. val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
  227. val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
  228. \
  229. __asm__ volatile ( \
  230. "usw %[val0_m], %[dst1_m] \n\t" \
  231. "usw %[val1_m], %[dst2_m] \n\t" \
  232. \
  233. : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \
  234. : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \
  235. ); \
  236. }
  237. #define STORE_HWORD(pdst, val) \
  238. { \
  239. uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
  240. uint16_t val_m = (val); \
  241. \
  242. __asm__ volatile ( \
  243. "ush %[val_m], %[dst_ptr_m] \n\t" \
  244. \
  245. : [dst_ptr_m] "=m" (*dst_ptr_m) \
  246. : [val_m] "r" (val_m) \
  247. ); \
  248. }
  249. #endif
  250. #define LOAD_4WORDS_WITH_STRIDE(psrc, src_stride, \
  251. src0, src1, src2, src3) \
  252. { \
  253. src0 = LOAD_WORD(psrc + 0 * src_stride); \
  254. src1 = LOAD_WORD(psrc + 1 * src_stride); \
  255. src2 = LOAD_WORD(psrc + 2 * src_stride); \
  256. src3 = LOAD_WORD(psrc + 3 * src_stride); \
  257. }
  258. #define LOAD_2VECS_UB(psrc, stride, \
  259. val0, val1) \
  260. { \
  261. val0 = LOAD_UB(psrc + 0 * stride); \
  262. val1 = LOAD_UB(psrc + 1 * stride); \
  263. }
  264. #define LOAD_2VECS_SB(psrc, stride, \
  265. val0, val1) \
  266. { \
  267. val0 = LOAD_SB(psrc + 0 * stride); \
  268. val1 = LOAD_SB(psrc + 1 * stride); \
  269. }
  270. #define LOAD_3VECS_UB(psrc, stride, \
  271. val0, val1, val2) \
  272. { \
  273. val0 = LOAD_UB(psrc + 0 * stride); \
  274. val1 = LOAD_UB(psrc + 1 * stride); \
  275. val2 = LOAD_UB(psrc + 2 * stride); \
  276. }
  277. #define LOAD_3VECS_SB(psrc, stride, \
  278. val0, val1, val2) \
  279. { \
  280. val0 = LOAD_SB(psrc + 0 * stride); \
  281. val1 = LOAD_SB(psrc + 1 * stride); \
  282. val2 = LOAD_SB(psrc + 2 * stride); \
  283. }
  284. #define LOAD_4VECS_UB(psrc, stride, \
  285. val0, val1, val2, val3) \
  286. { \
  287. val0 = LOAD_UB(psrc + 0 * stride); \
  288. val1 = LOAD_UB(psrc + 1 * stride); \
  289. val2 = LOAD_UB(psrc + 2 * stride); \
  290. val3 = LOAD_UB(psrc + 3 * stride); \
  291. }
  292. #define LOAD_4VECS_SB(psrc, stride, \
  293. val0, val1, val2, val3) \
  294. { \
  295. val0 = LOAD_SB(psrc + 0 * stride); \
  296. val1 = LOAD_SB(psrc + 1 * stride); \
  297. val2 = LOAD_SB(psrc + 2 * stride); \
  298. val3 = LOAD_SB(psrc + 3 * stride); \
  299. }
  300. #define LOAD_5VECS_UB(psrc, stride, \
  301. out0, out1, out2, out3, out4) \
  302. { \
  303. LOAD_4VECS_UB((psrc), (stride), \
  304. (out0), (out1), (out2), (out3)); \
  305. out4 = LOAD_UB(psrc + 4 * stride); \
  306. }
  307. #define LOAD_5VECS_SB(psrc, stride, \
  308. out0, out1, out2, out3, out4) \
  309. { \
  310. LOAD_4VECS_SB((psrc), (stride), \
  311. (out0), (out1), (out2), (out3)); \
  312. out4 = LOAD_SB(psrc + 4 * stride); \
  313. }
  314. #define LOAD_6VECS_SB(psrc, stride, \
  315. out0, out1, out2, out3, out4, out5) \
  316. { \
  317. LOAD_4VECS_SB((psrc), (stride), \
  318. (out0), (out1), (out2), (out3)); \
  319. LOAD_2VECS_SB((psrc + 4 * stride), (stride), \
  320. (out4), (out5)); \
  321. }
  322. #define LOAD_7VECS_UB(psrc, stride, \
  323. val0, val1, val2, val3, \
  324. val4, val5, val6) \
  325. { \
  326. val0 = LOAD_UB((psrc) + 0 * (stride)); \
  327. val1 = LOAD_UB((psrc) + 1 * (stride)); \
  328. val2 = LOAD_UB((psrc) + 2 * (stride)); \
  329. val3 = LOAD_UB((psrc) + 3 * (stride)); \
  330. val4 = LOAD_UB((psrc) + 4 * (stride)); \
  331. val5 = LOAD_UB((psrc) + 5 * (stride)); \
  332. val6 = LOAD_UB((psrc) + 6 * (stride)); \
  333. }
  334. #define LOAD_7VECS_SB(psrc, stride, \
  335. val0, val1, val2, val3, \
  336. val4, val5, val6) \
  337. { \
  338. val0 = LOAD_SB((psrc) + 0 * (stride)); \
  339. val1 = LOAD_SB((psrc) + 1 * (stride)); \
  340. val2 = LOAD_SB((psrc) + 2 * (stride)); \
  341. val3 = LOAD_SB((psrc) + 3 * (stride)); \
  342. val4 = LOAD_SB((psrc) + 4 * (stride)); \
  343. val5 = LOAD_SB((psrc) + 5 * (stride)); \
  344. val6 = LOAD_SB((psrc) + 6 * (stride)); \
  345. }
  346. #define LOAD_8VECS_UB(psrc, stride, \
  347. out0, out1, out2, out3, \
  348. out4, out5, out6, out7) \
  349. { \
  350. LOAD_4VECS_UB((psrc), (stride), \
  351. (out0), (out1), (out2), (out3)); \
  352. LOAD_4VECS_UB((psrc + 4 * stride), (stride), \
  353. (out4), (out5), (out6), (out7)); \
  354. }
  355. #define LOAD_8VECS_SB(psrc, stride, \
  356. out0, out1, out2, out3, \
  357. out4, out5, out6, out7) \
  358. { \
  359. LOAD_4VECS_SB((psrc), (stride), \
  360. (out0), (out1), (out2), (out3)); \
  361. LOAD_4VECS_SB((psrc + 4 * stride), (stride), \
  362. (out4), (out5), (out6), (out7)); \
  363. }
  364. #define LOAD_2VECS_UH(psrc, stride, \
  365. val0, val1) \
  366. { \
  367. val0 = LOAD_UH((psrc) + 0 * (stride)); \
  368. val1 = LOAD_UH((psrc) + 1 * (stride)); \
  369. }
  370. #define LOAD_2VECS_SH(psrc, stride, \
  371. val0, val1) \
  372. { \
  373. val0 = LOAD_SH((psrc) + 0 * (stride)); \
  374. val1 = LOAD_SH((psrc) + 1 * (stride)); \
  375. }
  376. #define LOAD_4VECS_UH(psrc, stride, \
  377. val0, val1, val2, val3) \
  378. { \
  379. LOAD_2VECS_UH((psrc), (stride), val0, val1); \
  380. LOAD_2VECS_UH((psrc + 2 * stride), (stride), val2, val3); \
  381. }
  382. #define LOAD_4VECS_SH(psrc, stride, \
  383. val0, val1, val2, val3) \
  384. { \
  385. LOAD_2VECS_SH((psrc), (stride), val0, val1); \
  386. LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \
  387. }
  388. #define LOAD_6VECS_SH(psrc, stride, \
  389. val0, val1, val2, val3, val4, val5) \
  390. { \
  391. LOAD_2VECS_SH((psrc), (stride), val0, val1); \
  392. LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \
  393. LOAD_2VECS_SH((psrc + 4 * stride), (stride), val4, val5); \
  394. }
  395. #define LOAD_8VECS_UH(psrc, stride, \
  396. val0, val1, val2, val3, \
  397. val4, val5, val6, val7) \
  398. { \
  399. LOAD_4VECS_UH((psrc), (stride), \
  400. val0, val1, val2, val3); \
  401. LOAD_4VECS_UH((psrc + 4 * stride), (stride), \
  402. val4, val5, val6, val7); \
  403. }
  404. #define LOAD_8VECS_SH(psrc, stride, \
  405. val0, val1, val2, val3, \
  406. val4, val5, val6, val7) \
  407. { \
  408. LOAD_4VECS_SH((psrc), (stride), \
  409. val0, val1, val2, val3); \
  410. LOAD_4VECS_SH((psrc + 4 * stride), (stride), \
  411. val4, val5, val6, val7); \
  412. }
  413. #define LOAD_16VECS_SH(psrc, stride, \
  414. val0, val1, val2, val3, \
  415. val4, val5, val6, val7, \
  416. val8, val9, val10, val11, \
  417. val12, val13, val14, val15) \
  418. { \
  419. LOAD_8VECS_SH((psrc), (stride), \
  420. val0, val1, val2, val3, \
  421. val4, val5, val6, val7); \
  422. LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \
  423. val8, val9, val10, val11, \
  424. val12, val13, val14, val15); \
  425. }
  426. #define STORE_4VECS_UB(dst_out, pitch, \
  427. in0, in1, in2, in3) \
  428. { \
  429. STORE_UB((in0), (dst_out)); \
  430. STORE_UB((in1), ((dst_out) + (pitch))); \
  431. STORE_UB((in2), ((dst_out) + 2 * (pitch))); \
  432. STORE_UB((in3), ((dst_out) + 3 * (pitch))); \
  433. }
  434. #define STORE_4VECS_SB(dst_out, pitch, \
  435. in0, in1, in2, in3) \
  436. { \
  437. STORE_SB((in0), (dst_out)); \
  438. STORE_SB((in1), ((dst_out) + (pitch))); \
  439. STORE_SB((in2), ((dst_out) + 2 * (pitch))); \
  440. STORE_SB((in3), ((dst_out) + 3 * (pitch))); \
  441. }
  442. #define STORE_8VECS_UB(dst_out, pitch_in, \
  443. in0, in1, in2, in3, \
  444. in4, in5, in6, in7) \
  445. { \
  446. STORE_4VECS_UB(dst_out, pitch_in, \
  447. in0, in1, in2, in3); \
  448. STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \
  449. in4, in5, in6, in7); \
  450. }
  451. #define STORE_2VECS_SH(ptr, stride, \
  452. in0, in1) \
  453. { \
  454. STORE_SH(in0, ((ptr) + 0 * stride)); \
  455. STORE_SH(in1, ((ptr) + 1 * stride)); \
  456. }
  457. #define STORE_4VECS_SH(ptr, stride, \
  458. in0, in1, in2, in3) \
  459. { \
  460. STORE_SH(in0, ((ptr) + 0 * stride)); \
  461. STORE_SH(in1, ((ptr) + 1 * stride)); \
  462. STORE_SH(in2, ((ptr) + 2 * stride)); \
  463. STORE_SH(in3, ((ptr) + 3 * stride)); \
  464. }
  465. #define STORE_6VECS_SH(ptr, stride, \
  466. in0, in1, in2, in3, \
  467. in4, in5) \
  468. { \
  469. STORE_SH(in0, ((ptr) + 0 * stride)); \
  470. STORE_SH(in1, ((ptr) + 1 * stride)); \
  471. STORE_SH(in2, ((ptr) + 2 * stride)); \
  472. STORE_SH(in3, ((ptr) + 3 * stride)); \
  473. STORE_SH(in4, ((ptr) + 4 * stride)); \
  474. STORE_SH(in5, ((ptr) + 5 * stride)); \
  475. }
  476. #define STORE_8VECS_SH(ptr, stride, \
  477. in0, in1, in2, in3, \
  478. in4, in5, in6, in7) \
  479. { \
  480. STORE_SH(in0, ((ptr) + 0 * stride)); \
  481. STORE_SH(in1, ((ptr) + 1 * stride)); \
  482. STORE_SH(in2, ((ptr) + 2 * stride)); \
  483. STORE_SH(in3, ((ptr) + 3 * stride)); \
  484. STORE_SH(in4, ((ptr) + 4 * stride)); \
  485. STORE_SH(in5, ((ptr) + 5 * stride)); \
  486. STORE_SH(in6, ((ptr) + 6 * stride)); \
  487. STORE_SH(in7, ((ptr) + 7 * stride)); \
  488. }
  489. #define CLIP_MIN_TO_MAX_H(in, min, max) \
  490. ( { \
  491. v8i16 out_m; \
  492. \
  493. out_m = __msa_max_s_h((v8i16) (min), (v8i16) (in)); \
  494. out_m = __msa_min_s_h((v8i16) (max), (v8i16) out_m); \
  495. out_m; \
  496. } )
  497. #define CLIP_UNSIGNED_CHAR_H(in) \
  498. ( { \
  499. v8i16 max_m = __msa_ldi_h(255); \
  500. v8i16 out_m; \
  501. \
  502. out_m = __msa_maxi_s_h((v8i16) (in), 0); \
  503. out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
  504. out_m; \
  505. } )
  506. #define CLIP_UNSIGNED_CHAR_W(in) \
  507. ( { \
  508. v4i32 max_m = __msa_ldi_w(255); \
  509. v4i32 out_m; \
  510. \
  511. out_m = __msa_maxi_s_w((v4i32) (in), 0); \
  512. out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
  513. out_m; \
  514. } )
  515. #define TRANSPOSE4x4_B_UB(in0, in1, in2, in3, \
  516. out0, out1, out2, out3) \
  517. { \
  518. v16i8 zero_m = { 0 }; \
  519. v16i8 s0_m, s1_m, s2_m, s3_m; \
  520. \
  521. s0_m = (v16i8) __msa_ilvr_d((v2i64) (in1), (v2i64) (in0)); \
  522. s1_m = (v16i8) __msa_ilvr_d((v2i64) (in3), (v2i64) (in2)); \
  523. s2_m = __msa_ilvr_b(s1_m, s0_m); \
  524. s3_m = __msa_ilvl_b(s1_m, s0_m); \
  525. \
  526. out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
  527. out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
  528. out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
  529. out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
  530. }
  531. #define TRANSPOSE8x4_B_UB(in0, in1, in2, in3, \
  532. in4, in5, in6, in7, \
  533. out0, out1, out2, out3) \
  534. { \
  535. v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  536. \
  537. tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in4), (v4i32) (in0)); \
  538. tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in5), (v4i32) (in1)); \
  539. tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
  540. tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in6), (v4i32) (in2)); \
  541. tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in7), (v4i32) (in3)); \
  542. \
  543. tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
  544. tmp0_m = (v16i8) __msa_ilvr_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
  545. tmp1_m = (v16i8) __msa_ilvl_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
  546. \
  547. out0 = (v16u8) __msa_ilvr_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
  548. out2 = (v16u8) __msa_ilvl_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
  549. out1 = (v16u8) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
  550. out3 = (v16u8) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
  551. }
  552. #define TRANSPOSE8x4_B_UH(in0, in1, in2, in3, \
  553. in4, in5, in6, in7, \
  554. out0, out1, out2, out3) \
  555. { \
  556. v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  557. \
  558. tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in4), (v4i32) (in0)); \
  559. tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in5), (v4i32) (in1)); \
  560. tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
  561. tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in6), (v4i32) (in2)); \
  562. tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in7), (v4i32) (in3)); \
  563. \
  564. tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
  565. tmp0_m = (v16i8) __msa_ilvr_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
  566. tmp1_m = (v16i8) __msa_ilvl_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
  567. \
  568. out0 = (v8u16) __msa_ilvr_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
  569. out2 = (v8u16) __msa_ilvl_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
  570. out1 = (v8u16) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
  571. out3 = (v8u16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
  572. }
  573. #define TRANSPOSE8x8_B_UB(in0, in1, in2, in3, \
  574. in4, in5, in6, in7, \
  575. out0, out1, out2, out3, \
  576. out4, out5, out6, out7) \
  577. { \
  578. v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  579. v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  580. v16i8 zero_m = { 0 }; \
  581. \
  582. tmp0_m = __msa_ilvr_b((v16i8) (in2), (v16i8) (in0)); \
  583. tmp1_m = __msa_ilvr_b((v16i8) (in3), (v16i8) (in1)); \
  584. tmp2_m = __msa_ilvr_b((v16i8) (in6), (v16i8) (in4)); \
  585. tmp3_m = __msa_ilvr_b((v16i8) (in7), (v16i8) (in5)); \
  586. \
  587. tmp4_m = __msa_ilvr_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
  588. tmp5_m = __msa_ilvl_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
  589. tmp6_m = __msa_ilvr_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
  590. tmp7_m = __msa_ilvl_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
  591. \
  592. out0 = (v16u8) __msa_ilvr_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
  593. out2 = (v16u8) __msa_ilvl_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
  594. out4 = (v16u8) __msa_ilvr_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
  595. out6 = (v16u8) __msa_ilvl_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
  596. \
  597. out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 8); \
  598. out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 8); \
  599. out5 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out4, 8); \
  600. out7 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out6, 8); \
  601. }
  602. #define TRANSPOSE8x8_B_UH(in0, in1, in2, in3, \
  603. in4, in5, in6, in7, \
  604. out0, out1, out2, out3, \
  605. out4, out5, out6, out7) \
  606. { \
  607. v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  608. v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  609. v16i8 zero_m = { 0 }; \
  610. \
  611. tmp0_m = __msa_ilvr_b((v16i8) (in2), (v16i8) (in0)); \
  612. tmp1_m = __msa_ilvr_b((v16i8) (in3), (v16i8) (in1)); \
  613. tmp2_m = __msa_ilvr_b((v16i8) (in6), (v16i8) (in4)); \
  614. tmp3_m = __msa_ilvr_b((v16i8) (in7), (v16i8) (in5)); \
  615. \
  616. tmp4_m = __msa_ilvr_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
  617. tmp5_m = __msa_ilvl_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
  618. tmp6_m = __msa_ilvr_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
  619. tmp7_m = __msa_ilvl_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
  620. \
  621. out0 = (v8u16) __msa_ilvr_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
  622. out2 = (v8u16) __msa_ilvl_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
  623. out4 = (v8u16) __msa_ilvr_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
  624. out6 = (v8u16) __msa_ilvl_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
  625. out1 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out0, 8); \
  626. out3 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out2, 8); \
  627. out5 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out4, 8); \
  628. out7 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out6, 8); \
  629. }
  630. #define TRANSPOSE16x8_B_UB(in0, in1, in2, in3, \
  631. in4, in5, in6, in7, \
  632. in8, in9, in10, in11, \
  633. in12, in13, in14, in15, \
  634. out0, out1, out2, out3, \
  635. out4, out5, out6, out7) \
  636. { \
  637. v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  638. v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  639. \
  640. (out7) = (v16u8) __msa_ilvev_d((v2i64) (in8), (v2i64) (in0)); \
  641. (out6) = (v16u8) __msa_ilvev_d((v2i64) (in9), (v2i64) (in1)); \
  642. (out5) = (v16u8) __msa_ilvev_d((v2i64) (in10), (v2i64) (in2)); \
  643. (out4) = (v16u8) __msa_ilvev_d((v2i64) (in11), (v2i64) (in3)); \
  644. (out3) = (v16u8) __msa_ilvev_d((v2i64) (in12), (v2i64) (in4)); \
  645. (out2) = (v16u8) __msa_ilvev_d((v2i64) (in13), (v2i64) (in5)); \
  646. (out1) = (v16u8) __msa_ilvev_d((v2i64) (in14), (v2i64) (in6)); \
  647. (out0) = (v16u8) __msa_ilvev_d((v2i64) (in15), (v2i64) (in7)); \
  648. \
  649. tmp0_m = (v16u8) __msa_ilvev_b((v16i8) (out6), (v16i8) (out7)); \
  650. tmp4_m = (v16u8) __msa_ilvod_b((v16i8) (out6), (v16i8) (out7)); \
  651. tmp1_m = (v16u8) __msa_ilvev_b((v16i8) (out4), (v16i8) (out5)); \
  652. tmp5_m = (v16u8) __msa_ilvod_b((v16i8) (out4), (v16i8) (out5)); \
  653. (out5) = (v16u8) __msa_ilvev_b((v16i8) (out2), (v16i8) (out3)); \
  654. tmp6_m = (v16u8) __msa_ilvod_b((v16i8) (out2), (v16i8) (out3)); \
  655. (out7) = (v16u8) __msa_ilvev_b((v16i8) (out0), (v16i8) (out1)); \
  656. tmp7_m = (v16u8) __msa_ilvod_b((v16i8) (out0), (v16i8) (out1)); \
  657. \
  658. tmp2_m = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
  659. tmp3_m = (v16u8) __msa_ilvev_h((v8i16) (out7), (v8i16) (out5)); \
  660. (out0) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  661. (out4) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  662. \
  663. tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
  664. tmp3_m = (v16u8) __msa_ilvod_h((v8i16) (out7), (v8i16) (out5)); \
  665. (out2) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  666. (out6) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  667. \
  668. tmp2_m = (v16u8) __msa_ilvev_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
  669. tmp3_m = (v16u8) __msa_ilvev_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
  670. (out1) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  671. (out5) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  672. \
  673. tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
  674. tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
  675. tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
  676. tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
  677. (out3) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  678. (out7) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
  679. }
  680. #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \
  681. in4, in5, in6, in7, \
  682. out0, out1, out2, out3, \
  683. out4, out5, out6, out7) \
  684. { \
  685. v8i16 s0_m, s1_m; \
  686. v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  687. v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  688. \
  689. s0_m = __msa_ilvr_h((v8i16) (in6), (v8i16) (in4)); \
  690. s1_m = __msa_ilvr_h((v8i16) (in7), (v8i16) (in5)); \
  691. tmp0_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
  692. tmp1_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
  693. \
  694. s0_m = __msa_ilvl_h((v8i16) (in6), (v8i16) (in4)); \
  695. s1_m = __msa_ilvl_h((v8i16) (in7), (v8i16) (in5)); \
  696. tmp2_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
  697. tmp3_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
  698. \
  699. s0_m = __msa_ilvr_h((v8i16) (in2), (v8i16) (in0)); \
  700. s1_m = __msa_ilvr_h((v8i16) (in3), (v8i16) (in1)); \
  701. tmp4_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
  702. tmp5_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
  703. \
  704. s0_m = __msa_ilvl_h((v8i16) (in2), (v8i16) (in0)); \
  705. s1_m = __msa_ilvl_h((v8i16) (in3), (v8i16) (in1)); \
  706. tmp6_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
  707. tmp7_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
  708. \
  709. out0 = (v8i16) __msa_pckev_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
  710. out1 = (v8i16) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
  711. out2 = (v8i16) __msa_pckev_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
  712. out3 = (v8i16) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
  713. out4 = (v8i16) __msa_pckev_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
  714. out5 = (v8i16) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
  715. out6 = (v8i16) __msa_pckev_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
  716. out7 = (v8i16) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
  717. }
  718. #define TRANSPOSE4x4_W(in0, in1, in2, in3, \
  719. out0, out1, out2, out3) \
  720. { \
  721. v4i32 s0_m, s1_m, s2_m, s3_m; \
  722. \
  723. s0_m = __msa_ilvr_w((v4i32) (in1), (v4i32) (in0)); \
  724. s1_m = __msa_ilvl_w((v4i32) (in1), (v4i32) (in0)); \
  725. s2_m = __msa_ilvr_w((v4i32) (in3), (v4i32) (in2)); \
  726. s3_m = __msa_ilvl_w((v4i32) (in3), (v4i32) (in2)); \
  727. \
  728. out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
  729. out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
  730. out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
  731. out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
  732. }
  733. #define ILV_B_LRLR_SB(in0, in1, in2, in3, \
  734. out0, out1, out2, out3) \
  735. { \
  736. out0 = __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \
  737. out1 = __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \
  738. out2 = __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \
  739. out3 = __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \
  740. }
  741. #define ILV_B_LRLR_UH(in0, in1, in2, in3, \
  742. out0, out1, out2, out3) \
  743. { \
  744. out0 = (v8u16) __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \
  745. out1 = (v8u16) __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \
  746. out2 = (v8u16) __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \
  747. out3 = (v8u16) __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \
  748. }
  749. #define ILV_B_LRLR_SH(in0, in1, in2, in3, \
  750. out0, out1, out2, out3) \
  751. { \
  752. out0 = (v8i16) __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \
  753. out1 = (v8i16) __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \
  754. out2 = (v8i16) __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \
  755. out3 = (v8i16) __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \
  756. }
  757. #define ILV_H_LRLR_SW(in0, in1, in2, in3, \
  758. out0, out1, out2, out3) \
  759. { \
  760. out0 = (v4i32) __msa_ilvl_h((v8i16) (in1), (v8i16) (in0)); \
  761. out1 = (v4i32) __msa_ilvr_h((v8i16) (in1), (v8i16) (in0)); \
  762. out2 = (v4i32) __msa_ilvl_h((v8i16) (in3), (v8i16) (in2)); \
  763. out3 = (v4i32) __msa_ilvr_h((v8i16) (in3), (v8i16) (in2)); \
  764. }
  765. #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \
  766. out0, out1) \
  767. { \
  768. out0 = (v16u8) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  769. out1 = (v16u8) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  770. }
  771. #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  772. out0, out1) \
  773. { \
  774. out0 = __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  775. out1 = __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  776. }
  777. #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
  778. in0_l, in1_l, in2_l, in3_l, \
  779. out0, out1, out2, out3) \
  780. { \
  781. ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  782. out0, out1); \
  783. ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
  784. out2, out3); \
  785. }
  786. #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \
  787. in3_r, in4_r, in5_r, \
  788. in0_l, in1_l, in2_l, \
  789. in3_l, in4_l, in5_l, \
  790. out0, out1, out2, \
  791. out3, out4, out5) \
  792. { \
  793. ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  794. out0, out1); \
  795. ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
  796. out2, out3); \
  797. ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
  798. out4, out5); \
  799. }
  800. #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \
  801. in4_r, in5_r, in6_r, in7_r, \
  802. in0_l, in1_l, in2_l, in3_l, \
  803. in4_l, in5_l, in6_l, in7_l, \
  804. out0, out1, out2, out3, \
  805. out4, out5, out6, out7) \
  806. { \
  807. ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  808. out0, out1); \
  809. ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
  810. out2, out3); \
  811. ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
  812. out4, out5); \
  813. ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \
  814. out6, out7); \
  815. }
  816. #define ILVR_B_2VECS_UH(in0_r, in1_r, in0_l, in1_l, \
  817. out0, out1) \
  818. { \
  819. out0 = (v8u16) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  820. out1 = (v8u16) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  821. }
  822. #define ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  823. out0, out1) \
  824. { \
  825. out0 = (v8i16) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  826. out1 = (v8i16) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  827. }
  828. #define ILVR_B_3VECS_SH(in0_r, in1_r, in2_r, in0_l, in1_l, in2_l, \
  829. out0, out1, out2) \
  830. { \
  831. ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, out0, out1); \
  832. out2 = (v8i16) __msa_ilvr_b((v16i8) (in2_l), (v16i8) (in2_r)); \
  833. }
  834. #define ILVR_B_4VECS_UH(in0_r, in1_r, in2_r, in3_r, \
  835. in0_l, in1_l, in2_l, in3_l, \
  836. out0, out1, out2, out3) \
  837. { \
  838. ILVR_B_2VECS_UH(in0_r, in1_r, in0_l, in1_l, \
  839. out0, out1); \
  840. ILVR_B_2VECS_UH(in2_r, in3_r, in2_l, in3_l, \
  841. out2, out3); \
  842. }
  843. #define ILVR_B_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \
  844. in0_l, in1_l, in2_l, in3_l, \
  845. out0, out1, out2, out3) \
  846. { \
  847. ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  848. out0, out1); \
  849. ILVR_B_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  850. out2, out3); \
  851. }
  852. #define ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  853. out0, out1) \
  854. { \
  855. out0 = __msa_ilvr_h((v8i16) (in0_l), (v8i16) (in0_r)); \
  856. out1 = __msa_ilvr_h((v8i16) (in1_l), (v8i16) (in1_r)); \
  857. }
  858. #define ILVR_H_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \
  859. in0_l, in1_l, in2_l, in3_l, \
  860. out0, out1, out2, out3) \
  861. { \
  862. ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  863. out0, out1); \
  864. ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  865. out2, out3); \
  866. }
  867. #define ILVR_H_6VECS_SH(in0_r, in1_r, in2_r, \
  868. in3_r, in4_r, in5_r, \
  869. in0_l, in1_l, in2_l, \
  870. in3_l, in4_l, in5_l, \
  871. out0, out1, out2, \
  872. out3, out4, out5) \
  873. { \
  874. ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  875. out0, out1); \
  876. ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  877. out2, out3); \
  878. ILVR_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
  879. out4, out5); \
  880. }
  881. #define ILVR_H_8VECS_SH(in0_r, in1_r, in2_r, in3_r, \
  882. in4_r, in5_r, in6_r, in7_r, \
  883. in0_l, in1_l, in2_l, in3_l, \
  884. in4_l, in5_l, in6_l, in7_l, \
  885. out0, out1, out2, out3, \
  886. out4, out5, out6, out7) \
  887. { \
  888. ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  889. out0, out1); \
  890. ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  891. out2, out3); \
  892. ILVR_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
  893. out4, out5); \
  894. ILVR_H_2VECS_SH(in6_r, in7_r, in6_l, in7_l, \
  895. out6, out7); \
  896. }
  897. #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  898. out0, out1) \
  899. { \
  900. out0 = __msa_ilvl_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  901. out1 = __msa_ilvl_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  902. }
  903. #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
  904. in0_l, in1_l, in2_l, in3_l, \
  905. out0, out1, out2, out3) \
  906. { \
  907. ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  908. out0, out1); \
  909. ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
  910. out2, out3); \
  911. }
  912. #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \
  913. in3_r, in4_r, in5_r, \
  914. in0_l, in1_l, in2_l, \
  915. in3_l, in4_l, in5_l, \
  916. out0, out1, out2, \
  917. out3, out4, out5) \
  918. { \
  919. ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
  920. out0, out1); \
  921. ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
  922. out2, out3); \
  923. ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
  924. out4, out5); \
  925. }
  926. #define ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  927. out0, out1) \
  928. { \
  929. out0 = __msa_ilvl_h((v8i16) (in0_l), (v8i16) (in0_r)); \
  930. out1 = __msa_ilvl_h((v8i16) (in1_l), (v8i16) (in1_r)); \
  931. }
  932. #define ILVL_H_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \
  933. in0_l, in1_l, in2_l, in3_l, \
  934. out0, out1, out2, out3) \
  935. { \
  936. ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  937. out0, out1); \
  938. ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  939. out2, out3); \
  940. }
  941. #define ILVL_H_6VECS_SH(in0_r, in1_r, in2_r, \
  942. in3_r, in4_r, in5_r, \
  943. in0_l, in1_l, in2_l, \
  944. in3_l, in4_l, in5_l, \
  945. out0, out1, out2, \
  946. out3, out4, out5) \
  947. { \
  948. ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  949. out0, out1); \
  950. ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  951. out2, out3); \
  952. ILVL_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
  953. out4, out5); \
  954. }
  955. #define ILVL_H_8VECS_SH(in0_r, in1_r, in2_r, in3_r, \
  956. in4_r, in5_r, in6_r, in7_r, \
  957. in0_l, in1_l, in2_l, in3_l, \
  958. in4_l, in5_l, in6_l, in7_l, \
  959. out0, out1, out2, out3, \
  960. out4, out5, out6, out7) \
  961. { \
  962. ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
  963. out0, out1); \
  964. ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
  965. out2, out3); \
  966. ILVL_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
  967. out4, out5); \
  968. ILVL_H_2VECS_SH(in6_r, in7_r, in6_l, in7_l, \
  969. out6, out7); \
  970. }
  971. #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
  972. out1, in1_l, in1_r) \
  973. { \
  974. out0 = (v16i8) __msa_ilvr_d((v2i64) (in0_l), (v2i64) (in0_r)); \
  975. out1 = (v16i8) __msa_ilvr_d((v2i64) (in1_l), (v2i64) (in1_r)); \
  976. }
  977. #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \
  978. out1, in1_l, in1_r, \
  979. out2, in2_l, in2_r) \
  980. { \
  981. ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
  982. out1, in1_l, in1_r); \
  983. out2 = (v16i8) __msa_ilvr_d((v2i64) (in2_l), (v2i64) (in2_r)); \
  984. }
  985. #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \
  986. out1, in1_l, in1_r, \
  987. out2, in2_l, in2_r, \
  988. out3, in3_l, in3_r) \
  989. { \
  990. ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
  991. out1, in1_l, in1_r); \
  992. ILVR_D_2VECS_SB(out2, in2_l, in2_r, \
  993. out3, in3_l, in3_r); \
  994. }
  995. #define MAXI_S_H_4VECS_UH(vec0, vec1, vec2, vec3, max_value) \
  996. { \
  997. vec0 = (v8u16) __msa_maxi_s_h((v8i16) (vec0), (max_value)); \
  998. vec1 = (v8u16) __msa_maxi_s_h((v8i16) (vec1), (max_value)); \
  999. vec2 = (v8u16) __msa_maxi_s_h((v8i16) (vec2), (max_value)); \
  1000. vec3 = (v8u16) __msa_maxi_s_h((v8i16) (vec3), (max_value)); \
  1001. }
  1002. #define SAT_U_H_4VECS_UH(vec0, vec1, vec2, vec3, sat_value) \
  1003. { \
  1004. vec0 = __msa_sat_u_h((v8u16) (vec0), (sat_value)); \
  1005. vec1 = __msa_sat_u_h((v8u16) (vec1), (sat_value)); \
  1006. vec2 = __msa_sat_u_h((v8u16) (vec2), (sat_value)); \
  1007. vec3 = __msa_sat_u_h((v8u16) (vec3), (sat_value)); \
  1008. }
  1009. #define PCKEV_B_4VECS_UB(in0_l, in1_l, in2_l, in3_l, \
  1010. in0_r, in1_r, in2_r, in3_r, \
  1011. out0, out1, out2, out3) \
  1012. { \
  1013. out0 = (v16u8) __msa_pckev_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  1014. out1 = (v16u8) __msa_pckev_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  1015. out2 = (v16u8) __msa_pckev_b((v16i8) (in2_l), (v16i8) (in2_r)); \
  1016. out3 = (v16u8) __msa_pckev_b((v16i8) (in3_l), (v16i8) (in3_r)); \
  1017. }
  1018. #define PCKEV_B_4VECS_SB(in0_l, in1_l, in2_l, in3_l, \
  1019. in0_r, in1_r, in2_r, in3_r, \
  1020. out0, out1, out2, out3) \
  1021. { \
  1022. out0 = __msa_pckev_b((v16i8) (in0_l), (v16i8) (in0_r)); \
  1023. out1 = __msa_pckev_b((v16i8) (in1_l), (v16i8) (in1_r)); \
  1024. out2 = __msa_pckev_b((v16i8) (in2_l), (v16i8) (in2_r)); \
  1025. out3 = __msa_pckev_b((v16i8) (in3_l), (v16i8) (in3_r)); \
  1026. }
  1027. #define XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val) \
  1028. { \
  1029. out0 = __msa_xori_b((v16u8) (val0), (xor_val)); \
  1030. out1 = __msa_xori_b((v16u8) (val1), (xor_val)); \
  1031. }
  1032. #define XORI_B_2VECS_SB(val0, val1, \
  1033. out0, out1, xor_val) \
  1034. { \
  1035. out0 = (v16i8) __msa_xori_b((v16u8) (val0), (xor_val)); \
  1036. out1 = (v16i8) __msa_xori_b((v16u8) (val1), (xor_val)); \
  1037. }
  1038. #define XORI_B_3VECS_SB(val0, val1, val2, \
  1039. out0, out1, out2, \
  1040. xor_val) \
  1041. { \
  1042. XORI_B_2VECS_SB(val0, val1, \
  1043. out0, out1, xor_val); \
  1044. out2 = (v16i8) __msa_xori_b((v16u8) (val2), (xor_val)); \
  1045. }
  1046. #define XORI_B_4VECS_UB(val0, val1, val2, val3, \
  1047. out0, out1, out2, out3, xor_val) \
  1048. { \
  1049. XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \
  1050. XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \
  1051. }
  1052. #define XORI_B_4VECS_SB(val0, val1, val2, val3, \
  1053. out0, out1, out2, out3, \
  1054. xor_val) \
  1055. { \
  1056. XORI_B_2VECS_SB(val0, val1, \
  1057. out0, out1, xor_val); \
  1058. XORI_B_2VECS_SB(val2, val3, \
  1059. out2, out3, xor_val); \
  1060. }
  1061. #define XORI_B_5VECS_SB(val0, val1, val2, val3, val4, \
  1062. out0, out1, out2, out3, out4, \
  1063. xor_val) \
  1064. { \
  1065. XORI_B_3VECS_SB(val0, val1, val2, \
  1066. out0, out1, out2, xor_val); \
  1067. XORI_B_2VECS_SB(val3, val4, \
  1068. out3, out4, xor_val); \
  1069. }
  1070. #define XORI_B_6VECS_SB(val0, val1, val2, val3, val4, val5, \
  1071. out0, out1, out2, out3, out4, out5, \
  1072. xor_val) \
  1073. { \
  1074. XORI_B_4VECS_SB(val0, val1, val2, val3, \
  1075. out0, out1, out2, out3, xor_val); \
  1076. XORI_B_2VECS_SB(val4, val5,out4, out5, xor_val); \
  1077. }
  1078. #define XORI_B_7VECS_SB(val0, val1, val2, val3, \
  1079. val4, val5, val6, \
  1080. out0, out1, out2, out3, \
  1081. out4, out5, out6, \
  1082. xor_val) \
  1083. { \
  1084. XORI_B_4VECS_SB(val0, val1, val2, val3, \
  1085. out0, out1, out2, out3, xor_val); \
  1086. XORI_B_3VECS_SB(val4, val5, val6, \
  1087. out4, out5, out6, xor_val); \
  1088. }
  1089. #define XORI_B_8VECS_SB(val0, val1, val2, val3, \
  1090. val4, val5, val6, val7, \
  1091. out0, out1, out2, out3, \
  1092. out4, out5, out6, out7, xor_val) \
  1093. { \
  1094. XORI_B_4VECS_SB(val0, val1, val2, val3, \
  1095. out0, out1, out2, out3, xor_val); \
  1096. XORI_B_4VECS_SB(val4, val5, val6, val7, \
  1097. out4, out5, out6, out7, xor_val); \
  1098. }
  1099. #define ADDS_S_H_4VECS_UH(in0, in1, in2, in3, in4, in5, in6, in7, \
  1100. out0, out1, out2, out3) \
  1101. { \
  1102. out0 = (v8u16) __msa_adds_s_h((v8i16) (in0), (v8i16) (in1)); \
  1103. out1 = (v8u16) __msa_adds_s_h((v8i16) (in2), (v8i16) (in3)); \
  1104. out2 = (v8u16) __msa_adds_s_h((v8i16) (in4), (v8i16) (in5)); \
  1105. out3 = (v8u16) __msa_adds_s_h((v8i16) (in6), (v8i16) (in7)); \
  1106. }
  1107. #define SRA_4VECS(in0, in1, in2, in3, \
  1108. out0, out1, out2, out3, \
  1109. shift_right_vec) \
  1110. { \
  1111. out0 = (in0) >> (shift_right_vec); \
  1112. out1 = (in1) >> (shift_right_vec); \
  1113. out2 = (in2) >> (shift_right_vec); \
  1114. out3 = (in3) >> (shift_right_vec); \
  1115. }
  1116. #define SRL_H_4VECS_UH(in0, in1, in2, in3, \
  1117. out0, out1, out2, out3, \
  1118. shift_right_vec) \
  1119. { \
  1120. out0 = (v8u16) __msa_srl_h((v8i16) (in0), (v8i16) (shift_right_vec)); \
  1121. out1 = (v8u16) __msa_srl_h((v8i16) (in1), (v8i16) (shift_right_vec)); \
  1122. out2 = (v8u16) __msa_srl_h((v8i16) (in2), (v8i16) (shift_right_vec)); \
  1123. out3 = (v8u16) __msa_srl_h((v8i16) (in3), (v8i16) (shift_right_vec)); \
  1124. }
  1125. #define SRAR_SATURATE_SIGNED_H(input, right_shift_vec, sat_val) \
  1126. ( { \
  1127. v8i16 out_m; \
  1128. \
  1129. out_m = __msa_srar_h((v8i16) (input), (v8i16) (right_shift_vec)); \
  1130. out_m = __msa_sat_s_h(out_m, (sat_val)); \
  1131. out_m; \
  1132. } )
  1133. #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \
  1134. pdst, stride) \
  1135. { \
  1136. uint32_t out0_m, out1_m, out2_m, out3_m; \
  1137. v16i8 tmp0_m; \
  1138. uint8_t *dst_m = (uint8_t *) (pdst); \
  1139. \
  1140. tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
  1141. tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
  1142. \
  1143. out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
  1144. out1_m = __msa_copy_u_w((v4i32) tmp0_m, 1); \
  1145. out2_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
  1146. out3_m = __msa_copy_u_w((v4i32) tmp0_m, 3); \
  1147. \
  1148. STORE_WORD(dst_m, out0_m); \
  1149. dst_m += stride; \
  1150. STORE_WORD(dst_m, out1_m); \
  1151. dst_m += stride; \
  1152. STORE_WORD(dst_m, out2_m); \
  1153. dst_m += stride; \
  1154. STORE_WORD(dst_m, out3_m); \
  1155. }
  1156. #define PCKEV_B_XORI128_STORE_8_BYTES(in1, in2, pdest) \
  1157. { \
  1158. uint64_t out_m; \
  1159. v16i8 tmp_m; \
  1160. \
  1161. tmp_m = __msa_pckev_b((v16i8) (in1), (v16i8) (in2)); \
  1162. tmp_m = (v16i8) __msa_xori_b((v16u8) tmp_m, 128); \
  1163. out_m = __msa_copy_u_d((v2i64) tmp_m, 0); \
  1164. STORE_DWORD((pdest), out_m); \
  1165. }
  1166. #define PCKEV_B_XORI128_STORE_8_BYTES_2(in1, in2, \
  1167. pdst, stride) \
  1168. { \
  1169. uint64_t out0_m, out1_m; \
  1170. v16i8 tmp0_m; \
  1171. uint8_t *dst_m = (uint8_t *) (pdst); \
  1172. \
  1173. tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
  1174. tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
  1175. \
  1176. out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
  1177. out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \
  1178. \
  1179. STORE_DWORD(dst_m, out0_m); \
  1180. dst_m += stride; \
  1181. STORE_DWORD(dst_m, out1_m); \
  1182. }
  1183. #define PCKEV_B_XORI128_STORE_6_BYTES_4(in1, in2, in3, in4, \
  1184. pdst, stride) \
  1185. { \
  1186. uint32_t out0_m, out1_m, out2_m, out3_m; \
  1187. uint16_t out4_m, out5_m, out6_m, out7_m; \
  1188. v16i8 tmp0_m, tmp1_m; \
  1189. uint8_t *dst_m = (uint8_t *) (pdst); \
  1190. \
  1191. tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
  1192. tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
  1193. \
  1194. tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
  1195. tmp1_m = (v16i8) __msa_xori_b((v16u8) tmp1_m, 128); \
  1196. \
  1197. out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
  1198. out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
  1199. out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
  1200. out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
  1201. \
  1202. out4_m = __msa_copy_u_h((v8i16) tmp0_m, 2); \
  1203. out5_m = __msa_copy_u_h((v8i16) tmp0_m, 6); \
  1204. out6_m = __msa_copy_u_h((v8i16) tmp1_m, 2); \
  1205. out7_m = __msa_copy_u_h((v8i16) tmp1_m, 6); \
  1206. \
  1207. STORE_WORD(dst_m, out0_m); \
  1208. STORE_HWORD((dst_m + 4), out4_m); \
  1209. dst_m += stride; \
  1210. STORE_WORD(dst_m, out1_m); \
  1211. STORE_HWORD((dst_m + 4), out5_m); \
  1212. dst_m += stride; \
  1213. STORE_WORD(dst_m, out2_m); \
  1214. STORE_HWORD((dst_m + 4), out6_m); \
  1215. dst_m += stride; \
  1216. STORE_WORD(dst_m, out3_m); \
  1217. STORE_HWORD((dst_m + 4), out7_m); \
  1218. }
  1219. #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, in3, in4, \
  1220. pdst, stride) \
  1221. { \
  1222. uint64_t out0_m, out1_m, out2_m, out3_m; \
  1223. v16i8 tmp0_m, tmp1_m; \
  1224. uint8_t *dst_m = (uint8_t *) (pdst); \
  1225. \
  1226. tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
  1227. tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
  1228. \
  1229. tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
  1230. tmp1_m = (v16i8) __msa_xori_b((v16u8) tmp1_m, 128); \
  1231. \
  1232. out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
  1233. out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \
  1234. out2_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
  1235. out3_m = __msa_copy_u_d((v2i64) tmp1_m, 1); \
  1236. \
  1237. STORE_DWORD(dst_m, out0_m); \
  1238. dst_m += stride; \
  1239. STORE_DWORD(dst_m, out1_m); \
  1240. dst_m += stride; \
  1241. STORE_DWORD(dst_m, out2_m); \
  1242. dst_m += stride; \
  1243. STORE_DWORD(dst_m, out3_m); \
  1244. }
  1245. #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) \
  1246. { \
  1247. v16i8 tmp_m; \
  1248. \
  1249. tmp_m = __msa_pckev_b((v16i8) (in1), (v16i8) (in2)); \
  1250. tmp_m = (v16i8) __msa_xori_b((v16u8) tmp_m, 128); \
  1251. STORE_SB(tmp_m, (pdest)); \
  1252. }
  1253. #define PCKEV_B_STORE_4_BYTES_4(in1, in2, in3, in4, \
  1254. pdst, stride) \
  1255. { \
  1256. uint32_t out0_m, out1_m, out2_m, out3_m; \
  1257. v16i8 tmp0_m, tmp1_m; \
  1258. uint8_t *dst_m = (uint8_t *) (pdst); \
  1259. \
  1260. tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
  1261. tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
  1262. \
  1263. out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
  1264. out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
  1265. out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
  1266. out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
  1267. \
  1268. STORE_WORD(dst_m, out0_m); \
  1269. dst_m += stride; \
  1270. STORE_WORD(dst_m, out1_m); \
  1271. dst_m += stride; \
  1272. STORE_WORD(dst_m, out2_m); \
  1273. dst_m += stride; \
  1274. STORE_WORD(dst_m, out3_m); \
  1275. }
  1276. #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \
  1277. pdst, stride) \
  1278. { \
  1279. uint64_t out0_m, out1_m, out2_m, out3_m; \
  1280. v16i8 tmp0_m, tmp1_m; \
  1281. uint8_t *dst_m = (uint8_t *) (pdst); \
  1282. \
  1283. tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
  1284. tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
  1285. \
  1286. out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
  1287. out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \
  1288. out2_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
  1289. out3_m = __msa_copy_u_d((v2i64) tmp1_m, 1); \
  1290. \
  1291. STORE_DWORD(dst_m, out0_m); \
  1292. dst_m += stride; \
  1293. STORE_DWORD(dst_m, out1_m); \
  1294. dst_m += stride; \
  1295. STORE_DWORD(dst_m, out2_m); \
  1296. dst_m += stride; \
  1297. STORE_DWORD(dst_m, out3_m); \
  1298. }
  1299. #define UNPCK_SIGNED_B_TO_H(in, out1, out2) \
  1300. { \
  1301. v16i8 tmp_m; \
  1302. \
  1303. tmp_m = __msa_clti_s_b((v16i8) (in), 0); \
  1304. out1 = (v8i16) __msa_ilvr_b(tmp_m, (v16i8) (in)); \
  1305. out2 = (v8i16) __msa_ilvl_b(tmp_m, (v16i8) (in)); \
  1306. }
  1307. #define SWAP_VECS(Vec0, Vec1) \
  1308. { \
  1309. Vec0 = Vec0 ^ Vec1; \
  1310. Vec1 = Vec0 ^ Vec1; \
  1311. Vec0 = Vec0 ^ Vec1; \
  1312. }
  1313. #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */