You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1408 lines
61KB

  1. /*
  2. * This is optimized for sh, which have post increment addressing (*p++).
  3. * Some CPU may be index (p[n]) faster than post increment (*p++).
  4. *
  5. * copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #define PIXOP2(OPNAME, OP) \
  24. \
  25. static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  26. {\
  27. do {\
  28. OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  29. src1+=src_stride1; \
  30. src2+=src_stride2; \
  31. dst+=dst_stride; \
  32. } while(--h); \
  33. }\
  34. \
  35. static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  36. {\
  37. do {\
  38. OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  39. src1+=src_stride1; \
  40. src2+=src_stride2; \
  41. dst+=dst_stride; \
  42. } while(--h); \
  43. }\
  44. \
  45. static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  46. {\
  47. do {\
  48. OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  49. OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  50. OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
  51. OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
  52. src1+=src_stride1; \
  53. src2+=src_stride2; \
  54. dst+=dst_stride; \
  55. } while(--h); \
  56. }\
  57. \
  58. static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  59. {\
  60. do {\
  61. OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  62. OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  63. OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
  64. OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
  65. src1+=src_stride1; \
  66. src2+=src_stride2; \
  67. dst+=dst_stride; \
  68. } while(--h); \
  69. }\
  70. \
  71. static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  72. {\
  73. do { /* onlye src2 aligned */\
  74. OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  75. OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  76. src1+=src_stride1; \
  77. src2+=src_stride2; \
  78. dst+=dst_stride; \
  79. } while(--h); \
  80. }\
  81. \
  82. static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  83. {\
  84. do {\
  85. OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  86. OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  87. src1+=src_stride1; \
  88. src2+=src_stride2; \
  89. dst+=dst_stride; \
  90. } while(--h); \
  91. }\
  92. \
  93. static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  94. {\
  95. do {\
  96. OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  97. OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  98. src1+=src_stride1; \
  99. src2+=src_stride2; \
  100. dst+=dst_stride; \
  101. } while(--h); \
  102. }\
  103. \
  104. static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  105. {\
  106. do {\
  107. OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  108. OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  109. src1+=src_stride1; \
  110. src2+=src_stride2; \
  111. dst+=dst_stride; \
  112. } while(--h); \
  113. }\
  114. \
  115. static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  116. {\
  117. do {\
  118. OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  119. OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  120. OP(LP(dst+8),no_rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
  121. OP(LP(dst+12),no_rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
  122. src1+=src_stride1; \
  123. src2+=src_stride2; \
  124. dst+=dst_stride; \
  125. } while(--h); \
  126. }\
  127. \
  128. static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  129. {\
  130. do {\
  131. OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  132. OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  133. OP(LP(dst+8),rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
  134. OP(LP(dst+12),rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
  135. src1+=src_stride1; \
  136. src2+=src_stride2; \
  137. dst+=dst_stride; \
  138. } while(--h); \
  139. }\
  140. \
  141. static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  142. { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  143. \
  144. static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  145. { OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  146. \
  147. static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  148. { OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  149. \
  150. static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  151. { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  152. \
  153. static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  154. do { \
  155. uint32_t a0,a1,a2,a3; \
  156. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  157. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  158. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  159. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  160. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  161. OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
  162. src1+=src_stride1;\
  163. src2+=src_stride2;\
  164. src3+=src_stride3;\
  165. src4+=src_stride4;\
  166. dst+=dst_stride;\
  167. } while(--h); \
  168. } \
  169. \
  170. static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  171. do { \
  172. uint32_t a0,a1,a2,a3; \
  173. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  174. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  175. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  176. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  177. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  178. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  179. src1+=src_stride1;\
  180. src2+=src_stride2;\
  181. src3+=src_stride3;\
  182. src4+=src_stride4;\
  183. dst+=dst_stride;\
  184. } while(--h); \
  185. } \
  186. \
  187. static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  188. do { \
  189. uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
  190. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  191. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  192. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  193. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  194. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  195. OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
  196. src1+=src_stride1;\
  197. src2+=src_stride2;\
  198. src3+=src_stride3;\
  199. src4+=src_stride4;\
  200. dst+=dst_stride;\
  201. } while(--h); \
  202. } \
  203. \
  204. static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  205. do { \
  206. uint32_t a0,a1,a2,a3; \
  207. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  208. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  209. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  210. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  211. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  212. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  213. src1+=src_stride1;\
  214. src2+=src_stride2;\
  215. src3+=src_stride3;\
  216. src4+=src_stride4;\
  217. dst+=dst_stride;\
  218. } while(--h); \
  219. } \
  220. \
  221. static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  222. do { \
  223. uint32_t a0,a1,a2,a3; \
  224. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  225. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  226. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  227. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  228. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  229. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  230. UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
  231. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  232. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  233. UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
  234. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  235. OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
  236. src1+=src_stride1;\
  237. src2+=src_stride2;\
  238. src3+=src_stride3;\
  239. src4+=src_stride4;\
  240. dst+=dst_stride;\
  241. } while(--h); \
  242. } \
  243. \
  244. static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  245. do { \
  246. uint32_t a0,a1,a2,a3; \
  247. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  248. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  249. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  250. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  251. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  252. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  253. UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
  254. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  255. OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
  256. UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
  257. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  258. OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
  259. src1+=src_stride1;\
  260. src2+=src_stride2;\
  261. src3+=src_stride3;\
  262. src4+=src_stride4;\
  263. dst+=dst_stride;\
  264. } while(--h); \
  265. } \
  266. \
  267. static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  268. do { /* src1 is unaligned */\
  269. uint32_t a0,a1,a2,a3; \
  270. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  271. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  272. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  273. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  274. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  275. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  276. UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
  277. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  278. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  279. UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
  280. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  281. OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
  282. src1+=src_stride1;\
  283. src2+=src_stride2;\
  284. src3+=src_stride3;\
  285. src4+=src_stride4;\
  286. dst+=dst_stride;\
  287. } while(--h); \
  288. } \
  289. \
  290. static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  291. do { \
  292. uint32_t a0,a1,a2,a3; \
  293. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  294. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  295. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  296. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  297. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  298. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  299. UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
  300. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  301. OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
  302. UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
  303. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  304. OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
  305. src1+=src_stride1;\
  306. src2+=src_stride2;\
  307. src3+=src_stride3;\
  308. src4+=src_stride4;\
  309. dst+=dst_stride;\
  310. } while(--h); \
  311. } \
  312. \
  313. #define op_avg(a, b) a = rnd_avg32(a,b)
  314. #define op_put(a, b) a = b
  315. PIXOP2(avg, op_avg)
  316. PIXOP2(put, op_put)
  317. #undef op_avg
  318. #undef op_put
  319. #define avg2(a,b) ((a+b+1)>>1)
  320. #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
  321. static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
  322. {
  323. const int A=(16-x16)*(16-y16);
  324. const int B=( x16)*(16-y16);
  325. const int C=(16-x16)*( y16);
  326. const int D=( x16)*( y16);
  327. do {
  328. int t0,t1,t2,t3;
  329. uint8_t *s0 = src;
  330. uint8_t *s1 = src+stride;
  331. t0 = *s0++; t2 = *s1++;
  332. t1 = *s0++; t3 = *s1++;
  333. dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  334. t0 = *s0++; t2 = *s1++;
  335. dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  336. t1 = *s0++; t3 = *s1++;
  337. dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  338. t0 = *s0++; t2 = *s1++;
  339. dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  340. t1 = *s0++; t3 = *s1++;
  341. dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  342. t0 = *s0++; t2 = *s1++;
  343. dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  344. t1 = *s0++; t3 = *s1++;
  345. dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  346. t0 = *s0++; t2 = *s1++;
  347. dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  348. dst+= stride;
  349. src+= stride;
  350. }while(--h);
  351. }
  352. static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
  353. int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
  354. {
  355. int y, vx, vy;
  356. const int s= 1<<shift;
  357. width--;
  358. height--;
  359. for(y=0; y<h; y++){
  360. int x;
  361. vx= ox;
  362. vy= oy;
  363. for(x=0; x<8; x++){ //XXX FIXME optimize
  364. int src_x, src_y, frac_x, frac_y, index;
  365. src_x= vx>>16;
  366. src_y= vy>>16;
  367. frac_x= src_x&(s-1);
  368. frac_y= src_y&(s-1);
  369. src_x>>=shift;
  370. src_y>>=shift;
  371. if((unsigned)src_x < width){
  372. if((unsigned)src_y < height){
  373. index= src_x + src_y*stride;
  374. dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
  375. + src[index +1]* frac_x )*(s-frac_y)
  376. + ( src[index+stride ]*(s-frac_x)
  377. + src[index+stride+1]* frac_x )* frac_y
  378. + r)>>(shift*2);
  379. }else{
  380. index= src_x + av_clip(src_y, 0, height)*stride;
  381. dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
  382. + src[index +1]* frac_x )*s
  383. + r)>>(shift*2);
  384. }
  385. }else{
  386. if((unsigned)src_y < height){
  387. index= av_clip(src_x, 0, width) + src_y*stride;
  388. dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
  389. + src[index+stride ]* frac_y )*s
  390. + r)>>(shift*2);
  391. }else{
  392. index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
  393. dst[y*stride + x]= src[index ];
  394. }
  395. }
  396. vx+= dxx;
  397. vy+= dyx;
  398. }
  399. ox += dxy;
  400. oy += dyy;
  401. }
  402. }
  403. #define H264_CHROMA_MC(OPNAME, OP)\
  404. static void OPNAME ## h264_chroma_mc2_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
  405. const int A=(8-x)*(8-y);\
  406. const int B=( x)*(8-y);\
  407. const int C=(8-x)*( y);\
  408. const int D=( x)*( y);\
  409. \
  410. assert(x<8 && y<8 && x>=0 && y>=0);\
  411. \
  412. do {\
  413. int t0,t1,t2,t3; \
  414. uint8_t *s0 = src; \
  415. uint8_t *s1 = src+stride; \
  416. t0 = *s0++; t2 = *s1++; \
  417. t1 = *s0++; t3 = *s1++; \
  418. OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
  419. t0 = *s0++; t2 = *s1++; \
  420. OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
  421. dst+= stride;\
  422. src+= stride;\
  423. }while(--h);\
  424. }\
  425. \
  426. static void OPNAME ## h264_chroma_mc4_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
  427. const int A=(8-x)*(8-y);\
  428. const int B=( x)*(8-y);\
  429. const int C=(8-x)*( y);\
  430. const int D=( x)*( y);\
  431. \
  432. assert(x<8 && y<8 && x>=0 && y>=0);\
  433. \
  434. do {\
  435. int t0,t1,t2,t3; \
  436. uint8_t *s0 = src; \
  437. uint8_t *s1 = src+stride; \
  438. t0 = *s0++; t2 = *s1++; \
  439. t1 = *s0++; t3 = *s1++; \
  440. OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
  441. t0 = *s0++; t2 = *s1++; \
  442. OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
  443. t1 = *s0++; t3 = *s1++; \
  444. OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
  445. t0 = *s0++; t2 = *s1++; \
  446. OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
  447. dst+= stride;\
  448. src+= stride;\
  449. }while(--h);\
  450. }\
  451. \
  452. static void OPNAME ## h264_chroma_mc8_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
  453. const int A=(8-x)*(8-y);\
  454. const int B=( x)*(8-y);\
  455. const int C=(8-x)*( y);\
  456. const int D=( x)*( y);\
  457. \
  458. assert(x<8 && y<8 && x>=0 && y>=0);\
  459. \
  460. do {\
  461. int t0,t1,t2,t3; \
  462. uint8_t *s0 = src; \
  463. uint8_t *s1 = src+stride; \
  464. t0 = *s0++; t2 = *s1++; \
  465. t1 = *s0++; t3 = *s1++; \
  466. OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
  467. t0 = *s0++; t2 = *s1++; \
  468. OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
  469. t1 = *s0++; t3 = *s1++; \
  470. OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
  471. t0 = *s0++; t2 = *s1++; \
  472. OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
  473. t1 = *s0++; t3 = *s1++; \
  474. OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
  475. t0 = *s0++; t2 = *s1++; \
  476. OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
  477. t1 = *s0++; t3 = *s1++; \
  478. OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
  479. t0 = *s0++; t2 = *s1++; \
  480. OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
  481. dst+= stride;\
  482. src+= stride;\
  483. }while(--h);\
  484. }
  485. #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
  486. #define op_put(a, b) a = (((b) + 32)>>6)
  487. H264_CHROMA_MC(put_ , op_put)
  488. H264_CHROMA_MC(avg_ , op_avg)
  489. #undef op_avg
  490. #undef op_put
  491. #define QPEL_MC(r, OPNAME, RND, OP) \
  492. static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  493. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  494. do {\
  495. uint8_t *s = src; \
  496. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  497. src0= *s++;\
  498. src1= *s++;\
  499. src2= *s++;\
  500. src3= *s++;\
  501. src4= *s++;\
  502. OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
  503. src5= *s++;\
  504. OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
  505. src6= *s++;\
  506. OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
  507. src7= *s++;\
  508. OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
  509. src8= *s++;\
  510. OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
  511. OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
  512. OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
  513. OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
  514. dst+=dstStride;\
  515. src+=srcStride;\
  516. }while(--h);\
  517. }\
  518. \
  519. static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  520. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  521. int w=8;\
  522. do{\
  523. uint8_t *s = src, *d=dst;\
  524. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  525. src0 = *s; s+=srcStride; \
  526. src1 = *s; s+=srcStride; \
  527. src2 = *s; s+=srcStride; \
  528. src3 = *s; s+=srcStride; \
  529. src4 = *s; s+=srcStride; \
  530. OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
  531. src5 = *s; s+=srcStride; \
  532. OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
  533. src6 = *s; s+=srcStride; \
  534. OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
  535. src7 = *s; s+=srcStride; \
  536. OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
  537. src8 = *s; \
  538. OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
  539. OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
  540. OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
  541. OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
  542. dst++;\
  543. src++;\
  544. }while(--w);\
  545. }\
  546. \
  547. static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  548. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  549. do {\
  550. uint8_t *s = src;\
  551. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  552. int src9,src10,src11,src12,src13,src14,src15,src16;\
  553. src0= *s++;\
  554. src1= *s++;\
  555. src2= *s++;\
  556. src3= *s++;\
  557. src4= *s++;\
  558. OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
  559. src5= *s++;\
  560. OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
  561. src6= *s++;\
  562. OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
  563. src7= *s++;\
  564. OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
  565. src8= *s++;\
  566. OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
  567. src9= *s++;\
  568. OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
  569. src10= *s++;\
  570. OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
  571. src11= *s++;\
  572. OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
  573. src12= *s++;\
  574. OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
  575. src13= *s++;\
  576. OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
  577. src14= *s++;\
  578. OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
  579. src15= *s++;\
  580. OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
  581. src16= *s++;\
  582. OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
  583. OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
  584. OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
  585. OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
  586. dst+=dstStride;\
  587. src+=srcStride;\
  588. }while(--h);\
  589. }\
  590. \
  591. static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  592. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  593. int w=16;\
  594. do {\
  595. uint8_t *s = src, *d=dst;\
  596. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  597. int src9,src10,src11,src12,src13,src14,src15,src16;\
  598. src0 = *s; s+=srcStride; \
  599. src1 = *s; s+=srcStride; \
  600. src2 = *s; s+=srcStride; \
  601. src3 = *s; s+=srcStride; \
  602. src4 = *s; s+=srcStride; \
  603. OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
  604. src5 = *s; s+=srcStride; \
  605. OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
  606. src6 = *s; s+=srcStride; \
  607. OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
  608. src7 = *s; s+=srcStride; \
  609. OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
  610. src8 = *s; s+=srcStride; \
  611. OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
  612. src9 = *s; s+=srcStride; \
  613. OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
  614. src10 = *s; s+=srcStride; \
  615. OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
  616. src11 = *s; s+=srcStride; \
  617. OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
  618. src12 = *s; s+=srcStride; \
  619. OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
  620. src13 = *s; s+=srcStride; \
  621. OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
  622. src14 = *s; s+=srcStride; \
  623. OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
  624. src15 = *s; s+=srcStride; \
  625. OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
  626. src16 = *s; \
  627. OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
  628. OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
  629. OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
  630. OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
  631. dst++;\
  632. src++;\
  633. }while(--w);\
  634. }\
  635. \
  636. static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
  637. OPNAME ## pixels8_c(dst, src, stride, 8);\
  638. }\
  639. \
  640. static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
  641. uint8_t half[64];\
  642. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  643. OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
  644. }\
  645. \
  646. static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
  647. OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
  648. }\
  649. \
  650. static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
  651. uint8_t half[64];\
  652. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  653. OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
  654. }\
  655. \
  656. static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
  657. uint8_t full[16*9];\
  658. uint8_t half[64];\
  659. copy_block9(full, src, 16, stride, 9);\
  660. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
  661. OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
  662. }\
  663. \
  664. static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
  665. uint8_t full[16*9];\
  666. copy_block9(full, src, 16, stride, 9);\
  667. OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
  668. }\
  669. \
  670. static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
  671. uint8_t full[16*9];\
  672. uint8_t half[64];\
  673. copy_block9(full, src, 16, stride, 9);\
  674. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
  675. OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
  676. }\
  677. static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
  678. uint8_t full[16*9];\
  679. uint8_t halfH[72];\
  680. uint8_t halfHV[64];\
  681. copy_block9(full, src, 16, stride, 9);\
  682. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  683. put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
  684. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  685. OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
  686. }\
  687. static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
  688. uint8_t full[16*9];\
  689. uint8_t halfH[72];\
  690. uint8_t halfHV[64];\
  691. copy_block9(full, src, 16, stride, 9);\
  692. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  693. put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
  694. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  695. OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
  696. }\
  697. static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
  698. uint8_t full[16*9];\
  699. uint8_t halfH[72];\
  700. uint8_t halfHV[64];\
  701. copy_block9(full, src, 16, stride, 9);\
  702. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  703. put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
  704. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  705. OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  706. }\
  707. static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
  708. uint8_t full[16*9];\
  709. uint8_t halfH[72];\
  710. uint8_t halfHV[64];\
  711. copy_block9(full, src, 16, stride, 9);\
  712. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  713. put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
  714. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  715. OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  716. }\
  717. static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
  718. uint8_t halfH[72];\
  719. uint8_t halfHV[64];\
  720. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  721. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  722. OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
  723. }\
  724. static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
  725. uint8_t halfH[72];\
  726. uint8_t halfHV[64];\
  727. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  728. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  729. OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  730. }\
  731. static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
  732. uint8_t full[16*9];\
  733. uint8_t halfH[72];\
  734. copy_block9(full, src, 16, stride, 9);\
  735. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  736. put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
  737. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
  738. }\
  739. static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
  740. uint8_t full[16*9];\
  741. uint8_t halfH[72];\
  742. copy_block9(full, src, 16, stride, 9);\
  743. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  744. put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
  745. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
  746. }\
  747. static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
  748. uint8_t halfH[72];\
  749. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  750. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
  751. }\
  752. static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
  753. OPNAME ## pixels16_c(dst, src, stride, 16);\
  754. }\
  755. \
  756. static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
  757. uint8_t half[256];\
  758. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  759. OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
  760. }\
  761. \
  762. static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
  763. OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
  764. }\
  765. \
  766. static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
  767. uint8_t half[256];\
  768. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  769. OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
  770. }\
  771. \
  772. static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
  773. uint8_t full[24*17];\
  774. uint8_t half[256];\
  775. copy_block17(full, src, 24, stride, 17);\
  776. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
  777. OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
  778. }\
  779. \
  780. static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
  781. uint8_t full[24*17];\
  782. copy_block17(full, src, 24, stride, 17);\
  783. OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
  784. }\
  785. \
  786. static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
  787. uint8_t full[24*17];\
  788. uint8_t half[256];\
  789. copy_block17(full, src, 24, stride, 17);\
  790. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
  791. OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
  792. }\
  793. static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
  794. uint8_t full[24*17];\
  795. uint8_t halfH[272];\
  796. uint8_t halfHV[256];\
  797. copy_block17(full, src, 24, stride, 17);\
  798. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  799. put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
  800. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  801. OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
  802. }\
  803. static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
  804. uint8_t full[24*17];\
  805. uint8_t halfH[272];\
  806. uint8_t halfHV[256];\
  807. copy_block17(full, src, 24, stride, 17);\
  808. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  809. put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
  810. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  811. OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
  812. }\
  813. static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
  814. uint8_t full[24*17];\
  815. uint8_t halfH[272];\
  816. uint8_t halfHV[256];\
  817. copy_block17(full, src, 24, stride, 17);\
  818. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  819. put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
  820. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  821. OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  822. }\
  823. static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
  824. uint8_t full[24*17];\
  825. uint8_t halfH[272];\
  826. uint8_t halfHV[256];\
  827. copy_block17(full, src, 24, stride, 17);\
  828. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  829. put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
  830. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  831. OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  832. }\
  833. static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
  834. uint8_t halfH[272];\
  835. uint8_t halfHV[256];\
  836. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  837. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  838. OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
  839. }\
  840. static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
  841. uint8_t halfH[272];\
  842. uint8_t halfHV[256];\
  843. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  844. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  845. OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  846. }\
  847. static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
  848. uint8_t full[24*17];\
  849. uint8_t halfH[272];\
  850. copy_block17(full, src, 24, stride, 17);\
  851. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  852. put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
  853. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
  854. }\
  855. static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
  856. uint8_t full[24*17];\
  857. uint8_t halfH[272];\
  858. copy_block17(full, src, 24, stride, 17);\
  859. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  860. put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
  861. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
  862. }\
  863. static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
  864. uint8_t halfH[272];\
  865. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  866. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
  867. }
  868. #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
  869. #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
  870. #define op_put(a, b) a = cm[((b) + 16)>>5]
  871. #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
  872. QPEL_MC(0, put_ , _ , op_put)
  873. QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
  874. QPEL_MC(0, avg_ , _ , op_avg)
  875. //QPEL_MC(1, avg_no_rnd , _ , op_avg)
  876. #undef op_avg
  877. #undef op_avg_no_rnd
  878. #undef op_put
  879. #undef op_put_no_rnd
  880. #define H264_LOWPASS(OPNAME, OP, OP2) \
  881. static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
  882. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  883. do {\
  884. int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
  885. uint8_t *s = src-2;\
  886. srcB = *s++;\
  887. srcA = *s++;\
  888. src0 = *s++;\
  889. src1 = *s++;\
  890. src2 = *s++;\
  891. src3 = *s++;\
  892. OP(dst[0], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  893. src4 = *s++;\
  894. OP(dst[1], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  895. src5 = *s++;\
  896. OP(dst[2], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  897. src6 = *s++;\
  898. OP(dst[3], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  899. if (w>4) { /* it optimized */ \
  900. int src7,src8,src9,src10; \
  901. src7 = *s++;\
  902. OP(dst[4], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
  903. src8 = *s++;\
  904. OP(dst[5], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
  905. src9 = *s++;\
  906. OP(dst[6], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
  907. src10 = *s++;\
  908. OP(dst[7], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
  909. if (w>8) { \
  910. int src11,src12,src13,src14,src15,src16,src17,src18; \
  911. src11 = *s++;\
  912. OP(dst[8] , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
  913. src12 = *s++;\
  914. OP(dst[9] , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
  915. src13 = *s++;\
  916. OP(dst[10], (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
  917. src14 = *s++;\
  918. OP(dst[11], (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
  919. src15 = *s++;\
  920. OP(dst[12], (src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
  921. src16 = *s++;\
  922. OP(dst[13], (src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
  923. src17 = *s++;\
  924. OP(dst[14], (src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
  925. src18 = *s++;\
  926. OP(dst[15], (src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
  927. } \
  928. } \
  929. dst+=dstStride;\
  930. src+=srcStride;\
  931. }while(--h);\
  932. }\
  933. \
  934. static inline void OPNAME ## h264_qpel_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
  935. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  936. do{\
  937. int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
  938. uint8_t *s = src-2*srcStride,*d=dst;\
  939. srcB = *s; s+=srcStride;\
  940. srcA = *s; s+=srcStride;\
  941. src0 = *s; s+=srcStride;\
  942. src1 = *s; s+=srcStride;\
  943. src2 = *s; s+=srcStride;\
  944. src3 = *s; s+=srcStride;\
  945. OP(*d, (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));d+=dstStride;\
  946. src4 = *s; s+=srcStride;\
  947. OP(*d, (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));d+=dstStride;\
  948. src5 = *s; s+=srcStride;\
  949. OP(*d, (src2+src3)*20 - (src1+src4)*5 + (src0+src5));d+=dstStride;\
  950. src6 = *s; s+=srcStride;\
  951. OP(*d, (src3+src4)*20 - (src2+src5)*5 + (src1+src6));d+=dstStride;\
  952. if (h>4) { \
  953. int src7,src8,src9,src10; \
  954. src7 = *s; s+=srcStride;\
  955. OP(*d, (src4+src5)*20 - (src3+src6)*5 + (src2+src7));d+=dstStride;\
  956. src8 = *s; s+=srcStride;\
  957. OP(*d, (src5+src6)*20 - (src4+src7)*5 + (src3+src8));d+=dstStride;\
  958. src9 = *s; s+=srcStride;\
  959. OP(*d, (src6+src7)*20 - (src5+src8)*5 + (src4+src9));d+=dstStride;\
  960. src10 = *s; s+=srcStride;\
  961. OP(*d, (src7+src8)*20 - (src6+src9)*5 + (src5+src10));d+=dstStride;\
  962. if (h>8) { \
  963. int src11,src12,src13,src14,src15,src16,src17,src18; \
  964. src11 = *s; s+=srcStride;\
  965. OP(*d , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));d+=dstStride;\
  966. src12 = *s; s+=srcStride;\
  967. OP(*d , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));d+=dstStride;\
  968. src13 = *s; s+=srcStride;\
  969. OP(*d, (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));d+=dstStride;\
  970. src14 = *s; s+=srcStride;\
  971. OP(*d, (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));d+=dstStride;\
  972. src15 = *s; s+=srcStride;\
  973. OP(*d, (src12+src13)*20 - (src11+src14)*5 + (src10+src15));d+=dstStride;\
  974. src16 = *s; s+=srcStride;\
  975. OP(*d, (src13+src14)*20 - (src12+src15)*5 + (src11+src16));d+=dstStride;\
  976. src17 = *s; s+=srcStride;\
  977. OP(*d, (src14+src15)*20 - (src13+src16)*5 + (src12+src17));d+=dstStride;\
  978. src18 = *s; s+=srcStride;\
  979. OP(*d, (src15+src16)*20 - (src14+src17)*5 + (src13+src18));d+=dstStride;\
  980. } \
  981. } \
  982. dst++;\
  983. src++;\
  984. }while(--w);\
  985. }\
  986. \
  987. static inline void OPNAME ## h264_qpel_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride,int w,int h){\
  988. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  989. int i;\
  990. src -= 2*srcStride;\
  991. i= h+5; \
  992. do {\
  993. int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
  994. uint8_t *s = src-2;\
  995. srcB = *s++;\
  996. srcA = *s++;\
  997. src0 = *s++;\
  998. src1 = *s++;\
  999. src2 = *s++;\
  1000. src3 = *s++;\
  1001. tmp[0] = ((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  1002. src4 = *s++;\
  1003. tmp[1] = ((src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  1004. src5 = *s++;\
  1005. tmp[2] = ((src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  1006. src6 = *s++;\
  1007. tmp[3] = ((src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  1008. if (w>4) { /* it optimized */ \
  1009. int src7,src8,src9,src10; \
  1010. src7 = *s++;\
  1011. tmp[4] = ((src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
  1012. src8 = *s++;\
  1013. tmp[5] = ((src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
  1014. src9 = *s++;\
  1015. tmp[6] = ((src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
  1016. src10 = *s++;\
  1017. tmp[7] = ((src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
  1018. if (w>8) { \
  1019. int src11,src12,src13,src14,src15,src16,src17,src18; \
  1020. src11 = *s++;\
  1021. tmp[8] = ((src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
  1022. src12 = *s++;\
  1023. tmp[9] = ((src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
  1024. src13 = *s++;\
  1025. tmp[10] = ((src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
  1026. src14 = *s++;\
  1027. tmp[11] = ((src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
  1028. src15 = *s++;\
  1029. tmp[12] = ((src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
  1030. src16 = *s++;\
  1031. tmp[13] = ((src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
  1032. src17 = *s++;\
  1033. tmp[14] = ((src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
  1034. src18 = *s++;\
  1035. tmp[15] = ((src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
  1036. } \
  1037. } \
  1038. tmp+=tmpStride;\
  1039. src+=srcStride;\
  1040. }while(--i);\
  1041. tmp -= tmpStride*(h+5-2);\
  1042. i = w; \
  1043. do {\
  1044. int tmpB,tmpA,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6;\
  1045. int16_t *s = tmp-2*tmpStride; \
  1046. uint8_t *d=dst;\
  1047. tmpB = *s; s+=tmpStride;\
  1048. tmpA = *s; s+=tmpStride;\
  1049. tmp0 = *s; s+=tmpStride;\
  1050. tmp1 = *s; s+=tmpStride;\
  1051. tmp2 = *s; s+=tmpStride;\
  1052. tmp3 = *s; s+=tmpStride;\
  1053. OP2(*d, (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));d+=dstStride;\
  1054. tmp4 = *s; s+=tmpStride;\
  1055. OP2(*d, (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));d+=dstStride;\
  1056. tmp5 = *s; s+=tmpStride;\
  1057. OP2(*d, (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));d+=dstStride;\
  1058. tmp6 = *s; s+=tmpStride;\
  1059. OP2(*d, (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));d+=dstStride;\
  1060. if (h>4) { \
  1061. int tmp7,tmp8,tmp9,tmp10; \
  1062. tmp7 = *s; s+=tmpStride;\
  1063. OP2(*d, (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));d+=dstStride;\
  1064. tmp8 = *s; s+=tmpStride;\
  1065. OP2(*d, (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));d+=dstStride;\
  1066. tmp9 = *s; s+=tmpStride;\
  1067. OP2(*d, (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));d+=dstStride;\
  1068. tmp10 = *s; s+=tmpStride;\
  1069. OP2(*d, (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));d+=dstStride;\
  1070. if (h>8) { \
  1071. int tmp11,tmp12,tmp13,tmp14,tmp15,tmp16,tmp17,tmp18; \
  1072. tmp11 = *s; s+=tmpStride;\
  1073. OP2(*d , (tmp8 +tmp9 )*20 - (tmp7 +tmp10)*5 + (tmp6 +tmp11));d+=dstStride;\
  1074. tmp12 = *s; s+=tmpStride;\
  1075. OP2(*d , (tmp9 +tmp10)*20 - (tmp8 +tmp11)*5 + (tmp7 +tmp12));d+=dstStride;\
  1076. tmp13 = *s; s+=tmpStride;\
  1077. OP2(*d, (tmp10+tmp11)*20 - (tmp9 +tmp12)*5 + (tmp8 +tmp13));d+=dstStride;\
  1078. tmp14 = *s; s+=tmpStride;\
  1079. OP2(*d, (tmp11+tmp12)*20 - (tmp10+tmp13)*5 + (tmp9 +tmp14));d+=dstStride;\
  1080. tmp15 = *s; s+=tmpStride;\
  1081. OP2(*d, (tmp12+tmp13)*20 - (tmp11+tmp14)*5 + (tmp10+tmp15));d+=dstStride;\
  1082. tmp16 = *s; s+=tmpStride;\
  1083. OP2(*d, (tmp13+tmp14)*20 - (tmp12+tmp15)*5 + (tmp11+tmp16));d+=dstStride;\
  1084. tmp17 = *s; s+=tmpStride;\
  1085. OP2(*d, (tmp14+tmp15)*20 - (tmp13+tmp16)*5 + (tmp12+tmp17));d+=dstStride;\
  1086. tmp18 = *s; s+=tmpStride;\
  1087. OP2(*d, (tmp15+tmp16)*20 - (tmp14+tmp17)*5 + (tmp13+tmp18));d+=dstStride;\
  1088. } \
  1089. } \
  1090. dst++;\
  1091. tmp++;\
  1092. }while(--i);\
  1093. }\
  1094. \
  1095. static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1096. OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,4,4); \
  1097. }\
  1098. static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1099. OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,8,8); \
  1100. }\
  1101. static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1102. OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,16,16); \
  1103. }\
  1104. \
  1105. static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1106. OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,4,4); \
  1107. }\
  1108. static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1109. OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,8,8); \
  1110. }\
  1111. static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1112. OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,16,16); \
  1113. }\
  1114. static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1115. OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,4,4); \
  1116. }\
  1117. static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1118. OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,8,8); \
  1119. }\
  1120. static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1121. OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,16,16); \
  1122. }\
  1123. #define H264_MC(OPNAME, SIZE) \
  1124. static void OPNAME ## h264_qpel ## SIZE ## _mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
  1125. OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
  1126. }\
  1127. \
  1128. static void OPNAME ## h264_qpel ## SIZE ## _mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1129. uint8_t half[SIZE*SIZE];\
  1130. put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
  1131. OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
  1132. }\
  1133. \
  1134. static void OPNAME ## h264_qpel ## SIZE ## _mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1135. OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
  1136. }\
  1137. \
  1138. static void OPNAME ## h264_qpel ## SIZE ## _mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1139. uint8_t half[SIZE*SIZE];\
  1140. put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
  1141. OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
  1142. }\
  1143. \
  1144. static void OPNAME ## h264_qpel ## SIZE ## _mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1145. uint8_t full[SIZE*(SIZE+5)];\
  1146. uint8_t * const full_mid= full + SIZE*2;\
  1147. uint8_t half[SIZE*SIZE];\
  1148. copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1149. put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
  1150. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
  1151. }\
  1152. \
  1153. static void OPNAME ## h264_qpel ## SIZE ## _mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1154. uint8_t full[SIZE*(SIZE+5)];\
  1155. uint8_t * const full_mid= full + SIZE*2;\
  1156. copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1157. OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
  1158. }\
  1159. \
  1160. static void OPNAME ## h264_qpel ## SIZE ## _mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1161. uint8_t full[SIZE*(SIZE+5)];\
  1162. uint8_t * const full_mid= full + SIZE*2;\
  1163. uint8_t half[SIZE*SIZE];\
  1164. copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1165. put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
  1166. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
  1167. }\
  1168. \
  1169. static void OPNAME ## h264_qpel ## SIZE ## _mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1170. uint8_t full[SIZE*(SIZE+5)];\
  1171. uint8_t * const full_mid= full + SIZE*2;\
  1172. uint8_t halfH[SIZE*SIZE];\
  1173. uint8_t halfV[SIZE*SIZE];\
  1174. put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
  1175. copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1176. put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  1177. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1178. }\
  1179. \
  1180. static void OPNAME ## h264_qpel ## SIZE ## _mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1181. uint8_t full[SIZE*(SIZE+5)];\
  1182. uint8_t * const full_mid= full + SIZE*2;\
  1183. uint8_t halfH[SIZE*SIZE];\
  1184. uint8_t halfV[SIZE*SIZE];\
  1185. put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
  1186. copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1187. put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  1188. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1189. }\
  1190. \
  1191. static void OPNAME ## h264_qpel ## SIZE ## _mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1192. uint8_t full[SIZE*(SIZE+5)];\
  1193. uint8_t * const full_mid= full + SIZE*2;\
  1194. uint8_t halfH[SIZE*SIZE];\
  1195. uint8_t halfV[SIZE*SIZE];\
  1196. put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
  1197. copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1198. put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  1199. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1200. }\
  1201. \
  1202. static void OPNAME ## h264_qpel ## SIZE ## _mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1203. uint8_t full[SIZE*(SIZE+5)];\
  1204. uint8_t * const full_mid= full + SIZE*2;\
  1205. uint8_t halfH[SIZE*SIZE];\
  1206. uint8_t halfV[SIZE*SIZE];\
  1207. put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
  1208. copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1209. put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  1210. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1211. }\
  1212. \
  1213. static void OPNAME ## h264_qpel ## SIZE ## _mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1214. int16_t tmp[SIZE*(SIZE+5)];\
  1215. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
  1216. }\
  1217. \
  1218. static void OPNAME ## h264_qpel ## SIZE ## _mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1219. int16_t tmp[SIZE*(SIZE+5)];\
  1220. uint8_t halfH[SIZE*SIZE];\
  1221. uint8_t halfHV[SIZE*SIZE];\
  1222. put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
  1223. put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  1224. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1225. }\
  1226. \
  1227. static void OPNAME ## h264_qpel ## SIZE ## _mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1228. int16_t tmp[SIZE*(SIZE+5)];\
  1229. uint8_t halfH[SIZE*SIZE];\
  1230. uint8_t halfHV[SIZE*SIZE];\
  1231. put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
  1232. put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  1233. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1234. }\
  1235. \
  1236. static void OPNAME ## h264_qpel ## SIZE ## _mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1237. uint8_t full[SIZE*(SIZE+5)];\
  1238. uint8_t * const full_mid= full + SIZE*2;\
  1239. int16_t tmp[SIZE*(SIZE+5)];\
  1240. uint8_t halfV[SIZE*SIZE];\
  1241. uint8_t halfHV[SIZE*SIZE];\
  1242. copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1243. put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  1244. put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  1245. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1246. }\
  1247. \
  1248. static void OPNAME ## h264_qpel ## SIZE ## _mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
  1249. uint8_t full[SIZE*(SIZE+5)];\
  1250. uint8_t * const full_mid= full + SIZE*2;\
  1251. int16_t tmp[SIZE*(SIZE+5)];\
  1252. uint8_t halfV[SIZE*SIZE];\
  1253. uint8_t halfHV[SIZE*SIZE];\
  1254. copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1255. put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  1256. put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  1257. OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1258. }\
  1259. #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
  1260. //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
  1261. #define op_put(a, b) a = cm[((b) + 16)>>5]
  1262. #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
  1263. #define op2_put(a, b) a = cm[((b) + 512)>>10]
  1264. H264_LOWPASS(put_ , op_put, op2_put)
  1265. H264_LOWPASS(avg_ , op_avg, op2_avg)
  1266. H264_MC(put_, 4)
  1267. H264_MC(put_, 8)
  1268. H264_MC(put_, 16)
  1269. H264_MC(avg_, 4)
  1270. H264_MC(avg_, 8)
  1271. H264_MC(avg_, 16)
  1272. #undef op_avg
  1273. #undef op_put
  1274. #undef op2_avg
  1275. #undef op2_put
  1276. static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
  1277. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  1278. do{
  1279. int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
  1280. uint8_t *s = src;
  1281. src_1 = s[-1];
  1282. src0 = *s++;
  1283. src1 = *s++;
  1284. src2 = *s++;
  1285. dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
  1286. src3 = *s++;
  1287. dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
  1288. src4 = *s++;
  1289. dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
  1290. src5 = *s++;
  1291. dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
  1292. src6 = *s++;
  1293. dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
  1294. src7 = *s++;
  1295. dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
  1296. src8 = *s++;
  1297. dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
  1298. src9 = *s++;
  1299. dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
  1300. dst+=dstStride;
  1301. src+=srcStride;
  1302. }while(--h);
  1303. }
  1304. static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
  1305. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  1306. do{
  1307. int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
  1308. uint8_t *s = src,*d = dst;
  1309. src_1 = *(s-srcStride);
  1310. src0 = *s; s+=srcStride;
  1311. src1 = *s; s+=srcStride;
  1312. src2 = *s; s+=srcStride;
  1313. *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
  1314. src3 = *s; s+=srcStride;
  1315. *d= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; d+=dstStride;
  1316. src4 = *s; s+=srcStride;
  1317. *d= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; d+=dstStride;
  1318. src5 = *s; s+=srcStride;
  1319. *d= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; d+=dstStride;
  1320. src6 = *s; s+=srcStride;
  1321. *d= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; d+=dstStride;
  1322. src7 = *s; s+=srcStride;
  1323. *d= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; d+=dstStride;
  1324. src8 = *s; s+=srcStride;
  1325. *d= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; d+=dstStride;
  1326. src9 = *s;
  1327. *d= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; d+=dstStride;
  1328. src++;
  1329. dst++;
  1330. }while(--w);
  1331. }
  1332. static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
  1333. put_pixels8_c(dst, src, stride, 8);
  1334. }
  1335. static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
  1336. uint8_t half[64];
  1337. wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
  1338. put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
  1339. }
  1340. static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
  1341. wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
  1342. }
  1343. static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
  1344. uint8_t half[64];
  1345. wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
  1346. put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
  1347. }
  1348. static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
  1349. wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
  1350. }
  1351. static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
  1352. uint8_t halfH[88];
  1353. uint8_t halfV[64];
  1354. uint8_t halfHV[64];
  1355. wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
  1356. wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
  1357. wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
  1358. put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
  1359. }
  1360. static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
  1361. uint8_t halfH[88];
  1362. uint8_t halfV[64];
  1363. uint8_t halfHV[64];
  1364. wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
  1365. wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
  1366. wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
  1367. put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
  1368. }
  1369. static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
  1370. uint8_t halfH[88];
  1371. wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
  1372. wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
  1373. }