You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

862 lines
38KB

  1. /*
  2. * This is optimized for sh, which have post increment addressing (*p++).
  3. * Some CPU may be index (p[n]) faster than post increment (*p++).
  4. *
  5. * copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
  6. *
  7. * This file is part of Libav.
  8. *
  9. * Libav is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * Libav is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with Libav; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavutil/common.h"
  24. #include "libavcodec/copy_block.h"
  25. #define PIXOP2(OPNAME, OP) \
  26. \
  27. static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  28. {\
  29. do {\
  30. OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  31. src1+=src_stride1; \
  32. src2+=src_stride2; \
  33. dst+=dst_stride; \
  34. } while(--h); \
  35. }\
  36. \
  37. static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  38. {\
  39. do {\
  40. OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  41. src1+=src_stride1; \
  42. src2+=src_stride2; \
  43. dst+=dst_stride; \
  44. } while(--h); \
  45. }\
  46. \
  47. static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  48. {\
  49. do {\
  50. OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  51. OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  52. OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
  53. OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
  54. src1+=src_stride1; \
  55. src2+=src_stride2; \
  56. dst+=dst_stride; \
  57. } while(--h); \
  58. }\
  59. \
  60. static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  61. {\
  62. do {\
  63. OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  64. OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  65. OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
  66. OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
  67. src1+=src_stride1; \
  68. src2+=src_stride2; \
  69. dst+=dst_stride; \
  70. } while(--h); \
  71. }\
  72. \
  73. static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  74. {\
  75. do { /* onlye src2 aligned */\
  76. OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  77. OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  78. src1+=src_stride1; \
  79. src2+=src_stride2; \
  80. dst+=dst_stride; \
  81. } while(--h); \
  82. }\
  83. \
  84. static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  85. {\
  86. do {\
  87. OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
  88. OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  89. src1+=src_stride1; \
  90. src2+=src_stride2; \
  91. dst+=dst_stride; \
  92. } while(--h); \
  93. }\
  94. \
  95. static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  96. {\
  97. do {\
  98. OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  99. OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  100. src1+=src_stride1; \
  101. src2+=src_stride2; \
  102. dst+=dst_stride; \
  103. } while(--h); \
  104. }\
  105. \
  106. static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  107. {\
  108. do {\
  109. OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  110. OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  111. src1+=src_stride1; \
  112. src2+=src_stride2; \
  113. dst+=dst_stride; \
  114. } while(--h); \
  115. }\
  116. \
  117. static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  118. {\
  119. do {\
  120. OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  121. OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  122. OP(LP(dst+8),no_rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
  123. OP(LP(dst+12),no_rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
  124. src1+=src_stride1; \
  125. src2+=src_stride2; \
  126. dst+=dst_stride; \
  127. } while(--h); \
  128. }\
  129. \
  130. static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  131. {\
  132. do {\
  133. OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
  134. OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
  135. OP(LP(dst+8),rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
  136. OP(LP(dst+12),rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
  137. src1+=src_stride1; \
  138. src2+=src_stride2; \
  139. dst+=dst_stride; \
  140. } while(--h); \
  141. }\
  142. \
  143. static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  144. { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  145. \
  146. static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  147. { OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  148. \
  149. static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  150. { OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  151. \
  152. static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  153. { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
  154. \
  155. static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  156. do { \
  157. uint32_t a0,a1,a2,a3; \
  158. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  159. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  160. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  161. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  162. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  163. OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
  164. src1+=src_stride1;\
  165. src2+=src_stride2;\
  166. src3+=src_stride3;\
  167. src4+=src_stride4;\
  168. dst+=dst_stride;\
  169. } while(--h); \
  170. } \
  171. \
  172. static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  173. do { \
  174. uint32_t a0,a1,a2,a3; \
  175. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  176. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  177. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  178. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  179. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  180. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  181. src1+=src_stride1;\
  182. src2+=src_stride2;\
  183. src3+=src_stride3;\
  184. src4+=src_stride4;\
  185. dst+=dst_stride;\
  186. } while(--h); \
  187. } \
  188. \
  189. static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  190. do { \
  191. uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
  192. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  193. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  194. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  195. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  196. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  197. OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
  198. src1+=src_stride1;\
  199. src2+=src_stride2;\
  200. src3+=src_stride3;\
  201. src4+=src_stride4;\
  202. dst+=dst_stride;\
  203. } while(--h); \
  204. } \
  205. \
  206. static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  207. do { \
  208. uint32_t a0,a1,a2,a3; \
  209. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  210. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  211. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  212. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  213. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  214. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  215. src1+=src_stride1;\
  216. src2+=src_stride2;\
  217. src3+=src_stride3;\
  218. src4+=src_stride4;\
  219. dst+=dst_stride;\
  220. } while(--h); \
  221. } \
  222. \
  223. static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  224. do { \
  225. uint32_t a0,a1,a2,a3; \
  226. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  227. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  228. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  229. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  230. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  231. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  232. UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
  233. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  234. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  235. UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
  236. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  237. OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
  238. src1+=src_stride1;\
  239. src2+=src_stride2;\
  240. src3+=src_stride3;\
  241. src4+=src_stride4;\
  242. dst+=dst_stride;\
  243. } while(--h); \
  244. } \
  245. \
  246. static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  247. do { \
  248. uint32_t a0,a1,a2,a3; \
  249. UNPACK(a0,a1,LPC(src1),LPC(src2)); \
  250. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  251. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  252. UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
  253. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  254. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  255. UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
  256. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  257. OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
  258. UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
  259. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  260. OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
  261. src1+=src_stride1;\
  262. src2+=src_stride2;\
  263. src3+=src_stride3;\
  264. src4+=src_stride4;\
  265. dst+=dst_stride;\
  266. } while(--h); \
  267. } \
  268. \
  269. static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  270. do { /* src1 is unaligned */\
  271. uint32_t a0,a1,a2,a3; \
  272. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  273. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  274. OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
  275. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  276. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  277. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  278. UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
  279. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  280. OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
  281. UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
  282. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  283. OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
  284. src1+=src_stride1;\
  285. src2+=src_stride2;\
  286. src3+=src_stride3;\
  287. src4+=src_stride4;\
  288. dst+=dst_stride;\
  289. } while(--h); \
  290. } \
  291. \
  292. static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  293. do { \
  294. uint32_t a0,a1,a2,a3; \
  295. UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
  296. UNPACK(a2,a3,LPC(src3),LPC(src4)); \
  297. OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
  298. UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
  299. UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
  300. OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
  301. UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
  302. UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
  303. OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
  304. UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
  305. UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
  306. OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
  307. src1+=src_stride1;\
  308. src2+=src_stride2;\
  309. src3+=src_stride3;\
  310. src4+=src_stride4;\
  311. dst+=dst_stride;\
  312. } while(--h); \
  313. } \
  314. \
  315. #define op_avg(a, b) a = rnd_avg32(a,b)
  316. #define op_put(a, b) a = b
  317. PIXOP2(avg, op_avg)
  318. PIXOP2(put, op_put)
  319. #undef op_avg
  320. #undef op_put
  321. #define avg2(a,b) ((a+b+1)>>1)
  322. #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
  323. static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
  324. {
  325. const int A=(16-x16)*(16-y16);
  326. const int B=( x16)*(16-y16);
  327. const int C=(16-x16)*( y16);
  328. const int D=( x16)*( y16);
  329. do {
  330. int t0,t1,t2,t3;
  331. uint8_t *s0 = src;
  332. uint8_t *s1 = src+stride;
  333. t0 = *s0++; t2 = *s1++;
  334. t1 = *s0++; t3 = *s1++;
  335. dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  336. t0 = *s0++; t2 = *s1++;
  337. dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  338. t1 = *s0++; t3 = *s1++;
  339. dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  340. t0 = *s0++; t2 = *s1++;
  341. dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  342. t1 = *s0++; t3 = *s1++;
  343. dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  344. t0 = *s0++; t2 = *s1++;
  345. dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  346. t1 = *s0++; t3 = *s1++;
  347. dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
  348. t0 = *s0++; t2 = *s1++;
  349. dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
  350. dst+= stride;
  351. src+= stride;
  352. }while(--h);
  353. }
  354. #define QPEL_MC(r, OPNAME, RND, OP) \
  355. static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  356. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  357. do {\
  358. uint8_t *s = src; \
  359. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  360. src0= *s++;\
  361. src1= *s++;\
  362. src2= *s++;\
  363. src3= *s++;\
  364. src4= *s++;\
  365. OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
  366. src5= *s++;\
  367. OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
  368. src6= *s++;\
  369. OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
  370. src7= *s++;\
  371. OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
  372. src8= *s++;\
  373. OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
  374. OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
  375. OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
  376. OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
  377. dst+=dstStride;\
  378. src+=srcStride;\
  379. }while(--h);\
  380. }\
  381. \
  382. static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  383. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  384. int w=8;\
  385. do{\
  386. uint8_t *s = src, *d=dst;\
  387. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  388. src0 = *s; s+=srcStride; \
  389. src1 = *s; s+=srcStride; \
  390. src2 = *s; s+=srcStride; \
  391. src3 = *s; s+=srcStride; \
  392. src4 = *s; s+=srcStride; \
  393. OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
  394. src5 = *s; s+=srcStride; \
  395. OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
  396. src6 = *s; s+=srcStride; \
  397. OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
  398. src7 = *s; s+=srcStride; \
  399. OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
  400. src8 = *s; \
  401. OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
  402. OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
  403. OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
  404. OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
  405. dst++;\
  406. src++;\
  407. }while(--w);\
  408. }\
  409. \
  410. static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  411. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  412. do {\
  413. uint8_t *s = src;\
  414. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  415. int src9,src10,src11,src12,src13,src14,src15,src16;\
  416. src0= *s++;\
  417. src1= *s++;\
  418. src2= *s++;\
  419. src3= *s++;\
  420. src4= *s++;\
  421. OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
  422. src5= *s++;\
  423. OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
  424. src6= *s++;\
  425. OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
  426. src7= *s++;\
  427. OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
  428. src8= *s++;\
  429. OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
  430. src9= *s++;\
  431. OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
  432. src10= *s++;\
  433. OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
  434. src11= *s++;\
  435. OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
  436. src12= *s++;\
  437. OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
  438. src13= *s++;\
  439. OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
  440. src14= *s++;\
  441. OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
  442. src15= *s++;\
  443. OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
  444. src16= *s++;\
  445. OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
  446. OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
  447. OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
  448. OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
  449. dst+=dstStride;\
  450. src+=srcStride;\
  451. }while(--h);\
  452. }\
  453. \
  454. static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  455. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  456. int w=16;\
  457. do {\
  458. uint8_t *s = src, *d=dst;\
  459. int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
  460. int src9,src10,src11,src12,src13,src14,src15,src16;\
  461. src0 = *s; s+=srcStride; \
  462. src1 = *s; s+=srcStride; \
  463. src2 = *s; s+=srcStride; \
  464. src3 = *s; s+=srcStride; \
  465. src4 = *s; s+=srcStride; \
  466. OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
  467. src5 = *s; s+=srcStride; \
  468. OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
  469. src6 = *s; s+=srcStride; \
  470. OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
  471. src7 = *s; s+=srcStride; \
  472. OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
  473. src8 = *s; s+=srcStride; \
  474. OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
  475. src9 = *s; s+=srcStride; \
  476. OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
  477. src10 = *s; s+=srcStride; \
  478. OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
  479. src11 = *s; s+=srcStride; \
  480. OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
  481. src12 = *s; s+=srcStride; \
  482. OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
  483. src13 = *s; s+=srcStride; \
  484. OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
  485. src14 = *s; s+=srcStride; \
  486. OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
  487. src15 = *s; s+=srcStride; \
  488. OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
  489. src16 = *s; \
  490. OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
  491. OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
  492. OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
  493. OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
  494. dst++;\
  495. src++;\
  496. }while(--w);\
  497. }\
  498. \
  499. static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
  500. OPNAME ## pixels8_c(dst, src, stride, 8);\
  501. }\
  502. \
  503. static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
  504. uint8_t half[64];\
  505. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  506. OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
  507. }\
  508. \
  509. static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
  510. OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
  511. }\
  512. \
  513. static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
  514. uint8_t half[64];\
  515. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  516. OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
  517. }\
  518. \
  519. static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
  520. uint8_t full[16*9];\
  521. uint8_t half[64];\
  522. copy_block9(full, src, 16, stride, 9);\
  523. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
  524. OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
  525. }\
  526. \
  527. static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
  528. uint8_t full[16*9];\
  529. copy_block9(full, src, 16, stride, 9);\
  530. OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
  531. }\
  532. \
  533. static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
  534. uint8_t full[16*9];\
  535. uint8_t half[64];\
  536. copy_block9(full, src, 16, stride, 9);\
  537. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
  538. OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
  539. }\
  540. static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
  541. uint8_t full[16*9];\
  542. uint8_t halfH[72];\
  543. uint8_t halfHV[64];\
  544. copy_block9(full, src, 16, stride, 9);\
  545. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  546. put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
  547. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  548. OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
  549. }\
  550. static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
  551. uint8_t full[16*9];\
  552. uint8_t halfH[72];\
  553. uint8_t halfHV[64];\
  554. copy_block9(full, src, 16, stride, 9);\
  555. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  556. put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
  557. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  558. OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
  559. }\
  560. static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
  561. uint8_t full[16*9];\
  562. uint8_t halfH[72];\
  563. uint8_t halfHV[64];\
  564. copy_block9(full, src, 16, stride, 9);\
  565. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  566. put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
  567. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  568. OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  569. }\
  570. static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
  571. uint8_t full[16*9];\
  572. uint8_t halfH[72];\
  573. uint8_t halfHV[64];\
  574. copy_block9(full, src, 16, stride, 9);\
  575. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  576. put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
  577. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  578. OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  579. }\
  580. static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
  581. uint8_t halfH[72];\
  582. uint8_t halfHV[64];\
  583. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  584. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  585. OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
  586. }\
  587. static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
  588. uint8_t halfH[72];\
  589. uint8_t halfHV[64];\
  590. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  591. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
  592. OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  593. }\
  594. static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
  595. uint8_t full[16*9];\
  596. uint8_t halfH[72];\
  597. copy_block9(full, src, 16, stride, 9);\
  598. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  599. put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
  600. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
  601. }\
  602. static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
  603. uint8_t full[16*9];\
  604. uint8_t halfH[72];\
  605. copy_block9(full, src, 16, stride, 9);\
  606. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  607. put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
  608. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
  609. }\
  610. static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
  611. uint8_t halfH[72];\
  612. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  613. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
  614. }\
  615. static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
  616. OPNAME ## pixels16_c(dst, src, stride, 16);\
  617. }\
  618. \
  619. static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
  620. uint8_t half[256];\
  621. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  622. OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
  623. }\
  624. \
  625. static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
  626. OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
  627. }\
  628. \
  629. static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
  630. uint8_t half[256];\
  631. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  632. OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
  633. }\
  634. \
  635. static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
  636. uint8_t full[24*17];\
  637. uint8_t half[256];\
  638. copy_block17(full, src, 24, stride, 17);\
  639. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
  640. OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
  641. }\
  642. \
  643. static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
  644. uint8_t full[24*17];\
  645. copy_block17(full, src, 24, stride, 17);\
  646. OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
  647. }\
  648. \
  649. static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
  650. uint8_t full[24*17];\
  651. uint8_t half[256];\
  652. copy_block17(full, src, 24, stride, 17);\
  653. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
  654. OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
  655. }\
  656. static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
  657. uint8_t full[24*17];\
  658. uint8_t halfH[272];\
  659. uint8_t halfHV[256];\
  660. copy_block17(full, src, 24, stride, 17);\
  661. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  662. put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
  663. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  664. OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
  665. }\
  666. static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
  667. uint8_t full[24*17];\
  668. uint8_t halfH[272];\
  669. uint8_t halfHV[256];\
  670. copy_block17(full, src, 24, stride, 17);\
  671. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  672. put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
  673. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  674. OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
  675. }\
  676. static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
  677. uint8_t full[24*17];\
  678. uint8_t halfH[272];\
  679. uint8_t halfHV[256];\
  680. copy_block17(full, src, 24, stride, 17);\
  681. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  682. put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
  683. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  684. OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  685. }\
  686. static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
  687. uint8_t full[24*17];\
  688. uint8_t halfH[272];\
  689. uint8_t halfHV[256];\
  690. copy_block17(full, src, 24, stride, 17);\
  691. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  692. put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
  693. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  694. OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  695. }\
  696. static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
  697. uint8_t halfH[272];\
  698. uint8_t halfHV[256];\
  699. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  700. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  701. OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
  702. }\
  703. static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
  704. uint8_t halfH[272];\
  705. uint8_t halfHV[256];\
  706. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  707. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
  708. OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  709. }\
  710. static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
  711. uint8_t full[24*17];\
  712. uint8_t halfH[272];\
  713. copy_block17(full, src, 24, stride, 17);\
  714. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  715. put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
  716. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
  717. }\
  718. static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
  719. uint8_t full[24*17];\
  720. uint8_t halfH[272];\
  721. copy_block17(full, src, 24, stride, 17);\
  722. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  723. put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
  724. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
  725. }\
  726. static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
  727. uint8_t halfH[272];\
  728. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  729. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
  730. }
  731. #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
  732. #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
  733. #define op_put(a, b) a = cm[((b) + 16)>>5]
  734. #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
  735. QPEL_MC(0, put_ , _ , op_put)
  736. QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
  737. QPEL_MC(0, avg_ , _ , op_avg)
  738. //QPEL_MC(1, avg_no_rnd , _ , op_avg)
  739. #undef op_avg
  740. #undef op_avg_no_rnd
  741. #undef op_put
  742. #undef op_put_no_rnd
  743. static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
  744. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  745. do{
  746. int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
  747. uint8_t *s = src;
  748. src_1 = s[-1];
  749. src0 = *s++;
  750. src1 = *s++;
  751. src2 = *s++;
  752. dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
  753. src3 = *s++;
  754. dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
  755. src4 = *s++;
  756. dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
  757. src5 = *s++;
  758. dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
  759. src6 = *s++;
  760. dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
  761. src7 = *s++;
  762. dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
  763. src8 = *s++;
  764. dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
  765. src9 = *s++;
  766. dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
  767. dst+=dstStride;
  768. src+=srcStride;
  769. }while(--h);
  770. }
  771. static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
  772. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  773. do{
  774. int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
  775. uint8_t *s = src,*d = dst;
  776. src_1 = *(s-srcStride);
  777. src0 = *s; s+=srcStride;
  778. src1 = *s; s+=srcStride;
  779. src2 = *s; s+=srcStride;
  780. *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
  781. src3 = *s; s+=srcStride;
  782. *d= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; d+=dstStride;
  783. src4 = *s; s+=srcStride;
  784. *d= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; d+=dstStride;
  785. src5 = *s; s+=srcStride;
  786. *d= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; d+=dstStride;
  787. src6 = *s; s+=srcStride;
  788. *d= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; d+=dstStride;
  789. src7 = *s; s+=srcStride;
  790. *d= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; d+=dstStride;
  791. src8 = *s; s+=srcStride;
  792. *d= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; d+=dstStride;
  793. src9 = *s;
  794. *d= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; d+=dstStride;
  795. src++;
  796. dst++;
  797. }while(--w);
  798. }
  799. static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
  800. put_pixels8_c(dst, src, stride, 8);
  801. }
  802. static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
  803. uint8_t half[64];
  804. wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
  805. put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
  806. }
  807. static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
  808. wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
  809. }
  810. static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
  811. uint8_t half[64];
  812. wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
  813. put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
  814. }
  815. static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
  816. wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
  817. }
  818. static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
  819. uint8_t halfH[88];
  820. uint8_t halfV[64];
  821. uint8_t halfHV[64];
  822. wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
  823. wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
  824. wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
  825. put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
  826. }
  827. static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
  828. uint8_t halfH[88];
  829. uint8_t halfV[64];
  830. uint8_t halfHV[64];
  831. wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
  832. wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
  833. wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
  834. put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
  835. }
  836. static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
  837. uint8_t halfH[88];
  838. wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
  839. wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
  840. }