You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1609 lines
67KB

  1. /*
  2. * DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20. */
  21. #include "avcodec.h"
  22. #include "dsputil.h"
  23. void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
  24. void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
  25. void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  26. void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  27. void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
  28. void (*clear_blocks)(DCTELEM *blocks);
  29. int (*pix_sum)(UINT8 * pix, int line_size);
  30. int (*pix_norm1)(UINT8 * pix, int line_size);
  31. op_pixels_abs_func pix_abs16x16;
  32. op_pixels_abs_func pix_abs16x16_x2;
  33. op_pixels_abs_func pix_abs16x16_y2;
  34. op_pixels_abs_func pix_abs16x16_xy2;
  35. op_pixels_abs_func pix_abs8x8;
  36. op_pixels_abs_func pix_abs8x8_x2;
  37. op_pixels_abs_func pix_abs8x8_y2;
  38. op_pixels_abs_func pix_abs8x8_xy2;
  39. int ff_bit_exact=0;
  40. UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
  41. UINT32 squareTbl[512];
  42. const UINT8 ff_zigzag_direct[64] = {
  43. 0, 1, 8, 16, 9, 2, 3, 10,
  44. 17, 24, 32, 25, 18, 11, 4, 5,
  45. 12, 19, 26, 33, 40, 48, 41, 34,
  46. 27, 20, 13, 6, 7, 14, 21, 28,
  47. 35, 42, 49, 56, 57, 50, 43, 36,
  48. 29, 22, 15, 23, 30, 37, 44, 51,
  49. 58, 59, 52, 45, 38, 31, 39, 46,
  50. 53, 60, 61, 54, 47, 55, 62, 63
  51. };
  52. /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  53. UINT16 __align8 inv_zigzag_direct16[64];
  54. const UINT8 ff_alternate_horizontal_scan[64] = {
  55. 0, 1, 2, 3, 8, 9, 16, 17,
  56. 10, 11, 4, 5, 6, 7, 15, 14,
  57. 13, 12, 19, 18, 24, 25, 32, 33,
  58. 26, 27, 20, 21, 22, 23, 28, 29,
  59. 30, 31, 34, 35, 40, 41, 48, 49,
  60. 42, 43, 36, 37, 38, 39, 44, 45,
  61. 46, 47, 50, 51, 56, 57, 58, 59,
  62. 52, 53, 54, 55, 60, 61, 62, 63,
  63. };
  64. const UINT8 ff_alternate_vertical_scan[64] = {
  65. 0, 8, 16, 24, 1, 9, 2, 10,
  66. 17, 25, 32, 40, 48, 56, 57, 49,
  67. 41, 33, 26, 18, 3, 11, 4, 12,
  68. 19, 27, 34, 42, 50, 58, 35, 43,
  69. 51, 59, 20, 28, 5, 13, 6, 14,
  70. 21, 29, 36, 44, 52, 60, 37, 45,
  71. 53, 61, 22, 30, 7, 15, 23, 31,
  72. 38, 46, 54, 62, 39, 47, 55, 63,
  73. };
  74. /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  75. UINT32 inverse[256]={
  76. 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
  77. 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
  78. 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
  79. 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
  80. 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
  81. 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
  82. 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
  83. 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
  84. 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
  85. 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
  86. 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
  87. 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
  88. 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
  89. 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
  90. 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
  91. 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
  92. 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
  93. 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
  94. 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
  95. 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
  96. 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
  97. 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
  98. 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
  99. 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
  100. 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
  101. 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
  102. 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
  103. 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
  104. 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
  105. 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
  106. 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
  107. 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
  108. };
  109. int pix_sum_c(UINT8 * pix, int line_size)
  110. {
  111. int s, i, j;
  112. s = 0;
  113. for (i = 0; i < 16; i++) {
  114. for (j = 0; j < 16; j += 8) {
  115. s += pix[0];
  116. s += pix[1];
  117. s += pix[2];
  118. s += pix[3];
  119. s += pix[4];
  120. s += pix[5];
  121. s += pix[6];
  122. s += pix[7];
  123. pix += 8;
  124. }
  125. pix += line_size - 16;
  126. }
  127. return s;
  128. }
  129. int pix_norm1_c(UINT8 * pix, int line_size)
  130. {
  131. int s, i, j;
  132. UINT32 *sq = squareTbl + 256;
  133. s = 0;
  134. for (i = 0; i < 16; i++) {
  135. for (j = 0; j < 16; j += 8) {
  136. s += sq[pix[0]];
  137. s += sq[pix[1]];
  138. s += sq[pix[2]];
  139. s += sq[pix[3]];
  140. s += sq[pix[4]];
  141. s += sq[pix[5]];
  142. s += sq[pix[6]];
  143. s += sq[pix[7]];
  144. pix += 8;
  145. }
  146. pix += line_size - 16;
  147. }
  148. return s;
  149. }
  150. void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
  151. {
  152. int i;
  153. /* read the pixels */
  154. for(i=0;i<8;i++) {
  155. block[0] = pixels[0];
  156. block[1] = pixels[1];
  157. block[2] = pixels[2];
  158. block[3] = pixels[3];
  159. block[4] = pixels[4];
  160. block[5] = pixels[5];
  161. block[6] = pixels[6];
  162. block[7] = pixels[7];
  163. pixels += line_size;
  164. block += 8;
  165. }
  166. }
  167. void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
  168. int stride){
  169. int i;
  170. /* read the pixels */
  171. for(i=0;i<8;i++) {
  172. block[0] = s1[0] - s2[0];
  173. block[1] = s1[1] - s2[1];
  174. block[2] = s1[2] - s2[2];
  175. block[3] = s1[3] - s2[3];
  176. block[4] = s1[4] - s2[4];
  177. block[5] = s1[5] - s2[5];
  178. block[6] = s1[6] - s2[6];
  179. block[7] = s1[7] - s2[7];
  180. s1 += stride;
  181. s2 += stride;
  182. block += 8;
  183. }
  184. }
  185. void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
  186. int line_size)
  187. {
  188. int i;
  189. UINT8 *cm = cropTbl + MAX_NEG_CROP;
  190. /* read the pixels */
  191. for(i=0;i<8;i++) {
  192. pixels[0] = cm[block[0]];
  193. pixels[1] = cm[block[1]];
  194. pixels[2] = cm[block[2]];
  195. pixels[3] = cm[block[3]];
  196. pixels[4] = cm[block[4]];
  197. pixels[5] = cm[block[5]];
  198. pixels[6] = cm[block[6]];
  199. pixels[7] = cm[block[7]];
  200. pixels += line_size;
  201. block += 8;
  202. }
  203. }
  204. void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
  205. int line_size)
  206. {
  207. int i;
  208. UINT8 *cm = cropTbl + MAX_NEG_CROP;
  209. /* read the pixels */
  210. for(i=0;i<8;i++) {
  211. pixels[0] = cm[pixels[0] + block[0]];
  212. pixels[1] = cm[pixels[1] + block[1]];
  213. pixels[2] = cm[pixels[2] + block[2]];
  214. pixels[3] = cm[pixels[3] + block[3]];
  215. pixels[4] = cm[pixels[4] + block[4]];
  216. pixels[5] = cm[pixels[5] + block[5]];
  217. pixels[6] = cm[pixels[6] + block[6]];
  218. pixels[7] = cm[pixels[7] + block[7]];
  219. pixels += line_size;
  220. block += 8;
  221. }
  222. }
  223. #if 0
  224. #define PIXOP2(OPNAME, OP) \
  225. static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  226. {\
  227. int i;\
  228. for(i=0; i<h; i++){\
  229. OP(*((uint64_t*)block), LD64(pixels));\
  230. pixels+=line_size;\
  231. block +=line_size;\
  232. }\
  233. }\
  234. \
  235. static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  236. {\
  237. int i;\
  238. for(i=0; i<h; i++){\
  239. const uint64_t a= LD64(pixels );\
  240. const uint64_t b= LD64(pixels+1);\
  241. OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  242. pixels+=line_size;\
  243. block +=line_size;\
  244. }\
  245. }\
  246. \
  247. static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  248. {\
  249. int i;\
  250. for(i=0; i<h; i++){\
  251. const uint64_t a= LD64(pixels );\
  252. const uint64_t b= LD64(pixels+1);\
  253. OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  254. pixels+=line_size;\
  255. block +=line_size;\
  256. }\
  257. }\
  258. \
  259. static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  260. {\
  261. int i;\
  262. for(i=0; i<h; i++){\
  263. const uint64_t a= LD64(pixels );\
  264. const uint64_t b= LD64(pixels+line_size);\
  265. OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  266. pixels+=line_size;\
  267. block +=line_size;\
  268. }\
  269. }\
  270. \
  271. static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  272. {\
  273. int i;\
  274. for(i=0; i<h; i++){\
  275. const uint64_t a= LD64(pixels );\
  276. const uint64_t b= LD64(pixels+line_size);\
  277. OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  278. pixels+=line_size;\
  279. block +=line_size;\
  280. }\
  281. }\
  282. \
  283. static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  284. {\
  285. int i;\
  286. const uint64_t a= LD64(pixels );\
  287. const uint64_t b= LD64(pixels+1);\
  288. uint64_t l0= (a&0x0303030303030303ULL)\
  289. + (b&0x0303030303030303ULL)\
  290. + 0x0202020202020202ULL;\
  291. uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  292. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  293. uint64_t l1,h1;\
  294. \
  295. pixels+=line_size;\
  296. for(i=0; i<h; i+=2){\
  297. uint64_t a= LD64(pixels );\
  298. uint64_t b= LD64(pixels+1);\
  299. l1= (a&0x0303030303030303ULL)\
  300. + (b&0x0303030303030303ULL);\
  301. h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  302. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  303. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  304. pixels+=line_size;\
  305. block +=line_size;\
  306. a= LD64(pixels );\
  307. b= LD64(pixels+1);\
  308. l0= (a&0x0303030303030303ULL)\
  309. + (b&0x0303030303030303ULL)\
  310. + 0x0202020202020202ULL;\
  311. h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  312. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  313. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  314. pixels+=line_size;\
  315. block +=line_size;\
  316. }\
  317. }\
  318. \
  319. static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  320. {\
  321. int i;\
  322. const uint64_t a= LD64(pixels );\
  323. const uint64_t b= LD64(pixels+1);\
  324. uint64_t l0= (a&0x0303030303030303ULL)\
  325. + (b&0x0303030303030303ULL)\
  326. + 0x0101010101010101ULL;\
  327. uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  328. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  329. uint64_t l1,h1;\
  330. \
  331. pixels+=line_size;\
  332. for(i=0; i<h; i+=2){\
  333. uint64_t a= LD64(pixels );\
  334. uint64_t b= LD64(pixels+1);\
  335. l1= (a&0x0303030303030303ULL)\
  336. + (b&0x0303030303030303ULL);\
  337. h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  338. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  339. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  340. pixels+=line_size;\
  341. block +=line_size;\
  342. a= LD64(pixels );\
  343. b= LD64(pixels+1);\
  344. l0= (a&0x0303030303030303ULL)\
  345. + (b&0x0303030303030303ULL)\
  346. + 0x0101010101010101ULL;\
  347. h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  348. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  349. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  350. pixels+=line_size;\
  351. block +=line_size;\
  352. }\
  353. }\
  354. \
  355. CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels , 8)\
  356. CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
  357. CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
  358. CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
  359. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
  360. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
  361. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
  362. \
  363. void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  364. {\
  365. OPNAME ## _pixels,\
  366. OPNAME ## _pixels_x2,\
  367. OPNAME ## _pixels_y2,\
  368. OPNAME ## _pixels_xy2},\
  369. {\
  370. OPNAME ## _pixels16,\
  371. OPNAME ## _pixels16_x2,\
  372. OPNAME ## _pixels16_y2,\
  373. OPNAME ## _pixels16_xy2}\
  374. };\
  375. \
  376. void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  377. {\
  378. OPNAME ## _pixels,\
  379. OPNAME ## _no_rnd_pixels_x2,\
  380. OPNAME ## _no_rnd_pixels_y2,\
  381. OPNAME ## _no_rnd_pixels_xy2},\
  382. {\
  383. OPNAME ## _pixels16,\
  384. OPNAME ## _no_rnd_pixels16_x2,\
  385. OPNAME ## _no_rnd_pixels16_y2,\
  386. OPNAME ## _no_rnd_pixels16_xy2}\
  387. };
  388. #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
  389. #else // 64 bit variant
  390. #define PIXOP2(OPNAME, OP) \
  391. static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  392. int i;\
  393. for(i=0; i<h; i++){\
  394. OP(*((uint32_t*)(block )), LD32(pixels ));\
  395. OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  396. pixels+=line_size;\
  397. block +=line_size;\
  398. }\
  399. }\
  400. static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  401. OPNAME ## _pixels8(block, pixels, line_size, h);\
  402. }\
  403. \
  404. static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  405. int src_stride1, int src_stride2, int h){\
  406. int i;\
  407. for(i=0; i<h; i++){\
  408. uint32_t a,b;\
  409. a= LD32(&src1[i*src_stride1 ]);\
  410. b= LD32(&src2[i*src_stride2 ]);\
  411. OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
  412. a= LD32(&src1[i*src_stride1+4]);\
  413. b= LD32(&src2[i*src_stride2+4]);\
  414. OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
  415. }\
  416. }\
  417. \
  418. static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  419. int src_stride1, int src_stride2, int h){\
  420. int i;\
  421. for(i=0; i<h; i++){\
  422. uint32_t a,b;\
  423. a= LD32(&src1[i*src_stride1 ]);\
  424. b= LD32(&src2[i*src_stride2 ]);\
  425. OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
  426. a= LD32(&src1[i*src_stride1+4]);\
  427. b= LD32(&src2[i*src_stride2+4]);\
  428. OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
  429. }\
  430. }\
  431. \
  432. static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  433. int src_stride1, int src_stride2, int h){\
  434. OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  435. OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  436. }\
  437. \
  438. static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  439. int src_stride1, int src_stride2, int h){\
  440. OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  441. OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  442. }\
  443. \
  444. static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  445. OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  446. }\
  447. \
  448. static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  449. OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  450. }\
  451. \
  452. static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  453. OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  454. }\
  455. \
  456. static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  457. OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  458. }\
  459. \
  460. static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  461. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  462. int i;\
  463. for(i=0; i<h; i++){\
  464. uint32_t a, b, c, d, l0, l1, h0, h1;\
  465. a= LD32(&src1[i*src_stride1]);\
  466. b= LD32(&src2[i*src_stride2]);\
  467. c= LD32(&src3[i*src_stride3]);\
  468. d= LD32(&src4[i*src_stride4]);\
  469. l0= (a&0x03030303UL)\
  470. + (b&0x03030303UL)\
  471. + 0x02020202UL;\
  472. h0= ((a&0xFCFCFCFCUL)>>2)\
  473. + ((b&0xFCFCFCFCUL)>>2);\
  474. l1= (c&0x03030303UL)\
  475. + (d&0x03030303UL);\
  476. h1= ((c&0xFCFCFCFCUL)>>2)\
  477. + ((d&0xFCFCFCFCUL)>>2);\
  478. OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  479. a= LD32(&src1[i*src_stride1+4]);\
  480. b= LD32(&src2[i*src_stride2+4]);\
  481. c= LD32(&src3[i*src_stride3+4]);\
  482. d= LD32(&src4[i*src_stride4+4]);\
  483. l0= (a&0x03030303UL)\
  484. + (b&0x03030303UL)\
  485. + 0x02020202UL;\
  486. h0= ((a&0xFCFCFCFCUL)>>2)\
  487. + ((b&0xFCFCFCFCUL)>>2);\
  488. l1= (c&0x03030303UL)\
  489. + (d&0x03030303UL);\
  490. h1= ((c&0xFCFCFCFCUL)>>2)\
  491. + ((d&0xFCFCFCFCUL)>>2);\
  492. OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  493. }\
  494. }\
  495. static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  496. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  497. int i;\
  498. for(i=0; i<h; i++){\
  499. uint32_t a, b, c, d, l0, l1, h0, h1;\
  500. a= LD32(&src1[i*src_stride1]);\
  501. b= LD32(&src2[i*src_stride2]);\
  502. c= LD32(&src3[i*src_stride3]);\
  503. d= LD32(&src4[i*src_stride4]);\
  504. l0= (a&0x03030303UL)\
  505. + (b&0x03030303UL)\
  506. + 0x01010101UL;\
  507. h0= ((a&0xFCFCFCFCUL)>>2)\
  508. + ((b&0xFCFCFCFCUL)>>2);\
  509. l1= (c&0x03030303UL)\
  510. + (d&0x03030303UL);\
  511. h1= ((c&0xFCFCFCFCUL)>>2)\
  512. + ((d&0xFCFCFCFCUL)>>2);\
  513. OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  514. a= LD32(&src1[i*src_stride1+4]);\
  515. b= LD32(&src2[i*src_stride2+4]);\
  516. c= LD32(&src3[i*src_stride3+4]);\
  517. d= LD32(&src4[i*src_stride4+4]);\
  518. l0= (a&0x03030303UL)\
  519. + (b&0x03030303UL)\
  520. + 0x01010101UL;\
  521. h0= ((a&0xFCFCFCFCUL)>>2)\
  522. + ((b&0xFCFCFCFCUL)>>2);\
  523. l1= (c&0x03030303UL)\
  524. + (d&0x03030303UL);\
  525. h1= ((c&0xFCFCFCFCUL)>>2)\
  526. + ((d&0xFCFCFCFCUL)>>2);\
  527. OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  528. }\
  529. }\
  530. static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  531. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  532. OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  533. OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  534. }\
  535. static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  536. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  537. OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  538. OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  539. }\
  540. \
  541. static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  542. {\
  543. int j;\
  544. for(j=0; j<2; j++){\
  545. int i;\
  546. const uint32_t a= LD32(pixels );\
  547. const uint32_t b= LD32(pixels+1);\
  548. uint32_t l0= (a&0x03030303UL)\
  549. + (b&0x03030303UL)\
  550. + 0x02020202UL;\
  551. uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  552. + ((b&0xFCFCFCFCUL)>>2);\
  553. uint32_t l1,h1;\
  554. \
  555. pixels+=line_size;\
  556. for(i=0; i<h; i+=2){\
  557. uint32_t a= LD32(pixels );\
  558. uint32_t b= LD32(pixels+1);\
  559. l1= (a&0x03030303UL)\
  560. + (b&0x03030303UL);\
  561. h1= ((a&0xFCFCFCFCUL)>>2)\
  562. + ((b&0xFCFCFCFCUL)>>2);\
  563. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  564. pixels+=line_size;\
  565. block +=line_size;\
  566. a= LD32(pixels );\
  567. b= LD32(pixels+1);\
  568. l0= (a&0x03030303UL)\
  569. + (b&0x03030303UL)\
  570. + 0x02020202UL;\
  571. h0= ((a&0xFCFCFCFCUL)>>2)\
  572. + ((b&0xFCFCFCFCUL)>>2);\
  573. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  574. pixels+=line_size;\
  575. block +=line_size;\
  576. }\
  577. pixels+=4-line_size*(h+1);\
  578. block +=4-line_size*h;\
  579. }\
  580. }\
  581. \
  582. static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  583. {\
  584. int j;\
  585. for(j=0; j<2; j++){\
  586. int i;\
  587. const uint32_t a= LD32(pixels );\
  588. const uint32_t b= LD32(pixels+1);\
  589. uint32_t l0= (a&0x03030303UL)\
  590. + (b&0x03030303UL)\
  591. + 0x01010101UL;\
  592. uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  593. + ((b&0xFCFCFCFCUL)>>2);\
  594. uint32_t l1,h1;\
  595. \
  596. pixels+=line_size;\
  597. for(i=0; i<h; i+=2){\
  598. uint32_t a= LD32(pixels );\
  599. uint32_t b= LD32(pixels+1);\
  600. l1= (a&0x03030303UL)\
  601. + (b&0x03030303UL);\
  602. h1= ((a&0xFCFCFCFCUL)>>2)\
  603. + ((b&0xFCFCFCFCUL)>>2);\
  604. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  605. pixels+=line_size;\
  606. block +=line_size;\
  607. a= LD32(pixels );\
  608. b= LD32(pixels+1);\
  609. l0= (a&0x03030303UL)\
  610. + (b&0x03030303UL)\
  611. + 0x01010101UL;\
  612. h0= ((a&0xFCFCFCFCUL)>>2)\
  613. + ((b&0xFCFCFCFCUL)>>2);\
  614. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  615. pixels+=line_size;\
  616. block +=line_size;\
  617. }\
  618. pixels+=4-line_size*(h+1);\
  619. block +=4-line_size*h;\
  620. }\
  621. }\
  622. \
  623. CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels8 , 8)\
  624. CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
  625. CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
  626. CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
  627. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16 , OPNAME ## _pixels8 , 8)\
  628. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
  629. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
  630. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
  631. \
  632. void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  633. {\
  634. OPNAME ## _pixels16,\
  635. OPNAME ## _pixels16_x2,\
  636. OPNAME ## _pixels16_y2,\
  637. OPNAME ## _pixels16_xy2},\
  638. {\
  639. OPNAME ## _pixels8,\
  640. OPNAME ## _pixels8_x2,\
  641. OPNAME ## _pixels8_y2,\
  642. OPNAME ## _pixels8_xy2},\
  643. };\
  644. \
  645. void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  646. {\
  647. OPNAME ## _pixels16,\
  648. OPNAME ## _no_rnd_pixels16_x2,\
  649. OPNAME ## _no_rnd_pixels16_y2,\
  650. OPNAME ## _no_rnd_pixels16_xy2},\
  651. {\
  652. OPNAME ## _pixels8,\
  653. OPNAME ## _no_rnd_pixels8_x2,\
  654. OPNAME ## _no_rnd_pixels8_y2,\
  655. OPNAME ## _no_rnd_pixels8_xy2},\
  656. };
  657. #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  658. #endif
  659. #define op_put(a, b) a = b
  660. PIXOP2(avg, op_avg)
  661. PIXOP2(put, op_put)
  662. #undef op_avg
  663. #undef op_put
  664. #if 0
  665. /* FIXME this stuff could be removed as its ot really used anymore */
  666. #define PIXOP(BTYPE, OPNAME, OP, INCR) \
  667. \
  668. static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  669. { \
  670. BTYPE *p; \
  671. const UINT8 *pix; \
  672. \
  673. p = block; \
  674. pix = pixels; \
  675. do { \
  676. OP(p[0], pix[0]); \
  677. OP(p[1], pix[1]); \
  678. OP(p[2], pix[2]); \
  679. OP(p[3], pix[3]); \
  680. OP(p[4], pix[4]); \
  681. OP(p[5], pix[5]); \
  682. OP(p[6], pix[6]); \
  683. OP(p[7], pix[7]); \
  684. pix += line_size; \
  685. p += INCR; \
  686. } while (--h);; \
  687. } \
  688. \
  689. static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  690. { \
  691. BTYPE *p; \
  692. const UINT8 *pix; \
  693. \
  694. p = block; \
  695. pix = pixels; \
  696. do { \
  697. OP(p[0], avg2(pix[0], pix[1])); \
  698. OP(p[1], avg2(pix[1], pix[2])); \
  699. OP(p[2], avg2(pix[2], pix[3])); \
  700. OP(p[3], avg2(pix[3], pix[4])); \
  701. OP(p[4], avg2(pix[4], pix[5])); \
  702. OP(p[5], avg2(pix[5], pix[6])); \
  703. OP(p[6], avg2(pix[6], pix[7])); \
  704. OP(p[7], avg2(pix[7], pix[8])); \
  705. pix += line_size; \
  706. p += INCR; \
  707. } while (--h); \
  708. } \
  709. \
  710. static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  711. { \
  712. BTYPE *p; \
  713. const UINT8 *pix; \
  714. const UINT8 *pix1; \
  715. \
  716. p = block; \
  717. pix = pixels; \
  718. pix1 = pixels + line_size; \
  719. do { \
  720. OP(p[0], avg2(pix[0], pix1[0])); \
  721. OP(p[1], avg2(pix[1], pix1[1])); \
  722. OP(p[2], avg2(pix[2], pix1[2])); \
  723. OP(p[3], avg2(pix[3], pix1[3])); \
  724. OP(p[4], avg2(pix[4], pix1[4])); \
  725. OP(p[5], avg2(pix[5], pix1[5])); \
  726. OP(p[6], avg2(pix[6], pix1[6])); \
  727. OP(p[7], avg2(pix[7], pix1[7])); \
  728. pix += line_size; \
  729. pix1 += line_size; \
  730. p += INCR; \
  731. } while(--h); \
  732. } \
  733. \
  734. static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  735. { \
  736. BTYPE *p; \
  737. const UINT8 *pix; \
  738. const UINT8 *pix1; \
  739. \
  740. p = block; \
  741. pix = pixels; \
  742. pix1 = pixels + line_size; \
  743. do { \
  744. OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
  745. OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
  746. OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
  747. OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
  748. OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
  749. OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
  750. OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
  751. OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
  752. pix += line_size; \
  753. pix1 += line_size; \
  754. p += INCR; \
  755. } while(--h); \
  756. } \
  757. \
  758. void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
  759. OPNAME ## _pixels, \
  760. OPNAME ## _pixels_x2, \
  761. OPNAME ## _pixels_y2, \
  762. OPNAME ## _pixels_xy2, \
  763. };
  764. /* rounding primitives */
  765. #define avg2(a,b) ((a+b+1)>>1)
  766. #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
  767. #define op_avg(a, b) a = avg2(a, b)
  768. #define op_sub(a, b) a -= b
  769. #define op_put(a, b) a = b
  770. PIXOP(DCTELEM, sub, op_sub, 8)
  771. PIXOP(uint8_t, avg, op_avg, line_size)
  772. PIXOP(uint8_t, put, op_put, line_size)
  773. /* not rounding primitives */
  774. #undef avg2
  775. #undef avg4
  776. #define avg2(a,b) ((a+b)>>1)
  777. #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
  778. PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
  779. PIXOP(uint8_t, put_no_rnd, op_put, line_size)
  780. /* motion estimation */
  781. #undef avg2
  782. #undef avg4
  783. #endif
  784. #define avg2(a,b) ((a+b+1)>>1)
  785. #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
  786. static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
  787. {
  788. const int A=(16-x16)*(16-y16);
  789. const int B=( x16)*(16-y16);
  790. const int C=(16-x16)*( y16);
  791. const int D=( x16)*( y16);
  792. int i;
  793. rounder= 128 - rounder;
  794. for(i=0; i<h; i++)
  795. {
  796. dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
  797. dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
  798. dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
  799. dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
  800. dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
  801. dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
  802. dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
  803. dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
  804. dst+= stride;
  805. src+= stride;
  806. }
  807. }
  808. static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
  809. {
  810. int i;
  811. for(i=0; i<h; i++)
  812. {
  813. ST32(dst , LD32(src ));
  814. ST32(dst+4 , LD32(src+4 ));
  815. ST32(dst+8 , LD32(src+8 ));
  816. ST32(dst+12, LD32(src+12));
  817. dst[16]= src[16];
  818. dst+=dstStride;
  819. src+=srcStride;
  820. }
  821. }
  822. static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
  823. {
  824. int i;
  825. for(i=0; i<h; i++)
  826. {
  827. ST32(dst , LD32(src ));
  828. ST32(dst+4 , LD32(src+4 ));
  829. dst[8]= src[8];
  830. dst+=dstStride;
  831. src+=srcStride;
  832. }
  833. }
  834. #define QPEL_MC(r, OPNAME, RND, OP) \
  835. static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
  836. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  837. int i;\
  838. for(i=0; i<h; i++)\
  839. {\
  840. OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
  841. OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
  842. OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
  843. OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
  844. OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
  845. OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
  846. OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
  847. OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
  848. dst+=dstStride;\
  849. src+=srcStride;\
  850. }\
  851. }\
  852. \
  853. static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
  854. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  855. int i;\
  856. for(i=0; i<w; i++)\
  857. {\
  858. const int src0= src[0*srcStride];\
  859. const int src1= src[1*srcStride];\
  860. const int src2= src[2*srcStride];\
  861. const int src3= src[3*srcStride];\
  862. const int src4= src[4*srcStride];\
  863. const int src5= src[5*srcStride];\
  864. const int src6= src[6*srcStride];\
  865. const int src7= src[7*srcStride];\
  866. const int src8= src[8*srcStride];\
  867. OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
  868. OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
  869. OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
  870. OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
  871. OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
  872. OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
  873. OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
  874. OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
  875. dst++;\
  876. src++;\
  877. }\
  878. }\
  879. \
  880. static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
  881. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  882. int i;\
  883. for(i=0; i<h; i++)\
  884. {\
  885. OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
  886. OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
  887. OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
  888. OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
  889. OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
  890. OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
  891. OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
  892. OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
  893. OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
  894. OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
  895. OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
  896. OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
  897. OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
  898. OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
  899. OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
  900. OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
  901. dst+=dstStride;\
  902. src+=srcStride;\
  903. }\
  904. }\
  905. \
  906. static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
  907. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  908. int i;\
  909. for(i=0; i<w; i++)\
  910. {\
  911. const int src0= src[0*srcStride];\
  912. const int src1= src[1*srcStride];\
  913. const int src2= src[2*srcStride];\
  914. const int src3= src[3*srcStride];\
  915. const int src4= src[4*srcStride];\
  916. const int src5= src[5*srcStride];\
  917. const int src6= src[6*srcStride];\
  918. const int src7= src[7*srcStride];\
  919. const int src8= src[8*srcStride];\
  920. const int src9= src[9*srcStride];\
  921. const int src10= src[10*srcStride];\
  922. const int src11= src[11*srcStride];\
  923. const int src12= src[12*srcStride];\
  924. const int src13= src[13*srcStride];\
  925. const int src14= src[14*srcStride];\
  926. const int src15= src[15*srcStride];\
  927. const int src16= src[16*srcStride];\
  928. OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
  929. OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
  930. OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
  931. OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
  932. OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
  933. OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
  934. OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
  935. OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
  936. OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
  937. OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
  938. OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
  939. OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
  940. OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
  941. OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
  942. OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
  943. OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
  944. dst++;\
  945. src++;\
  946. }\
  947. }\
  948. \
  949. static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
  950. OPNAME ## pixels8(dst, src, stride, 8);\
  951. }\
  952. \
  953. static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
  954. UINT8 half[64];\
  955. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  956. OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
  957. }\
  958. \
  959. static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
  960. OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
  961. }\
  962. \
  963. static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
  964. UINT8 half[64];\
  965. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  966. OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
  967. }\
  968. \
  969. static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
  970. UINT8 full[16*9];\
  971. UINT8 half[64];\
  972. copy_block9(full, src, 16, stride, 9);\
  973. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
  974. OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
  975. }\
  976. \
  977. static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
  978. UINT8 full[16*9];\
  979. copy_block9(full, src, 16, stride, 9);\
  980. OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
  981. }\
  982. \
  983. static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
  984. UINT8 full[16*9];\
  985. UINT8 half[64];\
  986. copy_block9(full, src, 16, stride, 9);\
  987. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
  988. OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
  989. }\
  990. static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
  991. UINT8 full[16*9];\
  992. UINT8 halfH[72];\
  993. UINT8 halfV[64];\
  994. UINT8 halfHV[64];\
  995. copy_block9(full, src, 16, stride, 9);\
  996. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  997. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
  998. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  999. OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1000. }\
  1001. static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
  1002. UINT8 full[16*9];\
  1003. UINT8 halfH[72];\
  1004. UINT8 halfV[64];\
  1005. UINT8 halfHV[64];\
  1006. copy_block9(full, src, 16, stride, 9);\
  1007. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1008. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
  1009. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1010. OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1011. }\
  1012. static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
  1013. UINT8 full[16*9];\
  1014. UINT8 halfH[72];\
  1015. UINT8 halfV[64];\
  1016. UINT8 halfHV[64];\
  1017. copy_block9(full, src, 16, stride, 9);\
  1018. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1019. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
  1020. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1021. OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1022. }\
  1023. static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
  1024. UINT8 full[16*9];\
  1025. UINT8 halfH[72];\
  1026. UINT8 halfV[64];\
  1027. UINT8 halfHV[64];\
  1028. copy_block9(full, src, 16, stride, 9);\
  1029. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
  1030. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
  1031. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1032. OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1033. }\
  1034. static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
  1035. UINT8 halfH[72];\
  1036. UINT8 halfHV[64];\
  1037. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  1038. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1039. OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
  1040. }\
  1041. static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
  1042. UINT8 halfH[72];\
  1043. UINT8 halfHV[64];\
  1044. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  1045. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1046. OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  1047. }\
  1048. static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
  1049. UINT8 full[16*9];\
  1050. UINT8 halfH[72];\
  1051. UINT8 halfV[64];\
  1052. UINT8 halfHV[64];\
  1053. copy_block9(full, src, 16, stride, 9);\
  1054. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1055. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
  1056. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1057. OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
  1058. }\
  1059. static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
  1060. UINT8 full[16*9];\
  1061. UINT8 halfH[72];\
  1062. UINT8 halfV[64];\
  1063. UINT8 halfHV[64];\
  1064. copy_block9(full, src, 16, stride, 9);\
  1065. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1066. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
  1067. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1068. OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
  1069. }\
  1070. static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
  1071. UINT8 halfH[72];\
  1072. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  1073. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
  1074. }\
  1075. static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
  1076. OPNAME ## pixels16(dst, src, stride, 16);\
  1077. }\
  1078. \
  1079. static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
  1080. UINT8 half[256];\
  1081. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  1082. OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
  1083. }\
  1084. \
  1085. static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
  1086. OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
  1087. }\
  1088. \
  1089. static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
  1090. UINT8 half[256];\
  1091. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  1092. OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
  1093. }\
  1094. \
  1095. static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
  1096. UINT8 full[24*17];\
  1097. UINT8 half[256];\
  1098. copy_block17(full, src, 24, stride, 17);\
  1099. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
  1100. OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
  1101. }\
  1102. \
  1103. static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
  1104. UINT8 full[24*17];\
  1105. copy_block17(full, src, 24, stride, 17);\
  1106. OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
  1107. }\
  1108. \
  1109. static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
  1110. UINT8 full[24*17];\
  1111. UINT8 half[256];\
  1112. copy_block17(full, src, 24, stride, 17);\
  1113. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
  1114. OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
  1115. }\
  1116. static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
  1117. UINT8 full[24*17];\
  1118. UINT8 halfH[272];\
  1119. UINT8 halfV[256];\
  1120. UINT8 halfHV[256];\
  1121. copy_block17(full, src, 24, stride, 17);\
  1122. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1123. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
  1124. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1125. OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1126. }\
  1127. static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
  1128. UINT8 full[24*17];\
  1129. UINT8 halfH[272];\
  1130. UINT8 halfV[256];\
  1131. UINT8 halfHV[256];\
  1132. copy_block17(full, src, 24, stride, 17);\
  1133. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1134. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
  1135. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1136. OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1137. }\
  1138. static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
  1139. UINT8 full[24*17];\
  1140. UINT8 halfH[272];\
  1141. UINT8 halfV[256];\
  1142. UINT8 halfHV[256];\
  1143. copy_block17(full, src, 24, stride, 17);\
  1144. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1145. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
  1146. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1147. OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1148. }\
  1149. static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
  1150. UINT8 full[24*17];\
  1151. UINT8 halfH[272];\
  1152. UINT8 halfV[256];\
  1153. UINT8 halfHV[256];\
  1154. copy_block17(full, src, 24, stride, 17);\
  1155. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
  1156. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
  1157. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1158. OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1159. }\
  1160. static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
  1161. UINT8 halfH[272];\
  1162. UINT8 halfHV[256];\
  1163. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  1164. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1165. OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
  1166. }\
  1167. static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
  1168. UINT8 halfH[272];\
  1169. UINT8 halfHV[256];\
  1170. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  1171. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1172. OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  1173. }\
  1174. static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
  1175. UINT8 full[24*17];\
  1176. UINT8 halfH[272];\
  1177. UINT8 halfV[256];\
  1178. UINT8 halfHV[256];\
  1179. copy_block17(full, src, 24, stride, 17);\
  1180. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1181. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
  1182. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1183. OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
  1184. }\
  1185. static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
  1186. UINT8 full[24*17];\
  1187. UINT8 halfH[272];\
  1188. UINT8 halfV[256];\
  1189. UINT8 halfHV[256];\
  1190. copy_block17(full, src, 24, stride, 17);\
  1191. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1192. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
  1193. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1194. OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
  1195. }\
  1196. static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
  1197. UINT8 halfH[272];\
  1198. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  1199. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
  1200. }\
  1201. qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
  1202. {\
  1203. OPNAME ## qpel16_mc00_c, \
  1204. OPNAME ## qpel16_mc10_c, \
  1205. OPNAME ## qpel16_mc20_c, \
  1206. OPNAME ## qpel16_mc30_c, \
  1207. OPNAME ## qpel16_mc01_c, \
  1208. OPNAME ## qpel16_mc11_c, \
  1209. OPNAME ## qpel16_mc21_c, \
  1210. OPNAME ## qpel16_mc31_c, \
  1211. OPNAME ## qpel16_mc02_c, \
  1212. OPNAME ## qpel16_mc12_c, \
  1213. OPNAME ## qpel16_mc22_c, \
  1214. OPNAME ## qpel16_mc32_c, \
  1215. OPNAME ## qpel16_mc03_c, \
  1216. OPNAME ## qpel16_mc13_c, \
  1217. OPNAME ## qpel16_mc23_c, \
  1218. OPNAME ## qpel16_mc33_c, \
  1219. },{\
  1220. OPNAME ## qpel8_mc00_c, \
  1221. OPNAME ## qpel8_mc10_c, \
  1222. OPNAME ## qpel8_mc20_c, \
  1223. OPNAME ## qpel8_mc30_c, \
  1224. OPNAME ## qpel8_mc01_c, \
  1225. OPNAME ## qpel8_mc11_c, \
  1226. OPNAME ## qpel8_mc21_c, \
  1227. OPNAME ## qpel8_mc31_c, \
  1228. OPNAME ## qpel8_mc02_c, \
  1229. OPNAME ## qpel8_mc12_c, \
  1230. OPNAME ## qpel8_mc22_c, \
  1231. OPNAME ## qpel8_mc32_c, \
  1232. OPNAME ## qpel8_mc03_c, \
  1233. OPNAME ## qpel8_mc13_c, \
  1234. OPNAME ## qpel8_mc23_c, \
  1235. OPNAME ## qpel8_mc33_c, \
  1236. }\
  1237. };
  1238. #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
  1239. #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
  1240. #define op_put(a, b) a = cm[((b) + 16)>>5]
  1241. #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
  1242. QPEL_MC(0, put_ , _ , op_put)
  1243. QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
  1244. QPEL_MC(0, avg_ , _ , op_avg)
  1245. //QPEL_MC(1, avg_no_rnd , _ , op_avg)
  1246. #undef op_avg
  1247. #undef op_avg_no_rnd
  1248. #undef op_put
  1249. #undef op_put_no_rnd
  1250. int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1251. {
  1252. int s, i;
  1253. s = 0;
  1254. for(i=0;i<16;i++) {
  1255. s += abs(pix1[0] - pix2[0]);
  1256. s += abs(pix1[1] - pix2[1]);
  1257. s += abs(pix1[2] - pix2[2]);
  1258. s += abs(pix1[3] - pix2[3]);
  1259. s += abs(pix1[4] - pix2[4]);
  1260. s += abs(pix1[5] - pix2[5]);
  1261. s += abs(pix1[6] - pix2[6]);
  1262. s += abs(pix1[7] - pix2[7]);
  1263. s += abs(pix1[8] - pix2[8]);
  1264. s += abs(pix1[9] - pix2[9]);
  1265. s += abs(pix1[10] - pix2[10]);
  1266. s += abs(pix1[11] - pix2[11]);
  1267. s += abs(pix1[12] - pix2[12]);
  1268. s += abs(pix1[13] - pix2[13]);
  1269. s += abs(pix1[14] - pix2[14]);
  1270. s += abs(pix1[15] - pix2[15]);
  1271. pix1 += line_size;
  1272. pix2 += line_size;
  1273. }
  1274. return s;
  1275. }
  1276. int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1277. {
  1278. int s, i;
  1279. s = 0;
  1280. for(i=0;i<16;i++) {
  1281. s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
  1282. s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
  1283. s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
  1284. s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
  1285. s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
  1286. s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
  1287. s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
  1288. s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
  1289. s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
  1290. s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
  1291. s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
  1292. s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
  1293. s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
  1294. s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
  1295. s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
  1296. s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
  1297. pix1 += line_size;
  1298. pix2 += line_size;
  1299. }
  1300. return s;
  1301. }
  1302. int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1303. {
  1304. int s, i;
  1305. UINT8 *pix3 = pix2 + line_size;
  1306. s = 0;
  1307. for(i=0;i<16;i++) {
  1308. s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
  1309. s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
  1310. s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
  1311. s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
  1312. s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
  1313. s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
  1314. s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
  1315. s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
  1316. s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
  1317. s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
  1318. s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
  1319. s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
  1320. s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
  1321. s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
  1322. s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
  1323. s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
  1324. pix1 += line_size;
  1325. pix2 += line_size;
  1326. pix3 += line_size;
  1327. }
  1328. return s;
  1329. }
  1330. int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1331. {
  1332. int s, i;
  1333. UINT8 *pix3 = pix2 + line_size;
  1334. s = 0;
  1335. for(i=0;i<16;i++) {
  1336. s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
  1337. s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
  1338. s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
  1339. s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
  1340. s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
  1341. s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
  1342. s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
  1343. s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
  1344. s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
  1345. s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
  1346. s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
  1347. s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
  1348. s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
  1349. s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
  1350. s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
  1351. s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
  1352. pix1 += line_size;
  1353. pix2 += line_size;
  1354. pix3 += line_size;
  1355. }
  1356. return s;
  1357. }
  1358. int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1359. {
  1360. int s, i;
  1361. s = 0;
  1362. for(i=0;i<8;i++) {
  1363. s += abs(pix1[0] - pix2[0]);
  1364. s += abs(pix1[1] - pix2[1]);
  1365. s += abs(pix1[2] - pix2[2]);
  1366. s += abs(pix1[3] - pix2[3]);
  1367. s += abs(pix1[4] - pix2[4]);
  1368. s += abs(pix1[5] - pix2[5]);
  1369. s += abs(pix1[6] - pix2[6]);
  1370. s += abs(pix1[7] - pix2[7]);
  1371. pix1 += line_size;
  1372. pix2 += line_size;
  1373. }
  1374. return s;
  1375. }
  1376. int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1377. {
  1378. int s, i;
  1379. s = 0;
  1380. for(i=0;i<8;i++) {
  1381. s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
  1382. s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
  1383. s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
  1384. s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
  1385. s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
  1386. s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
  1387. s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
  1388. s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
  1389. pix1 += line_size;
  1390. pix2 += line_size;
  1391. }
  1392. return s;
  1393. }
  1394. int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1395. {
  1396. int s, i;
  1397. UINT8 *pix3 = pix2 + line_size;
  1398. s = 0;
  1399. for(i=0;i<8;i++) {
  1400. s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
  1401. s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
  1402. s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
  1403. s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
  1404. s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
  1405. s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
  1406. s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
  1407. s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
  1408. pix1 += line_size;
  1409. pix2 += line_size;
  1410. pix3 += line_size;
  1411. }
  1412. return s;
  1413. }
  1414. int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1415. {
  1416. int s, i;
  1417. UINT8 *pix3 = pix2 + line_size;
  1418. s = 0;
  1419. for(i=0;i<8;i++) {
  1420. s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
  1421. s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
  1422. s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
  1423. s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
  1424. s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
  1425. s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
  1426. s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
  1427. s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
  1428. pix1 += line_size;
  1429. pix2 += line_size;
  1430. pix3 += line_size;
  1431. }
  1432. return s;
  1433. }
  1434. /* permute block according so that it corresponds to the MMX idct
  1435. order */
  1436. void block_permute(INT16 *block, UINT8 *permutation)
  1437. {
  1438. int i;
  1439. INT16 temp[64];
  1440. for(i=0; i<64; i++) temp[ permutation[i] ] = block[i];
  1441. for(i=0; i<64; i++) block[i] = temp[i];
  1442. }
  1443. void clear_blocks_c(DCTELEM *blocks)
  1444. {
  1445. memset(blocks, 0, sizeof(DCTELEM)*6*64);
  1446. }
  1447. void dsputil_init(void)
  1448. {
  1449. int i, j;
  1450. for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
  1451. for(i=0;i<MAX_NEG_CROP;i++) {
  1452. cropTbl[i] = 0;
  1453. cropTbl[i + MAX_NEG_CROP + 256] = 255;
  1454. }
  1455. for(i=0;i<512;i++) {
  1456. squareTbl[i] = (i - 256) * (i - 256);
  1457. }
  1458. get_pixels = get_pixels_c;
  1459. diff_pixels = diff_pixels_c;
  1460. put_pixels_clamped = put_pixels_clamped_c;
  1461. add_pixels_clamped = add_pixels_clamped_c;
  1462. gmc1= gmc1_c;
  1463. clear_blocks= clear_blocks_c;
  1464. pix_sum= pix_sum_c;
  1465. pix_norm1= pix_norm1_c;
  1466. pix_abs16x16 = pix_abs16x16_c;
  1467. pix_abs16x16_x2 = pix_abs16x16_x2_c;
  1468. pix_abs16x16_y2 = pix_abs16x16_y2_c;
  1469. pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
  1470. pix_abs8x8 = pix_abs8x8_c;
  1471. pix_abs8x8_x2 = pix_abs8x8_x2_c;
  1472. pix_abs8x8_y2 = pix_abs8x8_y2_c;
  1473. pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
  1474. #ifdef HAVE_MMX
  1475. dsputil_init_mmx();
  1476. #endif
  1477. #ifdef ARCH_ARMV4L
  1478. dsputil_init_armv4l();
  1479. #endif
  1480. #ifdef HAVE_MLIB
  1481. dsputil_init_mlib();
  1482. #endif
  1483. #ifdef ARCH_ALPHA
  1484. dsputil_init_alpha();
  1485. #endif
  1486. #ifdef ARCH_POWERPC
  1487. dsputil_init_ppc();
  1488. #endif
  1489. #ifdef HAVE_MMI
  1490. dsputil_init_mmi();
  1491. #endif
  1492. for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
  1493. }
  1494. /* remove any non bit exact operation (testing purpose) */
  1495. void avcodec_set_bit_exact(void)
  1496. {
  1497. ff_bit_exact=1;
  1498. #ifdef HAVE_MMX
  1499. dsputil_set_bit_exact_mmx();
  1500. #endif
  1501. }
  1502. void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
  1503. int orig_linesize[3], int coded_linesize,
  1504. AVCodecContext *avctx)
  1505. {
  1506. int quad, diff, x, y;
  1507. UINT8 *orig, *coded;
  1508. UINT32 *sq = squareTbl + 256;
  1509. quad = 0;
  1510. diff = 0;
  1511. /* Luminance */
  1512. orig = orig_image[0];
  1513. coded = coded_image[0];
  1514. for (y=0;y<avctx->height;y++) {
  1515. for (x=0;x<avctx->width;x++) {
  1516. diff = *(orig + x) - *(coded + x);
  1517. quad += sq[diff];
  1518. }
  1519. orig += orig_linesize[0];
  1520. coded += coded_linesize;
  1521. }
  1522. avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
  1523. if (avctx->psnr_y) {
  1524. avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
  1525. avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
  1526. } else
  1527. avctx->psnr_y = 99.99;
  1528. }