You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1679 lines
70KB

  1. /*
  2. * DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20. */
  21. #include "avcodec.h"
  22. #include "dsputil.h"
  23. void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
  24. void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
  25. void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  26. void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  27. void (*ff_gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
  28. void (*ff_gmc )(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
  29. int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
  30. void (*clear_blocks)(DCTELEM *blocks);
  31. int (*pix_sum)(UINT8 * pix, int line_size);
  32. int (*pix_norm1)(UINT8 * pix, int line_size);
  33. op_pixels_abs_func pix_abs16x16;
  34. op_pixels_abs_func pix_abs16x16_x2;
  35. op_pixels_abs_func pix_abs16x16_y2;
  36. op_pixels_abs_func pix_abs16x16_xy2;
  37. op_pixels_abs_func pix_abs8x8;
  38. op_pixels_abs_func pix_abs8x8_x2;
  39. op_pixels_abs_func pix_abs8x8_y2;
  40. op_pixels_abs_func pix_abs8x8_xy2;
  41. int ff_bit_exact=0;
  42. UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
  43. UINT32 squareTbl[512];
  44. const UINT8 ff_zigzag_direct[64] = {
  45. 0, 1, 8, 16, 9, 2, 3, 10,
  46. 17, 24, 32, 25, 18, 11, 4, 5,
  47. 12, 19, 26, 33, 40, 48, 41, 34,
  48. 27, 20, 13, 6, 7, 14, 21, 28,
  49. 35, 42, 49, 56, 57, 50, 43, 36,
  50. 29, 22, 15, 23, 30, 37, 44, 51,
  51. 58, 59, 52, 45, 38, 31, 39, 46,
  52. 53, 60, 61, 54, 47, 55, 62, 63
  53. };
  54. /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  55. UINT16 __align8 inv_zigzag_direct16[64];
  56. const UINT8 ff_alternate_horizontal_scan[64] = {
  57. 0, 1, 2, 3, 8, 9, 16, 17,
  58. 10, 11, 4, 5, 6, 7, 15, 14,
  59. 13, 12, 19, 18, 24, 25, 32, 33,
  60. 26, 27, 20, 21, 22, 23, 28, 29,
  61. 30, 31, 34, 35, 40, 41, 48, 49,
  62. 42, 43, 36, 37, 38, 39, 44, 45,
  63. 46, 47, 50, 51, 56, 57, 58, 59,
  64. 52, 53, 54, 55, 60, 61, 62, 63,
  65. };
  66. const UINT8 ff_alternate_vertical_scan[64] = {
  67. 0, 8, 16, 24, 1, 9, 2, 10,
  68. 17, 25, 32, 40, 48, 56, 57, 49,
  69. 41, 33, 26, 18, 3, 11, 4, 12,
  70. 19, 27, 34, 42, 50, 58, 35, 43,
  71. 51, 59, 20, 28, 5, 13, 6, 14,
  72. 21, 29, 36, 44, 52, 60, 37, 45,
  73. 53, 61, 22, 30, 7, 15, 23, 31,
  74. 38, 46, 54, 62, 39, 47, 55, 63,
  75. };
  76. /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  77. UINT32 inverse[256]={
  78. 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
  79. 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
  80. 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
  81. 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
  82. 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
  83. 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
  84. 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
  85. 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
  86. 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
  87. 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
  88. 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
  89. 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
  90. 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
  91. 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
  92. 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
  93. 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
  94. 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
  95. 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
  96. 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
  97. 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
  98. 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
  99. 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
  100. 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
  101. 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
  102. 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
  103. 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
  104. 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
  105. 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
  106. 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
  107. 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
  108. 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
  109. 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
  110. };
  111. int pix_sum_c(UINT8 * pix, int line_size)
  112. {
  113. int s, i, j;
  114. s = 0;
  115. for (i = 0; i < 16; i++) {
  116. for (j = 0; j < 16; j += 8) {
  117. s += pix[0];
  118. s += pix[1];
  119. s += pix[2];
  120. s += pix[3];
  121. s += pix[4];
  122. s += pix[5];
  123. s += pix[6];
  124. s += pix[7];
  125. pix += 8;
  126. }
  127. pix += line_size - 16;
  128. }
  129. return s;
  130. }
  131. int pix_norm1_c(UINT8 * pix, int line_size)
  132. {
  133. int s, i, j;
  134. UINT32 *sq = squareTbl + 256;
  135. s = 0;
  136. for (i = 0; i < 16; i++) {
  137. for (j = 0; j < 16; j += 8) {
  138. s += sq[pix[0]];
  139. s += sq[pix[1]];
  140. s += sq[pix[2]];
  141. s += sq[pix[3]];
  142. s += sq[pix[4]];
  143. s += sq[pix[5]];
  144. s += sq[pix[6]];
  145. s += sq[pix[7]];
  146. pix += 8;
  147. }
  148. pix += line_size - 16;
  149. }
  150. return s;
  151. }
  152. void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
  153. {
  154. int i;
  155. /* read the pixels */
  156. for(i=0;i<8;i++) {
  157. block[0] = pixels[0];
  158. block[1] = pixels[1];
  159. block[2] = pixels[2];
  160. block[3] = pixels[3];
  161. block[4] = pixels[4];
  162. block[5] = pixels[5];
  163. block[6] = pixels[6];
  164. block[7] = pixels[7];
  165. pixels += line_size;
  166. block += 8;
  167. }
  168. }
  169. void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
  170. int stride){
  171. int i;
  172. /* read the pixels */
  173. for(i=0;i<8;i++) {
  174. block[0] = s1[0] - s2[0];
  175. block[1] = s1[1] - s2[1];
  176. block[2] = s1[2] - s2[2];
  177. block[3] = s1[3] - s2[3];
  178. block[4] = s1[4] - s2[4];
  179. block[5] = s1[5] - s2[5];
  180. block[6] = s1[6] - s2[6];
  181. block[7] = s1[7] - s2[7];
  182. s1 += stride;
  183. s2 += stride;
  184. block += 8;
  185. }
  186. }
  187. void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
  188. int line_size)
  189. {
  190. int i;
  191. UINT8 *cm = cropTbl + MAX_NEG_CROP;
  192. /* read the pixels */
  193. for(i=0;i<8;i++) {
  194. pixels[0] = cm[block[0]];
  195. pixels[1] = cm[block[1]];
  196. pixels[2] = cm[block[2]];
  197. pixels[3] = cm[block[3]];
  198. pixels[4] = cm[block[4]];
  199. pixels[5] = cm[block[5]];
  200. pixels[6] = cm[block[6]];
  201. pixels[7] = cm[block[7]];
  202. pixels += line_size;
  203. block += 8;
  204. }
  205. }
  206. void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
  207. int line_size)
  208. {
  209. int i;
  210. UINT8 *cm = cropTbl + MAX_NEG_CROP;
  211. /* read the pixels */
  212. for(i=0;i<8;i++) {
  213. pixels[0] = cm[pixels[0] + block[0]];
  214. pixels[1] = cm[pixels[1] + block[1]];
  215. pixels[2] = cm[pixels[2] + block[2]];
  216. pixels[3] = cm[pixels[3] + block[3]];
  217. pixels[4] = cm[pixels[4] + block[4]];
  218. pixels[5] = cm[pixels[5] + block[5]];
  219. pixels[6] = cm[pixels[6] + block[6]];
  220. pixels[7] = cm[pixels[7] + block[7]];
  221. pixels += line_size;
  222. block += 8;
  223. }
  224. }
  225. #if 0
  226. #define PIXOP2(OPNAME, OP) \
  227. static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  228. {\
  229. int i;\
  230. for(i=0; i<h; i++){\
  231. OP(*((uint64_t*)block), LD64(pixels));\
  232. pixels+=line_size;\
  233. block +=line_size;\
  234. }\
  235. }\
  236. \
  237. static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  238. {\
  239. int i;\
  240. for(i=0; i<h; i++){\
  241. const uint64_t a= LD64(pixels );\
  242. const uint64_t b= LD64(pixels+1);\
  243. OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  244. pixels+=line_size;\
  245. block +=line_size;\
  246. }\
  247. }\
  248. \
  249. static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  250. {\
  251. int i;\
  252. for(i=0; i<h; i++){\
  253. const uint64_t a= LD64(pixels );\
  254. const uint64_t b= LD64(pixels+1);\
  255. OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  256. pixels+=line_size;\
  257. block +=line_size;\
  258. }\
  259. }\
  260. \
  261. static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  262. {\
  263. int i;\
  264. for(i=0; i<h; i++){\
  265. const uint64_t a= LD64(pixels );\
  266. const uint64_t b= LD64(pixels+line_size);\
  267. OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  268. pixels+=line_size;\
  269. block +=line_size;\
  270. }\
  271. }\
  272. \
  273. static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  274. {\
  275. int i;\
  276. for(i=0; i<h; i++){\
  277. const uint64_t a= LD64(pixels );\
  278. const uint64_t b= LD64(pixels+line_size);\
  279. OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
  280. pixels+=line_size;\
  281. block +=line_size;\
  282. }\
  283. }\
  284. \
  285. static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  286. {\
  287. int i;\
  288. const uint64_t a= LD64(pixels );\
  289. const uint64_t b= LD64(pixels+1);\
  290. uint64_t l0= (a&0x0303030303030303ULL)\
  291. + (b&0x0303030303030303ULL)\
  292. + 0x0202020202020202ULL;\
  293. uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  294. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  295. uint64_t l1,h1;\
  296. \
  297. pixels+=line_size;\
  298. for(i=0; i<h; i+=2){\
  299. uint64_t a= LD64(pixels );\
  300. uint64_t b= LD64(pixels+1);\
  301. l1= (a&0x0303030303030303ULL)\
  302. + (b&0x0303030303030303ULL);\
  303. h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  304. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  305. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  306. pixels+=line_size;\
  307. block +=line_size;\
  308. a= LD64(pixels );\
  309. b= LD64(pixels+1);\
  310. l0= (a&0x0303030303030303ULL)\
  311. + (b&0x0303030303030303ULL)\
  312. + 0x0202020202020202ULL;\
  313. h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  314. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  315. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  316. pixels+=line_size;\
  317. block +=line_size;\
  318. }\
  319. }\
  320. \
  321. static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  322. {\
  323. int i;\
  324. const uint64_t a= LD64(pixels );\
  325. const uint64_t b= LD64(pixels+1);\
  326. uint64_t l0= (a&0x0303030303030303ULL)\
  327. + (b&0x0303030303030303ULL)\
  328. + 0x0101010101010101ULL;\
  329. uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  330. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  331. uint64_t l1,h1;\
  332. \
  333. pixels+=line_size;\
  334. for(i=0; i<h; i+=2){\
  335. uint64_t a= LD64(pixels );\
  336. uint64_t b= LD64(pixels+1);\
  337. l1= (a&0x0303030303030303ULL)\
  338. + (b&0x0303030303030303ULL);\
  339. h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  340. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  341. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  342. pixels+=line_size;\
  343. block +=line_size;\
  344. a= LD64(pixels );\
  345. b= LD64(pixels+1);\
  346. l0= (a&0x0303030303030303ULL)\
  347. + (b&0x0303030303030303ULL)\
  348. + 0x0101010101010101ULL;\
  349. h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
  350. + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
  351. OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
  352. pixels+=line_size;\
  353. block +=line_size;\
  354. }\
  355. }\
  356. \
  357. CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels , 8)\
  358. CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
  359. CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
  360. CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
  361. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
  362. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
  363. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
  364. \
  365. void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  366. {\
  367. OPNAME ## _pixels,\
  368. OPNAME ## _pixels_x2,\
  369. OPNAME ## _pixels_y2,\
  370. OPNAME ## _pixels_xy2},\
  371. {\
  372. OPNAME ## _pixels16,\
  373. OPNAME ## _pixels16_x2,\
  374. OPNAME ## _pixels16_y2,\
  375. OPNAME ## _pixels16_xy2}\
  376. };\
  377. \
  378. void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  379. {\
  380. OPNAME ## _pixels,\
  381. OPNAME ## _no_rnd_pixels_x2,\
  382. OPNAME ## _no_rnd_pixels_y2,\
  383. OPNAME ## _no_rnd_pixels_xy2},\
  384. {\
  385. OPNAME ## _pixels16,\
  386. OPNAME ## _no_rnd_pixels16_x2,\
  387. OPNAME ## _no_rnd_pixels16_y2,\
  388. OPNAME ## _no_rnd_pixels16_xy2}\
  389. };
  390. #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
  391. #else // 64 bit variant
  392. #define PIXOP2(OPNAME, OP) \
  393. static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  394. int i;\
  395. for(i=0; i<h; i++){\
  396. OP(*((uint32_t*)(block )), LD32(pixels ));\
  397. OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  398. pixels+=line_size;\
  399. block +=line_size;\
  400. }\
  401. }\
  402. static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  403. OPNAME ## _pixels8(block, pixels, line_size, h);\
  404. }\
  405. \
  406. static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  407. int src_stride1, int src_stride2, int h){\
  408. int i;\
  409. for(i=0; i<h; i++){\
  410. uint32_t a,b;\
  411. a= LD32(&src1[i*src_stride1 ]);\
  412. b= LD32(&src2[i*src_stride2 ]);\
  413. OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
  414. a= LD32(&src1[i*src_stride1+4]);\
  415. b= LD32(&src2[i*src_stride2+4]);\
  416. OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
  417. }\
  418. }\
  419. \
  420. static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  421. int src_stride1, int src_stride2, int h){\
  422. int i;\
  423. for(i=0; i<h; i++){\
  424. uint32_t a,b;\
  425. a= LD32(&src1[i*src_stride1 ]);\
  426. b= LD32(&src2[i*src_stride2 ]);\
  427. OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
  428. a= LD32(&src1[i*src_stride1+4]);\
  429. b= LD32(&src2[i*src_stride2+4]);\
  430. OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
  431. }\
  432. }\
  433. \
  434. static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  435. int src_stride1, int src_stride2, int h){\
  436. OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  437. OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  438. }\
  439. \
  440. static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  441. int src_stride1, int src_stride2, int h){\
  442. OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  443. OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  444. }\
  445. \
  446. static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  447. OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  448. }\
  449. \
  450. static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  451. OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  452. }\
  453. \
  454. static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  455. OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  456. }\
  457. \
  458. static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  459. OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  460. }\
  461. \
  462. static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  463. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  464. int i;\
  465. for(i=0; i<h; i++){\
  466. uint32_t a, b, c, d, l0, l1, h0, h1;\
  467. a= LD32(&src1[i*src_stride1]);\
  468. b= LD32(&src2[i*src_stride2]);\
  469. c= LD32(&src3[i*src_stride3]);\
  470. d= LD32(&src4[i*src_stride4]);\
  471. l0= (a&0x03030303UL)\
  472. + (b&0x03030303UL)\
  473. + 0x02020202UL;\
  474. h0= ((a&0xFCFCFCFCUL)>>2)\
  475. + ((b&0xFCFCFCFCUL)>>2);\
  476. l1= (c&0x03030303UL)\
  477. + (d&0x03030303UL);\
  478. h1= ((c&0xFCFCFCFCUL)>>2)\
  479. + ((d&0xFCFCFCFCUL)>>2);\
  480. OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  481. a= LD32(&src1[i*src_stride1+4]);\
  482. b= LD32(&src2[i*src_stride2+4]);\
  483. c= LD32(&src3[i*src_stride3+4]);\
  484. d= LD32(&src4[i*src_stride4+4]);\
  485. l0= (a&0x03030303UL)\
  486. + (b&0x03030303UL)\
  487. + 0x02020202UL;\
  488. h0= ((a&0xFCFCFCFCUL)>>2)\
  489. + ((b&0xFCFCFCFCUL)>>2);\
  490. l1= (c&0x03030303UL)\
  491. + (d&0x03030303UL);\
  492. h1= ((c&0xFCFCFCFCUL)>>2)\
  493. + ((d&0xFCFCFCFCUL)>>2);\
  494. OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  495. }\
  496. }\
  497. static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  498. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  499. int i;\
  500. for(i=0; i<h; i++){\
  501. uint32_t a, b, c, d, l0, l1, h0, h1;\
  502. a= LD32(&src1[i*src_stride1]);\
  503. b= LD32(&src2[i*src_stride2]);\
  504. c= LD32(&src3[i*src_stride3]);\
  505. d= LD32(&src4[i*src_stride4]);\
  506. l0= (a&0x03030303UL)\
  507. + (b&0x03030303UL)\
  508. + 0x01010101UL;\
  509. h0= ((a&0xFCFCFCFCUL)>>2)\
  510. + ((b&0xFCFCFCFCUL)>>2);\
  511. l1= (c&0x03030303UL)\
  512. + (d&0x03030303UL);\
  513. h1= ((c&0xFCFCFCFCUL)>>2)\
  514. + ((d&0xFCFCFCFCUL)>>2);\
  515. OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  516. a= LD32(&src1[i*src_stride1+4]);\
  517. b= LD32(&src2[i*src_stride2+4]);\
  518. c= LD32(&src3[i*src_stride3+4]);\
  519. d= LD32(&src4[i*src_stride4+4]);\
  520. l0= (a&0x03030303UL)\
  521. + (b&0x03030303UL)\
  522. + 0x01010101UL;\
  523. h0= ((a&0xFCFCFCFCUL)>>2)\
  524. + ((b&0xFCFCFCFCUL)>>2);\
  525. l1= (c&0x03030303UL)\
  526. + (d&0x03030303UL);\
  527. h1= ((c&0xFCFCFCFCUL)>>2)\
  528. + ((d&0xFCFCFCFCUL)>>2);\
  529. OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  530. }\
  531. }\
  532. static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  533. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  534. OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  535. OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  536. }\
  537. static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  538. int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  539. OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  540. OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  541. }\
  542. \
  543. static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  544. {\
  545. int j;\
  546. for(j=0; j<2; j++){\
  547. int i;\
  548. const uint32_t a= LD32(pixels );\
  549. const uint32_t b= LD32(pixels+1);\
  550. uint32_t l0= (a&0x03030303UL)\
  551. + (b&0x03030303UL)\
  552. + 0x02020202UL;\
  553. uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  554. + ((b&0xFCFCFCFCUL)>>2);\
  555. uint32_t l1,h1;\
  556. \
  557. pixels+=line_size;\
  558. for(i=0; i<h; i+=2){\
  559. uint32_t a= LD32(pixels );\
  560. uint32_t b= LD32(pixels+1);\
  561. l1= (a&0x03030303UL)\
  562. + (b&0x03030303UL);\
  563. h1= ((a&0xFCFCFCFCUL)>>2)\
  564. + ((b&0xFCFCFCFCUL)>>2);\
  565. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  566. pixels+=line_size;\
  567. block +=line_size;\
  568. a= LD32(pixels );\
  569. b= LD32(pixels+1);\
  570. l0= (a&0x03030303UL)\
  571. + (b&0x03030303UL)\
  572. + 0x02020202UL;\
  573. h0= ((a&0xFCFCFCFCUL)>>2)\
  574. + ((b&0xFCFCFCFCUL)>>2);\
  575. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  576. pixels+=line_size;\
  577. block +=line_size;\
  578. }\
  579. pixels+=4-line_size*(h+1);\
  580. block +=4-line_size*h;\
  581. }\
  582. }\
  583. \
  584. static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  585. {\
  586. int j;\
  587. for(j=0; j<2; j++){\
  588. int i;\
  589. const uint32_t a= LD32(pixels );\
  590. const uint32_t b= LD32(pixels+1);\
  591. uint32_t l0= (a&0x03030303UL)\
  592. + (b&0x03030303UL)\
  593. + 0x01010101UL;\
  594. uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  595. + ((b&0xFCFCFCFCUL)>>2);\
  596. uint32_t l1,h1;\
  597. \
  598. pixels+=line_size;\
  599. for(i=0; i<h; i+=2){\
  600. uint32_t a= LD32(pixels );\
  601. uint32_t b= LD32(pixels+1);\
  602. l1= (a&0x03030303UL)\
  603. + (b&0x03030303UL);\
  604. h1= ((a&0xFCFCFCFCUL)>>2)\
  605. + ((b&0xFCFCFCFCUL)>>2);\
  606. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  607. pixels+=line_size;\
  608. block +=line_size;\
  609. a= LD32(pixels );\
  610. b= LD32(pixels+1);\
  611. l0= (a&0x03030303UL)\
  612. + (b&0x03030303UL)\
  613. + 0x01010101UL;\
  614. h0= ((a&0xFCFCFCFCUL)>>2)\
  615. + ((b&0xFCFCFCFCUL)>>2);\
  616. OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  617. pixels+=line_size;\
  618. block +=line_size;\
  619. }\
  620. pixels+=4-line_size*(h+1);\
  621. block +=4-line_size*h;\
  622. }\
  623. }\
  624. \
  625. CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels8 , 8)\
  626. CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
  627. CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
  628. CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
  629. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16 , OPNAME ## _pixels8 , 8)\
  630. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
  631. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
  632. CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
  633. \
  634. void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  635. {\
  636. OPNAME ## _pixels16,\
  637. OPNAME ## _pixels16_x2,\
  638. OPNAME ## _pixels16_y2,\
  639. OPNAME ## _pixels16_xy2},\
  640. {\
  641. OPNAME ## _pixels8,\
  642. OPNAME ## _pixels8_x2,\
  643. OPNAME ## _pixels8_y2,\
  644. OPNAME ## _pixels8_xy2},\
  645. };\
  646. \
  647. void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
  648. {\
  649. OPNAME ## _pixels16,\
  650. OPNAME ## _no_rnd_pixels16_x2,\
  651. OPNAME ## _no_rnd_pixels16_y2,\
  652. OPNAME ## _no_rnd_pixels16_xy2},\
  653. {\
  654. OPNAME ## _pixels8,\
  655. OPNAME ## _no_rnd_pixels8_x2,\
  656. OPNAME ## _no_rnd_pixels8_y2,\
  657. OPNAME ## _no_rnd_pixels8_xy2},\
  658. };
  659. #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  660. #endif
  661. #define op_put(a, b) a = b
  662. PIXOP2(avg, op_avg)
  663. PIXOP2(put, op_put)
  664. #undef op_avg
  665. #undef op_put
  666. #if 0
  667. /* FIXME this stuff could be removed as its ot really used anymore */
  668. #define PIXOP(BTYPE, OPNAME, OP, INCR) \
  669. \
  670. static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  671. { \
  672. BTYPE *p; \
  673. const UINT8 *pix; \
  674. \
  675. p = block; \
  676. pix = pixels; \
  677. do { \
  678. OP(p[0], pix[0]); \
  679. OP(p[1], pix[1]); \
  680. OP(p[2], pix[2]); \
  681. OP(p[3], pix[3]); \
  682. OP(p[4], pix[4]); \
  683. OP(p[5], pix[5]); \
  684. OP(p[6], pix[6]); \
  685. OP(p[7], pix[7]); \
  686. pix += line_size; \
  687. p += INCR; \
  688. } while (--h);; \
  689. } \
  690. \
  691. static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  692. { \
  693. BTYPE *p; \
  694. const UINT8 *pix; \
  695. \
  696. p = block; \
  697. pix = pixels; \
  698. do { \
  699. OP(p[0], avg2(pix[0], pix[1])); \
  700. OP(p[1], avg2(pix[1], pix[2])); \
  701. OP(p[2], avg2(pix[2], pix[3])); \
  702. OP(p[3], avg2(pix[3], pix[4])); \
  703. OP(p[4], avg2(pix[4], pix[5])); \
  704. OP(p[5], avg2(pix[5], pix[6])); \
  705. OP(p[6], avg2(pix[6], pix[7])); \
  706. OP(p[7], avg2(pix[7], pix[8])); \
  707. pix += line_size; \
  708. p += INCR; \
  709. } while (--h); \
  710. } \
  711. \
  712. static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  713. { \
  714. BTYPE *p; \
  715. const UINT8 *pix; \
  716. const UINT8 *pix1; \
  717. \
  718. p = block; \
  719. pix = pixels; \
  720. pix1 = pixels + line_size; \
  721. do { \
  722. OP(p[0], avg2(pix[0], pix1[0])); \
  723. OP(p[1], avg2(pix[1], pix1[1])); \
  724. OP(p[2], avg2(pix[2], pix1[2])); \
  725. OP(p[3], avg2(pix[3], pix1[3])); \
  726. OP(p[4], avg2(pix[4], pix1[4])); \
  727. OP(p[5], avg2(pix[5], pix1[5])); \
  728. OP(p[6], avg2(pix[6], pix1[6])); \
  729. OP(p[7], avg2(pix[7], pix1[7])); \
  730. pix += line_size; \
  731. pix1 += line_size; \
  732. p += INCR; \
  733. } while(--h); \
  734. } \
  735. \
  736. static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
  737. { \
  738. BTYPE *p; \
  739. const UINT8 *pix; \
  740. const UINT8 *pix1; \
  741. \
  742. p = block; \
  743. pix = pixels; \
  744. pix1 = pixels + line_size; \
  745. do { \
  746. OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
  747. OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
  748. OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
  749. OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
  750. OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
  751. OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
  752. OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
  753. OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
  754. pix += line_size; \
  755. pix1 += line_size; \
  756. p += INCR; \
  757. } while(--h); \
  758. } \
  759. \
  760. void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
  761. OPNAME ## _pixels, \
  762. OPNAME ## _pixels_x2, \
  763. OPNAME ## _pixels_y2, \
  764. OPNAME ## _pixels_xy2, \
  765. };
  766. /* rounding primitives */
  767. #define avg2(a,b) ((a+b+1)>>1)
  768. #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
  769. #define op_avg(a, b) a = avg2(a, b)
  770. #define op_sub(a, b) a -= b
  771. #define op_put(a, b) a = b
  772. PIXOP(DCTELEM, sub, op_sub, 8)
  773. PIXOP(uint8_t, avg, op_avg, line_size)
  774. PIXOP(uint8_t, put, op_put, line_size)
  775. /* not rounding primitives */
  776. #undef avg2
  777. #undef avg4
  778. #define avg2(a,b) ((a+b)>>1)
  779. #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
  780. PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
  781. PIXOP(uint8_t, put_no_rnd, op_put, line_size)
  782. /* motion estimation */
  783. #undef avg2
  784. #undef avg4
  785. #endif
  786. #define avg2(a,b) ((a+b+1)>>1)
  787. #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
  788. static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
  789. {
  790. const int A=(16-x16)*(16-y16);
  791. const int B=( x16)*(16-y16);
  792. const int C=(16-x16)*( y16);
  793. const int D=( x16)*( y16);
  794. int i;
  795. for(i=0; i<h; i++)
  796. {
  797. dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
  798. dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
  799. dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
  800. dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
  801. dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
  802. dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
  803. dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
  804. dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
  805. dst+= stride;
  806. src+= stride;
  807. }
  808. }
  809. static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
  810. int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
  811. {
  812. int y, vx, vy;
  813. const int s= 1<<shift;
  814. width--;
  815. height--;
  816. for(y=0; y<h; y++){
  817. int x;
  818. vx= ox;
  819. vy= oy;
  820. for(x=0; x<8; x++){ //XXX FIXME optimize
  821. int src_x, src_y, frac_x, frac_y, index;
  822. src_x= vx>>16;
  823. src_y= vy>>16;
  824. frac_x= src_x&(s-1);
  825. frac_y= src_y&(s-1);
  826. src_x>>=shift;
  827. src_y>>=shift;
  828. if((unsigned)src_x < width){
  829. if((unsigned)src_y < height){
  830. index= src_x + src_y*stride;
  831. dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
  832. + src[index +1]* frac_x )*(s-frac_y)
  833. + ( src[index+stride ]*(s-frac_x)
  834. + src[index+stride+1]* frac_x )* frac_y
  835. + r)>>(shift*2);
  836. }else{
  837. index= src_x + clip(src_y, 0, height)*stride;
  838. dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
  839. + src[index +1]* frac_x )*s
  840. + r)>>(shift*2);
  841. }
  842. }else{
  843. if((unsigned)src_y < height){
  844. index= clip(src_x, 0, width) + src_y*stride;
  845. dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
  846. + src[index+stride ]* frac_y )*s
  847. + r)>>(shift*2);
  848. }else{
  849. index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
  850. dst[y*stride + x]= src[index ];
  851. }
  852. }
  853. vx+= dxx;
  854. vy+= dyx;
  855. }
  856. ox += dxy;
  857. oy += dyy;
  858. }
  859. }
  860. static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
  861. {
  862. int i;
  863. for(i=0; i<h; i++)
  864. {
  865. ST32(dst , LD32(src ));
  866. ST32(dst+4 , LD32(src+4 ));
  867. ST32(dst+8 , LD32(src+8 ));
  868. ST32(dst+12, LD32(src+12));
  869. dst[16]= src[16];
  870. dst+=dstStride;
  871. src+=srcStride;
  872. }
  873. }
  874. static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
  875. {
  876. int i;
  877. for(i=0; i<h; i++)
  878. {
  879. ST32(dst , LD32(src ));
  880. ST32(dst+4 , LD32(src+4 ));
  881. dst[8]= src[8];
  882. dst+=dstStride;
  883. src+=srcStride;
  884. }
  885. }
  886. #define QPEL_MC(r, OPNAME, RND, OP) \
  887. static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
  888. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  889. int i;\
  890. for(i=0; i<h; i++)\
  891. {\
  892. OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
  893. OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
  894. OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
  895. OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
  896. OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
  897. OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
  898. OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
  899. OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
  900. dst+=dstStride;\
  901. src+=srcStride;\
  902. }\
  903. }\
  904. \
  905. static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
  906. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  907. int i;\
  908. for(i=0; i<w; i++)\
  909. {\
  910. const int src0= src[0*srcStride];\
  911. const int src1= src[1*srcStride];\
  912. const int src2= src[2*srcStride];\
  913. const int src3= src[3*srcStride];\
  914. const int src4= src[4*srcStride];\
  915. const int src5= src[5*srcStride];\
  916. const int src6= src[6*srcStride];\
  917. const int src7= src[7*srcStride];\
  918. const int src8= src[8*srcStride];\
  919. OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
  920. OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
  921. OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
  922. OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
  923. OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
  924. OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
  925. OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
  926. OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
  927. dst++;\
  928. src++;\
  929. }\
  930. }\
  931. \
  932. static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
  933. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  934. int i;\
  935. for(i=0; i<h; i++)\
  936. {\
  937. OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
  938. OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
  939. OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
  940. OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
  941. OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
  942. OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
  943. OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
  944. OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
  945. OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
  946. OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
  947. OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
  948. OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
  949. OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
  950. OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
  951. OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
  952. OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
  953. dst+=dstStride;\
  954. src+=srcStride;\
  955. }\
  956. }\
  957. \
  958. static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
  959. UINT8 *cm = cropTbl + MAX_NEG_CROP;\
  960. int i;\
  961. for(i=0; i<w; i++)\
  962. {\
  963. const int src0= src[0*srcStride];\
  964. const int src1= src[1*srcStride];\
  965. const int src2= src[2*srcStride];\
  966. const int src3= src[3*srcStride];\
  967. const int src4= src[4*srcStride];\
  968. const int src5= src[5*srcStride];\
  969. const int src6= src[6*srcStride];\
  970. const int src7= src[7*srcStride];\
  971. const int src8= src[8*srcStride];\
  972. const int src9= src[9*srcStride];\
  973. const int src10= src[10*srcStride];\
  974. const int src11= src[11*srcStride];\
  975. const int src12= src[12*srcStride];\
  976. const int src13= src[13*srcStride];\
  977. const int src14= src[14*srcStride];\
  978. const int src15= src[15*srcStride];\
  979. const int src16= src[16*srcStride];\
  980. OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
  981. OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
  982. OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
  983. OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
  984. OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
  985. OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
  986. OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
  987. OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
  988. OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
  989. OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
  990. OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
  991. OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
  992. OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
  993. OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
  994. OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
  995. OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
  996. dst++;\
  997. src++;\
  998. }\
  999. }\
  1000. \
  1001. static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
  1002. OPNAME ## pixels8(dst, src, stride, 8);\
  1003. }\
  1004. \
  1005. static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
  1006. UINT8 half[64];\
  1007. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  1008. OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
  1009. }\
  1010. \
  1011. static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
  1012. OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
  1013. }\
  1014. \
  1015. static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
  1016. UINT8 half[64];\
  1017. put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
  1018. OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
  1019. }\
  1020. \
  1021. static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
  1022. UINT8 full[16*9];\
  1023. UINT8 half[64];\
  1024. copy_block9(full, src, 16, stride, 9);\
  1025. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
  1026. OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
  1027. }\
  1028. \
  1029. static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
  1030. UINT8 full[16*9];\
  1031. copy_block9(full, src, 16, stride, 9);\
  1032. OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
  1033. }\
  1034. \
  1035. static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
  1036. UINT8 full[16*9];\
  1037. UINT8 half[64];\
  1038. copy_block9(full, src, 16, stride, 9);\
  1039. put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
  1040. OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
  1041. }\
  1042. static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
  1043. UINT8 full[16*9];\
  1044. UINT8 halfH[72];\
  1045. UINT8 halfV[64];\
  1046. UINT8 halfHV[64];\
  1047. copy_block9(full, src, 16, stride, 9);\
  1048. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1049. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
  1050. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1051. OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1052. }\
  1053. static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
  1054. UINT8 full[16*9];\
  1055. UINT8 halfH[72];\
  1056. UINT8 halfV[64];\
  1057. UINT8 halfHV[64];\
  1058. copy_block9(full, src, 16, stride, 9);\
  1059. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1060. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
  1061. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1062. OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1063. }\
  1064. static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
  1065. UINT8 full[16*9];\
  1066. UINT8 halfH[72];\
  1067. UINT8 halfV[64];\
  1068. UINT8 halfHV[64];\
  1069. copy_block9(full, src, 16, stride, 9);\
  1070. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1071. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
  1072. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1073. OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1074. }\
  1075. static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
  1076. UINT8 full[16*9];\
  1077. UINT8 halfH[72];\
  1078. UINT8 halfV[64];\
  1079. UINT8 halfHV[64];\
  1080. copy_block9(full, src, 16, stride, 9);\
  1081. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
  1082. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
  1083. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1084. OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
  1085. }\
  1086. static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
  1087. UINT8 halfH[72];\
  1088. UINT8 halfHV[64];\
  1089. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  1090. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1091. OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
  1092. }\
  1093. static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
  1094. UINT8 halfH[72];\
  1095. UINT8 halfHV[64];\
  1096. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  1097. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1098. OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
  1099. }\
  1100. static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
  1101. UINT8 full[16*9];\
  1102. UINT8 halfH[72];\
  1103. UINT8 halfV[64];\
  1104. UINT8 halfHV[64];\
  1105. copy_block9(full, src, 16, stride, 9);\
  1106. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1107. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
  1108. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1109. OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
  1110. }\
  1111. static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
  1112. UINT8 full[16*9];\
  1113. UINT8 halfH[72];\
  1114. UINT8 halfV[64];\
  1115. UINT8 halfHV[64];\
  1116. copy_block9(full, src, 16, stride, 9);\
  1117. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
  1118. put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
  1119. put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
  1120. OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
  1121. }\
  1122. static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
  1123. UINT8 halfH[72];\
  1124. put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
  1125. OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
  1126. }\
  1127. static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
  1128. OPNAME ## pixels16(dst, src, stride, 16);\
  1129. }\
  1130. \
  1131. static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
  1132. UINT8 half[256];\
  1133. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  1134. OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
  1135. }\
  1136. \
  1137. static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
  1138. OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
  1139. }\
  1140. \
  1141. static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
  1142. UINT8 half[256];\
  1143. put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
  1144. OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
  1145. }\
  1146. \
  1147. static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
  1148. UINT8 full[24*17];\
  1149. UINT8 half[256];\
  1150. copy_block17(full, src, 24, stride, 17);\
  1151. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
  1152. OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
  1153. }\
  1154. \
  1155. static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
  1156. UINT8 full[24*17];\
  1157. copy_block17(full, src, 24, stride, 17);\
  1158. OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
  1159. }\
  1160. \
  1161. static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
  1162. UINT8 full[24*17];\
  1163. UINT8 half[256];\
  1164. copy_block17(full, src, 24, stride, 17);\
  1165. put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
  1166. OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
  1167. }\
  1168. static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
  1169. UINT8 full[24*17];\
  1170. UINT8 halfH[272];\
  1171. UINT8 halfV[256];\
  1172. UINT8 halfHV[256];\
  1173. copy_block17(full, src, 24, stride, 17);\
  1174. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1175. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
  1176. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1177. OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1178. }\
  1179. static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
  1180. UINT8 full[24*17];\
  1181. UINT8 halfH[272];\
  1182. UINT8 halfV[256];\
  1183. UINT8 halfHV[256];\
  1184. copy_block17(full, src, 24, stride, 17);\
  1185. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1186. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
  1187. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1188. OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1189. }\
  1190. static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
  1191. UINT8 full[24*17];\
  1192. UINT8 halfH[272];\
  1193. UINT8 halfV[256];\
  1194. UINT8 halfHV[256];\
  1195. copy_block17(full, src, 24, stride, 17);\
  1196. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1197. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
  1198. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1199. OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1200. }\
  1201. static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
  1202. UINT8 full[24*17];\
  1203. UINT8 halfH[272];\
  1204. UINT8 halfV[256];\
  1205. UINT8 halfHV[256];\
  1206. copy_block17(full, src, 24, stride, 17);\
  1207. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
  1208. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
  1209. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1210. OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
  1211. }\
  1212. static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
  1213. UINT8 halfH[272];\
  1214. UINT8 halfHV[256];\
  1215. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  1216. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1217. OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
  1218. }\
  1219. static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
  1220. UINT8 halfH[272];\
  1221. UINT8 halfHV[256];\
  1222. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  1223. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1224. OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
  1225. }\
  1226. static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
  1227. UINT8 full[24*17];\
  1228. UINT8 halfH[272];\
  1229. UINT8 halfV[256];\
  1230. UINT8 halfHV[256];\
  1231. copy_block17(full, src, 24, stride, 17);\
  1232. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1233. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
  1234. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1235. OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
  1236. }\
  1237. static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
  1238. UINT8 full[24*17];\
  1239. UINT8 halfH[272];\
  1240. UINT8 halfV[256];\
  1241. UINT8 halfHV[256];\
  1242. copy_block17(full, src, 24, stride, 17);\
  1243. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
  1244. put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
  1245. put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
  1246. OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
  1247. }\
  1248. static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
  1249. UINT8 halfH[272];\
  1250. put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
  1251. OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
  1252. }\
  1253. qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
  1254. {\
  1255. OPNAME ## qpel16_mc00_c, \
  1256. OPNAME ## qpel16_mc10_c, \
  1257. OPNAME ## qpel16_mc20_c, \
  1258. OPNAME ## qpel16_mc30_c, \
  1259. OPNAME ## qpel16_mc01_c, \
  1260. OPNAME ## qpel16_mc11_c, \
  1261. OPNAME ## qpel16_mc21_c, \
  1262. OPNAME ## qpel16_mc31_c, \
  1263. OPNAME ## qpel16_mc02_c, \
  1264. OPNAME ## qpel16_mc12_c, \
  1265. OPNAME ## qpel16_mc22_c, \
  1266. OPNAME ## qpel16_mc32_c, \
  1267. OPNAME ## qpel16_mc03_c, \
  1268. OPNAME ## qpel16_mc13_c, \
  1269. OPNAME ## qpel16_mc23_c, \
  1270. OPNAME ## qpel16_mc33_c, \
  1271. },{\
  1272. OPNAME ## qpel8_mc00_c, \
  1273. OPNAME ## qpel8_mc10_c, \
  1274. OPNAME ## qpel8_mc20_c, \
  1275. OPNAME ## qpel8_mc30_c, \
  1276. OPNAME ## qpel8_mc01_c, \
  1277. OPNAME ## qpel8_mc11_c, \
  1278. OPNAME ## qpel8_mc21_c, \
  1279. OPNAME ## qpel8_mc31_c, \
  1280. OPNAME ## qpel8_mc02_c, \
  1281. OPNAME ## qpel8_mc12_c, \
  1282. OPNAME ## qpel8_mc22_c, \
  1283. OPNAME ## qpel8_mc32_c, \
  1284. OPNAME ## qpel8_mc03_c, \
  1285. OPNAME ## qpel8_mc13_c, \
  1286. OPNAME ## qpel8_mc23_c, \
  1287. OPNAME ## qpel8_mc33_c, \
  1288. }\
  1289. };
  1290. #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
  1291. #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
  1292. #define op_put(a, b) a = cm[((b) + 16)>>5]
  1293. #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
  1294. QPEL_MC(0, put_ , _ , op_put)
  1295. QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
  1296. QPEL_MC(0, avg_ , _ , op_avg)
  1297. //QPEL_MC(1, avg_no_rnd , _ , op_avg)
  1298. #undef op_avg
  1299. #undef op_avg_no_rnd
  1300. #undef op_put
  1301. #undef op_put_no_rnd
  1302. int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1303. {
  1304. int s, i;
  1305. s = 0;
  1306. for(i=0;i<16;i++) {
  1307. s += abs(pix1[0] - pix2[0]);
  1308. s += abs(pix1[1] - pix2[1]);
  1309. s += abs(pix1[2] - pix2[2]);
  1310. s += abs(pix1[3] - pix2[3]);
  1311. s += abs(pix1[4] - pix2[4]);
  1312. s += abs(pix1[5] - pix2[5]);
  1313. s += abs(pix1[6] - pix2[6]);
  1314. s += abs(pix1[7] - pix2[7]);
  1315. s += abs(pix1[8] - pix2[8]);
  1316. s += abs(pix1[9] - pix2[9]);
  1317. s += abs(pix1[10] - pix2[10]);
  1318. s += abs(pix1[11] - pix2[11]);
  1319. s += abs(pix1[12] - pix2[12]);
  1320. s += abs(pix1[13] - pix2[13]);
  1321. s += abs(pix1[14] - pix2[14]);
  1322. s += abs(pix1[15] - pix2[15]);
  1323. pix1 += line_size;
  1324. pix2 += line_size;
  1325. }
  1326. return s;
  1327. }
  1328. int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1329. {
  1330. int s, i;
  1331. s = 0;
  1332. for(i=0;i<16;i++) {
  1333. s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
  1334. s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
  1335. s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
  1336. s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
  1337. s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
  1338. s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
  1339. s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
  1340. s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
  1341. s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
  1342. s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
  1343. s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
  1344. s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
  1345. s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
  1346. s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
  1347. s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
  1348. s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
  1349. pix1 += line_size;
  1350. pix2 += line_size;
  1351. }
  1352. return s;
  1353. }
  1354. int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1355. {
  1356. int s, i;
  1357. UINT8 *pix3 = pix2 + line_size;
  1358. s = 0;
  1359. for(i=0;i<16;i++) {
  1360. s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
  1361. s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
  1362. s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
  1363. s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
  1364. s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
  1365. s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
  1366. s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
  1367. s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
  1368. s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
  1369. s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
  1370. s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
  1371. s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
  1372. s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
  1373. s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
  1374. s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
  1375. s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
  1376. pix1 += line_size;
  1377. pix2 += line_size;
  1378. pix3 += line_size;
  1379. }
  1380. return s;
  1381. }
  1382. int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1383. {
  1384. int s, i;
  1385. UINT8 *pix3 = pix2 + line_size;
  1386. s = 0;
  1387. for(i=0;i<16;i++) {
  1388. s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
  1389. s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
  1390. s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
  1391. s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
  1392. s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
  1393. s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
  1394. s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
  1395. s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
  1396. s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
  1397. s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
  1398. s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
  1399. s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
  1400. s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
  1401. s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
  1402. s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
  1403. s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
  1404. pix1 += line_size;
  1405. pix2 += line_size;
  1406. pix3 += line_size;
  1407. }
  1408. return s;
  1409. }
  1410. int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1411. {
  1412. int s, i;
  1413. s = 0;
  1414. for(i=0;i<8;i++) {
  1415. s += abs(pix1[0] - pix2[0]);
  1416. s += abs(pix1[1] - pix2[1]);
  1417. s += abs(pix1[2] - pix2[2]);
  1418. s += abs(pix1[3] - pix2[3]);
  1419. s += abs(pix1[4] - pix2[4]);
  1420. s += abs(pix1[5] - pix2[5]);
  1421. s += abs(pix1[6] - pix2[6]);
  1422. s += abs(pix1[7] - pix2[7]);
  1423. pix1 += line_size;
  1424. pix2 += line_size;
  1425. }
  1426. return s;
  1427. }
  1428. int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1429. {
  1430. int s, i;
  1431. s = 0;
  1432. for(i=0;i<8;i++) {
  1433. s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
  1434. s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
  1435. s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
  1436. s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
  1437. s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
  1438. s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
  1439. s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
  1440. s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
  1441. pix1 += line_size;
  1442. pix2 += line_size;
  1443. }
  1444. return s;
  1445. }
  1446. int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1447. {
  1448. int s, i;
  1449. UINT8 *pix3 = pix2 + line_size;
  1450. s = 0;
  1451. for(i=0;i<8;i++) {
  1452. s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
  1453. s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
  1454. s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
  1455. s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
  1456. s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
  1457. s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
  1458. s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
  1459. s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
  1460. pix1 += line_size;
  1461. pix2 += line_size;
  1462. pix3 += line_size;
  1463. }
  1464. return s;
  1465. }
  1466. int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
  1467. {
  1468. int s, i;
  1469. UINT8 *pix3 = pix2 + line_size;
  1470. s = 0;
  1471. for(i=0;i<8;i++) {
  1472. s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
  1473. s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
  1474. s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
  1475. s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
  1476. s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
  1477. s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
  1478. s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
  1479. s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
  1480. pix1 += line_size;
  1481. pix2 += line_size;
  1482. pix3 += line_size;
  1483. }
  1484. return s;
  1485. }
  1486. void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
  1487. {
  1488. int i;
  1489. INT16 temp[64];
  1490. if(last<=0) return;
  1491. if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
  1492. for(i=0; i<=last; i++){
  1493. const int j= scantable[i];
  1494. temp[j]= block[j];
  1495. block[j]=0;
  1496. }
  1497. for(i=0; i<=last; i++){
  1498. const int j= scantable[i];
  1499. const int perm_j= permutation[j];
  1500. block[perm_j]= temp[j];
  1501. }
  1502. }
  1503. void clear_blocks_c(DCTELEM *blocks)
  1504. {
  1505. memset(blocks, 0, sizeof(DCTELEM)*6*64);
  1506. }
  1507. void dsputil_init(void)
  1508. {
  1509. int i;
  1510. for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
  1511. for(i=0;i<MAX_NEG_CROP;i++) {
  1512. cropTbl[i] = 0;
  1513. cropTbl[i + MAX_NEG_CROP + 256] = 255;
  1514. }
  1515. for(i=0;i<512;i++) {
  1516. squareTbl[i] = (i - 256) * (i - 256);
  1517. }
  1518. get_pixels = get_pixels_c;
  1519. diff_pixels = diff_pixels_c;
  1520. put_pixels_clamped = put_pixels_clamped_c;
  1521. add_pixels_clamped = add_pixels_clamped_c;
  1522. ff_gmc1= gmc1_c;
  1523. ff_gmc= gmc_c;
  1524. clear_blocks= clear_blocks_c;
  1525. pix_sum= pix_sum_c;
  1526. pix_norm1= pix_norm1_c;
  1527. pix_abs16x16 = pix_abs16x16_c;
  1528. pix_abs16x16_x2 = pix_abs16x16_x2_c;
  1529. pix_abs16x16_y2 = pix_abs16x16_y2_c;
  1530. pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
  1531. pix_abs8x8 = pix_abs8x8_c;
  1532. pix_abs8x8_x2 = pix_abs8x8_x2_c;
  1533. pix_abs8x8_y2 = pix_abs8x8_y2_c;
  1534. pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
  1535. #ifdef HAVE_MMX
  1536. dsputil_init_mmx();
  1537. #endif
  1538. #ifdef ARCH_ARMV4L
  1539. dsputil_init_armv4l();
  1540. #endif
  1541. #ifdef HAVE_MLIB
  1542. dsputil_init_mlib();
  1543. #endif
  1544. #ifdef ARCH_ALPHA
  1545. dsputil_init_alpha();
  1546. #endif
  1547. #ifdef ARCH_POWERPC
  1548. dsputil_init_ppc();
  1549. #endif
  1550. #ifdef HAVE_MMI
  1551. dsputil_init_mmi();
  1552. #endif
  1553. for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
  1554. }
  1555. /* remove any non bit exact operation (testing purpose) */
  1556. void avcodec_set_bit_exact(void)
  1557. {
  1558. ff_bit_exact=1;
  1559. #ifdef HAVE_MMX
  1560. dsputil_set_bit_exact_mmx();
  1561. #endif
  1562. }
  1563. void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
  1564. int orig_linesize[3], int coded_linesize,
  1565. AVCodecContext *avctx)
  1566. {
  1567. int quad, diff, x, y;
  1568. UINT8 *orig, *coded;
  1569. UINT32 *sq = squareTbl + 256;
  1570. quad = 0;
  1571. diff = 0;
  1572. /* Luminance */
  1573. orig = orig_image[0];
  1574. coded = coded_image[0];
  1575. for (y=0;y<avctx->height;y++) {
  1576. for (x=0;x<avctx->width;x++) {
  1577. diff = *(orig + x) - *(coded + x);
  1578. quad += sq[diff];
  1579. }
  1580. orig += orig_linesize[0];
  1581. coded += coded_linesize;
  1582. }
  1583. avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
  1584. if (avctx->psnr_y) {
  1585. avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
  1586. avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
  1587. } else
  1588. avctx->psnr_y = 99.99;
  1589. }