You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

758 lines
25KB

  1. /*
  2. * VC-1 and WMV3 decoder - DSP functions
  3. * Copyright (c) 2006 Konstantin Shishkov
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. /**
  22. * @file
  23. * VC-1 and WMV3 decoder
  24. *
  25. */
  26. #include "vc1dsp.h"
  27. /** Apply overlap transform to horizontal edge
  28. */
  29. static void vc1_v_overlap_c(uint8_t* src, int stride)
  30. {
  31. int i;
  32. int a, b, c, d;
  33. int d1, d2;
  34. int rnd = 1;
  35. for(i = 0; i < 8; i++) {
  36. a = src[-2*stride];
  37. b = src[-stride];
  38. c = src[0];
  39. d = src[stride];
  40. d1 = (a - d + 3 + rnd) >> 3;
  41. d2 = (a - d + b - c + 4 - rnd) >> 3;
  42. src[-2*stride] = a - d1;
  43. src[-stride] = av_clip_uint8(b - d2);
  44. src[0] = av_clip_uint8(c + d2);
  45. src[stride] = d + d1;
  46. src++;
  47. rnd = !rnd;
  48. }
  49. }
  50. /** Apply overlap transform to vertical edge
  51. */
  52. static void vc1_h_overlap_c(uint8_t* src, int stride)
  53. {
  54. int i;
  55. int a, b, c, d;
  56. int d1, d2;
  57. int rnd = 1;
  58. for(i = 0; i < 8; i++) {
  59. a = src[-2];
  60. b = src[-1];
  61. c = src[0];
  62. d = src[1];
  63. d1 = (a - d + 3 + rnd) >> 3;
  64. d2 = (a - d + b - c + 4 - rnd) >> 3;
  65. src[-2] = a - d1;
  66. src[-1] = av_clip_uint8(b - d2);
  67. src[0] = av_clip_uint8(c + d2);
  68. src[1] = d + d1;
  69. src += stride;
  70. rnd = !rnd;
  71. }
  72. }
  73. /**
  74. * VC-1 in-loop deblocking filter for one line
  75. * @param src source block type
  76. * @param stride block stride
  77. * @param pq block quantizer
  78. * @return whether other 3 pairs should be filtered or not
  79. * @see 8.6
  80. */
  81. static av_always_inline int vc1_filter_line(uint8_t* src, int stride, int pq){
  82. uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  83. int a0 = (2*(src[-2*stride] - src[ 1*stride]) - 5*(src[-1*stride] - src[ 0*stride]) + 4) >> 3;
  84. int a0_sign = a0 >> 31; /* Store sign */
  85. a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
  86. if(a0 < pq){
  87. int a1 = FFABS((2*(src[-4*stride] - src[-1*stride]) - 5*(src[-3*stride] - src[-2*stride]) + 4) >> 3);
  88. int a2 = FFABS((2*(src[ 0*stride] - src[ 3*stride]) - 5*(src[ 1*stride] - src[ 2*stride]) + 4) >> 3);
  89. if(a1 < a0 || a2 < a0){
  90. int clip = src[-1*stride] - src[ 0*stride];
  91. int clip_sign = clip >> 31;
  92. clip = ((clip ^ clip_sign) - clip_sign)>>1;
  93. if(clip){
  94. int a3 = FFMIN(a1, a2);
  95. int d = 5 * (a3 - a0);
  96. int d_sign = (d >> 31);
  97. d = ((d ^ d_sign) - d_sign) >> 3;
  98. d_sign ^= a0_sign;
  99. if( d_sign ^ clip_sign )
  100. d = 0;
  101. else{
  102. d = FFMIN(d, clip);
  103. d = (d ^ d_sign) - d_sign; /* Restore sign */
  104. src[-1*stride] = cm[src[-1*stride] - d];
  105. src[ 0*stride] = cm[src[ 0*stride] + d];
  106. }
  107. return 1;
  108. }
  109. }
  110. }
  111. return 0;
  112. }
  113. /**
  114. * VC-1 in-loop deblocking filter
  115. * @param src source block type
  116. * @param step distance between horizontally adjacent elements
  117. * @param stride distance between vertically adjacent elements
  118. * @param len edge length to filter (4 or 8 pixels)
  119. * @param pq block quantizer
  120. * @see 8.6
  121. */
  122. static inline void vc1_loop_filter(uint8_t* src, int step, int stride, int len, int pq)
  123. {
  124. int i;
  125. int filt3;
  126. for(i = 0; i < len; i += 4){
  127. filt3 = vc1_filter_line(src + 2*step, stride, pq);
  128. if(filt3){
  129. vc1_filter_line(src + 0*step, stride, pq);
  130. vc1_filter_line(src + 1*step, stride, pq);
  131. vc1_filter_line(src + 3*step, stride, pq);
  132. }
  133. src += step * 4;
  134. }
  135. }
  136. static void vc1_v_loop_filter4_c(uint8_t *src, int stride, int pq)
  137. {
  138. vc1_loop_filter(src, 1, stride, 4, pq);
  139. }
  140. static void vc1_h_loop_filter4_c(uint8_t *src, int stride, int pq)
  141. {
  142. vc1_loop_filter(src, stride, 1, 4, pq);
  143. }
  144. static void vc1_v_loop_filter8_c(uint8_t *src, int stride, int pq)
  145. {
  146. vc1_loop_filter(src, 1, stride, 8, pq);
  147. }
  148. static void vc1_h_loop_filter8_c(uint8_t *src, int stride, int pq)
  149. {
  150. vc1_loop_filter(src, stride, 1, 8, pq);
  151. }
  152. static void vc1_v_loop_filter16_c(uint8_t *src, int stride, int pq)
  153. {
  154. vc1_loop_filter(src, 1, stride, 16, pq);
  155. }
  156. static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq)
  157. {
  158. vc1_loop_filter(src, stride, 1, 16, pq);
  159. }
  160. /** Do inverse transform on 8x8 block
  161. */
  162. static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
  163. {
  164. int i;
  165. int dc = block[0];
  166. const uint8_t *cm;
  167. dc = (3 * dc + 1) >> 1;
  168. dc = (3 * dc + 16) >> 5;
  169. cm = ff_cropTbl + MAX_NEG_CROP + dc;
  170. for(i = 0; i < 8; i++){
  171. dest[0] = cm[dest[0]];
  172. dest[1] = cm[dest[1]];
  173. dest[2] = cm[dest[2]];
  174. dest[3] = cm[dest[3]];
  175. dest[4] = cm[dest[4]];
  176. dest[5] = cm[dest[5]];
  177. dest[6] = cm[dest[6]];
  178. dest[7] = cm[dest[7]];
  179. dest += linesize;
  180. }
  181. }
  182. static av_always_inline void vc1_inv_trans_8x8_c(DCTELEM block[64], int shl, int sub)
  183. {
  184. int i;
  185. register int t1,t2,t3,t4,t5,t6,t7,t8;
  186. DCTELEM *src, *dst, temp[64];
  187. src = block;
  188. dst = temp;
  189. for(i = 0; i < 8; i++){
  190. t1 = 12 * (src[ 0] + src[32]) + 4;
  191. t2 = 12 * (src[ 0] - src[32]) + 4;
  192. t3 = 16 * src[16] + 6 * src[48];
  193. t4 = 6 * src[16] - 16 * src[48];
  194. t5 = t1 + t3;
  195. t6 = t2 + t4;
  196. t7 = t2 - t4;
  197. t8 = t1 - t3;
  198. t1 = 16 * src[ 8] + 15 * src[24] + 9 * src[40] + 4 * src[56];
  199. t2 = 15 * src[ 8] - 4 * src[24] - 16 * src[40] - 9 * src[56];
  200. t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56];
  201. t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56];
  202. dst[0] = (t5 + t1) >> 3;
  203. dst[1] = (t6 + t2) >> 3;
  204. dst[2] = (t7 + t3) >> 3;
  205. dst[3] = (t8 + t4) >> 3;
  206. dst[4] = (t8 - t4) >> 3;
  207. dst[5] = (t7 - t3) >> 3;
  208. dst[6] = (t6 - t2) >> 3;
  209. dst[7] = (t5 - t1) >> 3;
  210. src += 1;
  211. dst += 8;
  212. }
  213. src = temp;
  214. dst = block;
  215. for(i = 0; i < 8; i++){
  216. t1 = 12 * (src[ 0] + src[32]) + 64;
  217. t2 = 12 * (src[ 0] - src[32]) + 64;
  218. t3 = 16 * src[16] + 6 * src[48];
  219. t4 = 6 * src[16] - 16 * src[48];
  220. t5 = t1 + t3;
  221. t6 = t2 + t4;
  222. t7 = t2 - t4;
  223. t8 = t1 - t3;
  224. t1 = 16 * src[ 8] + 15 * src[24] + 9 * src[40] + 4 * src[56];
  225. t2 = 15 * src[ 8] - 4 * src[24] - 16 * src[40] - 9 * src[56];
  226. t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56];
  227. t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56];
  228. dst[ 0] = (((t5 + t1 ) >> 7) - sub) << shl;
  229. dst[ 8] = (((t6 + t2 ) >> 7) - sub) << shl;
  230. dst[16] = (((t7 + t3 ) >> 7) - sub) << shl;
  231. dst[24] = (((t8 + t4 ) >> 7) - sub) << shl;
  232. dst[32] = (((t8 - t4 + 1) >> 7) - sub) << shl;
  233. dst[40] = (((t7 - t3 + 1) >> 7) - sub) << shl;
  234. dst[48] = (((t6 - t2 + 1) >> 7) - sub) << shl;
  235. dst[56] = (((t5 - t1 + 1) >> 7) - sub) << shl;
  236. src++;
  237. dst++;
  238. }
  239. }
  240. static void vc1_inv_trans_8x8_add_c(uint8_t *dest, int linesize, DCTELEM *block)
  241. {
  242. vc1_inv_trans_8x8_c(block, 0, 0);
  243. ff_add_pixels_clamped_c(block, dest, linesize);
  244. }
  245. static void vc1_inv_trans_8x8_put_signed_c(uint8_t *dest, int linesize, DCTELEM *block)
  246. {
  247. vc1_inv_trans_8x8_c(block, 0, 0);
  248. ff_put_signed_pixels_clamped_c(block, dest, linesize);
  249. }
  250. static void vc1_inv_trans_8x8_put_signed_rangered_c(uint8_t *dest, int linesize, DCTELEM *block)
  251. {
  252. vc1_inv_trans_8x8_c(block, 1, 0);
  253. ff_put_signed_pixels_clamped_c(block, dest, linesize);
  254. }
  255. static void vc1_inv_trans_8x8_put_c(uint8_t *dest, int linesize, DCTELEM *block)
  256. {
  257. vc1_inv_trans_8x8_c(block, 0, 0);
  258. ff_put_pixels_clamped_c(block, dest, linesize);
  259. }
  260. static void vc1_inv_trans_8x8_put_rangered_c(uint8_t *dest, int linesize, DCTELEM *block)
  261. {
  262. vc1_inv_trans_8x8_c(block, 1, 64);
  263. ff_put_pixels_clamped_c(block, dest, linesize);
  264. }
  265. /** Do inverse transform on 8x4 part of block
  266. */
  267. static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
  268. {
  269. int i;
  270. int dc = block[0];
  271. const uint8_t *cm;
  272. dc = ( 3 * dc + 1) >> 1;
  273. dc = (17 * dc + 64) >> 7;
  274. cm = ff_cropTbl + MAX_NEG_CROP + dc;
  275. for(i = 0; i < 4; i++){
  276. dest[0] = cm[dest[0]];
  277. dest[1] = cm[dest[1]];
  278. dest[2] = cm[dest[2]];
  279. dest[3] = cm[dest[3]];
  280. dest[4] = cm[dest[4]];
  281. dest[5] = cm[dest[5]];
  282. dest[6] = cm[dest[6]];
  283. dest[7] = cm[dest[7]];
  284. dest += linesize;
  285. }
  286. }
  287. static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block)
  288. {
  289. int i;
  290. register int t1,t2,t3,t4,t5,t6,t7,t8;
  291. DCTELEM *src, *dst;
  292. const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  293. src = block;
  294. dst = block;
  295. for(i = 0; i < 4; i++){
  296. t1 = 12 * (src[0] + src[4]) + 4;
  297. t2 = 12 * (src[0] - src[4]) + 4;
  298. t3 = 16 * src[2] + 6 * src[6];
  299. t4 = 6 * src[2] - 16 * src[6];
  300. t5 = t1 + t3;
  301. t6 = t2 + t4;
  302. t7 = t2 - t4;
  303. t8 = t1 - t3;
  304. t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7];
  305. t2 = 15 * src[1] - 4 * src[3] - 16 * src[5] - 9 * src[7];
  306. t3 = 9 * src[1] - 16 * src[3] + 4 * src[5] + 15 * src[7];
  307. t4 = 4 * src[1] - 9 * src[3] + 15 * src[5] - 16 * src[7];
  308. dst[0] = (t5 + t1) >> 3;
  309. dst[1] = (t6 + t2) >> 3;
  310. dst[2] = (t7 + t3) >> 3;
  311. dst[3] = (t8 + t4) >> 3;
  312. dst[4] = (t8 - t4) >> 3;
  313. dst[5] = (t7 - t3) >> 3;
  314. dst[6] = (t6 - t2) >> 3;
  315. dst[7] = (t5 - t1) >> 3;
  316. src += 8;
  317. dst += 8;
  318. }
  319. src = block;
  320. for(i = 0; i < 8; i++){
  321. t1 = 17 * (src[ 0] + src[16]) + 64;
  322. t2 = 17 * (src[ 0] - src[16]) + 64;
  323. t3 = 22 * src[ 8] + 10 * src[24];
  324. t4 = 22 * src[24] - 10 * src[ 8];
  325. dest[0*linesize] = cm[dest[0*linesize] + ((t1 + t3) >> 7)];
  326. dest[1*linesize] = cm[dest[1*linesize] + ((t2 - t4) >> 7)];
  327. dest[2*linesize] = cm[dest[2*linesize] + ((t2 + t4) >> 7)];
  328. dest[3*linesize] = cm[dest[3*linesize] + ((t1 - t3) >> 7)];
  329. src ++;
  330. dest++;
  331. }
  332. }
  333. /** Do inverse transform on 4x8 parts of block
  334. */
  335. static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
  336. {
  337. int i;
  338. int dc = block[0];
  339. const uint8_t *cm;
  340. dc = (17 * dc + 4) >> 3;
  341. dc = (12 * dc + 64) >> 7;
  342. cm = ff_cropTbl + MAX_NEG_CROP + dc;
  343. for(i = 0; i < 8; i++){
  344. dest[0] = cm[dest[0]];
  345. dest[1] = cm[dest[1]];
  346. dest[2] = cm[dest[2]];
  347. dest[3] = cm[dest[3]];
  348. dest += linesize;
  349. }
  350. }
  351. static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block)
  352. {
  353. int i;
  354. register int t1,t2,t3,t4,t5,t6,t7,t8;
  355. DCTELEM *src, *dst;
  356. const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  357. src = block;
  358. dst = block;
  359. for(i = 0; i < 8; i++){
  360. t1 = 17 * (src[0] + src[2]) + 4;
  361. t2 = 17 * (src[0] - src[2]) + 4;
  362. t3 = 22 * src[1] + 10 * src[3];
  363. t4 = 22 * src[3] - 10 * src[1];
  364. dst[0] = (t1 + t3) >> 3;
  365. dst[1] = (t2 - t4) >> 3;
  366. dst[2] = (t2 + t4) >> 3;
  367. dst[3] = (t1 - t3) >> 3;
  368. src += 8;
  369. dst += 8;
  370. }
  371. src = block;
  372. for(i = 0; i < 4; i++){
  373. t1 = 12 * (src[ 0] + src[32]) + 64;
  374. t2 = 12 * (src[ 0] - src[32]) + 64;
  375. t3 = 16 * src[16] + 6 * src[48];
  376. t4 = 6 * src[16] - 16 * src[48];
  377. t5 = t1 + t3;
  378. t6 = t2 + t4;
  379. t7 = t2 - t4;
  380. t8 = t1 - t3;
  381. t1 = 16 * src[ 8] + 15 * src[24] + 9 * src[40] + 4 * src[56];
  382. t2 = 15 * src[ 8] - 4 * src[24] - 16 * src[40] - 9 * src[56];
  383. t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56];
  384. t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56];
  385. dest[0*linesize] = cm[dest[0*linesize] + ((t5 + t1) >> 7)];
  386. dest[1*linesize] = cm[dest[1*linesize] + ((t6 + t2) >> 7)];
  387. dest[2*linesize] = cm[dest[2*linesize] + ((t7 + t3) >> 7)];
  388. dest[3*linesize] = cm[dest[3*linesize] + ((t8 + t4) >> 7)];
  389. dest[4*linesize] = cm[dest[4*linesize] + ((t8 - t4 + 1) >> 7)];
  390. dest[5*linesize] = cm[dest[5*linesize] + ((t7 - t3 + 1) >> 7)];
  391. dest[6*linesize] = cm[dest[6*linesize] + ((t6 - t2 + 1) >> 7)];
  392. dest[7*linesize] = cm[dest[7*linesize] + ((t5 - t1 + 1) >> 7)];
  393. src ++;
  394. dest++;
  395. }
  396. }
  397. /** Do inverse transform on 4x4 part of block
  398. */
  399. static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
  400. {
  401. int i;
  402. int dc = block[0];
  403. const uint8_t *cm;
  404. dc = (17 * dc + 4) >> 3;
  405. dc = (17 * dc + 64) >> 7;
  406. cm = ff_cropTbl + MAX_NEG_CROP + dc;
  407. for(i = 0; i < 4; i++){
  408. dest[0] = cm[dest[0]];
  409. dest[1] = cm[dest[1]];
  410. dest[2] = cm[dest[2]];
  411. dest[3] = cm[dest[3]];
  412. dest += linesize;
  413. }
  414. }
  415. static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, DCTELEM *block)
  416. {
  417. int i;
  418. register int t1,t2,t3,t4;
  419. DCTELEM *src, *dst;
  420. const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  421. src = block;
  422. dst = block;
  423. for(i = 0; i < 4; i++){
  424. t1 = 17 * (src[0] + src[2]) + 4;
  425. t2 = 17 * (src[0] - src[2]) + 4;
  426. t3 = 22 * src[1] + 10 * src[3];
  427. t4 = 22 * src[3] - 10 * src[1];
  428. dst[0] = (t1 + t3) >> 3;
  429. dst[1] = (t2 - t4) >> 3;
  430. dst[2] = (t2 + t4) >> 3;
  431. dst[3] = (t1 - t3) >> 3;
  432. src += 8;
  433. dst += 8;
  434. }
  435. src = block;
  436. for(i = 0; i < 4; i++){
  437. t1 = 17 * (src[ 0] + src[16]) + 64;
  438. t2 = 17 * (src[ 0] - src[16]) + 64;
  439. t3 = 22 * src[ 8] + 10 * src[24];
  440. t4 = 22 * src[24] - 10 * src[ 8];
  441. dest[0*linesize] = cm[dest[0*linesize] + ((t1 + t3) >> 7)];
  442. dest[1*linesize] = cm[dest[1*linesize] + ((t2 - t4) >> 7)];
  443. dest[2*linesize] = cm[dest[2*linesize] + ((t2 + t4) >> 7)];
  444. dest[3*linesize] = cm[dest[3*linesize] + ((t1 - t3) >> 7)];
  445. src ++;
  446. dest++;
  447. }
  448. }
  449. /* motion compensation functions */
  450. /** Filter in case of 2 filters */
  451. #define VC1_MSPEL_FILTER_16B(DIR, TYPE) \
  452. static av_always_inline int vc1_mspel_ ## DIR ## _filter_16bits(const TYPE *src, int stride, int mode) \
  453. { \
  454. switch(mode){ \
  455. case 0: /* no shift - should not occur */ \
  456. return 0; \
  457. case 1: /* 1/4 shift */ \
  458. return -4*src[-stride] + 53*src[0] + 18*src[stride] - 3*src[stride*2]; \
  459. case 2: /* 1/2 shift */ \
  460. return -src[-stride] + 9*src[0] + 9*src[stride] - src[stride*2]; \
  461. case 3: /* 3/4 shift */ \
  462. return -3*src[-stride] + 18*src[0] + 53*src[stride] - 4*src[stride*2]; \
  463. } \
  464. return 0; /* should not occur */ \
  465. }
  466. VC1_MSPEL_FILTER_16B(ver, uint8_t);
  467. VC1_MSPEL_FILTER_16B(hor, int16_t);
  468. /** Filter used to interpolate fractional pel values
  469. */
  470. static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride, int mode, int r)
  471. {
  472. switch(mode){
  473. case 0: //no shift
  474. return src[0];
  475. case 1: // 1/4 shift
  476. return (-4*src[-stride] + 53*src[0] + 18*src[stride] - 3*src[stride*2] + 32 - r) >> 6;
  477. case 2: // 1/2 shift
  478. return (-src[-stride] + 9*src[0] + 9*src[stride] - src[stride*2] + 8 - r) >> 4;
  479. case 3: // 3/4 shift
  480. return (-3*src[-stride] + 18*src[0] + 53*src[stride] - 4*src[stride*2] + 32 - r) >> 6;
  481. }
  482. return 0; //should not occur
  483. }
  484. /** Function used to do motion compensation with bicubic interpolation
  485. */
  486. #define VC1_MSPEL_MC(OP, OPNAME)\
  487. static void OPNAME ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, int hmode, int vmode, int rnd)\
  488. {\
  489. int i, j;\
  490. \
  491. if (vmode) { /* Horizontal filter to apply */\
  492. int r;\
  493. \
  494. if (hmode) { /* Vertical filter to apply, output to tmp */\
  495. static const int shift_value[] = { 0, 5, 1, 5 };\
  496. int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
  497. int16_t tmp[11*8], *tptr = tmp;\
  498. \
  499. r = (1<<(shift-1)) + rnd-1;\
  500. \
  501. src -= 1;\
  502. for(j = 0; j < 8; j++) {\
  503. for(i = 0; i < 11; i++)\
  504. tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode)+r)>>shift;\
  505. src += stride;\
  506. tptr += 11;\
  507. }\
  508. \
  509. r = 64-rnd;\
  510. tptr = tmp+1;\
  511. for(j = 0; j < 8; j++) {\
  512. for(i = 0; i < 8; i++)\
  513. OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode)+r)>>7);\
  514. dst += stride;\
  515. tptr += 11;\
  516. }\
  517. \
  518. return;\
  519. }\
  520. else { /* No horizontal filter, output 8 lines to dst */\
  521. r = 1-rnd;\
  522. \
  523. for(j = 0; j < 8; j++) {\
  524. for(i = 0; i < 8; i++)\
  525. OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r));\
  526. src += stride;\
  527. dst += stride;\
  528. }\
  529. return;\
  530. }\
  531. }\
  532. \
  533. /* Horizontal mode with no vertical mode */\
  534. for(j = 0; j < 8; j++) {\
  535. for(i = 0; i < 8; i++)\
  536. OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd));\
  537. dst += stride;\
  538. src += stride;\
  539. }\
  540. }
  541. #define op_put(a, b) a = av_clip_uint8(b)
  542. #define op_avg(a, b) a = (a + av_clip_uint8(b) + 1) >> 1
  543. VC1_MSPEL_MC(op_put, put_)
  544. VC1_MSPEL_MC(op_avg, avg_)
  545. /* pixel functions - really are entry points to vc1_mspel_mc */
  546. #define PUT_VC1_MSPEL(a, b)\
  547. static void put_vc1_mspel_mc ## a ## b ##_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
  548. put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
  549. }\
  550. static void avg_vc1_mspel_mc ## a ## b ##_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
  551. avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
  552. }
  553. PUT_VC1_MSPEL(1, 0)
  554. PUT_VC1_MSPEL(2, 0)
  555. PUT_VC1_MSPEL(3, 0)
  556. PUT_VC1_MSPEL(0, 1)
  557. PUT_VC1_MSPEL(1, 1)
  558. PUT_VC1_MSPEL(2, 1)
  559. PUT_VC1_MSPEL(3, 1)
  560. PUT_VC1_MSPEL(0, 2)
  561. PUT_VC1_MSPEL(1, 2)
  562. PUT_VC1_MSPEL(2, 2)
  563. PUT_VC1_MSPEL(3, 2)
  564. PUT_VC1_MSPEL(0, 3)
  565. PUT_VC1_MSPEL(1, 3)
  566. PUT_VC1_MSPEL(2, 3)
  567. PUT_VC1_MSPEL(3, 3)
  568. static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  569. const int A=(8-x)*(8-y);
  570. const int B=( x)*(8-y);
  571. const int C=(8-x)*( y);
  572. const int D=( x)*( y);
  573. int i;
  574. assert(x<8 && y<8 && x>=0 && y>=0);
  575. for(i=0; i<h; i++)
  576. {
  577. dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
  578. dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
  579. dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
  580. dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
  581. dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
  582. dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
  583. dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
  584. dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
  585. dst+= stride;
  586. src+= stride;
  587. }
  588. }
  589. #define avg2(a,b) ((a+b+1)>>1)
  590. static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  591. const int A=(8-x)*(8-y);
  592. const int B=( x)*(8-y);
  593. const int C=(8-x)*( y);
  594. const int D=( x)*( y);
  595. int i;
  596. assert(x<8 && y<8 && x>=0 && y>=0);
  597. for(i=0; i<h; i++)
  598. {
  599. dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
  600. dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
  601. dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
  602. dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
  603. dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
  604. dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
  605. dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
  606. dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
  607. dst+= stride;
  608. src+= stride;
  609. }
  610. }
  611. av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
  612. dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_c;
  613. dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_c;
  614. dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_c;
  615. dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_c;
  616. dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_c;
  617. dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c;
  618. dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c;
  619. dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c;
  620. dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c;
  621. dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c;
  622. dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c;
  623. dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c;
  624. dsp->vc1_h_overlap = vc1_h_overlap_c;
  625. dsp->vc1_v_overlap = vc1_v_overlap_c;
  626. dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c;
  627. dsp->vc1_h_loop_filter4 = vc1_h_loop_filter4_c;
  628. dsp->vc1_v_loop_filter8 = vc1_v_loop_filter8_c;
  629. dsp->vc1_h_loop_filter8 = vc1_h_loop_filter8_c;
  630. dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_c;
  631. dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_c;
  632. dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_c;
  633. dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_c;
  634. dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_c;
  635. dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_c;
  636. dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_c;
  637. dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_c;
  638. dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_c;
  639. dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_c;
  640. dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_c;
  641. dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_c;
  642. dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_c;
  643. dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_c;
  644. dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_c;
  645. dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_c;
  646. dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c;
  647. dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c;
  648. dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_pixels8x8_c;
  649. dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_c;
  650. dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_c;
  651. dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_c;
  652. dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_c;
  653. dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_c;
  654. dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_c;
  655. dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_c;
  656. dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_c;
  657. dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_c;
  658. dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_c;
  659. dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_c;
  660. dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_c;
  661. dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_c;
  662. dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_c;
  663. dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_c;
  664. dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
  665. dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
  666. if (HAVE_ALTIVEC)
  667. ff_vc1dsp_init_altivec(dsp);
  668. if (HAVE_MMX)
  669. ff_vc1dsp_init_mmx(dsp);
  670. }