You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

343 lines
10KB

  1. /*
  2. * MMX optimized forward DCT
  3. * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
  4. *
  5. * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
  6. *
  7. * Intel Application Note AP-922 - fast, precise implementation of DCT
  8. * http://developer.intel.com/vtune/cbts/appnotes.htm
  9. */
  10. #include "../common.h"
  11. #include "mmx.h"
  12. #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
  13. //////////////////////////////////////////////////////////////////////
  14. //
  15. // constants for the forward DCT
  16. // -----------------------------
  17. //
  18. // Be sure to check that your compiler is aligning all constants to QWORD
  19. // (8-byte) memory boundaries! Otherwise the unaligned memory access will
  20. // severely stall MMX execution.
  21. //
  22. //////////////////////////////////////////////////////////////////////
  23. #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
  24. #define SHIFT_FRW_COL BITS_FRW_ACC
  25. #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
  26. //#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
  27. #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
  28. //#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
  29. #define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
  30. //concatenated table, for forward DCT transformation
  31. static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
  32. 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
  33. 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
  34. -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
  35. };
  36. static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
  37. -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5
  38. };
  39. static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
  40. 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
  41. };
  42. static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
  43. static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
  44. static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
  45. 16384, 16384, -8867, -21407,
  46. 16384, 16384, 21407, 8867,
  47. 16384, -16384, 21407, -8867,
  48. -16384, 16384, 8867, -21407,
  49. 22725, 19266, -22725, -12873,
  50. 12873, 4520, 19266, -4520,
  51. 12873, -22725, 19266, -22725,
  52. 4520, 19266, 4520, -12873,
  53. 22725, 22725, -12299, -29692,
  54. 22725, 22725, 29692, 12299,
  55. 22725, -22725, 29692, -12299,
  56. -22725, 22725, 12299, -29692,
  57. 31521, 26722, -31521, -17855,
  58. 17855, 6270, 26722, -6270,
  59. 17855, -31521, 26722, -31521,
  60. 6270, 26722, 6270, -17855,
  61. 21407, 21407, -11585, -27969,
  62. 21407, 21407, 27969, 11585,
  63. 21407, -21407, 27969, -11585,
  64. -21407, 21407, 11585, -27969,
  65. 29692, 25172, -29692, -16819,
  66. 16819, 5906, 25172, -5906,
  67. 16819, -29692, 25172, -29692,
  68. 5906, 25172, 5906, -16819,
  69. 19266, 19266, -10426, -25172,
  70. 19266, 19266, 25172, 10426,
  71. 19266, -19266, 25172, -10426,
  72. -19266, 19266, 10426, -25172,
  73. 26722, 22654, -26722, -15137,
  74. 15137, 5315, 22654, -5315,
  75. 15137, -26722, 22654, -26722,
  76. 5315, 22654, 5315, -15137,
  77. 16384, 16384, -8867, -21407,
  78. 16384, 16384, 21407, 8867,
  79. 16384, -16384, 21407, -8867,
  80. -16384, 16384, 8867, -21407,
  81. 22725, 19266, -22725, -12873,
  82. 12873, 4520, 19266, -4520,
  83. 12873, -22725, 19266, -22725,
  84. 4520, 19266, 4520, -12873,
  85. 19266, 19266, -10426, -25172,
  86. 19266, 19266, 25172, 10426,
  87. 19266, -19266, 25172, -10426,
  88. -19266, 19266, 10426, -25172,
  89. 26722, 22654, -26722, -15137,
  90. 15137, 5315, 22654, -5315,
  91. 15137, -26722, 22654, -26722,
  92. 5315, 22654, 5315, -15137,
  93. 21407, 21407, -11585, -27969,
  94. 21407, 21407, 27969, 11585,
  95. 21407, -21407, 27969, -11585,
  96. -21407, 21407, 11585, -27969,
  97. 29692, 25172, -29692, -16819,
  98. 16819, 5906, 25172, -5906,
  99. 16819, -29692, 25172, -29692,
  100. 5906, 25172, 5906, -16819,
  101. 22725, 22725, -12299, -29692,
  102. 22725, 22725, 29692, 12299,
  103. 22725, -22725, 29692, -12299,
  104. -22725, 22725, 12299, -29692,
  105. 31521, 26722, -31521, -17855,
  106. 17855, 6270, 26722, -6270,
  107. 17855, -31521, 26722, -31521,
  108. 6270, 26722, 6270, -17855,
  109. };
  110. static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
  111. {
  112. movq_m2r(*(in + offset + 1 * 8), mm0);
  113. movq_m2r(*(in + offset + 6 * 8), mm1);
  114. movq_r2r(mm0, mm2);
  115. movq_m2r(*(in + offset + 2 * 8), mm3);
  116. paddsw_r2r(mm1, mm0);
  117. movq_m2r(*(in + offset + 5 * 8), mm4);
  118. psllw_i2r(SHIFT_FRW_COL, mm0);
  119. movq_m2r(*(in + offset + 0 * 8), mm5);
  120. paddsw_r2r(mm3, mm4);
  121. paddsw_m2r(*(in + offset + 7 * 8), mm5);
  122. psllw_i2r(SHIFT_FRW_COL, mm4);
  123. movq_r2r(mm0, mm6);
  124. psubsw_r2r(mm1, mm2);
  125. movq_m2r(*(fdct_tg_all_16 + 4), mm1);
  126. psubsw_r2r(mm4, mm0);
  127. movq_m2r(*(in + offset + 3 * 8), mm7);
  128. pmulhw_r2r(mm0, mm1);
  129. paddsw_m2r(*(in + offset + 4 * 8), mm7);
  130. psllw_i2r(SHIFT_FRW_COL, mm5);
  131. paddsw_r2r(mm4, mm6);
  132. psllw_i2r(SHIFT_FRW_COL, mm7);
  133. movq_r2r(mm5, mm4);
  134. psubsw_r2r(mm7, mm5);
  135. paddsw_r2r(mm5, mm1);
  136. paddsw_r2r(mm7, mm4);
  137. por_m2r(fdct_one_corr, mm1);
  138. psllw_i2r(SHIFT_FRW_COL + 1, mm2);
  139. pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
  140. movq_r2r(mm4, mm7);
  141. psubsw_m2r(*(in + offset + 5 * 8), mm3);
  142. psubsw_r2r(mm6, mm4);
  143. movq_r2m(mm1, *(out + offset + 2 * 8));
  144. paddsw_r2r(mm6, mm7);
  145. movq_m2r(*(in + offset + 3 * 8), mm1);
  146. psllw_i2r(SHIFT_FRW_COL + 1, mm3);
  147. psubsw_m2r(*(in + offset + 4 * 8), mm1);
  148. movq_r2r(mm2, mm6);
  149. movq_r2m(mm4, *(out + offset + 4 * 8));
  150. paddsw_r2r(mm3, mm2);
  151. pmulhw_m2r(*ocos_4_16, mm2);
  152. psubsw_r2r(mm3, mm6);
  153. pmulhw_m2r(*ocos_4_16, mm6);
  154. psubsw_r2r(mm0, mm5);
  155. por_m2r(fdct_one_corr, mm5);
  156. psllw_i2r(SHIFT_FRW_COL, mm1);
  157. por_m2r(fdct_one_corr, mm2);
  158. movq_r2r(mm1, mm4);
  159. movq_m2r(*(in + offset + 0 * 8), mm3);
  160. paddsw_r2r(mm6, mm1);
  161. psubsw_m2r(*(in + offset + 7 * 8), mm3);
  162. psubsw_r2r(mm6, mm4);
  163. movq_m2r(*(fdct_tg_all_16 + 0), mm0);
  164. psllw_i2r(SHIFT_FRW_COL, mm3);
  165. movq_m2r(*(fdct_tg_all_16 + 8), mm6);
  166. pmulhw_r2r(mm1, mm0);
  167. movq_r2m(mm7, *(out + offset + 0 * 8));
  168. pmulhw_r2r(mm4, mm6);
  169. movq_r2m(mm5, *(out + offset + 6 * 8));
  170. movq_r2r(mm3, mm7);
  171. movq_m2r(*(fdct_tg_all_16 + 8), mm5);
  172. psubsw_r2r(mm2, mm7);
  173. paddsw_r2r(mm2, mm3);
  174. pmulhw_r2r(mm7, mm5);
  175. paddsw_r2r(mm3, mm0);
  176. paddsw_r2r(mm4, mm6);
  177. pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
  178. por_m2r(fdct_one_corr, mm0);
  179. paddsw_r2r(mm7, mm5);
  180. psubsw_r2r(mm6, mm7);
  181. movq_r2m(mm0, *(out + offset + 1 * 8));
  182. paddsw_r2r(mm4, mm5);
  183. movq_r2m(mm7, *(out + offset + 3 * 8));
  184. psubsw_r2r(mm1, mm3);
  185. movq_r2m(mm5, *(out + offset + 5 * 8));
  186. movq_r2m(mm3, *(out + offset + 7 * 8));
  187. }
  188. static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
  189. {
  190. pshufw_m2r(*(in + 4), mm5, 0x1B);
  191. movq_m2r(*(in + 0), mm0);
  192. movq_r2r(mm0, mm1);
  193. paddsw_r2r(mm5, mm0);
  194. psubsw_r2r(mm5, mm1);
  195. pshufw_r2r(mm0, mm2, 0x4E);
  196. pshufw_r2r(mm1, mm3, 0x4E);
  197. movq_m2r(*(table + 0), mm4);
  198. movq_m2r(*(table + 4), mm6);
  199. movq_m2r(*(table + 16), mm5);
  200. movq_m2r(*(table + 20), mm7);
  201. pmaddwd_r2r(mm0, mm4);
  202. pmaddwd_r2r(mm1, mm5);
  203. pmaddwd_r2r(mm2, mm6);
  204. pmaddwd_r2r(mm3, mm7);
  205. pmaddwd_m2r(*(table + 8), mm0);
  206. pmaddwd_m2r(*(table + 12), mm2);
  207. pmaddwd_m2r(*(table + 24), mm1);
  208. pmaddwd_m2r(*(table + 28), mm3);
  209. paddd_r2r(mm6, mm4);
  210. paddd_r2r(mm7, mm5);
  211. paddd_r2r(mm2, mm0);
  212. paddd_r2r(mm3, mm1);
  213. movq_m2r(*fdct_r_row, mm7);
  214. paddd_r2r(mm7, mm4);
  215. paddd_r2r(mm7, mm5);
  216. paddd_r2r(mm7, mm0);
  217. paddd_r2r(mm7, mm1);
  218. psrad_i2r(SHIFT_FRW_ROW, mm4);
  219. psrad_i2r(SHIFT_FRW_ROW, mm5);
  220. psrad_i2r(SHIFT_FRW_ROW, mm0);
  221. psrad_i2r(SHIFT_FRW_ROW, mm1);
  222. packssdw_r2r(mm0, mm4);
  223. packssdw_r2r(mm1, mm5);
  224. movq_r2r(mm4, mm2);
  225. punpcklwd_r2r(mm5, mm4);
  226. punpckhwd_r2r(mm5, mm2);
  227. movq_r2m(mm4, *(out + 0));
  228. movq_r2m(mm2, *(out + 4));
  229. }
  230. static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
  231. {
  232. movd_m2r(*(in + 6), mm1);
  233. punpcklwd_m2r(*(in + 4), mm1);
  234. movq_r2r(mm1, mm2);
  235. psrlq_i2r(0x20, mm1);
  236. movq_m2r(*(in + 0), mm0);
  237. punpcklwd_r2r(mm2, mm1);
  238. movq_r2r(mm0, mm5);
  239. paddsw_r2r(mm1, mm0);
  240. psubsw_r2r(mm1, mm5);
  241. movq_r2r(mm0, mm1);
  242. movq_r2r(mm5, mm6);
  243. punpckldq_r2r(mm5, mm3);
  244. punpckhdq_r2r(mm3, mm6);
  245. movq_m2r(*(table + 0), mm3);
  246. movq_m2r(*(table + 4), mm4);
  247. punpckldq_r2r(mm0, mm2);
  248. pmaddwd_r2r(mm0, mm3);
  249. punpckhdq_r2r(mm2, mm1);
  250. movq_m2r(*(table + 16), mm2);
  251. pmaddwd_r2r(mm1, mm4);
  252. pmaddwd_m2r(*(table + 8), mm0);
  253. movq_m2r(*(table + 20), mm7);
  254. pmaddwd_r2r(mm5, mm2);
  255. paddd_m2r(*fdct_r_row, mm3);
  256. pmaddwd_r2r(mm6, mm7);
  257. pmaddwd_m2r(*(table + 12), mm1);
  258. paddd_r2r(mm4, mm3);
  259. pmaddwd_m2r(*(table + 24), mm5);
  260. pmaddwd_m2r(*(table + 28), mm6);
  261. paddd_r2r(mm7, mm2);
  262. paddd_m2r(*fdct_r_row, mm0);
  263. psrad_i2r(SHIFT_FRW_ROW, mm3);
  264. paddd_m2r(*fdct_r_row, mm2);
  265. paddd_r2r(mm1, mm0);
  266. paddd_m2r(*fdct_r_row, mm5);
  267. psrad_i2r(SHIFT_FRW_ROW, mm2);
  268. paddd_r2r(mm6, mm5);
  269. psrad_i2r(SHIFT_FRW_ROW, mm0);
  270. psrad_i2r(SHIFT_FRW_ROW, mm5);
  271. packssdw_r2r(mm0, mm3);
  272. packssdw_r2r(mm5, mm2);
  273. movq_r2r(mm3, mm6);
  274. punpcklwd_r2r(mm2, mm3);
  275. punpckhwd_r2r(mm2, mm6);
  276. movq_r2m(mm3, *(out + 0));
  277. movq_r2m(mm6, *(out + 4));
  278. }
  279. void ff_fdct_mmx(int16_t *block)
  280. {
  281. int64_t align_tmp[16] ATTR_ALIGN(8);
  282. int16_t * const block_tmp= (int16_t*)align_tmp;
  283. int16_t *block1, *out;
  284. const int16_t *table;
  285. int i;
  286. block1 = block_tmp;
  287. fdct_col(block, block1, 0);
  288. fdct_col(block, block1, 4);
  289. block1 = block_tmp;
  290. table = tab_frw_01234567;
  291. out = block;
  292. for(i=8;i>0;i--) {
  293. fdct_row_mmx(block1, out, table);
  294. block1 += 8;
  295. table += 32;
  296. out += 8;
  297. }
  298. }
  299. void ff_fdct_mmx2(int16_t *block)
  300. {
  301. int64_t align_tmp[16] ATTR_ALIGN(8);
  302. int16_t * const block_tmp= (int16_t*)align_tmp;
  303. int16_t *block1, *out;
  304. const int16_t *table;
  305. int i;
  306. block1 = block_tmp;
  307. fdct_col(block, block1, 0);
  308. fdct_col(block, block1, 4);
  309. block1 = block_tmp;
  310. table = tab_frw_01234567;
  311. out = block;
  312. for(i=8;i>0;i--) {
  313. fdct_row_mmx2(block1, out, table);
  314. block1 += 8;
  315. table += 32;
  316. out += 8;
  317. }
  318. }