You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

507 lines
13KB

  1. ; //////////////////////////////////////////////////////////////////////////////
  2. ; //
  3. ; // fdctam32.c - AP922 MMX(3D-Now) forward-DCT
  4. ; // ----------
  5. ; // Intel Application Note AP-922 - fast, precise implementation of DCT
  6. ; // http://developer.intel.com/vtune/cbts/appnotes.htm
  7. ; // ----------
  8. ; //
  9. ; // This routine can use a 3D-Now/MMX enhancement to increase the
  10. ; // accuracy of the fdct_col_4 macro. The dct_col function uses 3D-Now's
  11. ; // PMHULHRW instead of MMX's PMHULHW(and POR). The substitution improves
  12. ; // accuracy very slightly with performance penalty. If the target CPU
  13. ; // does not support 3D-Now, then this function cannot be executed.
  14. ; //
  15. ; // For a fast, precise MMX implementation of inverse-DCT
  16. ; // visit http://www.elecard.com/peter
  17. ; //
  18. ; // v1.0 07/22/2000 (initial release)
  19. ; //
  20. ; // liaor@iname.com http://members.tripod.com/~liaor
  21. ; //////////////////////////////////////////////////////////////////////////////
  22. ;;;
  23. ;;; A.Stevens Jul 2000: ported to nasm syntax and disentangled from
  24. ;;; from Win**** compiler specific stuff.
  25. ;;; All the real work was done above though.
  26. ;;; See above for how to optimise quality on 3DNow! CPU's
  27. ;;
  28. ;; Macros for code-readability...
  29. ;;
  30. %define INP eax ; pointer to (short *blk)
  31. %define OUT ecx ; pointer to output (temporary store space qwTemp[])
  32. %define TABLE ebx ; pointer to tab_frw_01234567[]
  33. %define TABLEF ebx ; pointer to tg_all_16
  34. %define round_frw_row edx
  35. %define x0 INP + 0*16
  36. %define x1 INP + 1*16
  37. %define x2 INP + 2*16
  38. %define x3 INP + 3*16
  39. %define x4 INP + 4*16
  40. %define x5 INP + 5*16
  41. %define x6 INP + 6*16
  42. %define x7 INP + 7*16
  43. %define y0 OUT + 0*16
  44. %define y1 OUT + 1*16
  45. %define y2 OUT + 2*16
  46. %define y3 OUT + 3*16
  47. %define y4 OUT + 4*16
  48. %define y5 OUT + 5*16
  49. %define y6 OUT + 6*16
  50. %define y7 OUT + 7*16
  51. ;;
  52. ;; Constants for DCT
  53. ;;
  54. %define BITS_FRW_ACC 3 ; 2 or 3 for accuracy
  55. %define SHIFT_FRW_COL BITS_FRW_ACC
  56. %define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
  57. %define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
  58. %define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
  59. extern fdct_one_corr
  60. extern fdct_r_row ; Defined in C for convenience
  61. ;;
  62. ;; Concatenated table of forward dct transformation coeffs.
  63. ;;
  64. extern fdct_tg_all_16 ; Defined in C for convenience
  65. ;; Offsets into table..
  66. %define tg_1_16 (TABLEF + 0)
  67. %define tg_2_16 (TABLEF + 8)
  68. %define tg_3_16 (TABLEF + 16)
  69. %define cos_4_16 (TABLEF + 24)
  70. %define ocos_4_16 (TABLEF + 32)
  71. ;;
  72. ;; Concatenated table of forward dct coefficients
  73. ;;
  74. extern tab_frw_01234567 ; Defined in C for convenience
  75. ;; Offsets into table..
  76. SECTION .text
  77. global fdct_mmx
  78. ;;;
  79. ;;; void fdct_mmx( short *blk )
  80. ;;;
  81. ; ////////////////////////////////////////////////////////////////////////
  82. ; //
  83. ; // The high-level pseudocode for the fdct_am32() routine :
  84. ; //
  85. ; // fdct_am32()
  86. ; // {
  87. ; // forward_dct_col03(); // dct_column transform on cols 0-3
  88. ; // forward_dct_col47(); // dct_column transform on cols 4-7
  89. ; // for ( j = 0; j < 8; j=j+1 )
  90. ; // forward_dct_row1(j); // dct_row transform on row #j
  91. ; // }
  92. ; //
  93. ;
  94. align 32
  95. fdct_mmx:
  96. push ebp ; save stack pointer
  97. mov ebp, esp ; link
  98. push ebx
  99. push ecx
  100. push edx
  101. push edi
  102. mov INP, [ebp+8]; ; input data is row 0 of blk[]
  103. ;// transform the left half of the matrix (4 columns)
  104. lea TABLEF, [fdct_tg_all_16];
  105. mov OUT, INP;
  106. ; lea round_frw_col, [r_frw_col]
  107. ; for ( i = 0; i < 2; i = i + 1)
  108. ; the for-loop is executed twice. We are better off unrolling the
  109. ; loop to avoid branch misprediction.
  110. .mmx32_fdct_col03:
  111. movq mm0, [x1] ; 0 ; x1
  112. ;;
  113. movq mm1, [x6] ; 1 ; x6
  114. movq mm2, mm0 ; 2 ; x1
  115. movq mm3, [x2] ; 3 ; x2
  116. paddsw mm0, mm1 ; t1 = x[1] + x[6]
  117. movq mm4, [x5] ; 4 ; x5
  118. psllw mm0, SHIFT_FRW_COL ; t1
  119. movq mm5, [x0] ; 5 ; x0
  120. paddsw mm4, mm3 ; t2 = x[2] + x[5]
  121. paddsw mm5, [x7] ; t0 = x[0] + x[7]
  122. psllw mm4, SHIFT_FRW_COL ; t2
  123. movq mm6, mm0 ; 6 ; t1
  124. psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
  125. movq mm1, [tg_2_16] ; 1 ; tg_2_16
  126. psubsw mm0, mm4 ; tm12 = t1 - t2
  127. movq mm7, [x3] ; 7 ; x3
  128. pmulhw mm1, mm0 ; tm12*tg_2_16
  129. paddsw mm7, [x4] ; t3 = x[3] + x[4]
  130. psllw mm5, SHIFT_FRW_COL ; t0
  131. paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
  132. psllw mm7, SHIFT_FRW_COL ; t3
  133. movq mm4, mm5 ; 4 ; t0
  134. psubsw mm5, mm7 ; tm03 = t0 - t3
  135. paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
  136. paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
  137. por mm1, [fdct_one_corr] ; correction y2 +0.5
  138. psllw mm2, SHIFT_FRW_COL+1 ; t6
  139. pmulhw mm5, [tg_2_16] ; tm03*tg_2_16
  140. movq mm7, mm4 ; 7 ; tp03
  141. psubsw mm3, [x5] ; t5 = x[2] - x[5]
  142. psubsw mm4, mm6 ; y4 = tp03 - tp12
  143. movq [y2], mm1 ; 1 ; save y2
  144. paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
  145. movq mm1, [x3] ; 1 ; x3
  146. psllw mm3, SHIFT_FRW_COL+1 ; t5
  147. psubsw mm1, [x4] ; t4 = x[3] - x[4]
  148. movq mm6, mm2 ; 6 ; t6
  149. movq [y4], mm4 ; 4 ; save y4
  150. paddsw mm2, mm3 ; t6 + t5
  151. pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
  152. psubsw mm6, mm3 ; 3 ; t6 - t5
  153. pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
  154. psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
  155. por mm5, [fdct_one_corr] ; correction y6 +0.5
  156. psllw mm1, SHIFT_FRW_COL ; t4
  157. por mm2, [fdct_one_corr] ; correction tp65 +0.5
  158. movq mm4, mm1 ; 4 ; t4
  159. movq mm3, [x0] ; 3 ; x0
  160. paddsw mm1, mm6 ; tp465 = t4 + tm65
  161. psubsw mm3, [x7] ; t7 = x[0] - x[7]
  162. psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
  163. movq mm0, [tg_1_16] ; 0 ; tg_1_16
  164. psllw mm3, SHIFT_FRW_COL ; t7
  165. movq mm6, [tg_3_16] ; 6 ; tg_3_16
  166. pmulhw mm0, mm1 ; tp465*tg_1_16
  167. movq [y0], mm7 ; 7 ; save y0
  168. pmulhw mm6, mm4 ; tm465*tg_3_16
  169. movq [y6], mm5 ; 5 ; save y6
  170. movq mm7, mm3 ; 7 ; t7
  171. movq mm5, [tg_3_16] ; 5 ; tg_3_16
  172. psubsw mm7, mm2 ; tm765 = t7 - tp65
  173. paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
  174. pmulhw mm5, mm7 ; tm765*tg_3_16
  175. paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
  176. paddsw mm6, mm4 ; tm465*tg_3_16
  177. pmulhw mm3, [tg_1_16] ; tp765*tg_1_16
  178. ;;
  179. por mm0, [fdct_one_corr] ; correction y1 +0.5
  180. paddsw mm5, mm7 ; tm765*tg_3_16
  181. psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
  182. add INP, 0x08 ; ; increment pointer
  183. movq [y1], mm0 ; 0 ; save y1
  184. paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
  185. movq [y3], mm7 ; 7 ; save y3
  186. psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
  187. movq [y5], mm5 ; 5 ; save y5
  188. .mmx32_fdct_col47: ; begin processing last four columns
  189. movq mm0, [x1] ; 0 ; x1
  190. ;;
  191. movq [y7], mm3 ; 3 ; save y7 (columns 0-4)
  192. ;;
  193. movq mm1, [x6] ; 1 ; x6
  194. movq mm2, mm0 ; 2 ; x1
  195. movq mm3, [x2] ; 3 ; x2
  196. paddsw mm0, mm1 ; t1 = x[1] + x[6]
  197. movq mm4, [x5] ; 4 ; x5
  198. psllw mm0, SHIFT_FRW_COL ; t1
  199. movq mm5, [x0] ; 5 ; x0
  200. paddsw mm4, mm3 ; t2 = x[2] + x[5]
  201. paddsw mm5, [x7] ; t0 = x[0] + x[7]
  202. psllw mm4, SHIFT_FRW_COL ; t2
  203. movq mm6, mm0 ; 6 ; t1
  204. psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
  205. movq mm1, [tg_2_16] ; 1 ; tg_2_16
  206. psubsw mm0, mm4 ; tm12 = t1 - t2
  207. movq mm7, [x3] ; 7 ; x3
  208. pmulhw mm1, mm0 ; tm12*tg_2_16
  209. paddsw mm7, [x4] ; t3 = x[3] + x[4]
  210. psllw mm5, SHIFT_FRW_COL ; t0
  211. paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
  212. psllw mm7, SHIFT_FRW_COL ; t3
  213. movq mm4, mm5 ; 4 ; t0
  214. psubsw mm5, mm7 ; tm03 = t0 - t3
  215. paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
  216. paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
  217. por mm1, [fdct_one_corr] ; correction y2 +0.5
  218. psllw mm2, SHIFT_FRW_COL+1 ; t6
  219. pmulhw mm5, [tg_2_16] ; tm03*tg_2_16
  220. movq mm7, mm4 ; 7 ; tp03
  221. psubsw mm3, [x5] ; t5 = x[2] - x[5]
  222. psubsw mm4, mm6 ; y4 = tp03 - tp12
  223. movq [y2+8], mm1 ; 1 ; save y2
  224. paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
  225. movq mm1, [x3] ; 1 ; x3
  226. psllw mm3, SHIFT_FRW_COL+1 ; t5
  227. psubsw mm1, [x4] ; t4 = x[3] - x[4]
  228. movq mm6, mm2 ; 6 ; t6
  229. movq [y4+8], mm4 ; 4 ; save y4
  230. paddsw mm2, mm3 ; t6 + t5
  231. pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
  232. psubsw mm6, mm3 ; 3 ; t6 - t5
  233. pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
  234. psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
  235. por mm5, [fdct_one_corr] ; correction y6 +0.5
  236. psllw mm1, SHIFT_FRW_COL ; t4
  237. por mm2, [fdct_one_corr] ; correction tp65 +0.5
  238. movq mm4, mm1 ; 4 ; t4
  239. movq mm3, [x0] ; 3 ; x0
  240. paddsw mm1, mm6 ; tp465 = t4 + tm65
  241. psubsw mm3, [x7] ; t7 = x[0] - x[7]
  242. psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
  243. movq mm0, [tg_1_16] ; 0 ; tg_1_16
  244. psllw mm3, SHIFT_FRW_COL ; t7
  245. movq mm6, [tg_3_16] ; 6 ; tg_3_16
  246. pmulhw mm0, mm1 ; tp465*tg_1_16
  247. movq [y0+8], mm7 ; 7 ; save y0
  248. pmulhw mm6, mm4 ; tm465*tg_3_16
  249. movq [y6+8], mm5 ; 5 ; save y6
  250. movq mm7, mm3 ; 7 ; t7
  251. movq mm5, [tg_3_16] ; 5 ; tg_3_16
  252. psubsw mm7, mm2 ; tm765 = t7 - tp65
  253. paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
  254. pmulhw mm5, mm7 ; tm765*tg_3_16
  255. paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
  256. paddsw mm6, mm4 ; tm465*tg_3_16
  257. pmulhw mm3, [tg_1_16] ; tp765*tg_1_16
  258. ;;
  259. por mm0, [fdct_one_corr] ; correction y1 +0.5
  260. paddsw mm5, mm7 ; tm765*tg_3_16
  261. psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
  262. ;;
  263. movq [y1+8], mm0 ; 0 ; save y1
  264. paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
  265. movq [y3+8], mm7 ; 7 ; save y3
  266. psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
  267. movq [y5+8], mm5 ; 5 ; save y5
  268. movq [y7+8], mm3 ; 3 ; save y7
  269. ; emms;
  270. ; } ; end of forward_dct_col07()
  271. ; done with dct_row transform
  272. ; fdct_mmx32_cols() --
  273. ; the following subroutine repeats the row-transform operation,
  274. ; except with different shift&round constants. This version
  275. ; does NOT transpose the output again. Thus the final output
  276. ; is transposed with respect to the source.
  277. ;
  278. ; The output is stored into blk[], which destroys the original
  279. ; input data.
  280. mov INP, [ebp+8]; ;; row 0
  281. mov edi, 0x08; ;x = 8
  282. lea TABLE, [tab_frw_01234567]; ; row 0
  283. mov OUT, INP;
  284. lea round_frw_row, [fdct_r_row];
  285. ; for ( x = 8; x > 0; --x ) ; transform one row per iteration
  286. ; ---------- loop begin
  287. .lp_mmx_fdct_row1:
  288. movd mm5, [INP+12]; ; mm5 = 7 6
  289. punpcklwd mm5, [INP+8] ; mm5 = 5 7 4 6
  290. movq mm2, mm5; ; mm2 = 5 7 4 6
  291. psrlq mm5, 32; ; mm5 = _ _ 5 7
  292. movq mm0, [INP]; ; mm0 = 3 2 1 0
  293. punpcklwd mm5, mm2;; mm5 = 4 5 6 7
  294. movq mm1, mm0; ; mm1 = 3 2 1 0
  295. paddsw mm0, mm5; ; mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0)
  296. psubsw mm1, mm5; ; mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4)
  297. movq mm2, mm0; ; mm2 = [ xt3 xt2 xt1 xt0 ]
  298. ;movq [ xt3xt2xt1xt0 ], mm0;
  299. ;movq [ xt7xt6xt5xt4 ], mm1;
  300. punpcklwd mm0, mm1;; mm0 = [ xt5 xt1 xt4 xt0 ]
  301. punpckhwd mm2, mm1;; mm2 = [ xt7 xt3 xt6 xt2 ]
  302. movq mm1, mm2; ; mm1
  303. ;; shuffle bytes around
  304. ; movq mm0, [INP] ; 0 ; x3 x2 x1 x0
  305. ; movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4
  306. movq mm2, mm0 ; 2 ; x3 x2 x1 x0
  307. movq mm3, [TABLE] ; 3 ; w06 w04 w02 w00
  308. punpcklwd mm0, mm1 ; x5 x1 x4 x0
  309. movq mm5, mm0 ; 5 ; x5 x1 x4 x0
  310. punpckldq mm0, mm0 ; x4 x0 x4 x0 [ xt2 xt0 xt2 xt0 ]
  311. movq mm4, [TABLE+8] ; 4 ; w07 w05 w03 w01
  312. punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2
  313. pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00
  314. movq mm6, mm2 ; 6 ; x7 x3 x6 x2
  315. movq mm1, [TABLE+32] ; 1 ; w22 w20 w18 w16
  316. punpckldq mm2, mm2 ; x6 x2 x6 x2 [ xt3 xt1 xt3 xt1 ]
  317. pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01
  318. punpckhdq mm5, mm5 ; x5 x1 x5 x1 [ xt6 xt4 xt6 xt4 ]
  319. pmaddwd mm0, [TABLE+16] ; x4*w14+x0*w12 x4*w10+x0*w08
  320. punpckhdq mm6, mm6 ; x7 x3 x7 x3 [ xt7 xt5 xt7 xt5 ]
  321. movq mm7, [TABLE+40] ; 7 ; w23 w21 w19 w17
  322. pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16
  323. ;mm3 = a1, a0 (y2,y0)
  324. ;mm1 = b1, b0 (y3,y1)
  325. ;mm0 = a3,a2 (y6,y4)
  326. ;mm5 = b3,b2 (y7,y5)
  327. paddd mm3, [round_frw_row] ; +rounder (y2,y0)
  328. pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17
  329. pmaddwd mm2, [TABLE+24] ; x6*w15+x2*w13 x6*w11+x2*w09
  330. paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0)
  331. pmaddwd mm5, [TABLE+48] ; x5*w30+x1*w28 x5*w26+x1*w24
  332. ;;
  333. pmaddwd mm6, [TABLE+56] ; x7*w31+x3*w29 x7*w27+x3*w25
  334. paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1)
  335. paddd mm0, [round_frw_row] ; +rounder (y6,y4)
  336. psrad mm3, SHIFT_FRW_ROW ; (y2, y0)
  337. paddd mm1, [round_frw_row] ; +rounder (y3,y1)
  338. paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4)
  339. paddd mm5, [round_frw_row] ; +rounder (y7,y5)
  340. psrad mm1, SHIFT_FRW_ROW ; y1=a1+b1 y0=a0+b0
  341. paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5)
  342. psrad mm0, SHIFT_FRW_ROW ;y3=a3+b3 y2=a2+b2
  343. add OUT, 16; ; increment row-output address by 1 row
  344. psrad mm5, SHIFT_FRW_ROW ; y4=a3-b3 y5=a2-b2
  345. add INP, 16; ; increment row-address by 1 row
  346. packssdw mm3, mm0 ; 0 ; y6 y4 y2 y0
  347. packssdw mm1, mm5 ; 3 ; y7 y5 y3 y1
  348. movq mm6, mm3; ; mm0 = y6 y4 y2 y0
  349. punpcklwd mm3, mm1; ; y3 y2 y1 y0
  350. sub edi, 0x01; ; i = i - 1
  351. punpckhwd mm6, mm1; ; y7 y6 y5 y4
  352. add TABLE,64; ; increment to next table
  353. movq [OUT-16], mm3 ; 1 ; save y3 y2 y1 y0
  354. movq [OUT-8], mm6 ; 7 ; save y7 y6 y5 y4
  355. cmp edi, 0x00;
  356. jg near .lp_mmx_fdct_row1; ; begin fdct processing on next row
  357. ;;
  358. ;; Tidy up and return
  359. ;;
  360. pop edi
  361. pop edx
  362. pop ecx
  363. pop ebx
  364. pop ebp ; restore stack pointer
  365. emms
  366. ret