You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

434 lines
13KB

  1. /*
  2. * Simple IDCT
  3. *
  4. * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  5. * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "asm.S"
  24. #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  25. #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  26. #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  27. #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  28. #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  29. #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  30. #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  31. #define ROW_SHIFT 11
  32. #define COL_SHIFT 20
  33. #define W13 (W1 | (W3 << 16))
  34. #define W26 (W2 | (W6 << 16))
  35. #define W42 (W4 | (W2 << 16))
  36. #define W42n (-W4&0xffff | (-W2 << 16))
  37. #define W46 (W4 | (W6 << 16))
  38. #define W57 (W5 | (W7 << 16))
  39. .text
  40. .align
  41. w13: .long W13
  42. w26: .long W26
  43. w42: .long W42
  44. w42n: .long W42n
  45. w46: .long W46
  46. w57: .long W57
  47. /*
  48. Compute partial IDCT of single row.
  49. shift = left-shift amount
  50. r0 = source address
  51. r2 = row[2,0] <= 2 cycles
  52. r3 = row[3,1]
  53. ip = w42 <= 2 cycles
  54. Output in registers r4--r11
  55. */
  56. .macro idct_row shift
  57. ldr lr, w46 /* lr = W4 | (W6 << 16) */
  58. mov r1, #(1<<(\shift-1))
  59. smlad r4, r2, ip, r1
  60. smlsd r7, r2, ip, r1
  61. ldr ip, w13 /* ip = W1 | (W3 << 16) */
  62. ldr r10,w57 /* r10 = W5 | (W7 << 16) */
  63. smlad r5, r2, lr, r1
  64. smlsd r6, r2, lr, r1
  65. smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
  66. smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
  67. ldr lr, [r0, #12] /* lr = row[7,5] */
  68. pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
  69. pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
  70. smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
  71. smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */
  72. smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
  73. ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */
  74. smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */
  75. ldr r2, [r0, #4] /* r2 = row[6,4] */
  76. smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */
  77. ldr ip, w46 /* ip = W4 | (W6 << 16) */
  78. smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */
  79. smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */
  80. smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */
  81. smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */
  82. smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */
  83. .endm
  84. /*
  85. Compute partial IDCT of half row.
  86. shift = left-shift amount
  87. r2 = row[2,0]
  88. r3 = row[3,1]
  89. ip = w42
  90. Output in registers r4--r11
  91. */
  92. .macro idct_row4 shift
  93. ldr lr, w46 /* lr = W4 | (W6 << 16) */
  94. ldr r10,w57 /* r10 = W5 | (W7 << 16) */
  95. mov r1, #(1<<(\shift-1))
  96. smlad r4, r2, ip, r1
  97. smlsd r7, r2, ip, r1
  98. ldr ip, w13 /* ip = W1 | (W3 << 16) */
  99. smlad r5, r2, lr, r1
  100. smlsd r6, r2, lr, r1
  101. smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
  102. smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
  103. pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
  104. pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
  105. smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
  106. smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
  107. .endm
  108. /*
  109. Compute final part of IDCT single row without shift.
  110. Input in registers r4--r11
  111. Output in registers ip, r4--r6, lr, r8--r10
  112. */
  113. .macro idct_finish
  114. add ip, r4, r8 /* r1 = A0 + B0 */
  115. sub lr, r4, r8 /* r2 = A0 - B0 */
  116. sub r4, r5, r9 /* r2 = A1 + B1 */
  117. add r8, r5, r9 /* r2 = A1 - B1 */
  118. add r5, r6, r10 /* r1 = A2 + B2 */
  119. sub r9, r6, r10 /* r1 = A2 - B2 */
  120. add r6, r7, r11 /* r2 = A3 + B3 */
  121. sub r10,r7, r11 /* r2 = A3 - B3 */
  122. .endm
  123. /*
  124. Compute final part of IDCT single row.
  125. shift = right-shift amount
  126. Input/output in registers r4--r11
  127. */
  128. .macro idct_finish_shift shift
  129. add r3, r4, r8 /* r3 = A0 + B0 */
  130. sub r2, r4, r8 /* r2 = A0 - B0 */
  131. mov r4, r3, asr #\shift
  132. mov r8, r2, asr #\shift
  133. sub r3, r5, r9 /* r3 = A1 + B1 */
  134. add r2, r5, r9 /* r2 = A1 - B1 */
  135. mov r5, r3, asr #\shift
  136. mov r9, r2, asr #\shift
  137. add r3, r6, r10 /* r3 = A2 + B2 */
  138. sub r2, r6, r10 /* r2 = A2 - B2 */
  139. mov r6, r3, asr #\shift
  140. mov r10,r2, asr #\shift
  141. add r3, r7, r11 /* r3 = A3 + B3 */
  142. sub r2, r7, r11 /* r2 = A3 - B3 */
  143. mov r7, r3, asr #\shift
  144. mov r11,r2, asr #\shift
  145. .endm
  146. /*
  147. Compute final part of IDCT single row, saturating results at 8 bits.
  148. shift = right-shift amount
  149. Input/output in registers r4--r11
  150. */
  151. .macro idct_finish_shift_sat shift
  152. add r3, r4, r8 /* r3 = A0 + B0 */
  153. sub ip, r4, r8 /* ip = A0 - B0 */
  154. usat r4, #8, r3, asr #\shift
  155. usat r8, #8, ip, asr #\shift
  156. sub r3, r5, r9 /* r3 = A1 + B1 */
  157. add ip, r5, r9 /* ip = A1 - B1 */
  158. usat r5, #8, r3, asr #\shift
  159. usat r9, #8, ip, asr #\shift
  160. add r3, r6, r10 /* r3 = A2 + B2 */
  161. sub ip, r6, r10 /* ip = A2 - B2 */
  162. usat r6, #8, r3, asr #\shift
  163. usat r10,#8, ip, asr #\shift
  164. add r3, r7, r11 /* r3 = A3 + B3 */
  165. sub ip, r7, r11 /* ip = A3 - B3 */
  166. usat r7, #8, r3, asr #\shift
  167. usat r11,#8, ip, asr #\shift
  168. .endm
  169. /*
  170. Compute IDCT of single row, storing as column.
  171. r0 = source
  172. r1 = dest
  173. */
  174. function idct_row_armv6
  175. push {lr}
  176. ldr lr, [r0, #12] /* lr = row[7,5] */
  177. ldr ip, [r0, #4] /* ip = row[6,4] */
  178. ldr r3, [r0, #8] /* r3 = row[3,1] */
  179. ldr r2, [r0] /* r2 = row[2,0] */
  180. orrs lr, lr, ip
  181. cmpeq lr, r3
  182. cmpeq lr, r2, lsr #16
  183. beq 1f
  184. push {r1}
  185. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  186. cmp lr, #0
  187. beq 2f
  188. idct_row ROW_SHIFT
  189. b 3f
  190. 2: idct_row4 ROW_SHIFT
  191. 3: pop {r1}
  192. idct_finish_shift ROW_SHIFT
  193. strh r4, [r1]
  194. strh r5, [r1, #(16*2)]
  195. strh r6, [r1, #(16*4)]
  196. strh r7, [r1, #(16*6)]
  197. strh r11,[r1, #(16*1)]
  198. strh r10,[r1, #(16*3)]
  199. strh r9, [r1, #(16*5)]
  200. strh r8, [r1, #(16*7)]
  201. pop {pc}
  202. 1: mov r2, r2, lsl #3
  203. strh r2, [r1]
  204. strh r2, [r1, #(16*2)]
  205. strh r2, [r1, #(16*4)]
  206. strh r2, [r1, #(16*6)]
  207. strh r2, [r1, #(16*1)]
  208. strh r2, [r1, #(16*3)]
  209. strh r2, [r1, #(16*5)]
  210. strh r2, [r1, #(16*7)]
  211. pop {pc}
  212. .endfunc
  213. /*
  214. Compute IDCT of single column, read as row.
  215. r0 = source
  216. r1 = dest
  217. */
  218. function idct_col_armv6
  219. push {r1, lr}
  220. ldr r2, [r0] /* r2 = row[2,0] */
  221. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  222. ldr r3, [r0, #8] /* r3 = row[3,1] */
  223. idct_row COL_SHIFT
  224. pop {r1}
  225. idct_finish_shift COL_SHIFT
  226. strh r4, [r1]
  227. strh r5, [r1, #(16*1)]
  228. strh r6, [r1, #(16*2)]
  229. strh r7, [r1, #(16*3)]
  230. strh r11,[r1, #(16*4)]
  231. strh r10,[r1, #(16*5)]
  232. strh r9, [r1, #(16*6)]
  233. strh r8, [r1, #(16*7)]
  234. pop {pc}
  235. .endfunc
  236. /*
  237. Compute IDCT of single column, read as row, store saturated 8-bit.
  238. r0 = source
  239. r1 = dest
  240. r2 = line size
  241. */
  242. function idct_col_put_armv6
  243. push {r1, r2, lr}
  244. ldr r2, [r0] /* r2 = row[2,0] */
  245. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  246. ldr r3, [r0, #8] /* r3 = row[3,1] */
  247. idct_row COL_SHIFT
  248. pop {r1, r2}
  249. idct_finish_shift_sat COL_SHIFT
  250. strb r4, [r1], r2
  251. strb r5, [r1], r2
  252. strb r6, [r1], r2
  253. strb r7, [r1], r2
  254. strb r11,[r1], r2
  255. strb r10,[r1], r2
  256. strb r9, [r1], r2
  257. strb r8, [r1], r2
  258. sub r1, r1, r2, lsl #3
  259. pop {pc}
  260. .endfunc
  261. /*
  262. Compute IDCT of single column, read as row, add/store saturated 8-bit.
  263. r0 = source
  264. r1 = dest
  265. r2 = line size
  266. */
  267. function idct_col_add_armv6
  268. push {r1, r2, lr}
  269. ldr r2, [r0] /* r2 = row[2,0] */
  270. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  271. ldr r3, [r0, #8] /* r3 = row[3,1] */
  272. idct_row COL_SHIFT
  273. pop {r1, r2}
  274. idct_finish
  275. ldrb r3, [r1]
  276. ldrb r7, [r1, r2]
  277. ldrb r11,[r1, r2, lsl #2]
  278. add ip, r3, ip, asr #COL_SHIFT
  279. usat ip, #8, ip
  280. add r4, r7, r4, asr #COL_SHIFT
  281. strb ip, [r1], r2
  282. ldrb ip, [r1, r2]
  283. usat r4, #8, r4
  284. ldrb r11,[r1, r2, lsl #2]
  285. add r5, ip, r5, asr #COL_SHIFT
  286. usat r5, #8, r5
  287. strb r4, [r1], r2
  288. ldrb r3, [r1, r2]
  289. ldrb ip, [r1, r2, lsl #2]
  290. strb r5, [r1], r2
  291. ldrb r7, [r1, r2]
  292. ldrb r4, [r1, r2, lsl #2]
  293. add r6, r3, r6, asr #COL_SHIFT
  294. usat r6, #8, r6
  295. add r10,r7, r10,asr #COL_SHIFT
  296. usat r10,#8, r10
  297. add r9, r11,r9, asr #COL_SHIFT
  298. usat r9, #8, r9
  299. add r8, ip, r8, asr #COL_SHIFT
  300. usat r8, #8, r8
  301. add lr, r4, lr, asr #COL_SHIFT
  302. usat lr, #8, lr
  303. strb r6, [r1], r2
  304. strb r10,[r1], r2
  305. strb r9, [r1], r2
  306. strb r8, [r1], r2
  307. strb lr, [r1], r2
  308. sub r1, r1, r2, lsl #3
  309. pop {pc}
  310. .endfunc
  311. /*
  312. Compute 8 IDCT row transforms.
  313. func = IDCT row->col function
  314. width = width of columns in bytes
  315. */
  316. .macro idct_rows func width
  317. bl \func
  318. add r0, r0, #(16*2)
  319. add r1, r1, #\width
  320. bl \func
  321. add r0, r0, #(16*2)
  322. add r1, r1, #\width
  323. bl \func
  324. add r0, r0, #(16*2)
  325. add r1, r1, #\width
  326. bl \func
  327. sub r0, r0, #(16*5)
  328. add r1, r1, #\width
  329. bl \func
  330. add r0, r0, #(16*2)
  331. add r1, r1, #\width
  332. bl \func
  333. add r0, r0, #(16*2)
  334. add r1, r1, #\width
  335. bl \func
  336. add r0, r0, #(16*2)
  337. add r1, r1, #\width
  338. bl \func
  339. sub r0, r0, #(16*7)
  340. .endm
  341. /* void ff_simple_idct_armv6(DCTELEM *data); */
  342. function ff_simple_idct_armv6, export=1
  343. push {r4-r11, lr}
  344. sub sp, sp, #128
  345. mov r1, sp
  346. idct_rows idct_row_armv6, 2
  347. mov r1, r0
  348. mov r0, sp
  349. idct_rows idct_col_armv6, 2
  350. add sp, sp, #128
  351. pop {r4-r11, pc}
  352. .endfunc
  353. /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  354. function ff_simple_idct_add_armv6, export=1
  355. push {r0, r1, r4-r11, lr}
  356. sub sp, sp, #128
  357. mov r0, r2
  358. mov r1, sp
  359. idct_rows idct_row_armv6, 2
  360. mov r0, sp
  361. ldr r1, [sp, #128]
  362. ldr r2, [sp, #(128+4)]
  363. idct_rows idct_col_add_armv6, 1
  364. add sp, sp, #(128+8)
  365. pop {r4-r11, pc}
  366. .endfunc
  367. /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  368. function ff_simple_idct_put_armv6, export=1
  369. push {r0, r1, r4-r11, lr}
  370. sub sp, sp, #128
  371. mov r0, r2
  372. mov r1, sp
  373. idct_rows idct_row_armv6, 2
  374. mov r0, sp
  375. ldr r1, [sp, #128]
  376. ldr r2, [sp, #(128+4)]
  377. idct_rows idct_col_put_armv6, 1
  378. add sp, sp, #(128+8)
  379. pop {r4-r11, pc}
  380. .endfunc