You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

456 lines
14KB

  1. /*
  2. * Simple IDCT
  3. *
  4. * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  5. * Copyright (c) 2007 Mans Rullgard <mru@inprovide.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  24. #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  25. #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  26. #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  27. #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  28. #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  29. #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  30. #define ROW_SHIFT 11
  31. #define COL_SHIFT 20
  32. #define W13 (W1 | (W3 << 16))
  33. #define W26 (W2 | (W6 << 16))
  34. #define W42 (W4 | (W2 << 16))
  35. #define W42n (-W4&0xffff | (-W2 << 16))
  36. #define W46 (W4 | (W6 << 16))
  37. #define W57 (W5 | (W7 << 16))
  38. .text
  39. .align
  40. w13: .long W13
  41. w26: .long W26
  42. w42: .long W42
  43. w42n: .long W42n
  44. w46: .long W46
  45. w57: .long W57
  46. /*
  47. Compute partial IDCT of single row.
  48. shift = left-shift amount
  49. a1 = source address
  50. a3 = row[2,0] <= 2 cycles
  51. a4 = row[3,1]
  52. ip = w42 <= 2 cycles
  53. Output in registers v1--v8
  54. */
  55. .macro idct_row shift
  56. ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
  57. mov a2, #(1<<(\shift-1))
  58. smlad v1, a3, ip, a2
  59. smlsd v4, a3, ip, a2
  60. ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
  61. ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
  62. smlad v2, a3, lr, a2
  63. smlsd v3, a3, lr, a2
  64. smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
  65. smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
  66. ldr lr, [a1, #12] /* lr = row[7,5] */
  67. pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
  68. pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
  69. smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
  70. smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */
  71. smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
  72. ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */
  73. smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */
  74. ldr a3, [a1, #4] /* a3 = row[6,4] */
  75. smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */
  76. ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */
  77. smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */
  78. smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */
  79. smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */
  80. smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */
  81. smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */
  82. .endm
  83. /*
  84. Compute partial IDCT of half row.
  85. shift = left-shift amount
  86. a3 = row[2,0]
  87. a4 = row[3,1]
  88. ip = w42
  89. Output in registers v1--v8
  90. */
  91. .macro idct_row4 shift
  92. ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
  93. ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
  94. mov a2, #(1<<(\shift-1))
  95. smlad v1, a3, ip, a2
  96. smlsd v4, a3, ip, a2
  97. ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
  98. smlad v2, a3, lr, a2
  99. smlsd v3, a3, lr, a2
  100. smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
  101. smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
  102. pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
  103. pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
  104. smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
  105. smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
  106. .endm
  107. /*
  108. Compute final part of IDCT single row without shift.
  109. Input in registers v1--v8
  110. Output in registers ip, v1--v3, lr, v5--v7
  111. */
  112. .macro idct_finish
  113. add ip, v1, v5 /* a2 = A0 + B0 */
  114. sub lr, v1, v5 /* a3 = A0 - B0 */
  115. sub v1, v2, v6 /* a3 = A1 + B1 */
  116. add v5, v2, v6 /* a3 = A1 - B1 */
  117. add v2, v3, v7 /* a2 = A2 + B2 */
  118. sub v6, v3, v7 /* a2 = A2 - B2 */
  119. add v3, v4, fp /* a3 = A3 + B3 */
  120. sub v7, v4, fp /* a3 = A3 - B3 */
  121. .endm
  122. /*
  123. Compute final part of IDCT single row.
  124. shift = right-shift amount
  125. Input/output in registers v1--v8
  126. */
  127. .macro idct_finish_shift shift
  128. add a4, v1, v5 /* a4 = A0 + B0 */
  129. sub a3, v1, v5 /* a3 = A0 - B0 */
  130. mov v1, a4, asr #\shift
  131. mov v5, a3, asr #\shift
  132. sub a4, v2, v6 /* a4 = A1 + B1 */
  133. add a3, v2, v6 /* a3 = A1 - B1 */
  134. mov v2, a4, asr #\shift
  135. mov v6, a3, asr #\shift
  136. add a4, v3, v7 /* a4 = A2 + B2 */
  137. sub a3, v3, v7 /* a3 = A2 - B2 */
  138. mov v3, a4, asr #\shift
  139. mov v7, a3, asr #\shift
  140. add a4, v4, fp /* a4 = A3 + B3 */
  141. sub a3, v4, fp /* a3 = A3 - B3 */
  142. mov v4, a4, asr #\shift
  143. mov fp, a3, asr #\shift
  144. .endm
  145. /*
  146. Compute final part of IDCT single row, saturating results at 8 bits.
  147. shift = right-shift amount
  148. Input/output in registers v1--v8
  149. */
  150. .macro idct_finish_shift_sat shift
  151. add a4, v1, v5 /* a4 = A0 + B0 */
  152. sub ip, v1, v5 /* ip = A0 - B0 */
  153. usat v1, #8, a4, asr #\shift
  154. usat v5, #8, ip, asr #\shift
  155. sub a4, v2, v6 /* a4 = A1 + B1 */
  156. add ip, v2, v6 /* ip = A1 - B1 */
  157. usat v2, #8, a4, asr #\shift
  158. usat v6, #8, ip, asr #\shift
  159. add a4, v3, v7 /* a4 = A2 + B2 */
  160. sub ip, v3, v7 /* ip = A2 - B2 */
  161. usat v3, #8, a4, asr #\shift
  162. usat v7, #8, ip, asr #\shift
  163. add a4, v4, fp /* a4 = A3 + B3 */
  164. sub ip, v4, fp /* ip = A3 - B3 */
  165. usat v4, #8, a4, asr #\shift
  166. usat fp, #8, ip, asr #\shift
  167. .endm
  168. /*
  169. Compute IDCT of single row, storing as column.
  170. a1 = source
  171. a2 = dest
  172. */
  173. .align
  174. .type idct_row_armv6, %function
  175. .func idct_row_armv6
  176. idct_row_armv6:
  177. str lr, [sp, #-4]!
  178. ldr lr, [a1, #12] /* lr = row[7,5] */
  179. ldr ip, [a1, #4] /* ip = row[6,4] */
  180. ldr a4, [a1, #8] /* a4 = row[3,1] */
  181. ldr a3, [a1] /* a3 = row[2,0] */
  182. orrs lr, lr, ip
  183. cmpeq lr, a4
  184. cmpeq lr, a3, lsr #16
  185. beq 1f
  186. str a2, [sp, #-4]!
  187. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  188. cmp lr, #0
  189. beq 2f
  190. idct_row ROW_SHIFT
  191. b 3f
  192. 2: idct_row4 ROW_SHIFT
  193. 3: ldr a2, [sp], #4
  194. idct_finish_shift ROW_SHIFT
  195. strh v1, [a2]
  196. strh v2, [a2, #(16*2)]
  197. strh v3, [a2, #(16*4)]
  198. strh v4, [a2, #(16*6)]
  199. strh fp, [a2, #(16*1)]
  200. strh v7, [a2, #(16*3)]
  201. strh v6, [a2, #(16*5)]
  202. strh v5, [a2, #(16*7)]
  203. ldr pc, [sp], #4
  204. 1: mov a3, a3, lsl #3
  205. strh a3, [a2]
  206. strh a3, [a2, #(16*2)]
  207. strh a3, [a2, #(16*4)]
  208. strh a3, [a2, #(16*6)]
  209. strh a3, [a2, #(16*1)]
  210. strh a3, [a2, #(16*3)]
  211. strh a3, [a2, #(16*5)]
  212. strh a3, [a2, #(16*7)]
  213. ldr pc, [sp], #4
  214. .endfunc
  215. /*
  216. Compute IDCT of single column, read as row.
  217. a1 = source
  218. a2 = dest
  219. */
  220. .align
  221. .type idct_col_armv6, %function
  222. .func idct_col_armv6
  223. idct_col_armv6:
  224. stmfd sp!, {a2, lr}
  225. ldr a3, [a1] /* a3 = row[2,0] */
  226. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  227. ldr a4, [a1, #8] /* a4 = row[3,1] */
  228. idct_row COL_SHIFT
  229. ldr a2, [sp], #4
  230. idct_finish_shift COL_SHIFT
  231. strh v1, [a2]
  232. strh v2, [a2, #(16*1)]
  233. strh v3, [a2, #(16*2)]
  234. strh v4, [a2, #(16*3)]
  235. strh fp, [a2, #(16*4)]
  236. strh v7, [a2, #(16*5)]
  237. strh v6, [a2, #(16*6)]
  238. strh v5, [a2, #(16*7)]
  239. ldr pc, [sp], #4
  240. .endfunc
  241. /*
  242. Compute IDCT of single column, read as row, store saturated 8-bit.
  243. a1 = source
  244. a2 = dest
  245. a3 = line size
  246. */
  247. .align
  248. .type idct_col_put_armv6, %function
  249. .func idct_col_put_armv6
  250. idct_col_put_armv6:
  251. stmfd sp!, {a2, a3, lr}
  252. ldr a3, [a1] /* a3 = row[2,0] */
  253. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  254. ldr a4, [a1, #8] /* a4 = row[3,1] */
  255. idct_row COL_SHIFT
  256. ldmfd sp!, {a2, a3}
  257. idct_finish_shift_sat COL_SHIFT
  258. strb v1, [a2], a3
  259. strb v2, [a2], a3
  260. strb v3, [a2], a3
  261. strb v4, [a2], a3
  262. strb fp, [a2], a3
  263. strb v7, [a2], a3
  264. strb v6, [a2], a3
  265. strb v5, [a2], a3
  266. sub a2, a2, a3, lsl #3
  267. ldr pc, [sp], #4
  268. .endfunc
  269. /*
  270. Compute IDCT of single column, read as row, add/store saturated 8-bit.
  271. a1 = source
  272. a2 = dest
  273. a3 = line size
  274. */
  275. .align
  276. .type idct_col_add_armv6, %function
  277. .func idct_col_add_armv6
  278. idct_col_add_armv6:
  279. stmfd sp!, {a2, a3, lr}
  280. ldr a3, [a1] /* a3 = row[2,0] */
  281. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  282. ldr a4, [a1, #8] /* a4 = row[3,1] */
  283. idct_row COL_SHIFT
  284. ldmfd sp!, {a2, a3}
  285. idct_finish
  286. ldrb a4, [a2]
  287. ldrb v4, [a2, a3]
  288. ldrb fp, [a2, a3, lsl #2]
  289. add ip, a4, ip, asr #COL_SHIFT
  290. usat ip, #8, ip
  291. add v1, v4, v1, asr #COL_SHIFT
  292. strb ip, [a2], a3
  293. ldrb ip, [a2, a3]
  294. usat v1, #8, v1
  295. ldrb fp, [a2, a3, lsl #2]
  296. add v2, ip, v2, asr #COL_SHIFT
  297. usat v2, #8, v2
  298. strb v1, [a2], a3
  299. ldrb a4, [a2, a3]
  300. ldrb ip, [a2, a3, lsl #2]
  301. strb v2, [a2], a3
  302. ldrb v4, [a2, a3]
  303. ldrb v1, [a2, a3, lsl #2]
  304. add v3, a4, v3, asr #COL_SHIFT
  305. usat v3, #8, v3
  306. add v7, v4, v7, asr #COL_SHIFT
  307. usat v7, #8, v7
  308. add v6, fp, v6, asr #COL_SHIFT
  309. usat v6, #8, v6
  310. add v5, ip, v5, asr #COL_SHIFT
  311. usat v5, #8, v5
  312. add lr, v1, lr, asr #COL_SHIFT
  313. usat lr, #8, lr
  314. strb v3, [a2], a3
  315. strb v7, [a2], a3
  316. strb v6, [a2], a3
  317. strb v5, [a2], a3
  318. strb lr, [a2], a3
  319. sub a2, a2, a3, lsl #3
  320. ldr pc, [sp], #4
  321. .endfunc
  322. /*
  323. Compute 8 IDCT row transforms.
  324. func = IDCT row->col function
  325. width = width of columns in bytes
  326. */
  327. .macro idct_rows func width
  328. bl \func
  329. add a1, a1, #(16*2)
  330. add a2, a2, #\width
  331. bl \func
  332. add a1, a1, #(16*2)
  333. add a2, a2, #\width
  334. bl \func
  335. add a1, a1, #(16*2)
  336. add a2, a2, #\width
  337. bl \func
  338. sub a1, a1, #(16*5)
  339. add a2, a2, #\width
  340. bl \func
  341. add a1, a1, #(16*2)
  342. add a2, a2, #\width
  343. bl \func
  344. add a1, a1, #(16*2)
  345. add a2, a2, #\width
  346. bl \func
  347. add a1, a1, #(16*2)
  348. add a2, a2, #\width
  349. bl \func
  350. sub a1, a1, #(16*7)
  351. .endm
  352. .align
  353. .global ff_simple_idct_armv6
  354. .type ff_simple_idct_armv6, %function
  355. .func ff_simple_idct_armv6
  356. /* void ff_simple_idct_armv6(DCTELEM *data); */
  357. ff_simple_idct_armv6:
  358. stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
  359. sub sp, sp, #128
  360. mov a2, sp
  361. idct_rows idct_row_armv6, 2
  362. mov a2, a1
  363. mov a1, sp
  364. idct_rows idct_col_armv6, 2
  365. add sp, sp, #128
  366. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  367. .endfunc
  368. .align
  369. .global ff_simple_idct_add_armv6
  370. .type ff_simple_idct_add_armv6, %function
  371. .func ff_simple_idct_add_armv6
  372. /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  373. ff_simple_idct_add_armv6:
  374. stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  375. sub sp, sp, #128
  376. mov a1, a3
  377. mov a2, sp
  378. idct_rows idct_row_armv6, 2
  379. mov a1, sp
  380. ldr a2, [sp, #128]
  381. ldr a3, [sp, #(128+4)]
  382. idct_rows idct_col_add_armv6, 1
  383. add sp, sp, #(128+8)
  384. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  385. .endfunc
  386. .align
  387. .global ff_simple_idct_put_armv6
  388. .type ff_simple_idct_put_armv6, %function
  389. .func ff_simple_idct_put_armv6
  390. /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  391. ff_simple_idct_put_armv6:
  392. stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  393. sub sp, sp, #128
  394. mov a1, a3
  395. mov a2, sp
  396. idct_rows idct_row_armv6, 2
  397. mov a1, sp
  398. ldr a2, [sp, #128]
  399. ldr a3, [sp, #(128+4)]
  400. idct_rows idct_col_put_armv6, 1
  401. add sp, sp, #(128+8)
  402. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  403. .endfunc