You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

434 lines
13KB

  1. /*
  2. * Simple IDCT
  3. *
  4. * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  5. * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "asm.S"
  24. #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  25. #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  26. #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  27. #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  28. #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  29. #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  30. #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  31. #define ROW_SHIFT 11
  32. #define COL_SHIFT 20
  33. #define W13 (W1 | (W3 << 16))
  34. #define W26 (W2 | (W6 << 16))
  35. #define W42 (W4 | (W2 << 16))
  36. #define W42n (-W4&0xffff | (-W2 << 16))
  37. #define W46 (W4 | (W6 << 16))
  38. #define W57 (W5 | (W7 << 16))
  39. .text
  40. .align
  41. w13: .long W13
  42. w26: .long W26
  43. w42: .long W42
  44. w42n: .long W42n
  45. w46: .long W46
  46. w57: .long W57
  47. /*
  48. Compute partial IDCT of single row.
  49. shift = left-shift amount
  50. a1 = source address
  51. a3 = row[2,0] <= 2 cycles
  52. a4 = row[3,1]
  53. ip = w42 <= 2 cycles
  54. Output in registers v1--v8
  55. */
  56. .macro idct_row shift
  57. ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
  58. mov a2, #(1<<(\shift-1))
  59. smlad v1, a3, ip, a2
  60. smlsd v4, a3, ip, a2
  61. ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
  62. ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
  63. smlad v2, a3, lr, a2
  64. smlsd v3, a3, lr, a2
  65. smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
  66. smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
  67. ldr lr, [a1, #12] /* lr = row[7,5] */
  68. pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
  69. pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
  70. smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
  71. smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */
  72. smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
  73. ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */
  74. smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */
  75. ldr a3, [a1, #4] /* a3 = row[6,4] */
  76. smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */
  77. ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */
  78. smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */
  79. smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */
  80. smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */
  81. smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */
  82. smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */
  83. .endm
  84. /*
  85. Compute partial IDCT of half row.
  86. shift = left-shift amount
  87. a3 = row[2,0]
  88. a4 = row[3,1]
  89. ip = w42
  90. Output in registers v1--v8
  91. */
  92. .macro idct_row4 shift
  93. ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
  94. ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
  95. mov a2, #(1<<(\shift-1))
  96. smlad v1, a3, ip, a2
  97. smlsd v4, a3, ip, a2
  98. ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
  99. smlad v2, a3, lr, a2
  100. smlsd v3, a3, lr, a2
  101. smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
  102. smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
  103. pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
  104. pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
  105. smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
  106. smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
  107. .endm
  108. /*
  109. Compute final part of IDCT single row without shift.
  110. Input in registers v1--v8
  111. Output in registers ip, v1--v3, lr, v5--v7
  112. */
  113. .macro idct_finish
  114. add ip, v1, v5 /* a2 = A0 + B0 */
  115. sub lr, v1, v5 /* a3 = A0 - B0 */
  116. sub v1, v2, v6 /* a3 = A1 + B1 */
  117. add v5, v2, v6 /* a3 = A1 - B1 */
  118. add v2, v3, v7 /* a2 = A2 + B2 */
  119. sub v6, v3, v7 /* a2 = A2 - B2 */
  120. add v3, v4, fp /* a3 = A3 + B3 */
  121. sub v7, v4, fp /* a3 = A3 - B3 */
  122. .endm
  123. /*
  124. Compute final part of IDCT single row.
  125. shift = right-shift amount
  126. Input/output in registers v1--v8
  127. */
  128. .macro idct_finish_shift shift
  129. add a4, v1, v5 /* a4 = A0 + B0 */
  130. sub a3, v1, v5 /* a3 = A0 - B0 */
  131. mov v1, a4, asr #\shift
  132. mov v5, a3, asr #\shift
  133. sub a4, v2, v6 /* a4 = A1 + B1 */
  134. add a3, v2, v6 /* a3 = A1 - B1 */
  135. mov v2, a4, asr #\shift
  136. mov v6, a3, asr #\shift
  137. add a4, v3, v7 /* a4 = A2 + B2 */
  138. sub a3, v3, v7 /* a3 = A2 - B2 */
  139. mov v3, a4, asr #\shift
  140. mov v7, a3, asr #\shift
  141. add a4, v4, fp /* a4 = A3 + B3 */
  142. sub a3, v4, fp /* a3 = A3 - B3 */
  143. mov v4, a4, asr #\shift
  144. mov fp, a3, asr #\shift
  145. .endm
  146. /*
  147. Compute final part of IDCT single row, saturating results at 8 bits.
  148. shift = right-shift amount
  149. Input/output in registers v1--v8
  150. */
  151. .macro idct_finish_shift_sat shift
  152. add a4, v1, v5 /* a4 = A0 + B0 */
  153. sub ip, v1, v5 /* ip = A0 - B0 */
  154. usat v1, #8, a4, asr #\shift
  155. usat v5, #8, ip, asr #\shift
  156. sub a4, v2, v6 /* a4 = A1 + B1 */
  157. add ip, v2, v6 /* ip = A1 - B1 */
  158. usat v2, #8, a4, asr #\shift
  159. usat v6, #8, ip, asr #\shift
  160. add a4, v3, v7 /* a4 = A2 + B2 */
  161. sub ip, v3, v7 /* ip = A2 - B2 */
  162. usat v3, #8, a4, asr #\shift
  163. usat v7, #8, ip, asr #\shift
  164. add a4, v4, fp /* a4 = A3 + B3 */
  165. sub ip, v4, fp /* ip = A3 - B3 */
  166. usat v4, #8, a4, asr #\shift
  167. usat fp, #8, ip, asr #\shift
  168. .endm
  169. /*
  170. Compute IDCT of single row, storing as column.
  171. a1 = source
  172. a2 = dest
  173. */
  174. function idct_row_armv6
  175. str lr, [sp, #-4]!
  176. ldr lr, [a1, #12] /* lr = row[7,5] */
  177. ldr ip, [a1, #4] /* ip = row[6,4] */
  178. ldr a4, [a1, #8] /* a4 = row[3,1] */
  179. ldr a3, [a1] /* a3 = row[2,0] */
  180. orrs lr, lr, ip
  181. cmpeq lr, a4
  182. cmpeq lr, a3, lsr #16
  183. beq 1f
  184. str a2, [sp, #-4]!
  185. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  186. cmp lr, #0
  187. beq 2f
  188. idct_row ROW_SHIFT
  189. b 3f
  190. 2: idct_row4 ROW_SHIFT
  191. 3: ldr a2, [sp], #4
  192. idct_finish_shift ROW_SHIFT
  193. strh v1, [a2]
  194. strh v2, [a2, #(16*2)]
  195. strh v3, [a2, #(16*4)]
  196. strh v4, [a2, #(16*6)]
  197. strh fp, [a2, #(16*1)]
  198. strh v7, [a2, #(16*3)]
  199. strh v6, [a2, #(16*5)]
  200. strh v5, [a2, #(16*7)]
  201. ldr pc, [sp], #4
  202. 1: mov a3, a3, lsl #3
  203. strh a3, [a2]
  204. strh a3, [a2, #(16*2)]
  205. strh a3, [a2, #(16*4)]
  206. strh a3, [a2, #(16*6)]
  207. strh a3, [a2, #(16*1)]
  208. strh a3, [a2, #(16*3)]
  209. strh a3, [a2, #(16*5)]
  210. strh a3, [a2, #(16*7)]
  211. ldr pc, [sp], #4
  212. .endfunc
  213. /*
  214. Compute IDCT of single column, read as row.
  215. a1 = source
  216. a2 = dest
  217. */
  218. function idct_col_armv6
  219. stmfd sp!, {a2, lr}
  220. ldr a3, [a1] /* a3 = row[2,0] */
  221. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  222. ldr a4, [a1, #8] /* a4 = row[3,1] */
  223. idct_row COL_SHIFT
  224. ldr a2, [sp], #4
  225. idct_finish_shift COL_SHIFT
  226. strh v1, [a2]
  227. strh v2, [a2, #(16*1)]
  228. strh v3, [a2, #(16*2)]
  229. strh v4, [a2, #(16*3)]
  230. strh fp, [a2, #(16*4)]
  231. strh v7, [a2, #(16*5)]
  232. strh v6, [a2, #(16*6)]
  233. strh v5, [a2, #(16*7)]
  234. ldr pc, [sp], #4
  235. .endfunc
  236. /*
  237. Compute IDCT of single column, read as row, store saturated 8-bit.
  238. a1 = source
  239. a2 = dest
  240. a3 = line size
  241. */
  242. function idct_col_put_armv6
  243. stmfd sp!, {a2, a3, lr}
  244. ldr a3, [a1] /* a3 = row[2,0] */
  245. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  246. ldr a4, [a1, #8] /* a4 = row[3,1] */
  247. idct_row COL_SHIFT
  248. ldmfd sp!, {a2, a3}
  249. idct_finish_shift_sat COL_SHIFT
  250. strb v1, [a2], a3
  251. strb v2, [a2], a3
  252. strb v3, [a2], a3
  253. strb v4, [a2], a3
  254. strb fp, [a2], a3
  255. strb v7, [a2], a3
  256. strb v6, [a2], a3
  257. strb v5, [a2], a3
  258. sub a2, a2, a3, lsl #3
  259. ldr pc, [sp], #4
  260. .endfunc
  261. /*
  262. Compute IDCT of single column, read as row, add/store saturated 8-bit.
  263. a1 = source
  264. a2 = dest
  265. a3 = line size
  266. */
  267. function idct_col_add_armv6
  268. stmfd sp!, {a2, a3, lr}
  269. ldr a3, [a1] /* a3 = row[2,0] */
  270. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  271. ldr a4, [a1, #8] /* a4 = row[3,1] */
  272. idct_row COL_SHIFT
  273. ldmfd sp!, {a2, a3}
  274. idct_finish
  275. ldrb a4, [a2]
  276. ldrb v4, [a2, a3]
  277. ldrb fp, [a2, a3, lsl #2]
  278. add ip, a4, ip, asr #COL_SHIFT
  279. usat ip, #8, ip
  280. add v1, v4, v1, asr #COL_SHIFT
  281. strb ip, [a2], a3
  282. ldrb ip, [a2, a3]
  283. usat v1, #8, v1
  284. ldrb fp, [a2, a3, lsl #2]
  285. add v2, ip, v2, asr #COL_SHIFT
  286. usat v2, #8, v2
  287. strb v1, [a2], a3
  288. ldrb a4, [a2, a3]
  289. ldrb ip, [a2, a3, lsl #2]
  290. strb v2, [a2], a3
  291. ldrb v4, [a2, a3]
  292. ldrb v1, [a2, a3, lsl #2]
  293. add v3, a4, v3, asr #COL_SHIFT
  294. usat v3, #8, v3
  295. add v7, v4, v7, asr #COL_SHIFT
  296. usat v7, #8, v7
  297. add v6, fp, v6, asr #COL_SHIFT
  298. usat v6, #8, v6
  299. add v5, ip, v5, asr #COL_SHIFT
  300. usat v5, #8, v5
  301. add lr, v1, lr, asr #COL_SHIFT
  302. usat lr, #8, lr
  303. strb v3, [a2], a3
  304. strb v7, [a2], a3
  305. strb v6, [a2], a3
  306. strb v5, [a2], a3
  307. strb lr, [a2], a3
  308. sub a2, a2, a3, lsl #3
  309. ldr pc, [sp], #4
  310. .endfunc
  311. /*
  312. Compute 8 IDCT row transforms.
  313. func = IDCT row->col function
  314. width = width of columns in bytes
  315. */
  316. .macro idct_rows func width
  317. bl \func
  318. add a1, a1, #(16*2)
  319. add a2, a2, #\width
  320. bl \func
  321. add a1, a1, #(16*2)
  322. add a2, a2, #\width
  323. bl \func
  324. add a1, a1, #(16*2)
  325. add a2, a2, #\width
  326. bl \func
  327. sub a1, a1, #(16*5)
  328. add a2, a2, #\width
  329. bl \func
  330. add a1, a1, #(16*2)
  331. add a2, a2, #\width
  332. bl \func
  333. add a1, a1, #(16*2)
  334. add a2, a2, #\width
  335. bl \func
  336. add a1, a1, #(16*2)
  337. add a2, a2, #\width
  338. bl \func
  339. sub a1, a1, #(16*7)
  340. .endm
  341. /* void ff_simple_idct_armv6(DCTELEM *data); */
  342. function ff_simple_idct_armv6, export=1
  343. stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
  344. sub sp, sp, #128
  345. mov a2, sp
  346. idct_rows idct_row_armv6, 2
  347. mov a2, a1
  348. mov a1, sp
  349. idct_rows idct_col_armv6, 2
  350. add sp, sp, #128
  351. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  352. .endfunc
  353. /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  354. function ff_simple_idct_add_armv6, export=1
  355. stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  356. sub sp, sp, #128
  357. mov a1, a3
  358. mov a2, sp
  359. idct_rows idct_row_armv6, 2
  360. mov a1, sp
  361. ldr a2, [sp, #128]
  362. ldr a3, [sp, #(128+4)]
  363. idct_rows idct_col_add_armv6, 1
  364. add sp, sp, #(128+8)
  365. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  366. .endfunc
  367. /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  368. function ff_simple_idct_put_armv6, export=1
  369. stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  370. sub sp, sp, #128
  371. mov a1, a3
  372. mov a2, sp
  373. idct_rows idct_row_armv6, 2
  374. mov a1, sp
  375. ldr a2, [sp, #128]
  376. ldr a3, [sp, #(128+4)]
  377. idct_rows idct_col_put_armv6, 1
  378. add sp, sp, #(128+8)
  379. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  380. .endfunc