You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

435 lines
13KB

  1. /*
  2. * Simple IDCT
  3. *
  4. * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  5. * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "asm.S"
  24. #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  25. #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  26. #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  27. #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  28. #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  29. #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  30. #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  31. #define ROW_SHIFT 11
  32. #define COL_SHIFT 20
  33. #define W13 (W1 | (W3 << 16))
  34. #define W26 (W2 | (W6 << 16))
  35. #define W42 (W4 | (W2 << 16))
  36. #define W42n (-W4&0xffff | (-W2 << 16))
  37. #define W46 (W4 | (W6 << 16))
  38. #define W57 (W5 | (W7 << 16))
  39. .text
  40. .align
  41. w13: .long W13
  42. w26: .long W26
  43. w42: .long W42
  44. w42n: .long W42n
  45. w46: .long W46
  46. w57: .long W57
  47. /*
  48. Compute partial IDCT of single row.
  49. shift = left-shift amount
  50. r0 = source address
  51. r2 = row[2,0] <= 2 cycles
  52. r3 = row[3,1]
  53. ip = w42 <= 2 cycles
  54. Output in registers r4--r11
  55. */
  56. .macro idct_row shift
  57. ldr lr, w46 /* lr = W4 | (W6 << 16) */
  58. mov r1, #(1<<(\shift-1))
  59. smlad r4, r2, ip, r1
  60. smlsd r7, r2, ip, r1
  61. ldr ip, w13 /* ip = W1 | (W3 << 16) */
  62. ldr r10,w57 /* r10 = W5 | (W7 << 16) */
  63. smlad r5, r2, lr, r1
  64. smlsd r6, r2, lr, r1
  65. smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
  66. smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
  67. ldr lr, [r0, #12] /* lr = row[7,5] */
  68. pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
  69. pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
  70. smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
  71. smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */
  72. smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
  73. ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */
  74. smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */
  75. ldr r2, [r0, #4] /* r2 = row[6,4] */
  76. smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */
  77. ldr ip, w46 /* ip = W4 | (W6 << 16) */
  78. smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */
  79. smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */
  80. smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */
  81. smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */
  82. smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */
  83. .endm
  84. /*
  85. Compute partial IDCT of half row.
  86. shift = left-shift amount
  87. r2 = row[2,0]
  88. r3 = row[3,1]
  89. ip = w42
  90. Output in registers r4--r11
  91. */
  92. .macro idct_row4 shift
  93. ldr lr, w46 /* lr = W4 | (W6 << 16) */
  94. ldr r10,w57 /* r10 = W5 | (W7 << 16) */
  95. mov r1, #(1<<(\shift-1))
  96. smlad r4, r2, ip, r1
  97. smlsd r7, r2, ip, r1
  98. ldr ip, w13 /* ip = W1 | (W3 << 16) */
  99. smlad r5, r2, lr, r1
  100. smlsd r6, r2, lr, r1
  101. smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
  102. smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
  103. pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
  104. pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
  105. smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
  106. smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
  107. .endm
  108. /*
  109. Compute final part of IDCT single row without shift.
  110. Input in registers r4--r11
  111. Output in registers ip, r4--r6, lr, r8--r10
  112. */
  113. .macro idct_finish
  114. add ip, r4, r8 /* r1 = A0 + B0 */
  115. sub lr, r4, r8 /* r2 = A0 - B0 */
  116. sub r4, r5, r9 /* r2 = A1 + B1 */
  117. add r8, r5, r9 /* r2 = A1 - B1 */
  118. add r5, r6, r10 /* r1 = A2 + B2 */
  119. sub r9, r6, r10 /* r1 = A2 - B2 */
  120. add r6, r7, r11 /* r2 = A3 + B3 */
  121. sub r10,r7, r11 /* r2 = A3 - B3 */
  122. .endm
  123. /*
  124. Compute final part of IDCT single row.
  125. shift = right-shift amount
  126. Input/output in registers r4--r11
  127. */
  128. .macro idct_finish_shift shift
  129. add r3, r4, r8 /* r3 = A0 + B0 */
  130. sub r2, r4, r8 /* r2 = A0 - B0 */
  131. mov r4, r3, asr #\shift
  132. mov r8, r2, asr #\shift
  133. sub r3, r5, r9 /* r3 = A1 + B1 */
  134. add r2, r5, r9 /* r2 = A1 - B1 */
  135. mov r5, r3, asr #\shift
  136. mov r9, r2, asr #\shift
  137. add r3, r6, r10 /* r3 = A2 + B2 */
  138. sub r2, r6, r10 /* r2 = A2 - B2 */
  139. mov r6, r3, asr #\shift
  140. mov r10,r2, asr #\shift
  141. add r3, r7, r11 /* r3 = A3 + B3 */
  142. sub r2, r7, r11 /* r2 = A3 - B3 */
  143. mov r7, r3, asr #\shift
  144. mov r11,r2, asr #\shift
  145. .endm
  146. /*
  147. Compute final part of IDCT single row, saturating results at 8 bits.
  148. shift = right-shift amount
  149. Input/output in registers r4--r11
  150. */
  151. .macro idct_finish_shift_sat shift
  152. add r3, r4, r8 /* r3 = A0 + B0 */
  153. sub ip, r4, r8 /* ip = A0 - B0 */
  154. usat r4, #8, r3, asr #\shift
  155. usat r8, #8, ip, asr #\shift
  156. sub r3, r5, r9 /* r3 = A1 + B1 */
  157. add ip, r5, r9 /* ip = A1 - B1 */
  158. usat r5, #8, r3, asr #\shift
  159. usat r9, #8, ip, asr #\shift
  160. add r3, r6, r10 /* r3 = A2 + B2 */
  161. sub ip, r6, r10 /* ip = A2 - B2 */
  162. usat r6, #8, r3, asr #\shift
  163. usat r10,#8, ip, asr #\shift
  164. add r3, r7, r11 /* r3 = A3 + B3 */
  165. sub ip, r7, r11 /* ip = A3 - B3 */
  166. usat r7, #8, r3, asr #\shift
  167. usat r11,#8, ip, asr #\shift
  168. .endm
  169. /*
  170. Compute IDCT of single row, storing as column.
  171. r0 = source
  172. r1 = dest
  173. */
  174. function idct_row_armv6
  175. push {lr}
  176. ldr lr, [r0, #12] /* lr = row[7,5] */
  177. ldr ip, [r0, #4] /* ip = row[6,4] */
  178. ldr r3, [r0, #8] /* r3 = row[3,1] */
  179. ldr r2, [r0] /* r2 = row[2,0] */
  180. orrs lr, lr, ip
  181. itt eq
  182. cmpeq lr, r3
  183. cmpeq lr, r2, lsr #16
  184. beq 1f
  185. push {r1}
  186. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  187. cmp lr, #0
  188. beq 2f
  189. idct_row ROW_SHIFT
  190. b 3f
  191. 2: idct_row4 ROW_SHIFT
  192. 3: pop {r1}
  193. idct_finish_shift ROW_SHIFT
  194. strh r4, [r1]
  195. strh r5, [r1, #(16*2)]
  196. strh r6, [r1, #(16*4)]
  197. strh r7, [r1, #(16*6)]
  198. strh r11,[r1, #(16*1)]
  199. strh r10,[r1, #(16*3)]
  200. strh r9, [r1, #(16*5)]
  201. strh r8, [r1, #(16*7)]
  202. pop {pc}
  203. 1: mov r2, r2, lsl #3
  204. strh r2, [r1]
  205. strh r2, [r1, #(16*2)]
  206. strh r2, [r1, #(16*4)]
  207. strh r2, [r1, #(16*6)]
  208. strh r2, [r1, #(16*1)]
  209. strh r2, [r1, #(16*3)]
  210. strh r2, [r1, #(16*5)]
  211. strh r2, [r1, #(16*7)]
  212. pop {pc}
  213. endfunc
  214. /*
  215. Compute IDCT of single column, read as row.
  216. r0 = source
  217. r1 = dest
  218. */
  219. function idct_col_armv6
  220. push {r1, lr}
  221. ldr r2, [r0] /* r2 = row[2,0] */
  222. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  223. ldr r3, [r0, #8] /* r3 = row[3,1] */
  224. idct_row COL_SHIFT
  225. pop {r1}
  226. idct_finish_shift COL_SHIFT
  227. strh r4, [r1]
  228. strh r5, [r1, #(16*1)]
  229. strh r6, [r1, #(16*2)]
  230. strh r7, [r1, #(16*3)]
  231. strh r11,[r1, #(16*4)]
  232. strh r10,[r1, #(16*5)]
  233. strh r9, [r1, #(16*6)]
  234. strh r8, [r1, #(16*7)]
  235. pop {pc}
  236. endfunc
  237. /*
  238. Compute IDCT of single column, read as row, store saturated 8-bit.
  239. r0 = source
  240. r1 = dest
  241. r2 = line size
  242. */
  243. function idct_col_put_armv6
  244. push {r1, r2, lr}
  245. ldr r2, [r0] /* r2 = row[2,0] */
  246. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  247. ldr r3, [r0, #8] /* r3 = row[3,1] */
  248. idct_row COL_SHIFT
  249. pop {r1, r2}
  250. idct_finish_shift_sat COL_SHIFT
  251. strb_post r4, r1, r2
  252. strb_post r5, r1, r2
  253. strb_post r6, r1, r2
  254. strb_post r7, r1, r2
  255. strb_post r11,r1, r2
  256. strb_post r10,r1, r2
  257. strb_post r9, r1, r2
  258. strb_post r8, r1, r2
  259. sub r1, r1, r2, lsl #3
  260. pop {pc}
  261. endfunc
  262. /*
  263. Compute IDCT of single column, read as row, add/store saturated 8-bit.
  264. r0 = source
  265. r1 = dest
  266. r2 = line size
  267. */
  268. function idct_col_add_armv6
  269. push {r1, r2, lr}
  270. ldr r2, [r0] /* r2 = row[2,0] */
  271. ldr ip, w42 /* ip = W4 | (W2 << 16) */
  272. ldr r3, [r0, #8] /* r3 = row[3,1] */
  273. idct_row COL_SHIFT
  274. pop {r1, r2}
  275. idct_finish
  276. ldrb r3, [r1]
  277. ldrb r7, [r1, r2]
  278. ldrb r11,[r1, r2, lsl #2]
  279. add ip, r3, ip, asr #COL_SHIFT
  280. usat ip, #8, ip
  281. add r4, r7, r4, asr #COL_SHIFT
  282. strb_post ip, r1, r2
  283. ldrb ip, [r1, r2]
  284. usat r4, #8, r4
  285. ldrb r11,[r1, r2, lsl #2]
  286. add r5, ip, r5, asr #COL_SHIFT
  287. usat r5, #8, r5
  288. strb_post r4, r1, r2
  289. ldrb r3, [r1, r2]
  290. ldrb ip, [r1, r2, lsl #2]
  291. strb_post r5, r1, r2
  292. ldrb r7, [r1, r2]
  293. ldrb r4, [r1, r2, lsl #2]
  294. add r6, r3, r6, asr #COL_SHIFT
  295. usat r6, #8, r6
  296. add r10,r7, r10,asr #COL_SHIFT
  297. usat r10,#8, r10
  298. add r9, r11,r9, asr #COL_SHIFT
  299. usat r9, #8, r9
  300. add r8, ip, r8, asr #COL_SHIFT
  301. usat r8, #8, r8
  302. add lr, r4, lr, asr #COL_SHIFT
  303. usat lr, #8, lr
  304. strb_post r6, r1, r2
  305. strb_post r10,r1, r2
  306. strb_post r9, r1, r2
  307. strb_post r8, r1, r2
  308. strb_post lr, r1, r2
  309. sub r1, r1, r2, lsl #3
  310. pop {pc}
  311. endfunc
  312. /*
  313. Compute 8 IDCT row transforms.
  314. func = IDCT row->col function
  315. width = width of columns in bytes
  316. */
  317. .macro idct_rows func width
  318. bl \func
  319. add r0, r0, #(16*2)
  320. add r1, r1, #\width
  321. bl \func
  322. add r0, r0, #(16*2)
  323. add r1, r1, #\width
  324. bl \func
  325. add r0, r0, #(16*2)
  326. add r1, r1, #\width
  327. bl \func
  328. sub r0, r0, #(16*5)
  329. add r1, r1, #\width
  330. bl \func
  331. add r0, r0, #(16*2)
  332. add r1, r1, #\width
  333. bl \func
  334. add r0, r0, #(16*2)
  335. add r1, r1, #\width
  336. bl \func
  337. add r0, r0, #(16*2)
  338. add r1, r1, #\width
  339. bl \func
  340. sub r0, r0, #(16*7)
  341. .endm
  342. /* void ff_simple_idct_armv6(DCTELEM *data); */
  343. function ff_simple_idct_armv6, export=1
  344. push {r4-r11, lr}
  345. sub sp, sp, #128
  346. mov r1, sp
  347. idct_rows idct_row_armv6, 2
  348. mov r1, r0
  349. mov r0, sp
  350. idct_rows idct_col_armv6, 2
  351. add sp, sp, #128
  352. pop {r4-r11, pc}
  353. endfunc
  354. /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  355. function ff_simple_idct_add_armv6, export=1
  356. push {r0, r1, r4-r11, lr}
  357. sub sp, sp, #128
  358. mov r0, r2
  359. mov r1, sp
  360. idct_rows idct_row_armv6, 2
  361. mov r0, sp
  362. ldr r1, [sp, #128]
  363. ldr r2, [sp, #(128+4)]
  364. idct_rows idct_col_add_armv6, 1
  365. add sp, sp, #(128+8)
  366. pop {r4-r11, pc}
  367. endfunc
  368. /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  369. function ff_simple_idct_put_armv6, export=1
  370. push {r0, r1, r4-r11, lr}
  371. sub sp, sp, #128
  372. mov r0, r2
  373. mov r1, sp
  374. idct_rows idct_row_armv6, 2
  375. mov r0, sp
  376. ldr r1, [sp, #128]
  377. ldr r2, [sp, #(128+4)]
  378. idct_rows idct_col_put_armv6, 1
  379. add sp, sp, #(128+8)
  380. pop {r4-r11, pc}
  381. endfunc