You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

426 lines
13KB

  1. /*
  2. * Simple IDCT
  3. *
  4. * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  5. * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of Libav.
  8. *
  9. * Libav is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * Libav is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with Libav; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavutil/arm/asm.S"
  24. #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  25. #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  26. #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  27. #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  28. #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  29. #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  30. #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  31. #define ROW_SHIFT 11
  32. #define COL_SHIFT 20
  33. #define W13 (W1 | (W3 << 16))
  34. #define W26 (W2 | (W6 << 16))
  35. #define W42 (W4 | (W2 << 16))
  36. #define W42n (-W4&0xffff | (-W2 << 16))
  37. #define W46 (W4 | (W6 << 16))
  38. #define W57 (W5 | (W7 << 16))
  39. /*
  40. Compute partial IDCT of single row.
  41. shift = left-shift amount
  42. r0 = source address
  43. r2 = row[2,0] <= 2 cycles
  44. r3 = row[3,1]
  45. ip = w42 <= 2 cycles
  46. Output in registers r4--r11
  47. */
  48. .macro idct_row shift
  49. ldr lr, =W46 /* lr = W4 | (W6 << 16) */
  50. mov r1, #(1<<(\shift-1))
  51. smlad r4, r2, ip, r1
  52. smlsd r7, r2, ip, r1
  53. ldr ip, =W13 /* ip = W1 | (W3 << 16) */
  54. ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
  55. smlad r5, r2, lr, r1
  56. smlsd r6, r2, lr, r1
  57. smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
  58. smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
  59. ldr lr, [r0, #12] /* lr = row[7,5] */
  60. pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
  61. pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
  62. smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
  63. smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */
  64. smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
  65. ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */
  66. smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */
  67. ldr r2, [r0, #4] /* r2 = row[6,4] */
  68. smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */
  69. ldr ip, =W46 /* ip = W4 | (W6 << 16) */
  70. smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */
  71. smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */
  72. smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */
  73. smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */
  74. smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */
  75. .endm
  76. /*
  77. Compute partial IDCT of half row.
  78. shift = left-shift amount
  79. r2 = row[2,0]
  80. r3 = row[3,1]
  81. ip = w42
  82. Output in registers r4--r11
  83. */
  84. .macro idct_row4 shift
  85. ldr lr, =W46 /* lr = W4 | (W6 << 16) */
  86. ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
  87. mov r1, #(1<<(\shift-1))
  88. smlad r4, r2, ip, r1
  89. smlsd r7, r2, ip, r1
  90. ldr ip, =W13 /* ip = W1 | (W3 << 16) */
  91. smlad r5, r2, lr, r1
  92. smlsd r6, r2, lr, r1
  93. smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
  94. smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
  95. pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
  96. pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
  97. smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
  98. smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
  99. .endm
  100. /*
  101. Compute final part of IDCT single row without shift.
  102. Input in registers r4--r11
  103. Output in registers ip, r4--r6, lr, r8--r10
  104. */
  105. .macro idct_finish
  106. add ip, r4, r8 /* r1 = A0 + B0 */
  107. sub lr, r4, r8 /* r2 = A0 - B0 */
  108. sub r4, r5, r9 /* r2 = A1 + B1 */
  109. add r8, r5, r9 /* r2 = A1 - B1 */
  110. add r5, r6, r10 /* r1 = A2 + B2 */
  111. sub r9, r6, r10 /* r1 = A2 - B2 */
  112. add r6, r7, r11 /* r2 = A3 + B3 */
  113. sub r10,r7, r11 /* r2 = A3 - B3 */
  114. .endm
  115. /*
  116. Compute final part of IDCT single row.
  117. shift = right-shift amount
  118. Input/output in registers r4--r11
  119. */
  120. .macro idct_finish_shift shift
  121. add r3, r4, r8 /* r3 = A0 + B0 */
  122. sub r2, r4, r8 /* r2 = A0 - B0 */
  123. mov r4, r3, asr #\shift
  124. mov r8, r2, asr #\shift
  125. sub r3, r5, r9 /* r3 = A1 + B1 */
  126. add r2, r5, r9 /* r2 = A1 - B1 */
  127. mov r5, r3, asr #\shift
  128. mov r9, r2, asr #\shift
  129. add r3, r6, r10 /* r3 = A2 + B2 */
  130. sub r2, r6, r10 /* r2 = A2 - B2 */
  131. mov r6, r3, asr #\shift
  132. mov r10,r2, asr #\shift
  133. add r3, r7, r11 /* r3 = A3 + B3 */
  134. sub r2, r7, r11 /* r2 = A3 - B3 */
  135. mov r7, r3, asr #\shift
  136. mov r11,r2, asr #\shift
  137. .endm
  138. /*
  139. Compute final part of IDCT single row, saturating results at 8 bits.
  140. shift = right-shift amount
  141. Input/output in registers r4--r11
  142. */
  143. .macro idct_finish_shift_sat shift
  144. add r3, r4, r8 /* r3 = A0 + B0 */
  145. sub ip, r4, r8 /* ip = A0 - B0 */
  146. usat r4, #8, r3, asr #\shift
  147. usat r8, #8, ip, asr #\shift
  148. sub r3, r5, r9 /* r3 = A1 + B1 */
  149. add ip, r5, r9 /* ip = A1 - B1 */
  150. usat r5, #8, r3, asr #\shift
  151. usat r9, #8, ip, asr #\shift
  152. add r3, r6, r10 /* r3 = A2 + B2 */
  153. sub ip, r6, r10 /* ip = A2 - B2 */
  154. usat r6, #8, r3, asr #\shift
  155. usat r10,#8, ip, asr #\shift
  156. add r3, r7, r11 /* r3 = A3 + B3 */
  157. sub ip, r7, r11 /* ip = A3 - B3 */
  158. usat r7, #8, r3, asr #\shift
  159. usat r11,#8, ip, asr #\shift
  160. .endm
  161. /*
  162. Compute IDCT of single row, storing as column.
  163. r0 = source
  164. r1 = dest
  165. */
  166. function idct_row_armv6
  167. push {lr}
  168. ldr lr, [r0, #12] /* lr = row[7,5] */
  169. ldr ip, [r0, #4] /* ip = row[6,4] */
  170. ldr r3, [r0, #8] /* r3 = row[3,1] */
  171. ldr r2, [r0] /* r2 = row[2,0] */
  172. orrs lr, lr, ip
  173. itt eq
  174. cmpeq lr, r3
  175. cmpeq lr, r2, lsr #16
  176. beq 1f
  177. push {r1}
  178. ldr ip, =W42 /* ip = W4 | (W2 << 16) */
  179. cmp lr, #0
  180. beq 2f
  181. idct_row ROW_SHIFT
  182. b 3f
  183. 2: idct_row4 ROW_SHIFT
  184. 3: pop {r1}
  185. idct_finish_shift ROW_SHIFT
  186. strh r4, [r1]
  187. strh r5, [r1, #(16*2)]
  188. strh r6, [r1, #(16*4)]
  189. strh r7, [r1, #(16*6)]
  190. strh r11,[r1, #(16*1)]
  191. strh r10,[r1, #(16*3)]
  192. strh r9, [r1, #(16*5)]
  193. strh r8, [r1, #(16*7)]
  194. pop {pc}
  195. 1: mov r2, r2, lsl #3
  196. strh r2, [r1]
  197. strh r2, [r1, #(16*2)]
  198. strh r2, [r1, #(16*4)]
  199. strh r2, [r1, #(16*6)]
  200. strh r2, [r1, #(16*1)]
  201. strh r2, [r1, #(16*3)]
  202. strh r2, [r1, #(16*5)]
  203. strh r2, [r1, #(16*7)]
  204. pop {pc}
  205. endfunc
  206. /*
  207. Compute IDCT of single column, read as row.
  208. r0 = source
  209. r1 = dest
  210. */
  211. function idct_col_armv6
  212. push {r1, lr}
  213. ldr r2, [r0] /* r2 = row[2,0] */
  214. ldr ip, =W42 /* ip = W4 | (W2 << 16) */
  215. ldr r3, [r0, #8] /* r3 = row[3,1] */
  216. idct_row COL_SHIFT
  217. pop {r1}
  218. idct_finish_shift COL_SHIFT
  219. strh r4, [r1]
  220. strh r5, [r1, #(16*1)]
  221. strh r6, [r1, #(16*2)]
  222. strh r7, [r1, #(16*3)]
  223. strh r11,[r1, #(16*4)]
  224. strh r10,[r1, #(16*5)]
  225. strh r9, [r1, #(16*6)]
  226. strh r8, [r1, #(16*7)]
  227. pop {pc}
  228. endfunc
  229. /*
  230. Compute IDCT of single column, read as row, store saturated 8-bit.
  231. r0 = source
  232. r1 = dest
  233. r2 = line size
  234. */
  235. function idct_col_put_armv6
  236. push {r1, r2, lr}
  237. ldr r2, [r0] /* r2 = row[2,0] */
  238. ldr ip, =W42 /* ip = W4 | (W2 << 16) */
  239. ldr r3, [r0, #8] /* r3 = row[3,1] */
  240. idct_row COL_SHIFT
  241. pop {r1, r2}
  242. idct_finish_shift_sat COL_SHIFT
  243. strb_post r4, r1, r2
  244. strb_post r5, r1, r2
  245. strb_post r6, r1, r2
  246. strb_post r7, r1, r2
  247. strb_post r11,r1, r2
  248. strb_post r10,r1, r2
  249. strb_post r9, r1, r2
  250. strb_post r8, r1, r2
  251. sub r1, r1, r2, lsl #3
  252. pop {pc}
  253. endfunc
  254. /*
  255. Compute IDCT of single column, read as row, add/store saturated 8-bit.
  256. r0 = source
  257. r1 = dest
  258. r2 = line size
  259. */
  260. function idct_col_add_armv6
  261. push {r1, r2, lr}
  262. ldr r2, [r0] /* r2 = row[2,0] */
  263. ldr ip, =W42 /* ip = W4 | (W2 << 16) */
  264. ldr r3, [r0, #8] /* r3 = row[3,1] */
  265. idct_row COL_SHIFT
  266. pop {r1, r2}
  267. idct_finish
  268. ldrb r3, [r1]
  269. ldrb r7, [r1, r2]
  270. ldrb r11,[r1, r2, lsl #2]
  271. add ip, r3, ip, asr #COL_SHIFT
  272. usat ip, #8, ip
  273. add r4, r7, r4, asr #COL_SHIFT
  274. strb_post ip, r1, r2
  275. ldrb ip, [r1, r2]
  276. usat r4, #8, r4
  277. ldrb r11,[r1, r2, lsl #2]
  278. add r5, ip, r5, asr #COL_SHIFT
  279. usat r5, #8, r5
  280. strb_post r4, r1, r2
  281. ldrb r3, [r1, r2]
  282. ldrb ip, [r1, r2, lsl #2]
  283. strb_post r5, r1, r2
  284. ldrb r7, [r1, r2]
  285. ldrb r4, [r1, r2, lsl #2]
  286. add r6, r3, r6, asr #COL_SHIFT
  287. usat r6, #8, r6
  288. add r10,r7, r10,asr #COL_SHIFT
  289. usat r10,#8, r10
  290. add r9, r11,r9, asr #COL_SHIFT
  291. usat r9, #8, r9
  292. add r8, ip, r8, asr #COL_SHIFT
  293. usat r8, #8, r8
  294. add lr, r4, lr, asr #COL_SHIFT
  295. usat lr, #8, lr
  296. strb_post r6, r1, r2
  297. strb_post r10,r1, r2
  298. strb_post r9, r1, r2
  299. strb_post r8, r1, r2
  300. strb_post lr, r1, r2
  301. sub r1, r1, r2, lsl #3
  302. pop {pc}
  303. endfunc
  304. /*
  305. Compute 8 IDCT row transforms.
  306. func = IDCT row->col function
  307. width = width of columns in bytes
  308. */
  309. .macro idct_rows func width
  310. bl \func
  311. add r0, r0, #(16*2)
  312. add r1, r1, #\width
  313. bl \func
  314. add r0, r0, #(16*2)
  315. add r1, r1, #\width
  316. bl \func
  317. add r0, r0, #(16*2)
  318. add r1, r1, #\width
  319. bl \func
  320. sub r0, r0, #(16*5)
  321. add r1, r1, #\width
  322. bl \func
  323. add r0, r0, #(16*2)
  324. add r1, r1, #\width
  325. bl \func
  326. add r0, r0, #(16*2)
  327. add r1, r1, #\width
  328. bl \func
  329. add r0, r0, #(16*2)
  330. add r1, r1, #\width
  331. bl \func
  332. sub r0, r0, #(16*7)
  333. .endm
  334. /* void ff_simple_idct_armv6(int16_t *data); */
  335. function ff_simple_idct_armv6, export=1
  336. push {r4-r11, lr}
  337. sub sp, sp, #128
  338. mov r1, sp
  339. idct_rows idct_row_armv6, 2
  340. mov r1, r0
  341. mov r0, sp
  342. idct_rows idct_col_armv6, 2
  343. add sp, sp, #128
  344. pop {r4-r11, pc}
  345. endfunc
  346. /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */
  347. function ff_simple_idct_add_armv6, export=1
  348. push {r0, r1, r4-r11, lr}
  349. sub sp, sp, #128
  350. mov r0, r2
  351. mov r1, sp
  352. idct_rows idct_row_armv6, 2
  353. mov r0, sp
  354. ldr r1, [sp, #128]
  355. ldr r2, [sp, #(128+4)]
  356. idct_rows idct_col_add_armv6, 1
  357. add sp, sp, #(128+8)
  358. pop {r4-r11, pc}
  359. endfunc
  360. /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */
  361. function ff_simple_idct_put_armv6, export=1
  362. push {r0, r1, r4-r11, lr}
  363. sub sp, sp, #128
  364. mov r0, r2
  365. mov r1, sp
  366. idct_rows idct_row_armv6, 2
  367. mov r0, sp
  368. ldr r1, [sp, #128]
  369. ldr r2, [sp, #(128+4)]
  370. idct_rows idct_col_put_armv6, 1
  371. add sp, sp, #(128+8)
  372. pop {r4-r11, pc}
  373. endfunc