You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

800 lines
22KB

  1. @
  2. @ ARMv4 optimized DSP utils
  3. @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
  4. @
  5. @ This file is part of FFmpeg.
  6. @
  7. @ FFmpeg is free software; you can redistribute it and/or
  8. @ modify it under the terms of the GNU Lesser General Public
  9. @ License as published by the Free Software Foundation; either
  10. @ version 2.1 of the License, or (at your option) any later version.
  11. @
  12. @ FFmpeg is distributed in the hope that it will be useful,
  13. @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. @ Lesser General Public License for more details.
  16. @
  17. @ You should have received a copy of the GNU Lesser General Public
  18. @ License along with FFmpeg; if not, write to the Free Software
  19. @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. @
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. #if !HAVE_PLD
  25. .macro pld reg
  26. .endm
  27. #endif
  28. #if HAVE_ARMV5TE
  29. function ff_prefetch_arm, export=1
  30. subs r2, r2, #1
  31. pld [r0]
  32. add r0, r0, r1
  33. bne ff_prefetch_arm
  34. bx lr
  35. .endfunc
  36. #endif
  37. .macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
  38. mov \Rd0, \Rn0, lsr #(\shift * 8)
  39. mov \Rd1, \Rn1, lsr #(\shift * 8)
  40. mov \Rd2, \Rn2, lsr #(\shift * 8)
  41. mov \Rd3, \Rn3, lsr #(\shift * 8)
  42. orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
  43. orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
  44. orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
  45. orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
  46. .endm
  47. .macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
  48. mov \R0, \R0, lsr #(\shift * 8)
  49. orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
  50. mov \R1, \R1, lsr #(\shift * 8)
  51. orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
  52. .endm
  53. .macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
  54. mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
  55. mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
  56. orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
  57. orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
  58. .endm
  59. .macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  60. @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  61. @ Rmask = 0xFEFEFEFE
  62. @ Rn = destroy
  63. eor \Rd0, \Rn0, \Rm0
  64. eor \Rd1, \Rn1, \Rm1
  65. orr \Rn0, \Rn0, \Rm0
  66. orr \Rn1, \Rn1, \Rm1
  67. and \Rd0, \Rd0, \Rmask
  68. and \Rd1, \Rd1, \Rmask
  69. sub \Rd0, \Rn0, \Rd0, lsr #1
  70. sub \Rd1, \Rn1, \Rd1, lsr #1
  71. .endm
  72. .macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  73. @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  74. @ Rmask = 0xFEFEFEFE
  75. @ Rn = destroy
  76. eor \Rd0, \Rn0, \Rm0
  77. eor \Rd1, \Rn1, \Rm1
  78. and \Rn0, \Rn0, \Rm0
  79. and \Rn1, \Rn1, \Rm1
  80. and \Rd0, \Rd0, \Rmask
  81. and \Rd1, \Rd1, \Rmask
  82. add \Rd0, \Rn0, \Rd0, lsr #1
  83. add \Rd1, \Rn1, \Rd1, lsr #1
  84. .endm
  85. @ ----------------------------------------------------------------
  86. .align 8
  87. function put_pixels16_arm, export=1
  88. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  89. @ block = word aligned, pixles = unaligned
  90. pld [r1]
  91. stmfd sp!, {r4-r11, lr} @ R14 is also called LR
  92. adr r5, 5f
  93. ands r4, r1, #3
  94. bic r1, r1, #3
  95. add r5, r5, r4, lsl #2
  96. ldrne pc, [r5]
  97. 1:
  98. ldmia r1, {r4-r7}
  99. add r1, r1, r2
  100. stmia r0, {r4-r7}
  101. pld [r1]
  102. subs r3, r3, #1
  103. add r0, r0, r2
  104. bne 1b
  105. ldmfd sp!, {r4-r11, pc}
  106. .align 8
  107. 2:
  108. ldmia r1, {r4-r8}
  109. add r1, r1, r2
  110. ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
  111. pld [r1]
  112. subs r3, r3, #1
  113. stmia r0, {r9-r12}
  114. add r0, r0, r2
  115. bne 2b
  116. ldmfd sp!, {r4-r11, pc}
  117. .align 8
  118. 3:
  119. ldmia r1, {r4-r8}
  120. add r1, r1, r2
  121. ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
  122. pld [r1]
  123. subs r3, r3, #1
  124. stmia r0, {r9-r12}
  125. add r0, r0, r2
  126. bne 3b
  127. ldmfd sp!, {r4-r11, pc}
  128. .align 8
  129. 4:
  130. ldmia r1, {r4-r8}
  131. add r1, r1, r2
  132. ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
  133. pld [r1]
  134. subs r3, r3, #1
  135. stmia r0, {r9-r12}
  136. add r0, r0, r2
  137. bne 4b
  138. ldmfd sp!, {r4-r11,pc}
  139. .align 8
  140. 5:
  141. .word 1b
  142. .word 2b
  143. .word 3b
  144. .word 4b
  145. .endfunc
  146. @ ----------------------------------------------------------------
  147. .align 8
  148. function put_pixels8_arm, export=1
  149. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  150. @ block = word aligned, pixles = unaligned
  151. pld [r1]
  152. stmfd sp!, {r4-r5,lr} @ R14 is also called LR
  153. adr r5, 5f
  154. ands r4, r1, #3
  155. bic r1, r1, #3
  156. add r5, r5, r4, lsl #2
  157. ldrne pc, [r5]
  158. 1:
  159. ldmia r1, {r4-r5}
  160. add r1, r1, r2
  161. subs r3, r3, #1
  162. pld [r1]
  163. stmia r0, {r4-r5}
  164. add r0, r0, r2
  165. bne 1b
  166. ldmfd sp!, {r4-r5,pc}
  167. .align 8
  168. 2:
  169. ldmia r1, {r4-r5, r12}
  170. add r1, r1, r2
  171. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
  172. pld [r1]
  173. subs r3, r3, #1
  174. stmia r0, {r4-r5}
  175. add r0, r0, r2
  176. bne 2b
  177. ldmfd sp!, {r4-r5,pc}
  178. .align 8
  179. 3:
  180. ldmia r1, {r4-r5, r12}
  181. add r1, r1, r2
  182. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
  183. pld [r1]
  184. subs r3, r3, #1
  185. stmia r0, {r4-r5}
  186. add r0, r0, r2
  187. bne 3b
  188. ldmfd sp!, {r4-r5,pc}
  189. .align 8
  190. 4:
  191. ldmia r1, {r4-r5, r12}
  192. add r1, r1, r2
  193. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
  194. pld [r1]
  195. subs r3, r3, #1
  196. stmia r0, {r4-r5}
  197. add r0, r0, r2
  198. bne 4b
  199. ldmfd sp!, {r4-r5,pc}
  200. .align 8
  201. 5:
  202. .word 1b
  203. .word 2b
  204. .word 3b
  205. .word 4b
  206. .endfunc
  207. @ ----------------------------------------------------------------
  208. .align 8
  209. function put_pixels8_x2_arm, export=1
  210. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  211. @ block = word aligned, pixles = unaligned
  212. pld [r1]
  213. stmfd sp!, {r4-r10,lr} @ R14 is also called LR
  214. adr r5, 5f
  215. ands r4, r1, #3
  216. ldr r12, [r5]
  217. add r5, r5, r4, lsl #2
  218. bic r1, r1, #3
  219. ldrne pc, [r5]
  220. 1:
  221. ldmia r1, {r4-r5, r10}
  222. add r1, r1, r2
  223. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  224. pld [r1]
  225. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  226. subs r3, r3, #1
  227. stmia r0, {r8-r9}
  228. add r0, r0, r2
  229. bne 1b
  230. ldmfd sp!, {r4-r10,pc}
  231. .align 8
  232. 2:
  233. ldmia r1, {r4-r5, r10}
  234. add r1, r1, r2
  235. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  236. ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
  237. pld [r1]
  238. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  239. subs r3, r3, #1
  240. stmia r0, {r4-r5}
  241. add r0, r0, r2
  242. bne 2b
  243. ldmfd sp!, {r4-r10,pc}
  244. .align 8
  245. 3:
  246. ldmia r1, {r4-r5, r10}
  247. add r1, r1, r2
  248. ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
  249. ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
  250. pld [r1]
  251. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  252. subs r3, r3, #1
  253. stmia r0, {r4-r5}
  254. add r0, r0, r2
  255. bne 3b
  256. ldmfd sp!, {r4-r10,pc}
  257. .align 8
  258. 4:
  259. ldmia r1, {r4-r5, r10}
  260. add r1, r1, r2
  261. ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
  262. pld [r1]
  263. RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  264. subs r3, r3, #1
  265. stmia r0, {r8-r9}
  266. add r0, r0, r2
  267. bne 4b
  268. ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
  269. .align 8
  270. 5:
  271. .word 0xFEFEFEFE
  272. .word 2b
  273. .word 3b
  274. .word 4b
  275. .endfunc
  276. .align 8
  277. function put_no_rnd_pixels8_x2_arm, export=1
  278. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  279. @ block = word aligned, pixles = unaligned
  280. pld [r1]
  281. stmfd sp!, {r4-r10,lr} @ R14 is also called LR
  282. adr r5, 5f
  283. ands r4, r1, #3
  284. ldr r12, [r5]
  285. add r5, r5, r4, lsl #2
  286. bic r1, r1, #3
  287. ldrne pc, [r5]
  288. 1:
  289. ldmia r1, {r4-r5, r10}
  290. add r1, r1, r2
  291. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  292. pld [r1]
  293. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  294. subs r3, r3, #1
  295. stmia r0, {r8-r9}
  296. add r0, r0, r2
  297. bne 1b
  298. ldmfd sp!, {r4-r10,pc}
  299. .align 8
  300. 2:
  301. ldmia r1, {r4-r5, r10}
  302. add r1, r1, r2
  303. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  304. ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
  305. pld [r1]
  306. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  307. subs r3, r3, #1
  308. stmia r0, {r4-r5}
  309. add r0, r0, r2
  310. bne 2b
  311. ldmfd sp!, {r4-r10,pc}
  312. .align 8
  313. 3:
  314. ldmia r1, {r4-r5, r10}
  315. add r1, r1, r2
  316. ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
  317. ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
  318. pld [r1]
  319. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  320. subs r3, r3, #1
  321. stmia r0, {r4-r5}
  322. add r0, r0, r2
  323. bne 3b
  324. ldmfd sp!, {r4-r10,pc}
  325. .align 8
  326. 4:
  327. ldmia r1, {r4-r5, r10}
  328. add r1, r1, r2
  329. ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
  330. pld [r1]
  331. NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  332. subs r3, r3, #1
  333. stmia r0, {r8-r9}
  334. add r0, r0, r2
  335. bne 4b
  336. ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
  337. .align 8
  338. 5:
  339. .word 0xFEFEFEFE
  340. .word 2b
  341. .word 3b
  342. .word 4b
  343. .endfunc
  344. @ ----------------------------------------------------------------
  345. .align 8
  346. function put_pixels8_y2_arm, export=1
  347. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  348. @ block = word aligned, pixles = unaligned
  349. pld [r1]
  350. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  351. adr r5, 5f
  352. ands r4, r1, #3
  353. mov r3, r3, lsr #1
  354. ldr r12, [r5]
  355. add r5, r5, r4, lsl #2
  356. bic r1, r1, #3
  357. ldrne pc, [r5]
  358. 1:
  359. ldmia r1, {r4-r5}
  360. add r1, r1, r2
  361. 6: ldmia r1, {r6-r7}
  362. add r1, r1, r2
  363. pld [r1]
  364. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  365. ldmia r1, {r4-r5}
  366. add r1, r1, r2
  367. stmia r0, {r8-r9}
  368. add r0, r0, r2
  369. pld [r1]
  370. RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  371. subs r3, r3, #1
  372. stmia r0, {r8-r9}
  373. add r0, r0, r2
  374. bne 6b
  375. ldmfd sp!, {r4-r11,pc}
  376. .align 8
  377. 2:
  378. ldmia r1, {r4-r6}
  379. add r1, r1, r2
  380. pld [r1]
  381. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  382. 6: ldmia r1, {r7-r9}
  383. add r1, r1, r2
  384. pld [r1]
  385. ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
  386. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  387. stmia r0, {r10-r11}
  388. add r0, r0, r2
  389. ldmia r1, {r4-r6}
  390. add r1, r1, r2
  391. pld [r1]
  392. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  393. subs r3, r3, #1
  394. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  395. stmia r0, {r10-r11}
  396. add r0, r0, r2
  397. bne 6b
  398. ldmfd sp!, {r4-r11,pc}
  399. .align 8
  400. 3:
  401. ldmia r1, {r4-r6}
  402. add r1, r1, r2
  403. pld [r1]
  404. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  405. 6: ldmia r1, {r7-r9}
  406. add r1, r1, r2
  407. pld [r1]
  408. ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
  409. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  410. stmia r0, {r10-r11}
  411. add r0, r0, r2
  412. ldmia r1, {r4-r6}
  413. add r1, r1, r2
  414. pld [r1]
  415. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  416. subs r3, r3, #1
  417. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  418. stmia r0, {r10-r11}
  419. add r0, r0, r2
  420. bne 6b
  421. ldmfd sp!, {r4-r11,pc}
  422. .align 8
  423. 4:
  424. ldmia r1, {r4-r6}
  425. add r1, r1, r2
  426. pld [r1]
  427. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  428. 6: ldmia r1, {r7-r9}
  429. add r1, r1, r2
  430. pld [r1]
  431. ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
  432. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  433. stmia r0, {r10-r11}
  434. add r0, r0, r2
  435. ldmia r1, {r4-r6}
  436. add r1, r1, r2
  437. pld [r1]
  438. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  439. subs r3, r3, #1
  440. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  441. stmia r0, {r10-r11}
  442. add r0, r0, r2
  443. bne 6b
  444. ldmfd sp!, {r4-r11,pc}
  445. .align 8
  446. 5:
  447. .word 0xFEFEFEFE
  448. .word 2b
  449. .word 3b
  450. .word 4b
  451. .endfunc
  452. .align 8
  453. function put_no_rnd_pixels8_y2_arm, export=1
  454. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  455. @ block = word aligned, pixles = unaligned
  456. pld [r1]
  457. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  458. adr r5, 5f
  459. ands r4, r1, #3
  460. mov r3, r3, lsr #1
  461. ldr r12, [r5]
  462. add r5, r5, r4, lsl #2
  463. bic r1, r1, #3
  464. ldrne pc, [r5]
  465. 1:
  466. ldmia r1, {r4-r5}
  467. add r1, r1, r2
  468. 6: ldmia r1, {r6-r7}
  469. add r1, r1, r2
  470. pld [r1]
  471. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  472. ldmia r1, {r4-r5}
  473. add r1, r1, r2
  474. stmia r0, {r8-r9}
  475. add r0, r0, r2
  476. pld [r1]
  477. NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  478. subs r3, r3, #1
  479. stmia r0, {r8-r9}
  480. add r0, r0, r2
  481. bne 6b
  482. ldmfd sp!, {r4-r11,pc}
  483. .align 8
  484. 2:
  485. ldmia r1, {r4-r6}
  486. add r1, r1, r2
  487. pld [r1]
  488. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  489. 6: ldmia r1, {r7-r9}
  490. add r1, r1, r2
  491. pld [r1]
  492. ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
  493. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  494. stmia r0, {r10-r11}
  495. add r0, r0, r2
  496. ldmia r1, {r4-r6}
  497. add r1, r1, r2
  498. pld [r1]
  499. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  500. subs r3, r3, #1
  501. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  502. stmia r0, {r10-r11}
  503. add r0, r0, r2
  504. bne 6b
  505. ldmfd sp!, {r4-r11,pc}
  506. .align 8
  507. 3:
  508. ldmia r1, {r4-r6}
  509. add r1, r1, r2
  510. pld [r1]
  511. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  512. 6: ldmia r1, {r7-r9}
  513. add r1, r1, r2
  514. pld [r1]
  515. ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
  516. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  517. stmia r0, {r10-r11}
  518. add r0, r0, r2
  519. ldmia r1, {r4-r6}
  520. add r1, r1, r2
  521. pld [r1]
  522. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  523. subs r3, r3, #1
  524. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  525. stmia r0, {r10-r11}
  526. add r0, r0, r2
  527. bne 6b
  528. ldmfd sp!, {r4-r11,pc}
  529. .align 8
  530. 4:
  531. ldmia r1, {r4-r6}
  532. add r1, r1, r2
  533. pld [r1]
  534. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  535. 6: ldmia r1, {r7-r9}
  536. add r1, r1, r2
  537. pld [r1]
  538. ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
  539. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  540. stmia r0, {r10-r11}
  541. add r0, r0, r2
  542. ldmia r1, {r4-r6}
  543. add r1, r1, r2
  544. pld [r1]
  545. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  546. subs r3, r3, #1
  547. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  548. stmia r0, {r10-r11}
  549. add r0, r0, r2
  550. bne 6b
  551. ldmfd sp!, {r4-r11,pc}
  552. .align 8
  553. 5:
  554. .word 0xFEFEFEFE
  555. .word 2b
  556. .word 3b
  557. .word 4b
  558. .endfunc
  559. @ ----------------------------------------------------------------
  560. .macro RND_XY2_IT align
  561. @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
  562. @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
  563. .if \align == 0
  564. ldmia r1, {r6-r8}
  565. .elseif \align == 3
  566. ldmia r1, {r5-r7}
  567. .else
  568. ldmia r1, {r8-r10}
  569. .endif
  570. add r1, r1, r2
  571. pld [r1]
  572. .if \align == 0
  573. ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
  574. .elseif \align == 1
  575. ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
  576. ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
  577. .elseif \align == 2
  578. ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
  579. ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
  580. .elseif \align == 3
  581. ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
  582. .endif
  583. ldr r14, [r12, #0] @ 0x03030303
  584. tst r3, #1
  585. and r8, r4, r14
  586. and r9, r5, r14
  587. and r10, r6, r14
  588. and r11, r7, r14
  589. ldreq r14, [r12, #16] @ 0x02020202/0x01010101
  590. add r8, r8, r10
  591. add r9, r9, r11
  592. addeq r8, r8, r14
  593. addeq r9, r9, r14
  594. ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
  595. and r4, r14, r4, lsr #2
  596. and r5, r14, r5, lsr #2
  597. and r6, r14, r6, lsr #2
  598. and r7, r14, r7, lsr #2
  599. add r10, r4, r6
  600. add r11, r5, r7
  601. subs r3, r3, #1
  602. .endm
  603. .macro RND_XY2_EXPAND align
  604. RND_XY2_IT \align
  605. 6: stmfd sp!, {r8-r11}
  606. RND_XY2_IT \align
  607. ldmfd sp!, {r4-r7}
  608. add r4, r4, r8
  609. add r5, r5, r9
  610. add r6, r6, r10
  611. add r7, r7, r11
  612. ldr r14, [r12, #24] @ 0x0F0F0F0F
  613. and r4, r14, r4, lsr #2
  614. and r5, r14, r5, lsr #2
  615. add r4, r4, r6
  616. add r5, r5, r7
  617. stmia r0, {r4-r5}
  618. add r0, r0, r2
  619. bge 6b
  620. ldmfd sp!, {r4-r11,pc}
  621. .endm
  622. .align 8
  623. function put_pixels8_xy2_arm, export=1
  624. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  625. @ block = word aligned, pixles = unaligned
  626. pld [r1]
  627. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  628. adrl r12, 5f
  629. ands r4, r1, #3
  630. add r5, r12, r4, lsl #2
  631. bic r1, r1, #3
  632. ldrne pc, [r5]
  633. 1:
  634. RND_XY2_EXPAND 0
  635. .align 8
  636. 2:
  637. RND_XY2_EXPAND 1
  638. .align 8
  639. 3:
  640. RND_XY2_EXPAND 2
  641. .align 8
  642. 4:
  643. RND_XY2_EXPAND 3
  644. 5:
  645. .word 0x03030303
  646. .word 2b
  647. .word 3b
  648. .word 4b
  649. .word 0x02020202
  650. .word 0xFCFCFCFC >> 2
  651. .word 0x0F0F0F0F
  652. .endfunc
  653. .align 8
  654. function put_no_rnd_pixels8_xy2_arm, export=1
  655. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  656. @ block = word aligned, pixles = unaligned
  657. pld [r1]
  658. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  659. adrl r12, 5f
  660. ands r4, r1, #3
  661. add r5, r12, r4, lsl #2
  662. bic r1, r1, #3
  663. ldrne pc, [r5]
  664. 1:
  665. RND_XY2_EXPAND 0
  666. .align 8
  667. 2:
  668. RND_XY2_EXPAND 1
  669. .align 8
  670. 3:
  671. RND_XY2_EXPAND 2
  672. .align 8
  673. 4:
  674. RND_XY2_EXPAND 3
  675. 5:
  676. .word 0x03030303
  677. .word 2b
  678. .word 3b
  679. .word 4b
  680. .word 0x01010101
  681. .word 0xFCFCFCFC >> 2
  682. .word 0x0F0F0F0F
  683. .endfunc
  684. @ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
  685. function ff_add_pixels_clamped_ARM, export=1
  686. push {r4-r10}
  687. mov r10, #8
  688. 1:
  689. ldr r4, [r1] /* load dest */
  690. /* block[0] and block[1]*/
  691. ldrsh r5, [r0]
  692. ldrsh r7, [r0, #2]
  693. and r6, r4, #0xFF
  694. and r8, r4, #0xFF00
  695. add r6, r5, r6
  696. add r8, r7, r8, lsr #8
  697. mvn r5, r5
  698. mvn r7, r7
  699. tst r6, #0x100
  700. movne r6, r5, lsr #24
  701. tst r8, #0x100
  702. movne r8, r7, lsr #24
  703. mov r9, r6
  704. ldrsh r5, [r0, #4] /* moved form [A] */
  705. orr r9, r9, r8, lsl #8
  706. /* block[2] and block[3] */
  707. /* [A] */
  708. ldrsh r7, [r0, #6]
  709. and r6, r4, #0xFF0000
  710. and r8, r4, #0xFF000000
  711. add r6, r5, r6, lsr #16
  712. add r8, r7, r8, lsr #24
  713. mvn r5, r5
  714. mvn r7, r7
  715. tst r6, #0x100
  716. movne r6, r5, lsr #24
  717. tst r8, #0x100
  718. movne r8, r7, lsr #24
  719. orr r9, r9, r6, lsl #16
  720. ldr r4, [r1, #4] /* moved form [B] */
  721. orr r9, r9, r8, lsl #24
  722. /* store dest */
  723. ldrsh r5, [r0, #8] /* moved form [C] */
  724. str r9, [r1]
  725. /* load dest */
  726. /* [B] */
  727. /* block[4] and block[5] */
  728. /* [C] */
  729. ldrsh r7, [r0, #10]
  730. and r6, r4, #0xFF
  731. and r8, r4, #0xFF00
  732. add r6, r5, r6
  733. add r8, r7, r8, lsr #8
  734. mvn r5, r5
  735. mvn r7, r7
  736. tst r6, #0x100
  737. movne r6, r5, lsr #24
  738. tst r8, #0x100
  739. movne r8, r7, lsr #24
  740. mov r9, r6
  741. ldrsh r5, [r0, #12] /* moved from [D] */
  742. orr r9, r9, r8, lsl #8
  743. /* block[6] and block[7] */
  744. /* [D] */
  745. ldrsh r7, [r0, #14]
  746. and r6, r4, #0xFF0000
  747. and r8, r4, #0xFF000000
  748. add r6, r5, r6, lsr #16
  749. add r8, r7, r8, lsr #24
  750. mvn r5, r5
  751. mvn r7, r7
  752. tst r6, #0x100
  753. movne r6, r5, lsr #24
  754. tst r8, #0x100
  755. movne r8, r7, lsr #24
  756. orr r9, r9, r6, lsl #16
  757. add r0, r0, #16 /* moved from [E] */
  758. orr r9, r9, r8, lsl #24
  759. subs r10, r10, #1 /* moved from [F] */
  760. /* store dest */
  761. str r9, [r1, #4]
  762. /* [E] */
  763. /* [F] */
  764. add r1, r1, r2
  765. bne 1b
  766. pop {r4-r10}
  767. bx lr
  768. .endfunc