You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

710 lines
24KB

  1. @
  2. @ ARMv4 optimized DSP utils
  3. @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
  4. @
  5. @ This file is part of Libav.
  6. @
  7. @ Libav is free software; you can redistribute it and/or
  8. @ modify it under the terms of the GNU Lesser General Public
  9. @ License as published by the Free Software Foundation; either
  10. @ version 2.1 of the License, or (at your option) any later version.
  11. @
  12. @ Libav is distributed in the hope that it will be useful,
  13. @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. @ Lesser General Public License for more details.
  16. @
  17. @ You should have received a copy of the GNU Lesser General Public
  18. @ License along with Libav; if not, write to the Free Software
  19. @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. @
  21. #include "config.h"
  22. #include "libavutil/arm/asm.S"
  23. #if !HAVE_ARMV5TE_EXTERNAL
  24. #define pld @
  25. #endif
  26. .macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
  27. mov \Rd0, \Rn0, lsr #(\shift * 8)
  28. mov \Rd1, \Rn1, lsr #(\shift * 8)
  29. mov \Rd2, \Rn2, lsr #(\shift * 8)
  30. mov \Rd3, \Rn3, lsr #(\shift * 8)
  31. orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
  32. orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
  33. orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
  34. orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
  35. .endm
  36. .macro ALIGN_DWORD shift, R0, R1, R2
  37. mov \R0, \R0, lsr #(\shift * 8)
  38. orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
  39. mov \R1, \R1, lsr #(\shift * 8)
  40. orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
  41. .endm
  42. .macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
  43. mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
  44. mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
  45. orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
  46. orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
  47. .endm
  48. .macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  49. @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  50. @ Rmask = 0xFEFEFEFE
  51. @ Rn = destroy
  52. eor \Rd0, \Rn0, \Rm0
  53. eor \Rd1, \Rn1, \Rm1
  54. orr \Rn0, \Rn0, \Rm0
  55. orr \Rn1, \Rn1, \Rm1
  56. and \Rd0, \Rd0, \Rmask
  57. and \Rd1, \Rd1, \Rmask
  58. sub \Rd0, \Rn0, \Rd0, lsr #1
  59. sub \Rd1, \Rn1, \Rd1, lsr #1
  60. .endm
  61. .macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  62. @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  63. @ Rmask = 0xFEFEFEFE
  64. @ Rn = destroy
  65. eor \Rd0, \Rn0, \Rm0
  66. eor \Rd1, \Rn1, \Rm1
  67. and \Rn0, \Rn0, \Rm0
  68. and \Rn1, \Rn1, \Rm1
  69. and \Rd0, \Rd0, \Rmask
  70. and \Rd1, \Rd1, \Rmask
  71. add \Rd0, \Rn0, \Rd0, lsr #1
  72. add \Rd1, \Rn1, \Rd1, lsr #1
  73. .endm
  74. .macro JMP_ALIGN tmp, reg
  75. ands \tmp, \reg, #3
  76. bic \reg, \reg, #3
  77. beq 1f
  78. subs \tmp, \tmp, #1
  79. beq 2f
  80. subs \tmp, \tmp, #1
  81. beq 3f
  82. b 4f
  83. .endm
  84. @ ----------------------------------------------------------------
  85. .align 5
  86. function ff_put_pixels16_arm, export=1
  87. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  88. @ block = word aligned, pixles = unaligned
  89. pld [r1]
  90. push {r4-r11, lr}
  91. JMP_ALIGN r5, r1
  92. 1:
  93. ldm r1, {r4-r7}
  94. add r1, r1, r2
  95. stm r0, {r4-r7}
  96. pld [r1]
  97. subs r3, r3, #1
  98. add r0, r0, r2
  99. bne 1b
  100. pop {r4-r11, pc}
  101. .align 5
  102. 2:
  103. ldm r1, {r4-r8}
  104. add r1, r1, r2
  105. ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
  106. pld [r1]
  107. subs r3, r3, #1
  108. stm r0, {r9-r12}
  109. add r0, r0, r2
  110. bne 2b
  111. pop {r4-r11, pc}
  112. .align 5
  113. 3:
  114. ldm r1, {r4-r8}
  115. add r1, r1, r2
  116. ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
  117. pld [r1]
  118. subs r3, r3, #1
  119. stm r0, {r9-r12}
  120. add r0, r0, r2
  121. bne 3b
  122. pop {r4-r11, pc}
  123. .align 5
  124. 4:
  125. ldm r1, {r4-r8}
  126. add r1, r1, r2
  127. ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
  128. pld [r1]
  129. subs r3, r3, #1
  130. stm r0, {r9-r12}
  131. add r0, r0, r2
  132. bne 4b
  133. pop {r4-r11,pc}
  134. endfunc
  135. @ ----------------------------------------------------------------
  136. .align 5
  137. function ff_put_pixels8_arm, export=1
  138. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  139. @ block = word aligned, pixles = unaligned
  140. pld [r1]
  141. push {r4-r5,lr}
  142. JMP_ALIGN r5, r1
  143. 1:
  144. ldm r1, {r4-r5}
  145. add r1, r1, r2
  146. subs r3, r3, #1
  147. pld [r1]
  148. stm r0, {r4-r5}
  149. add r0, r0, r2
  150. bne 1b
  151. pop {r4-r5,pc}
  152. .align 5
  153. 2:
  154. ldm r1, {r4-r5, r12}
  155. add r1, r1, r2
  156. ALIGN_DWORD 1, r4, r5, r12
  157. pld [r1]
  158. subs r3, r3, #1
  159. stm r0, {r4-r5}
  160. add r0, r0, r2
  161. bne 2b
  162. pop {r4-r5,pc}
  163. .align 5
  164. 3:
  165. ldm r1, {r4-r5, r12}
  166. add r1, r1, r2
  167. ALIGN_DWORD 2, r4, r5, r12
  168. pld [r1]
  169. subs r3, r3, #1
  170. stm r0, {r4-r5}
  171. add r0, r0, r2
  172. bne 3b
  173. pop {r4-r5,pc}
  174. .align 5
  175. 4:
  176. ldm r1, {r4-r5, r12}
  177. add r1, r1, r2
  178. ALIGN_DWORD 3, r4, r5, r12
  179. pld [r1]
  180. subs r3, r3, #1
  181. stm r0, {r4-r5}
  182. add r0, r0, r2
  183. bne 4b
  184. pop {r4-r5,pc}
  185. endfunc
  186. @ ----------------------------------------------------------------
  187. .align 5
  188. function ff_put_pixels8_x2_arm, export=1
  189. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  190. @ block = word aligned, pixles = unaligned
  191. pld [r1]
  192. push {r4-r10,lr}
  193. ldr r12, =0xfefefefe
  194. JMP_ALIGN r5, r1
  195. 1:
  196. ldm r1, {r4-r5, r10}
  197. add r1, r1, r2
  198. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  199. pld [r1]
  200. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  201. subs r3, r3, #1
  202. stm r0, {r8-r9}
  203. add r0, r0, r2
  204. bne 1b
  205. pop {r4-r10,pc}
  206. .align 5
  207. 2:
  208. ldm r1, {r4-r5, r10}
  209. add r1, r1, r2
  210. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  211. ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
  212. pld [r1]
  213. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  214. subs r3, r3, #1
  215. stm r0, {r4-r5}
  216. add r0, r0, r2
  217. bne 2b
  218. pop {r4-r10,pc}
  219. .align 5
  220. 3:
  221. ldm r1, {r4-r5, r10}
  222. add r1, r1, r2
  223. ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
  224. ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
  225. pld [r1]
  226. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  227. subs r3, r3, #1
  228. stm r0, {r4-r5}
  229. add r0, r0, r2
  230. bne 3b
  231. pop {r4-r10,pc}
  232. .align 5
  233. 4:
  234. ldm r1, {r4-r5, r10}
  235. add r1, r1, r2
  236. ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
  237. pld [r1]
  238. RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  239. subs r3, r3, #1
  240. stm r0, {r8-r9}
  241. add r0, r0, r2
  242. bne 4b
  243. pop {r4-r10,pc}
  244. endfunc
  245. .align 5
  246. function ff_put_no_rnd_pixels8_x2_arm, export=1
  247. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  248. @ block = word aligned, pixles = unaligned
  249. pld [r1]
  250. push {r4-r10,lr}
  251. ldr r12, =0xfefefefe
  252. JMP_ALIGN r5, r1
  253. 1:
  254. ldm r1, {r4-r5, r10}
  255. add r1, r1, r2
  256. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  257. pld [r1]
  258. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  259. subs r3, r3, #1
  260. stm r0, {r8-r9}
  261. add r0, r0, r2
  262. bne 1b
  263. pop {r4-r10,pc}
  264. .align 5
  265. 2:
  266. ldm r1, {r4-r5, r10}
  267. add r1, r1, r2
  268. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  269. ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
  270. pld [r1]
  271. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  272. subs r3, r3, #1
  273. stm r0, {r4-r5}
  274. add r0, r0, r2
  275. bne 2b
  276. pop {r4-r10,pc}
  277. .align 5
  278. 3:
  279. ldm r1, {r4-r5, r10}
  280. add r1, r1, r2
  281. ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
  282. ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
  283. pld [r1]
  284. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  285. subs r3, r3, #1
  286. stm r0, {r4-r5}
  287. add r0, r0, r2
  288. bne 3b
  289. pop {r4-r10,pc}
  290. .align 5
  291. 4:
  292. ldm r1, {r4-r5, r10}
  293. add r1, r1, r2
  294. ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
  295. pld [r1]
  296. NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  297. subs r3, r3, #1
  298. stm r0, {r8-r9}
  299. add r0, r0, r2
  300. bne 4b
  301. pop {r4-r10,pc}
  302. endfunc
  303. @ ----------------------------------------------------------------
  304. .align 5
  305. function ff_put_pixels8_y2_arm, export=1
  306. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  307. @ block = word aligned, pixles = unaligned
  308. pld [r1]
  309. push {r4-r11,lr}
  310. mov r3, r3, lsr #1
  311. ldr r12, =0xfefefefe
  312. JMP_ALIGN r5, r1
  313. 1:
  314. ldm r1, {r4-r5}
  315. add r1, r1, r2
  316. 6: ldm r1, {r6-r7}
  317. add r1, r1, r2
  318. pld [r1]
  319. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  320. ldm r1, {r4-r5}
  321. add r1, r1, r2
  322. stm r0, {r8-r9}
  323. add r0, r0, r2
  324. pld [r1]
  325. RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  326. subs r3, r3, #1
  327. stm r0, {r8-r9}
  328. add r0, r0, r2
  329. bne 6b
  330. pop {r4-r11,pc}
  331. .align 5
  332. 2:
  333. ldm r1, {r4-r6}
  334. add r1, r1, r2
  335. pld [r1]
  336. ALIGN_DWORD 1, r4, r5, r6
  337. 6: ldm r1, {r7-r9}
  338. add r1, r1, r2
  339. pld [r1]
  340. ALIGN_DWORD 1, r7, r8, r9
  341. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  342. stm r0, {r10-r11}
  343. add r0, r0, r2
  344. ldm r1, {r4-r6}
  345. add r1, r1, r2
  346. pld [r1]
  347. ALIGN_DWORD 1, r4, r5, r6
  348. subs r3, r3, #1
  349. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  350. stm r0, {r10-r11}
  351. add r0, r0, r2
  352. bne 6b
  353. pop {r4-r11,pc}
  354. .align 5
  355. 3:
  356. ldm r1, {r4-r6}
  357. add r1, r1, r2
  358. pld [r1]
  359. ALIGN_DWORD 2, r4, r5, r6
  360. 6: ldm r1, {r7-r9}
  361. add r1, r1, r2
  362. pld [r1]
  363. ALIGN_DWORD 2, r7, r8, r9
  364. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  365. stm r0, {r10-r11}
  366. add r0, r0, r2
  367. ldm r1, {r4-r6}
  368. add r1, r1, r2
  369. pld [r1]
  370. ALIGN_DWORD 2, r4, r5, r6
  371. subs r3, r3, #1
  372. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  373. stm r0, {r10-r11}
  374. add r0, r0, r2
  375. bne 6b
  376. pop {r4-r11,pc}
  377. .align 5
  378. 4:
  379. ldm r1, {r4-r6}
  380. add r1, r1, r2
  381. pld [r1]
  382. ALIGN_DWORD 3, r4, r5, r6
  383. 6: ldm r1, {r7-r9}
  384. add r1, r1, r2
  385. pld [r1]
  386. ALIGN_DWORD 3, r7, r8, r9
  387. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  388. stm r0, {r10-r11}
  389. add r0, r0, r2
  390. ldm r1, {r4-r6}
  391. add r1, r1, r2
  392. pld [r1]
  393. ALIGN_DWORD 3, r4, r5, r6
  394. subs r3, r3, #1
  395. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  396. stm r0, {r10-r11}
  397. add r0, r0, r2
  398. bne 6b
  399. pop {r4-r11,pc}
  400. endfunc
  401. .align 5
  402. function ff_put_no_rnd_pixels8_y2_arm, export=1
  403. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  404. @ block = word aligned, pixles = unaligned
  405. pld [r1]
  406. push {r4-r11,lr}
  407. mov r3, r3, lsr #1
  408. ldr r12, =0xfefefefe
  409. JMP_ALIGN r5, r1
  410. 1:
  411. ldm r1, {r4-r5}
  412. add r1, r1, r2
  413. 6: ldm r1, {r6-r7}
  414. add r1, r1, r2
  415. pld [r1]
  416. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  417. ldm r1, {r4-r5}
  418. add r1, r1, r2
  419. stm r0, {r8-r9}
  420. add r0, r0, r2
  421. pld [r1]
  422. NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  423. subs r3, r3, #1
  424. stm r0, {r8-r9}
  425. add r0, r0, r2
  426. bne 6b
  427. pop {r4-r11,pc}
  428. .align 5
  429. 2:
  430. ldm r1, {r4-r6}
  431. add r1, r1, r2
  432. pld [r1]
  433. ALIGN_DWORD 1, r4, r5, r6
  434. 6: ldm r1, {r7-r9}
  435. add r1, r1, r2
  436. pld [r1]
  437. ALIGN_DWORD 1, r7, r8, r9
  438. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  439. stm r0, {r10-r11}
  440. add r0, r0, r2
  441. ldm r1, {r4-r6}
  442. add r1, r1, r2
  443. pld [r1]
  444. ALIGN_DWORD 1, r4, r5, r6
  445. subs r3, r3, #1
  446. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  447. stm r0, {r10-r11}
  448. add r0, r0, r2
  449. bne 6b
  450. pop {r4-r11,pc}
  451. .align 5
  452. 3:
  453. ldm r1, {r4-r6}
  454. add r1, r1, r2
  455. pld [r1]
  456. ALIGN_DWORD 2, r4, r5, r6
  457. 6: ldm r1, {r7-r9}
  458. add r1, r1, r2
  459. pld [r1]
  460. ALIGN_DWORD 2, r7, r8, r9
  461. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  462. stm r0, {r10-r11}
  463. add r0, r0, r2
  464. ldm r1, {r4-r6}
  465. add r1, r1, r2
  466. pld [r1]
  467. ALIGN_DWORD 2, r4, r5, r6
  468. subs r3, r3, #1
  469. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  470. stm r0, {r10-r11}
  471. add r0, r0, r2
  472. bne 6b
  473. pop {r4-r11,pc}
  474. .align 5
  475. 4:
  476. ldm r1, {r4-r6}
  477. add r1, r1, r2
  478. pld [r1]
  479. ALIGN_DWORD 3, r4, r5, r6
  480. 6: ldm r1, {r7-r9}
  481. add r1, r1, r2
  482. pld [r1]
  483. ALIGN_DWORD 3, r7, r8, r9
  484. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  485. stm r0, {r10-r11}
  486. add r0, r0, r2
  487. ldm r1, {r4-r6}
  488. add r1, r1, r2
  489. pld [r1]
  490. ALIGN_DWORD 3, r4, r5, r6
  491. subs r3, r3, #1
  492. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  493. stm r0, {r10-r11}
  494. add r0, r0, r2
  495. bne 6b
  496. pop {r4-r11,pc}
  497. endfunc
  498. .ltorg
  499. @ ----------------------------------------------------------------
  500. .macro RND_XY2_IT align, rnd
  501. @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
  502. @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
  503. .if \align == 0
  504. ldm r1, {r6-r8}
  505. .elseif \align == 3
  506. ldm r1, {r5-r7}
  507. .else
  508. ldm r1, {r8-r10}
  509. .endif
  510. add r1, r1, r2
  511. pld [r1]
  512. .if \align == 0
  513. ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
  514. .elseif \align == 1
  515. ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
  516. ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
  517. .elseif \align == 2
  518. ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
  519. ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
  520. .elseif \align == 3
  521. ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
  522. .endif
  523. ldr r14, =0x03030303
  524. tst r3, #1
  525. and r8, r4, r14
  526. and r9, r5, r14
  527. and r10, r6, r14
  528. and r11, r7, r14
  529. it eq
  530. andeq r14, r14, r14, \rnd #1
  531. add r8, r8, r10
  532. add r9, r9, r11
  533. ldr r12, =0xfcfcfcfc >> 2
  534. itt eq
  535. addeq r8, r8, r14
  536. addeq r9, r9, r14
  537. and r4, r12, r4, lsr #2
  538. and r5, r12, r5, lsr #2
  539. and r6, r12, r6, lsr #2
  540. and r7, r12, r7, lsr #2
  541. add r10, r4, r6
  542. add r11, r5, r7
  543. subs r3, r3, #1
  544. .endm
  545. .macro RND_XY2_EXPAND align, rnd
  546. RND_XY2_IT \align, \rnd
  547. 6: push {r8-r11}
  548. RND_XY2_IT \align, \rnd
  549. pop {r4-r7}
  550. add r4, r4, r8
  551. add r5, r5, r9
  552. ldr r14, =0x0f0f0f0f
  553. add r6, r6, r10
  554. add r7, r7, r11
  555. and r4, r14, r4, lsr #2
  556. and r5, r14, r5, lsr #2
  557. add r4, r4, r6
  558. add r5, r5, r7
  559. stm r0, {r4-r5}
  560. add r0, r0, r2
  561. bge 6b
  562. pop {r4-r11,pc}
  563. .endm
  564. .align 5
  565. function ff_put_pixels8_xy2_arm, export=1
  566. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  567. @ block = word aligned, pixles = unaligned
  568. pld [r1]
  569. push {r4-r11,lr} @ R14 is also called LR
  570. JMP_ALIGN r5, r1
  571. 1: RND_XY2_EXPAND 0, lsl
  572. .align 5
  573. 2: RND_XY2_EXPAND 1, lsl
  574. .align 5
  575. 3: RND_XY2_EXPAND 2, lsl
  576. .align 5
  577. 4: RND_XY2_EXPAND 3, lsl
  578. endfunc
  579. .align 5
  580. function ff_put_no_rnd_pixels8_xy2_arm, export=1
  581. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  582. @ block = word aligned, pixles = unaligned
  583. pld [r1]
  584. push {r4-r11,lr}
  585. JMP_ALIGN r5, r1
  586. 1: RND_XY2_EXPAND 0, lsr
  587. .align 5
  588. 2: RND_XY2_EXPAND 1, lsr
  589. .align 5
  590. 3: RND_XY2_EXPAND 2, lsr
  591. .align 5
  592. 4: RND_XY2_EXPAND 3, lsr
  593. endfunc
  594. .align 5
  595. @ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
  596. function ff_add_pixels_clamped_arm, export=1
  597. push {r4-r10}
  598. mov r10, #8
  599. 1:
  600. ldr r4, [r1] /* load dest */
  601. /* block[0] and block[1]*/
  602. ldrsh r5, [r0]
  603. ldrsh r7, [r0, #2]
  604. and r6, r4, #0xFF
  605. and r8, r4, #0xFF00
  606. add r6, r6, r5
  607. add r8, r7, r8, lsr #8
  608. mvn r5, r5
  609. mvn r7, r7
  610. tst r6, #0x100
  611. it ne
  612. movne r6, r5, lsr #24
  613. tst r8, #0x100
  614. it ne
  615. movne r8, r7, lsr #24
  616. mov r9, r6
  617. ldrsh r5, [r0, #4] /* moved form [A] */
  618. orr r9, r9, r8, lsl #8
  619. /* block[2] and block[3] */
  620. /* [A] */
  621. ldrsh r7, [r0, #6]
  622. and r6, r4, #0xFF0000
  623. and r8, r4, #0xFF000000
  624. add r6, r5, r6, lsr #16
  625. add r8, r7, r8, lsr #24
  626. mvn r5, r5
  627. mvn r7, r7
  628. tst r6, #0x100
  629. it ne
  630. movne r6, r5, lsr #24
  631. tst r8, #0x100
  632. it ne
  633. movne r8, r7, lsr #24
  634. orr r9, r9, r6, lsl #16
  635. ldr r4, [r1, #4] /* moved form [B] */
  636. orr r9, r9, r8, lsl #24
  637. /* store dest */
  638. ldrsh r5, [r0, #8] /* moved form [C] */
  639. str r9, [r1]
  640. /* load dest */
  641. /* [B] */
  642. /* block[4] and block[5] */
  643. /* [C] */
  644. ldrsh r7, [r0, #10]
  645. and r6, r4, #0xFF
  646. and r8, r4, #0xFF00
  647. add r6, r6, r5
  648. add r8, r7, r8, lsr #8
  649. mvn r5, r5
  650. mvn r7, r7
  651. tst r6, #0x100
  652. it ne
  653. movne r6, r5, lsr #24
  654. tst r8, #0x100
  655. it ne
  656. movne r8, r7, lsr #24
  657. mov r9, r6
  658. ldrsh r5, [r0, #12] /* moved from [D] */
  659. orr r9, r9, r8, lsl #8
  660. /* block[6] and block[7] */
  661. /* [D] */
  662. ldrsh r7, [r0, #14]
  663. and r6, r4, #0xFF0000
  664. and r8, r4, #0xFF000000
  665. add r6, r5, r6, lsr #16
  666. add r8, r7, r8, lsr #24
  667. mvn r5, r5
  668. mvn r7, r7
  669. tst r6, #0x100
  670. it ne
  671. movne r6, r5, lsr #24
  672. tst r8, #0x100
  673. it ne
  674. movne r8, r7, lsr #24
  675. orr r9, r9, r6, lsl #16
  676. add r0, r0, #16 /* moved from [E] */
  677. orr r9, r9, r8, lsl #24
  678. subs r10, r10, #1 /* moved from [F] */
  679. /* store dest */
  680. str r9, [r1, #4]
  681. /* [E] */
  682. /* [F] */
  683. add r1, r1, r2
  684. bne 1b
  685. pop {r4-r10}
  686. bx lr
  687. endfunc