You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

713 lines
24KB

  1. @
  2. @ ARMv4 optimized DSP utils
  3. @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
  4. @
  5. @ This file is part of FFmpeg.
  6. @
  7. @ FFmpeg is free software; you can redistribute it and/or
  8. @ modify it under the terms of the GNU Lesser General Public
  9. @ License as published by the Free Software Foundation; either
  10. @ version 2.1 of the License, or (at your option) any later version.
  11. @
  12. @ FFmpeg is distributed in the hope that it will be useful,
  13. @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. @ Lesser General Public License for more details.
  16. @
  17. @ You should have received a copy of the GNU Lesser General Public
  18. @ License along with FFmpeg; if not, write to the Free Software
  19. @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. @
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. #if !HAVE_PLD
  25. .macro pld reg
  26. .endm
  27. #endif
  28. #if HAVE_ARMV5TE
  29. function ff_prefetch_arm, export=1
  30. subs r2, r2, #1
  31. pld [r0]
  32. add r0, r0, r1
  33. bne ff_prefetch_arm
  34. bx lr
  35. .endfunc
  36. #endif
  37. .macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
  38. mov \Rd0, \Rn0, lsr #(\shift * 8)
  39. mov \Rd1, \Rn1, lsr #(\shift * 8)
  40. mov \Rd2, \Rn2, lsr #(\shift * 8)
  41. mov \Rd3, \Rn3, lsr #(\shift * 8)
  42. orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
  43. orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
  44. orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
  45. orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
  46. .endm
  47. .macro ALIGN_DWORD shift, R0, R1, R2
  48. mov \R0, \R0, lsr #(\shift * 8)
  49. orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
  50. mov \R1, \R1, lsr #(\shift * 8)
  51. orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
  52. .endm
  53. .macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
  54. mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
  55. mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
  56. orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
  57. orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
  58. .endm
  59. .macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  60. @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  61. @ Rmask = 0xFEFEFEFE
  62. @ Rn = destroy
  63. eor \Rd0, \Rn0, \Rm0
  64. eor \Rd1, \Rn1, \Rm1
  65. orr \Rn0, \Rn0, \Rm0
  66. orr \Rn1, \Rn1, \Rm1
  67. and \Rd0, \Rd0, \Rmask
  68. and \Rd1, \Rd1, \Rmask
  69. sub \Rd0, \Rn0, \Rd0, lsr #1
  70. sub \Rd1, \Rn1, \Rd1, lsr #1
  71. .endm
  72. .macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  73. @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  74. @ Rmask = 0xFEFEFEFE
  75. @ Rn = destroy
  76. eor \Rd0, \Rn0, \Rm0
  77. eor \Rd1, \Rn1, \Rm1
  78. and \Rn0, \Rn0, \Rm0
  79. and \Rn1, \Rn1, \Rm1
  80. and \Rd0, \Rd0, \Rmask
  81. and \Rd1, \Rd1, \Rmask
  82. add \Rd0, \Rn0, \Rd0, lsr #1
  83. add \Rd1, \Rn1, \Rd1, lsr #1
  84. .endm
  85. .macro JMP_ALIGN tmp, reg
  86. ands \tmp, \reg, #3
  87. bic \reg, \reg, #3
  88. beq 1f
  89. subs \tmp, \tmp, #1
  90. beq 2f
  91. subs \tmp, \tmp, #1
  92. beq 3f
  93. b 4f
  94. .endm
  95. @ ----------------------------------------------------------------
  96. .align 5
  97. function ff_put_pixels16_arm, export=1
  98. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  99. @ block = word aligned, pixles = unaligned
  100. pld [r1]
  101. push {r4-r11, lr}
  102. JMP_ALIGN r5, r1
  103. 1:
  104. ldm r1, {r4-r7}
  105. add r1, r1, r2
  106. stm r0, {r4-r7}
  107. pld [r1]
  108. subs r3, r3, #1
  109. add r0, r0, r2
  110. bne 1b
  111. pop {r4-r11, pc}
  112. .align 5
  113. 2:
  114. ldm r1, {r4-r8}
  115. add r1, r1, r2
  116. ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
  117. pld [r1]
  118. subs r3, r3, #1
  119. stm r0, {r9-r12}
  120. add r0, r0, r2
  121. bne 2b
  122. pop {r4-r11, pc}
  123. .align 5
  124. 3:
  125. ldm r1, {r4-r8}
  126. add r1, r1, r2
  127. ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
  128. pld [r1]
  129. subs r3, r3, #1
  130. stm r0, {r9-r12}
  131. add r0, r0, r2
  132. bne 3b
  133. pop {r4-r11, pc}
  134. .align 5
  135. 4:
  136. ldm r1, {r4-r8}
  137. add r1, r1, r2
  138. ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
  139. pld [r1]
  140. subs r3, r3, #1
  141. stm r0, {r9-r12}
  142. add r0, r0, r2
  143. bne 4b
  144. pop {r4-r11,pc}
  145. .endfunc
  146. @ ----------------------------------------------------------------
  147. .align 5
  148. function ff_put_pixels8_arm, export=1
  149. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  150. @ block = word aligned, pixles = unaligned
  151. pld [r1]
  152. push {r4-r5,lr}
  153. JMP_ALIGN r5, r1
  154. 1:
  155. ldm r1, {r4-r5}
  156. add r1, r1, r2
  157. subs r3, r3, #1
  158. pld [r1]
  159. stm r0, {r4-r5}
  160. add r0, r0, r2
  161. bne 1b
  162. pop {r4-r5,pc}
  163. .align 5
  164. 2:
  165. ldm r1, {r4-r5, r12}
  166. add r1, r1, r2
  167. ALIGN_DWORD 1, r4, r5, r12
  168. pld [r1]
  169. subs r3, r3, #1
  170. stm r0, {r4-r5}
  171. add r0, r0, r2
  172. bne 2b
  173. pop {r4-r5,pc}
  174. .align 5
  175. 3:
  176. ldm r1, {r4-r5, r12}
  177. add r1, r1, r2
  178. ALIGN_DWORD 2, r4, r5, r12
  179. pld [r1]
  180. subs r3, r3, #1
  181. stm r0, {r4-r5}
  182. add r0, r0, r2
  183. bne 3b
  184. pop {r4-r5,pc}
  185. .align 5
  186. 4:
  187. ldm r1, {r4-r5, r12}
  188. add r1, r1, r2
  189. ALIGN_DWORD 3, r4, r5, r12
  190. pld [r1]
  191. subs r3, r3, #1
  192. stm r0, {r4-r5}
  193. add r0, r0, r2
  194. bne 4b
  195. pop {r4-r5,pc}
  196. .endfunc
  197. @ ----------------------------------------------------------------
  198. .align 5
  199. function ff_put_pixels8_x2_arm, export=1
  200. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  201. @ block = word aligned, pixles = unaligned
  202. pld [r1]
  203. push {r4-r10,lr}
  204. ldr r12, =0xfefefefe
  205. JMP_ALIGN r5, r1
  206. 1:
  207. ldm r1, {r4-r5, r10}
  208. add r1, r1, r2
  209. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  210. pld [r1]
  211. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  212. subs r3, r3, #1
  213. stm r0, {r8-r9}
  214. add r0, r0, r2
  215. bne 1b
  216. pop {r4-r10,pc}
  217. .align 5
  218. 2:
  219. ldm r1, {r4-r5, r10}
  220. add r1, r1, r2
  221. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  222. ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
  223. pld [r1]
  224. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  225. subs r3, r3, #1
  226. stm r0, {r4-r5}
  227. add r0, r0, r2
  228. bne 2b
  229. pop {r4-r10,pc}
  230. .align 5
  231. 3:
  232. ldm r1, {r4-r5, r10}
  233. add r1, r1, r2
  234. ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
  235. ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
  236. pld [r1]
  237. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  238. subs r3, r3, #1
  239. stm r0, {r4-r5}
  240. add r0, r0, r2
  241. bne 3b
  242. pop {r4-r10,pc}
  243. .align 5
  244. 4:
  245. ldm r1, {r4-r5, r10}
  246. add r1, r1, r2
  247. ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
  248. pld [r1]
  249. RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  250. subs r3, r3, #1
  251. stm r0, {r8-r9}
  252. add r0, r0, r2
  253. bne 4b
  254. pop {r4-r10,pc}
  255. .endfunc
  256. .align 5
  257. function ff_put_no_rnd_pixels8_x2_arm, export=1
  258. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  259. @ block = word aligned, pixles = unaligned
  260. pld [r1]
  261. push {r4-r10,lr}
  262. ldr r12, =0xfefefefe
  263. JMP_ALIGN r5, r1
  264. 1:
  265. ldm r1, {r4-r5, r10}
  266. add r1, r1, r2
  267. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  268. pld [r1]
  269. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  270. subs r3, r3, #1
  271. stm r0, {r8-r9}
  272. add r0, r0, r2
  273. bne 1b
  274. pop {r4-r10,pc}
  275. .align 5
  276. 2:
  277. ldm r1, {r4-r5, r10}
  278. add r1, r1, r2
  279. ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
  280. ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
  281. pld [r1]
  282. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  283. subs r3, r3, #1
  284. stm r0, {r4-r5}
  285. add r0, r0, r2
  286. bne 2b
  287. pop {r4-r10,pc}
  288. .align 5
  289. 3:
  290. ldm r1, {r4-r5, r10}
  291. add r1, r1, r2
  292. ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
  293. ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
  294. pld [r1]
  295. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  296. subs r3, r3, #1
  297. stm r0, {r4-r5}
  298. add r0, r0, r2
  299. bne 3b
  300. pop {r4-r10,pc}
  301. .align 5
  302. 4:
  303. ldm r1, {r4-r5, r10}
  304. add r1, r1, r2
  305. ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
  306. pld [r1]
  307. NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  308. subs r3, r3, #1
  309. stm r0, {r8-r9}
  310. add r0, r0, r2
  311. bne 4b
  312. pop {r4-r10,pc}
  313. .endfunc
  314. @ ----------------------------------------------------------------
  315. .align 5
  316. function ff_put_pixels8_y2_arm, export=1
  317. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  318. @ block = word aligned, pixles = unaligned
  319. pld [r1]
  320. push {r4-r11,lr}
  321. mov r3, r3, lsr #1
  322. ldr r12, =0xfefefefe
  323. JMP_ALIGN r5, r1
  324. 1:
  325. ldm r1, {r4-r5}
  326. add r1, r1, r2
  327. 6: ldm r1, {r6-r7}
  328. add r1, r1, r2
  329. pld [r1]
  330. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  331. ldm r1, {r4-r5}
  332. add r1, r1, r2
  333. stm r0, {r8-r9}
  334. add r0, r0, r2
  335. pld [r1]
  336. RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  337. subs r3, r3, #1
  338. stm r0, {r8-r9}
  339. add r0, r0, r2
  340. bne 6b
  341. pop {r4-r11,pc}
  342. .align 5
  343. 2:
  344. ldm r1, {r4-r6}
  345. add r1, r1, r2
  346. pld [r1]
  347. ALIGN_DWORD 1, r4, r5, r6
  348. 6: ldm r1, {r7-r9}
  349. add r1, r1, r2
  350. pld [r1]
  351. ALIGN_DWORD 1, r7, r8, r9
  352. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  353. stm r0, {r10-r11}
  354. add r0, r0, r2
  355. ldm r1, {r4-r6}
  356. add r1, r1, r2
  357. pld [r1]
  358. ALIGN_DWORD 1, r4, r5, r6
  359. subs r3, r3, #1
  360. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  361. stm r0, {r10-r11}
  362. add r0, r0, r2
  363. bne 6b
  364. pop {r4-r11,pc}
  365. .align 5
  366. 3:
  367. ldm r1, {r4-r6}
  368. add r1, r1, r2
  369. pld [r1]
  370. ALIGN_DWORD 2, r4, r5, r6
  371. 6: ldm r1, {r7-r9}
  372. add r1, r1, r2
  373. pld [r1]
  374. ALIGN_DWORD 2, r7, r8, r9
  375. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  376. stm r0, {r10-r11}
  377. add r0, r0, r2
  378. ldm r1, {r4-r6}
  379. add r1, r1, r2
  380. pld [r1]
  381. ALIGN_DWORD 2, r4, r5, r6
  382. subs r3, r3, #1
  383. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  384. stm r0, {r10-r11}
  385. add r0, r0, r2
  386. bne 6b
  387. pop {r4-r11,pc}
  388. .align 5
  389. 4:
  390. ldm r1, {r4-r6}
  391. add r1, r1, r2
  392. pld [r1]
  393. ALIGN_DWORD 3, r4, r5, r6
  394. 6: ldm r1, {r7-r9}
  395. add r1, r1, r2
  396. pld [r1]
  397. ALIGN_DWORD 3, r7, r8, r9
  398. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  399. stm r0, {r10-r11}
  400. add r0, r0, r2
  401. ldm r1, {r4-r6}
  402. add r1, r1, r2
  403. pld [r1]
  404. ALIGN_DWORD 3, r4, r5, r6
  405. subs r3, r3, #1
  406. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  407. stm r0, {r10-r11}
  408. add r0, r0, r2
  409. bne 6b
  410. pop {r4-r11,pc}
  411. .endfunc
  412. .align 5
  413. function ff_put_no_rnd_pixels8_y2_arm, export=1
  414. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  415. @ block = word aligned, pixles = unaligned
  416. pld [r1]
  417. push {r4-r11,lr}
  418. mov r3, r3, lsr #1
  419. ldr r12, =0xfefefefe
  420. JMP_ALIGN r5, r1
  421. 1:
  422. ldm r1, {r4-r5}
  423. add r1, r1, r2
  424. 6: ldm r1, {r6-r7}
  425. add r1, r1, r2
  426. pld [r1]
  427. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  428. ldm r1, {r4-r5}
  429. add r1, r1, r2
  430. stm r0, {r8-r9}
  431. add r0, r0, r2
  432. pld [r1]
  433. NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  434. subs r3, r3, #1
  435. stm r0, {r8-r9}
  436. add r0, r0, r2
  437. bne 6b
  438. pop {r4-r11,pc}
  439. .align 5
  440. 2:
  441. ldm r1, {r4-r6}
  442. add r1, r1, r2
  443. pld [r1]
  444. ALIGN_DWORD 1, r4, r5, r6
  445. 6: ldm r1, {r7-r9}
  446. add r1, r1, r2
  447. pld [r1]
  448. ALIGN_DWORD 1, r7, r8, r9
  449. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  450. stm r0, {r10-r11}
  451. add r0, r0, r2
  452. ldm r1, {r4-r6}
  453. add r1, r1, r2
  454. pld [r1]
  455. ALIGN_DWORD 1, r4, r5, r6
  456. subs r3, r3, #1
  457. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  458. stm r0, {r10-r11}
  459. add r0, r0, r2
  460. bne 6b
  461. pop {r4-r11,pc}
  462. .align 5
  463. 3:
  464. ldm r1, {r4-r6}
  465. add r1, r1, r2
  466. pld [r1]
  467. ALIGN_DWORD 2, r4, r5, r6
  468. 6: ldm r1, {r7-r9}
  469. add r1, r1, r2
  470. pld [r1]
  471. ALIGN_DWORD 2, r7, r8, r9
  472. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  473. stm r0, {r10-r11}
  474. add r0, r0, r2
  475. ldm r1, {r4-r6}
  476. add r1, r1, r2
  477. pld [r1]
  478. ALIGN_DWORD 2, r4, r5, r6
  479. subs r3, r3, #1
  480. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  481. stm r0, {r10-r11}
  482. add r0, r0, r2
  483. bne 6b
  484. pop {r4-r11,pc}
  485. .align 5
  486. 4:
  487. ldm r1, {r4-r6}
  488. add r1, r1, r2
  489. pld [r1]
  490. ALIGN_DWORD 3, r4, r5, r6
  491. 6: ldm r1, {r7-r9}
  492. add r1, r1, r2
  493. pld [r1]
  494. ALIGN_DWORD 3, r7, r8, r9
  495. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  496. stm r0, {r10-r11}
  497. add r0, r0, r2
  498. ldm r1, {r4-r6}
  499. add r1, r1, r2
  500. pld [r1]
  501. ALIGN_DWORD 3, r4, r5, r6
  502. subs r3, r3, #1
  503. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  504. stm r0, {r10-r11}
  505. add r0, r0, r2
  506. bne 6b
  507. pop {r4-r11,pc}
  508. .endfunc
  509. .ltorg
  510. @ ----------------------------------------------------------------
  511. .macro RND_XY2_IT align, rnd
  512. @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
  513. @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
  514. .if \align == 0
  515. ldm r1, {r6-r8}
  516. .elseif \align == 3
  517. ldm r1, {r5-r7}
  518. .else
  519. ldm r1, {r8-r10}
  520. .endif
  521. add r1, r1, r2
  522. pld [r1]
  523. .if \align == 0
  524. ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
  525. .elseif \align == 1
  526. ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
  527. ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
  528. .elseif \align == 2
  529. ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
  530. ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
  531. .elseif \align == 3
  532. ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
  533. .endif
  534. ldr r14, =0x03030303
  535. tst r3, #1
  536. and r8, r4, r14
  537. and r9, r5, r14
  538. and r10, r6, r14
  539. and r11, r7, r14
  540. andeq r14, r14, r14, \rnd #1
  541. add r8, r8, r10
  542. add r9, r9, r11
  543. ldr r12, =0xfcfcfcfc >> 2
  544. addeq r8, r8, r14
  545. addeq r9, r9, r14
  546. and r4, r12, r4, lsr #2
  547. and r5, r12, r5, lsr #2
  548. and r6, r12, r6, lsr #2
  549. and r7, r12, r7, lsr #2
  550. add r10, r4, r6
  551. add r11, r5, r7
  552. subs r3, r3, #1
  553. .endm
  554. .macro RND_XY2_EXPAND align, rnd
  555. RND_XY2_IT \align, \rnd
  556. 6: push {r8-r11}
  557. RND_XY2_IT \align, \rnd
  558. pop {r4-r7}
  559. add r4, r4, r8
  560. add r5, r5, r9
  561. ldr r14, =0x0f0f0f0f
  562. add r6, r6, r10
  563. add r7, r7, r11
  564. and r4, r14, r4, lsr #2
  565. and r5, r14, r5, lsr #2
  566. add r4, r4, r6
  567. add r5, r5, r7
  568. stm r0, {r4-r5}
  569. add r0, r0, r2
  570. bge 6b
  571. pop {r4-r11,pc}
  572. .endm
  573. .align 5
  574. function ff_put_pixels8_xy2_arm, export=1
  575. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  576. @ block = word aligned, pixles = unaligned
  577. pld [r1]
  578. push {r4-r11,lr} @ R14 is also called LR
  579. JMP_ALIGN r5, r1
  580. 1: RND_XY2_EXPAND 0, lsl
  581. .align 5
  582. 2: RND_XY2_EXPAND 1, lsl
  583. .align 5
  584. 3: RND_XY2_EXPAND 2, lsl
  585. .align 5
  586. 4: RND_XY2_EXPAND 3, lsl
  587. .endfunc
  588. .align 5
  589. function ff_put_no_rnd_pixels8_xy2_arm, export=1
  590. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  591. @ block = word aligned, pixles = unaligned
  592. pld [r1]
  593. push {r4-r11,lr}
  594. JMP_ALIGN r5, r1
  595. 1: RND_XY2_EXPAND 0, lsr
  596. .align 5
  597. 2: RND_XY2_EXPAND 1, lsr
  598. .align 5
  599. 3: RND_XY2_EXPAND 2, lsr
  600. .align 5
  601. 4: RND_XY2_EXPAND 3, lsr
  602. .endfunc
  603. .align 5
  604. @ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
  605. function ff_add_pixels_clamped_arm, export=1
  606. push {r4-r10}
  607. mov r10, #8
  608. 1:
  609. ldr r4, [r1] /* load dest */
  610. /* block[0] and block[1]*/
  611. ldrsh r5, [r0]
  612. ldrsh r7, [r0, #2]
  613. and r6, r4, #0xFF
  614. and r8, r4, #0xFF00
  615. add r6, r5, r6
  616. add r8, r7, r8, lsr #8
  617. mvn r5, r5
  618. mvn r7, r7
  619. tst r6, #0x100
  620. movne r6, r5, lsr #24
  621. tst r8, #0x100
  622. movne r8, r7, lsr #24
  623. mov r9, r6
  624. ldrsh r5, [r0, #4] /* moved form [A] */
  625. orr r9, r9, r8, lsl #8
  626. /* block[2] and block[3] */
  627. /* [A] */
  628. ldrsh r7, [r0, #6]
  629. and r6, r4, #0xFF0000
  630. and r8, r4, #0xFF000000
  631. add r6, r5, r6, lsr #16
  632. add r8, r7, r8, lsr #24
  633. mvn r5, r5
  634. mvn r7, r7
  635. tst r6, #0x100
  636. movne r6, r5, lsr #24
  637. tst r8, #0x100
  638. movne r8, r7, lsr #24
  639. orr r9, r9, r6, lsl #16
  640. ldr r4, [r1, #4] /* moved form [B] */
  641. orr r9, r9, r8, lsl #24
  642. /* store dest */
  643. ldrsh r5, [r0, #8] /* moved form [C] */
  644. str r9, [r1]
  645. /* load dest */
  646. /* [B] */
  647. /* block[4] and block[5] */
  648. /* [C] */
  649. ldrsh r7, [r0, #10]
  650. and r6, r4, #0xFF
  651. and r8, r4, #0xFF00
  652. add r6, r5, r6
  653. add r8, r7, r8, lsr #8
  654. mvn r5, r5
  655. mvn r7, r7
  656. tst r6, #0x100
  657. movne r6, r5, lsr #24
  658. tst r8, #0x100
  659. movne r8, r7, lsr #24
  660. mov r9, r6
  661. ldrsh r5, [r0, #12] /* moved from [D] */
  662. orr r9, r9, r8, lsl #8
  663. /* block[6] and block[7] */
  664. /* [D] */
  665. ldrsh r7, [r0, #14]
  666. and r6, r4, #0xFF0000
  667. and r8, r4, #0xFF000000
  668. add r6, r5, r6, lsr #16
  669. add r8, r7, r8, lsr #24
  670. mvn r5, r5
  671. mvn r7, r7
  672. tst r6, #0x100
  673. movne r6, r5, lsr #24
  674. tst r8, #0x100
  675. movne r8, r7, lsr #24
  676. orr r9, r9, r6, lsl #16
  677. add r0, r0, #16 /* moved from [E] */
  678. orr r9, r9, r8, lsl #24
  679. subs r10, r10, #1 /* moved from [F] */
  680. /* store dest */
  681. str r9, [r1, #4]
  682. /* [E] */
  683. /* [F] */
  684. add r1, r1, r2
  685. bne 1b
  686. pop {r4-r10}
  687. bx lr
  688. .endfunc