You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

695 lines
18KB

  1. @
  2. @ ARMv4L optimized DSP utils
  3. @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
  4. @
  5. @ This library is free software; you can redistribute it and/or
  6. @ modify it under the terms of the GNU Lesser General Public
  7. @ License as published by the Free Software Foundation; either
  8. @ version 2 of the License, or (at your option) any later version.
  9. @
  10. @ This library is distributed in the hope that it will be useful,
  11. @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. @ Lesser General Public License for more details.
  14. @
  15. @ You should have received a copy of the GNU Lesser General Public
  16. @ License along with this library; if not, write to the Free Software
  17. @ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. @
  19. .macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
  20. mov \Rd0, \Rn0, lsr #(\shift * 8)
  21. mov \Rd1, \Rn1, lsr #(\shift * 8)
  22. mov \Rd2, \Rn2, lsr #(\shift * 8)
  23. mov \Rd3, \Rn3, lsr #(\shift * 8)
  24. orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
  25. orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
  26. orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
  27. orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
  28. .endm
  29. .macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
  30. mov \R0, \R0, lsr #(\shift * 8)
  31. orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
  32. mov \R1, \R1, lsr #(\shift * 8)
  33. orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
  34. .endm
  35. .macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
  36. mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
  37. mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
  38. orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
  39. orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
  40. .endm
  41. .macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  42. @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  43. @ Rmask = 0xFEFEFEFE
  44. @ Rn = destroy
  45. eor \Rd0, \Rn0, \Rm0
  46. eor \Rd1, \Rn1, \Rm1
  47. orr \Rn0, \Rn0, \Rm0
  48. orr \Rn1, \Rn1, \Rm1
  49. and \Rd0, \Rd0, \Rmask
  50. and \Rd1, \Rd1, \Rmask
  51. sub \Rd0, \Rn0, \Rd0, lsr #1
  52. sub \Rd1, \Rn1, \Rd1, lsr #1
  53. .endm
  54. .macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
  55. @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
  56. @ Rmask = 0xFEFEFEFE
  57. @ Rn = destroy
  58. eor \Rd0, \Rn0, \Rm0
  59. eor \Rd1, \Rn1, \Rm1
  60. and \Rn0, \Rn0, \Rm0
  61. and \Rn1, \Rn1, \Rm1
  62. and \Rd0, \Rd0, \Rmask
  63. and \Rd1, \Rd1, \Rmask
  64. add \Rd0, \Rn0, \Rd0, lsr #1
  65. add \Rd1, \Rn1, \Rd1, lsr #1
  66. .endm
  67. @ ----------------------------------------------------------------
  68. .align 8
  69. .global put_pixels16_arm
  70. put_pixels16_arm:
  71. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  72. @ block = word aligned, pixles = unaligned
  73. pld [r1]
  74. stmfd sp!, {r4-r11, lr} @ R14 is also called LR
  75. adr r5, 5f
  76. ands r4, r1, #3
  77. bic r1, r1, #3
  78. add r5, r5, r4, lsl #2
  79. ldrne pc, [r5]
  80. 1:
  81. ldmia r1, {r4-r7}
  82. add r1, r1, r2
  83. stmia r0, {r4-r7}
  84. pld [r1]
  85. subs r3, r3, #1
  86. add r0, r0, r2
  87. bne 1b
  88. ldmfd sp!, {r4-r11, pc}
  89. .align 8
  90. 2:
  91. ldmia r1, {r4-r8}
  92. add r1, r1, r2
  93. ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
  94. pld [r1]
  95. subs r3, r3, #1
  96. stmia r0, {r9-r12}
  97. add r0, r0, r2
  98. bne 2b
  99. ldmfd sp!, {r4-r11, pc}
  100. .align 8
  101. 3:
  102. ldmia r1, {r4-r8}
  103. add r1, r1, r2
  104. ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
  105. pld [r1]
  106. subs r3, r3, #1
  107. stmia r0, {r9-r12}
  108. add r0, r0, r2
  109. bne 3b
  110. ldmfd sp!, {r4-r11, pc}
  111. .align 8
  112. 4:
  113. ldmia r1, {r4-r8}
  114. add r1, r1, r2
  115. ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
  116. pld [r1]
  117. subs r3, r3, #1
  118. stmia r0, {r9-r12}
  119. add r0, r0, r2
  120. bne 4b
  121. ldmfd sp!, {r4-r11,pc}
  122. .align 8
  123. 5:
  124. .word 1b
  125. .word 2b
  126. .word 3b
  127. .word 4b
  128. @ ----------------------------------------------------------------
  129. .align 8
  130. .global put_pixels8_arm
  131. put_pixels8_arm:
  132. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  133. @ block = word aligned, pixles = unaligned
  134. pld [r1]
  135. stmfd sp!, {r4-r5,lr} @ R14 is also called LR
  136. adr r5, 5f
  137. ands r4, r1, #3
  138. bic r1, r1, #3
  139. add r5, r5, r4, lsl #2
  140. ldrne pc, [r5]
  141. 1:
  142. ldmia r1, {r4-r5}
  143. add r1, r1, r2
  144. subs r3, r3, #1
  145. pld [r1]
  146. stmia r0, {r4-r5}
  147. add r0, r0, r2
  148. bne 1b
  149. ldmfd sp!, {r4-r5,pc}
  150. .align 8
  151. 2:
  152. ldmia r1, {r4-r5, r12}
  153. add r1, r1, r2
  154. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
  155. pld [r1]
  156. subs r3, r3, #1
  157. stmia r0, {r4-r5}
  158. add r0, r0, r2
  159. bne 2b
  160. ldmfd sp!, {r4-r5,pc}
  161. .align 8
  162. 3:
  163. ldmia r1, {r4-r5, r12}
  164. add r1, r1, r2
  165. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
  166. pld [r1]
  167. subs r3, r3, #1
  168. stmia r0, {r4-r5}
  169. add r0, r0, r2
  170. bne 3b
  171. ldmfd sp!, {r4-r5,pc}
  172. .align 8
  173. 4:
  174. ldmia r1, {r4-r5, r12}
  175. add r1, r1, r2
  176. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
  177. pld [r1]
  178. subs r3, r3, #1
  179. stmia r0, {r4-r5}
  180. add r0, r0, r2
  181. bne 4b
  182. ldmfd sp!, {r4-r5,pc}
  183. .align 8
  184. 5:
  185. .word 1b
  186. .word 2b
  187. .word 3b
  188. .word 4b
  189. @ ----------------------------------------------------------------
  190. .align 8
  191. .global put_pixels8_x2_arm
  192. put_pixels8_x2_arm:
  193. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  194. @ block = word aligned, pixles = unaligned
  195. pld [r1]
  196. stmfd sp!, {r4-r10,lr} @ R14 is also called LR
  197. adr r5, 5f
  198. ands r4, r1, #3
  199. ldr r12, [r5]
  200. add r5, r5, r4, lsl #2
  201. bic r1, r1, #3
  202. ldrne pc, [r5]
  203. 1:
  204. ldmia r1, {r4-r5, r10}
  205. add r1, r1, r2
  206. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  207. pld [r1]
  208. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  209. subs r3, r3, #1
  210. stmia r0, {r8-r9}
  211. add r0, r0, r2
  212. bne 1b
  213. ldmfd sp!, {r4-r10,pc}
  214. .align 8
  215. 2:
  216. ldmia r1, {r4-r5, r10}
  217. add r1, r1, r2
  218. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  219. ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
  220. pld [r1]
  221. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  222. subs r3, r3, #1
  223. stmia r0, {r4-r5}
  224. add r0, r0, r2
  225. bne 2b
  226. ldmfd sp!, {r4-r10,pc}
  227. .align 8
  228. 3:
  229. ldmia r1, {r4-r5, r10}
  230. add r1, r1, r2
  231. ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
  232. ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
  233. pld [r1]
  234. RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  235. subs r3, r3, #1
  236. stmia r0, {r4-r5}
  237. add r0, r0, r2
  238. bne 3b
  239. ldmfd sp!, {r4-r10,pc}
  240. .align 8
  241. 4:
  242. ldmia r1, {r4-r5, r10}
  243. add r1, r1, r2
  244. ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
  245. pld [r1]
  246. RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  247. subs r3, r3, #1
  248. stmia r0, {r8-r9}
  249. add r0, r0, r2
  250. bne 4b
  251. ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
  252. .align 8
  253. 5:
  254. .word 0xFEFEFEFE
  255. .word 2b
  256. .word 3b
  257. .word 4b
  258. .align 8
  259. .global put_no_rnd_pixels8_x2_arm
  260. put_no_rnd_pixels8_x2_arm:
  261. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  262. @ block = word aligned, pixles = unaligned
  263. pld [r1]
  264. stmfd sp!, {r4-r10,lr} @ R14 is also called LR
  265. adr r5, 5f
  266. ands r4, r1, #3
  267. ldr r12, [r5]
  268. add r5, r5, r4, lsl #2
  269. bic r1, r1, #3
  270. ldrne pc, [r5]
  271. 1:
  272. ldmia r1, {r4-r5, r10}
  273. add r1, r1, r2
  274. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  275. pld [r1]
  276. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  277. subs r3, r3, #1
  278. stmia r0, {r8-r9}
  279. add r0, r0, r2
  280. bne 1b
  281. ldmfd sp!, {r4-r10,pc}
  282. .align 8
  283. 2:
  284. ldmia r1, {r4-r5, r10}
  285. add r1, r1, r2
  286. ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
  287. ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
  288. pld [r1]
  289. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  290. subs r3, r3, #1
  291. stmia r0, {r4-r5}
  292. add r0, r0, r2
  293. bne 2b
  294. ldmfd sp!, {r4-r10,pc}
  295. .align 8
  296. 3:
  297. ldmia r1, {r4-r5, r10}
  298. add r1, r1, r2
  299. ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
  300. ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
  301. pld [r1]
  302. NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
  303. subs r3, r3, #1
  304. stmia r0, {r4-r5}
  305. add r0, r0, r2
  306. bne 3b
  307. ldmfd sp!, {r4-r10,pc}
  308. .align 8
  309. 4:
  310. ldmia r1, {r4-r5, r10}
  311. add r1, r1, r2
  312. ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
  313. pld [r1]
  314. NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
  315. subs r3, r3, #1
  316. stmia r0, {r8-r9}
  317. add r0, r0, r2
  318. bne 4b
  319. ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
  320. .align 8
  321. 5:
  322. .word 0xFEFEFEFE
  323. .word 2b
  324. .word 3b
  325. .word 4b
  326. @ ----------------------------------------------------------------
  327. .align 8
  328. .global put_pixels8_y2_arm
  329. put_pixels8_y2_arm:
  330. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  331. @ block = word aligned, pixles = unaligned
  332. pld [r1]
  333. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  334. adr r5, 5f
  335. ands r4, r1, #3
  336. mov r3, r3, lsr #1
  337. ldr r12, [r5]
  338. add r5, r5, r4, lsl #2
  339. bic r1, r1, #3
  340. ldrne pc, [r5]
  341. 1:
  342. ldmia r1, {r4-r5}
  343. add r1, r1, r2
  344. 6: ldmia r1, {r6-r7}
  345. add r1, r1, r2
  346. pld [r1]
  347. RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  348. ldmia r1, {r4-r5}
  349. add r1, r1, r2
  350. stmia r0, {r8-r9}
  351. add r0, r0, r2
  352. pld [r1]
  353. RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  354. subs r3, r3, #1
  355. stmia r0, {r8-r9}
  356. add r0, r0, r2
  357. bne 6b
  358. ldmfd sp!, {r4-r11,pc}
  359. .align 8
  360. 2:
  361. ldmia r1, {r4-r6}
  362. add r1, r1, r2
  363. pld [r1]
  364. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  365. 6: ldmia r1, {r7-r9}
  366. add r1, r1, r2
  367. pld [r1]
  368. ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
  369. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  370. stmia r0, {r10-r11}
  371. add r0, r0, r2
  372. ldmia r1, {r4-r6}
  373. add r1, r1, r2
  374. pld [r1]
  375. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  376. subs r3, r3, #1
  377. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  378. stmia r0, {r10-r11}
  379. add r0, r0, r2
  380. bne 6b
  381. ldmfd sp!, {r4-r11,pc}
  382. .align 8
  383. 3:
  384. ldmia r1, {r4-r6}
  385. add r1, r1, r2
  386. pld [r1]
  387. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  388. 6: ldmia r1, {r7-r9}
  389. add r1, r1, r2
  390. pld [r1]
  391. ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
  392. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  393. stmia r0, {r10-r11}
  394. add r0, r0, r2
  395. ldmia r1, {r4-r6}
  396. add r1, r1, r2
  397. pld [r1]
  398. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  399. subs r3, r3, #1
  400. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  401. stmia r0, {r10-r11}
  402. add r0, r0, r2
  403. bne 6b
  404. ldmfd sp!, {r4-r11,pc}
  405. .align 8
  406. 4:
  407. ldmia r1, {r4-r6}
  408. add r1, r1, r2
  409. pld [r1]
  410. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  411. 6: ldmia r1, {r7-r9}
  412. add r1, r1, r2
  413. pld [r1]
  414. ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
  415. RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  416. stmia r0, {r10-r11}
  417. add r0, r0, r2
  418. ldmia r1, {r4-r6}
  419. add r1, r1, r2
  420. pld [r1]
  421. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  422. subs r3, r3, #1
  423. RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  424. stmia r0, {r10-r11}
  425. add r0, r0, r2
  426. bne 6b
  427. ldmfd sp!, {r4-r11,pc}
  428. .align 8
  429. 5:
  430. .word 0xFEFEFEFE
  431. .word 2b
  432. .word 3b
  433. .word 4b
  434. .align 8
  435. .global put_no_rnd_pixels8_y2_arm
  436. put_no_rnd_pixels8_y2_arm:
  437. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  438. @ block = word aligned, pixles = unaligned
  439. pld [r1]
  440. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  441. adr r5, 5f
  442. ands r4, r1, #3
  443. mov r3, r3, lsr #1
  444. ldr r12, [r5]
  445. add r5, r5, r4, lsl #2
  446. bic r1, r1, #3
  447. ldrne pc, [r5]
  448. 1:
  449. ldmia r1, {r4-r5}
  450. add r1, r1, r2
  451. 6: ldmia r1, {r6-r7}
  452. add r1, r1, r2
  453. pld [r1]
  454. NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
  455. ldmia r1, {r4-r5}
  456. add r1, r1, r2
  457. stmia r0, {r8-r9}
  458. add r0, r0, r2
  459. pld [r1]
  460. NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
  461. subs r3, r3, #1
  462. stmia r0, {r8-r9}
  463. add r0, r0, r2
  464. bne 6b
  465. ldmfd sp!, {r4-r11,pc}
  466. .align 8
  467. 2:
  468. ldmia r1, {r4-r6}
  469. add r1, r1, r2
  470. pld [r1]
  471. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  472. 6: ldmia r1, {r7-r9}
  473. add r1, r1, r2
  474. pld [r1]
  475. ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
  476. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  477. stmia r0, {r10-r11}
  478. add r0, r0, r2
  479. ldmia r1, {r4-r6}
  480. add r1, r1, r2
  481. pld [r1]
  482. ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
  483. subs r3, r3, #1
  484. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  485. stmia r0, {r10-r11}
  486. add r0, r0, r2
  487. bne 6b
  488. ldmfd sp!, {r4-r11,pc}
  489. .align 8
  490. 3:
  491. ldmia r1, {r4-r6}
  492. add r1, r1, r2
  493. pld [r1]
  494. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  495. 6: ldmia r1, {r7-r9}
  496. add r1, r1, r2
  497. pld [r1]
  498. ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
  499. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  500. stmia r0, {r10-r11}
  501. add r0, r0, r2
  502. ldmia r1, {r4-r6}
  503. add r1, r1, r2
  504. pld [r1]
  505. ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
  506. subs r3, r3, #1
  507. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  508. stmia r0, {r10-r11}
  509. add r0, r0, r2
  510. bne 6b
  511. ldmfd sp!, {r4-r11,pc}
  512. .align 8
  513. 4:
  514. ldmia r1, {r4-r6}
  515. add r1, r1, r2
  516. pld [r1]
  517. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  518. 6: ldmia r1, {r7-r9}
  519. add r1, r1, r2
  520. pld [r1]
  521. ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
  522. NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
  523. stmia r0, {r10-r11}
  524. add r0, r0, r2
  525. ldmia r1, {r4-r6}
  526. add r1, r1, r2
  527. pld [r1]
  528. ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
  529. subs r3, r3, #1
  530. NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
  531. stmia r0, {r10-r11}
  532. add r0, r0, r2
  533. bne 6b
  534. ldmfd sp!, {r4-r11,pc}
  535. .align 8
  536. 5:
  537. .word 0xFEFEFEFE
  538. .word 2b
  539. .word 3b
  540. .word 4b
  541. @ ----------------------------------------------------------------
  542. .macro RND_XY2_IT align, rnd
  543. @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
  544. @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
  545. .if \align == 0
  546. ldmia r1, {r6-r8}
  547. .elseif \align == 3
  548. ldmia r1, {r5-r7}
  549. .else
  550. ldmia r1, {r8-r10}
  551. .endif
  552. add r1, r1, r2
  553. pld [r1]
  554. .if \align == 0
  555. ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
  556. .elseif \align == 1
  557. ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
  558. ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
  559. .elseif \align == 2
  560. ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
  561. ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
  562. .elseif \align == 3
  563. ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
  564. .endif
  565. ldr r14, [r12, #0] @ 0x03030303
  566. tst r3, #1
  567. and r8, r4, r14
  568. and r9, r5, r14
  569. and r10, r6, r14
  570. and r11, r7, r14
  571. .if \rnd == 1
  572. ldreq r14, [r12, #16] @ 0x02020202
  573. .else
  574. ldreq r14, [r12, #28] @ 0x01010101
  575. .endif
  576. add r8, r8, r10
  577. add r9, r9, r11
  578. addeq r8, r8, r14
  579. addeq r9, r9, r14
  580. ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
  581. and r4, r14, r4, lsr #2
  582. and r5, r14, r5, lsr #2
  583. and r6, r14, r6, lsr #2
  584. and r7, r14, r7, lsr #2
  585. add r10, r4, r6
  586. add r11, r5, r7
  587. .endm
  588. .macro RND_XY2_EXPAND align, rnd
  589. RND_XY2_IT \align, \rnd
  590. 6: stmfd sp!, {r8-r11}
  591. RND_XY2_IT \align, \rnd
  592. ldmfd sp!, {r4-r7}
  593. add r4, r4, r8
  594. add r5, r5, r9
  595. add r6, r6, r10
  596. add r7, r7, r11
  597. ldr r14, [r12, #24] @ 0x0F0F0F0F
  598. and r4, r14, r4, lsr #2
  599. and r5, r14, r5, lsr #2
  600. add r4, r4, r6
  601. add r5, r5, r7
  602. subs r3, r3, #1
  603. stmia r0, {r4-r5}
  604. add r0, r0, r2
  605. bne 6b
  606. ldmfd sp!, {r4-r11,pc}
  607. .endm
  608. .align 8
  609. .global put_pixels8_xy2_arm
  610. put_pixels8_xy2_arm:
  611. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  612. @ block = word aligned, pixles = unaligned
  613. pld [r1]
  614. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  615. adrl r12, 5f
  616. ands r4, r1, #3
  617. add r5, r12, r4, lsl #2
  618. bic r1, r1, #3
  619. ldrne pc, [r5]
  620. 1:
  621. RND_XY2_EXPAND 0, 1
  622. .align 8
  623. 2:
  624. RND_XY2_EXPAND 1, 1
  625. .align 8
  626. 3:
  627. RND_XY2_EXPAND 2, 1
  628. .align 8
  629. 4:
  630. RND_XY2_EXPAND 3, 1
  631. 5:
  632. .word 0x03030303
  633. .word 2b
  634. .word 3b
  635. .word 4b
  636. .word 0x02020202
  637. .word 0xFCFCFCFC >> 2
  638. .word 0x0F0F0F0F
  639. .word 0x01010101
  640. .align 8
  641. .global put_no_rnd_pixels8_xy2_arm
  642. put_no_rnd_pixels8_xy2_arm:
  643. @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  644. @ block = word aligned, pixles = unaligned
  645. pld [r1]
  646. stmfd sp!, {r4-r11,lr} @ R14 is also called LR
  647. adrl r12, 5f
  648. ands r4, r1, #3
  649. add r5, r12, r4, lsl #2
  650. bic r1, r1, #3
  651. ldrne pc, [r5]
  652. 1:
  653. RND_XY2_EXPAND 0, 0
  654. .align 8
  655. 2:
  656. RND_XY2_EXPAND 1, 0
  657. .align 8
  658. 3:
  659. RND_XY2_EXPAND 2, 0
  660. .align 8
  661. 4:
  662. RND_XY2_EXPAND 3, 0
  663. 5:
  664. .word 0x03030303
  665. .word 2b
  666. .word 3b
  667. .word 4b
  668. .word 0x02020202
  669. .word 0xFCFCFCFC >> 2
  670. .word 0x0F0F0F0F
  671. .word 0x01010101