You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

501 lines
12KB

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV40 decoder
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  5. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA 16
  25. pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
  26. sixtap_filter_hb_m: times 8 db 1, -5
  27. times 8 db 52, 20
  28. ; multiplied by 2 to have the same shift
  29. times 8 db 2, -10
  30. times 8 db 40, 40
  31. ; back to normal
  32. times 8 db 1, -5
  33. times 8 db 20, 52
  34. sixtap_filter_v_m: times 8 dw 1
  35. times 8 dw -5
  36. times 8 dw 52
  37. times 8 dw 20
  38. ; multiplied by 2 to have the same shift
  39. times 8 dw 2
  40. times 8 dw -10
  41. times 8 dw 40
  42. times 8 dw 40
  43. ; back to normal
  44. times 8 dw 1
  45. times 8 dw -5
  46. times 8 dw 20
  47. times 8 dw 52
  48. %ifdef PIC
  49. %define sixtap_filter_hw picregq
  50. %define sixtap_filter_hb picregq
  51. %define sixtap_filter_v picregq
  52. %define npicregs 1
  53. %else
  54. %define sixtap_filter_hw sixtap_filter_hw_m
  55. %define sixtap_filter_hb sixtap_filter_hb_m
  56. %define sixtap_filter_v sixtap_filter_v_m
  57. %define npicregs 0
  58. %endif
  59. filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  60. filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  61. filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  62. cextern pw_32
  63. cextern pw_16
  64. cextern pw_512
  65. SECTION .text
  66. ;-----------------------------------------------------------------------------
  67. ; subpel MC functions:
  68. ;
  69. ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  70. ; uint8_t *src, int srcstride,
  71. ; int len, int m);
  72. ;----------------------------------------------------------------------
  73. %macro LOAD 2
  74. %if WIN64
  75. movsxd %1q, %1d
  76. %endif
  77. %ifdef PIC
  78. add %1q, picregq
  79. %else
  80. add %1q, %2
  81. %endif
  82. %endmacro
  83. %macro STORE 3
  84. %ifidn %3, avg
  85. movh %2, [dstq]
  86. %endif
  87. packuswb %1, %1
  88. %ifidn %3, avg
  89. PAVGB %1, %2
  90. %endif
  91. movh [dstq], %1
  92. %endmacro
  93. %macro FILTER_V 1
  94. cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
  95. %ifdef PIC
  96. lea picregq, [sixtap_filter_v_m]
  97. %endif
  98. pxor m7, m7
  99. LOAD my, sixtap_filter_v
  100. ; read 5 lines
  101. sub srcq, srcstrideq
  102. sub srcq, srcstrideq
  103. movh m0, [srcq]
  104. movh m1, [srcq+srcstrideq]
  105. movh m2, [srcq+srcstrideq*2]
  106. lea srcq, [srcq+srcstrideq*2]
  107. add srcq, srcstrideq
  108. movh m3, [srcq]
  109. movh m4, [srcq+srcstrideq]
  110. punpcklbw m0, m7
  111. punpcklbw m1, m7
  112. punpcklbw m2, m7
  113. punpcklbw m3, m7
  114. punpcklbw m4, m7
  115. %ifdef m8
  116. mova m8, [myq+ 0]
  117. mova m9, [myq+16]
  118. mova m10, [myq+32]
  119. mova m11, [myq+48]
  120. %define COEFF05 m8
  121. %define COEFF14 m9
  122. %define COEFF2 m10
  123. %define COEFF3 m11
  124. %else
  125. %define COEFF05 [myq+ 0]
  126. %define COEFF14 [myq+16]
  127. %define COEFF2 [myq+32]
  128. %define COEFF3 [myq+48]
  129. %endif
  130. .nextrow:
  131. mova m6, m1
  132. movh m5, [srcq+2*srcstrideq] ; read new row
  133. paddw m6, m4
  134. punpcklbw m5, m7
  135. pmullw m6, COEFF14
  136. paddw m0, m5
  137. pmullw m0, COEFF05
  138. paddw m6, m0
  139. mova m0, m1
  140. paddw m6, [pw_32]
  141. mova m1, m2
  142. pmullw m2, COEFF2
  143. paddw m6, m2
  144. mova m2, m3
  145. pmullw m3, COEFF3
  146. paddw m6, m3
  147. ; round/clip/store
  148. mova m3, m4
  149. psraw m6, 6
  150. mova m4, m5
  151. STORE m6, m5, %1
  152. ; go to next line
  153. add dstq, dststrideq
  154. add srcq, srcstrideq
  155. dec heightd ; next row
  156. jg .nextrow
  157. REP_RET
  158. %endmacro
  159. %macro FILTER_H 1
  160. cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
  161. %ifdef PIC
  162. lea picregq, [sixtap_filter_v_m]
  163. %endif
  164. pxor m7, m7
  165. LOAD mx, sixtap_filter_v
  166. mova m6, [pw_32]
  167. %ifdef m8
  168. mova m8, [mxq+ 0]
  169. mova m9, [mxq+16]
  170. mova m10, [mxq+32]
  171. mova m11, [mxq+48]
  172. %define COEFF05 m8
  173. %define COEFF14 m9
  174. %define COEFF2 m10
  175. %define COEFF3 m11
  176. %else
  177. %define COEFF05 [mxq+ 0]
  178. %define COEFF14 [mxq+16]
  179. %define COEFF2 [mxq+32]
  180. %define COEFF3 [mxq+48]
  181. %endif
  182. .nextrow:
  183. movq m0, [srcq-2]
  184. movq m5, [srcq+3]
  185. movq m1, [srcq-1]
  186. movq m4, [srcq+2]
  187. punpcklbw m0, m7
  188. punpcklbw m5, m7
  189. punpcklbw m1, m7
  190. punpcklbw m4, m7
  191. movq m2, [srcq-0]
  192. movq m3, [srcq+1]
  193. paddw m0, m5
  194. paddw m1, m4
  195. punpcklbw m2, m7
  196. punpcklbw m3, m7
  197. pmullw m0, COEFF05
  198. pmullw m1, COEFF14
  199. pmullw m2, COEFF2
  200. pmullw m3, COEFF3
  201. paddw m0, m6
  202. paddw m1, m2
  203. paddw m0, m3
  204. paddw m0, m1
  205. psraw m0, 6
  206. STORE m0, m1, %1
  207. ; go to next line
  208. add dstq, dststrideq
  209. add srcq, srcstrideq
  210. dec heightd ; next row
  211. jg .nextrow
  212. REP_RET
  213. %endmacro
  214. %if ARCH_X86_32
  215. INIT_MMX mmx
  216. FILTER_V put
  217. FILTER_H put
  218. INIT_MMX mmxext
  219. FILTER_V avg
  220. FILTER_H avg
  221. INIT_MMX 3dnow
  222. FILTER_V avg
  223. FILTER_H avg
  224. %endif
  225. INIT_XMM sse2
  226. FILTER_H put
  227. FILTER_H avg
  228. FILTER_V put
  229. FILTER_V avg
  230. %macro FILTER_SSSE3 1
  231. cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
  232. %ifdef PIC
  233. lea picregq, [sixtap_filter_hb_m]
  234. %endif
  235. ; read 5 lines
  236. sub srcq, srcstrideq
  237. LOAD my, sixtap_filter_hb
  238. sub srcq, srcstrideq
  239. movh m0, [srcq]
  240. movh m1, [srcq+srcstrideq]
  241. movh m2, [srcq+srcstrideq*2]
  242. lea srcq, [srcq+srcstrideq*2]
  243. add srcq, srcstrideq
  244. mova m5, [myq]
  245. movh m3, [srcq]
  246. movh m4, [srcq+srcstrideq]
  247. lea srcq, [srcq+2*srcstrideq]
  248. .nextrow:
  249. mova m6, m2
  250. punpcklbw m0, m1
  251. punpcklbw m6, m3
  252. pmaddubsw m0, m5
  253. pmaddubsw m6, [myq+16]
  254. movh m7, [srcq] ; read new row
  255. paddw m6, m0
  256. mova m0, m1
  257. mova m1, m2
  258. mova m2, m3
  259. mova m3, m4
  260. mova m4, m7
  261. punpcklbw m7, m3
  262. pmaddubsw m7, m5
  263. paddw m6, m7
  264. pmulhrsw m6, [pw_512]
  265. STORE m6, m7, %1
  266. ; go to next line
  267. add dstq, dststrideq
  268. add srcq, srcstrideq
  269. dec heightd ; next row
  270. jg .nextrow
  271. REP_RET
  272. cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
  273. %ifdef PIC
  274. lea picregq, [sixtap_filter_hb_m]
  275. %endif
  276. mova m3, [filter_h6_shuf2]
  277. mova m4, [filter_h6_shuf3]
  278. LOAD mx, sixtap_filter_hb
  279. mova m5, [mxq] ; set up 6tap filter in bytes
  280. mova m6, [mxq+16]
  281. mova m7, [filter_h6_shuf1]
  282. .nextrow:
  283. movu m0, [srcq-2]
  284. mova m1, m0
  285. mova m2, m0
  286. pshufb m0, m7
  287. pshufb m1, m3
  288. pshufb m2, m4
  289. pmaddubsw m0, m5
  290. pmaddubsw m1, m6
  291. pmaddubsw m2, m5
  292. paddw m0, m1
  293. paddw m0, m2
  294. pmulhrsw m0, [pw_512]
  295. STORE m0, m1, %1
  296. ; go to next line
  297. add dstq, dststrideq
  298. add srcq, srcstrideq
  299. dec heightd ; next row
  300. jg .nextrow
  301. REP_RET
  302. %endmacro
  303. INIT_XMM ssse3
  304. FILTER_SSSE3 put
  305. FILTER_SSSE3 avg
  306. ; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
  307. %macro RV40_WCORE 4-5
  308. movh m4, [%3 + r6 + 0]
  309. movh m5, [%4 + r6 + 0]
  310. %if %0 == 4
  311. %define OFFSET r6 + mmsize / 2
  312. %else
  313. ; 8x8 block and SSE2, stride was provided
  314. %define OFFSET r6
  315. add r6, r5
  316. %endif
  317. movh m6, [%3 + OFFSET]
  318. movh m7, [%4 + OFFSET]
  319. %if %1 == 0
  320. ; 14-bit weights
  321. punpcklbw m4, m0
  322. punpcklbw m5, m0
  323. punpcklbw m6, m0
  324. punpcklbw m7, m0
  325. psllw m4, 7
  326. psllw m5, 7
  327. psllw m6, 7
  328. psllw m7, 7
  329. pmulhw m4, m3
  330. pmulhw m5, m2
  331. pmulhw m6, m3
  332. pmulhw m7, m2
  333. paddw m4, m5
  334. paddw m6, m7
  335. %else
  336. ; 5-bit weights
  337. %if cpuflag(ssse3)
  338. punpcklbw m4, m5
  339. punpcklbw m6, m7
  340. pmaddubsw m4, m3
  341. pmaddubsw m6, m3
  342. %else
  343. punpcklbw m4, m0
  344. punpcklbw m5, m0
  345. punpcklbw m6, m0
  346. punpcklbw m7, m0
  347. pmullw m4, m3
  348. pmullw m5, m2
  349. pmullw m6, m3
  350. pmullw m7, m2
  351. paddw m4, m5
  352. paddw m6, m7
  353. %endif
  354. %endif
  355. ; bias and shift down
  356. %if cpuflag(ssse3)
  357. pmulhrsw m4, m1
  358. pmulhrsw m6, m1
  359. %else
  360. paddw m4, m1
  361. paddw m6, m1
  362. psrlw m4, 5
  363. psrlw m6, 5
  364. %endif
  365. packuswb m4, m6
  366. %if %0 == 5
  367. ; Only called for 8x8 blocks and SSE2
  368. sub r6, r5
  369. movh [%2 + r6], m4
  370. add r6, r5
  371. movhps [%2 + r6], m4
  372. %else
  373. mova [%2 + r6], m4
  374. %endif
  375. %endmacro
  376. %macro MAIN_LOOP 2
  377. %if mmsize == 8
  378. RV40_WCORE %2, r0, r1, r2
  379. %if %1 == 16
  380. RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
  381. %endif
  382. ; Prepare for next loop
  383. add r6, r5
  384. %else
  385. %ifidn %1, 8
  386. RV40_WCORE %2, r0, r1, r2, r5
  387. ; Prepare 2 next lines
  388. add r6, r5
  389. %else
  390. RV40_WCORE %2, r0, r1, r2
  391. ; Prepare single next line
  392. add r6, r5
  393. %endif
  394. %endif
  395. %endmacro
  396. ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
  397. ; %1=size %2=num of xmm regs
  398. ; The weights are FP0.14 notation of fractions depending on pts.
  399. ; For timebases without rounding error (i.e. PAL), the fractions
  400. ; can be simplified, and several operations can be avoided.
  401. ; Therefore, we check here whether they are multiples of 2^9 for
  402. ; those simplifications to occur.
  403. %macro RV40_WEIGHT 3
  404. cglobal rv40_weight_func_%1_%2, 6, 7, 8
  405. %if cpuflag(ssse3)
  406. mova m1, [pw_1024]
  407. %else
  408. mova m1, [pw_16]
  409. %endif
  410. pxor m0, m0
  411. ; Set loop counter and increments
  412. mov r6, r5
  413. shl r6, %3
  414. add r0, r6
  415. add r1, r6
  416. add r2, r6
  417. neg r6
  418. movd m2, r3d
  419. movd m3, r4d
  420. %ifidn %1,rnd
  421. %define RND 0
  422. SPLATW m2, m2
  423. %else
  424. %define RND 1
  425. %if cpuflag(ssse3)
  426. punpcklbw m3, m2
  427. %else
  428. SPLATW m2, m2
  429. %endif
  430. %endif
  431. SPLATW m3, m3
  432. .loop:
  433. MAIN_LOOP %2, RND
  434. jnz .loop
  435. REP_RET
  436. %endmacro
  437. INIT_MMX mmxext
  438. RV40_WEIGHT rnd, 8, 3
  439. RV40_WEIGHT rnd, 16, 4
  440. RV40_WEIGHT nornd, 8, 3
  441. RV40_WEIGHT nornd, 16, 4
  442. INIT_XMM sse2
  443. RV40_WEIGHT rnd, 8, 3
  444. RV40_WEIGHT rnd, 16, 4
  445. RV40_WEIGHT nornd, 8, 3
  446. RV40_WEIGHT nornd, 16, 4
  447. INIT_XMM ssse3
  448. RV40_WEIGHT rnd, 8, 3
  449. RV40_WEIGHT rnd, 16, 4
  450. RV40_WEIGHT nornd, 8, 3
  451. RV40_WEIGHT nornd, 16, 4