You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

507 lines
12KB

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV40 decoder
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "x86inc.asm"
  24. %include "x86util.asm"
  25. SECTION_RODATA
  26. align 16
  27. pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
  28. sixtap_filter_hb_m: times 8 db 1, -5
  29. times 8 db 52, 20
  30. ; multiplied by 2 to have the same shift
  31. times 8 db 2, -10
  32. times 8 db 40, 40
  33. ; back to normal
  34. times 8 db 1, -5
  35. times 8 db 20, 52
  36. sixtap_filter_v_m: times 8 dw 1
  37. times 8 dw -5
  38. times 8 dw 52
  39. times 8 dw 20
  40. ; multiplied by 2 to have the same shift
  41. times 8 dw 2
  42. times 8 dw -10
  43. times 8 dw 40
  44. times 8 dw 40
  45. ; back to normal
  46. times 8 dw 1
  47. times 8 dw -5
  48. times 8 dw 20
  49. times 8 dw 52
  50. %ifdef PIC
  51. %define sixtap_filter_hw picregq
  52. %define sixtap_filter_hb picregq
  53. %define sixtap_filter_v picregq
  54. %define npicregs 1
  55. %else
  56. %define sixtap_filter_hw sixtap_filter_hw_m
  57. %define sixtap_filter_hb sixtap_filter_hb_m
  58. %define sixtap_filter_v sixtap_filter_v_m
  59. %define npicregs 0
  60. %endif
  61. filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  62. filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  63. filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  64. cextern pw_32
  65. cextern pw_16
  66. cextern pw_512
  67. SECTION .text
  68. ;-----------------------------------------------------------------------------
  69. ; subpel MC functions:
  70. ;
  71. ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  72. ; uint8_t *src, int srcstride,
  73. ; int len, int m);
  74. ;----------------------------------------------------------------------
  75. %macro LOAD 2
  76. %if WIN64
  77. movsxd %1q, %1d
  78. %endif
  79. %ifdef PIC
  80. add %1q, picregq
  81. %else
  82. add %1q, %2
  83. %endif
  84. %endmacro
  85. %macro STORE 3
  86. %ifidn %3, avg
  87. movh %2, [dstq]
  88. %endif
  89. packuswb %1, %1
  90. %ifidn %3, avg
  91. %if cpuflag(3dnow)
  92. pavgusb %1, %2
  93. %else
  94. pavgb %1, %2
  95. %endif
  96. %endif
  97. movh [dstq], %1
  98. %endmacro
  99. %macro FILTER_V 1
  100. cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
  101. %ifdef PIC
  102. lea picregq, [sixtap_filter_v_m]
  103. %endif
  104. pxor m7, m7
  105. LOAD my, sixtap_filter_v
  106. ; read 5 lines
  107. sub srcq, srcstrideq
  108. sub srcq, srcstrideq
  109. movh m0, [srcq]
  110. movh m1, [srcq+srcstrideq]
  111. movh m2, [srcq+srcstrideq*2]
  112. lea srcq, [srcq+srcstrideq*2]
  113. add srcq, srcstrideq
  114. movh m3, [srcq]
  115. movh m4, [srcq+srcstrideq]
  116. punpcklbw m0, m7
  117. punpcklbw m1, m7
  118. punpcklbw m2, m7
  119. punpcklbw m3, m7
  120. punpcklbw m4, m7
  121. %ifdef m8
  122. mova m8, [myq+ 0]
  123. mova m9, [myq+16]
  124. mova m10, [myq+32]
  125. mova m11, [myq+48]
  126. %define COEFF05 m8
  127. %define COEFF14 m9
  128. %define COEFF2 m10
  129. %define COEFF3 m11
  130. %else
  131. %define COEFF05 [myq+ 0]
  132. %define COEFF14 [myq+16]
  133. %define COEFF2 [myq+32]
  134. %define COEFF3 [myq+48]
  135. %endif
  136. .nextrow:
  137. mova m6, m1
  138. movh m5, [srcq+2*srcstrideq] ; read new row
  139. paddw m6, m4
  140. punpcklbw m5, m7
  141. pmullw m6, COEFF14
  142. paddw m0, m5
  143. pmullw m0, COEFF05
  144. paddw m6, m0
  145. mova m0, m1
  146. paddw m6, [pw_32]
  147. mova m1, m2
  148. pmullw m2, COEFF2
  149. paddw m6, m2
  150. mova m2, m3
  151. pmullw m3, COEFF3
  152. paddw m6, m3
  153. ; round/clip/store
  154. mova m3, m4
  155. psraw m6, 6
  156. mova m4, m5
  157. STORE m6, m5, %1
  158. ; go to next line
  159. add dstq, dststrideq
  160. add srcq, srcstrideq
  161. dec heightd ; next row
  162. jg .nextrow
  163. REP_RET
  164. %endmacro
  165. %macro FILTER_H 1
  166. cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
  167. %ifdef PIC
  168. lea picregq, [sixtap_filter_v_m]
  169. %endif
  170. pxor m7, m7
  171. LOAD mx, sixtap_filter_v
  172. mova m6, [pw_32]
  173. %ifdef m8
  174. mova m8, [mxq+ 0]
  175. mova m9, [mxq+16]
  176. mova m10, [mxq+32]
  177. mova m11, [mxq+48]
  178. %define COEFF05 m8
  179. %define COEFF14 m9
  180. %define COEFF2 m10
  181. %define COEFF3 m11
  182. %else
  183. %define COEFF05 [mxq+ 0]
  184. %define COEFF14 [mxq+16]
  185. %define COEFF2 [mxq+32]
  186. %define COEFF3 [mxq+48]
  187. %endif
  188. .nextrow:
  189. movq m0, [srcq-2]
  190. movq m5, [srcq+3]
  191. movq m1, [srcq-1]
  192. movq m4, [srcq+2]
  193. punpcklbw m0, m7
  194. punpcklbw m5, m7
  195. punpcklbw m1, m7
  196. punpcklbw m4, m7
  197. movq m2, [srcq-0]
  198. movq m3, [srcq+1]
  199. paddw m0, m5
  200. paddw m1, m4
  201. punpcklbw m2, m7
  202. punpcklbw m3, m7
  203. pmullw m0, COEFF05
  204. pmullw m1, COEFF14
  205. pmullw m2, COEFF2
  206. pmullw m3, COEFF3
  207. paddw m0, m6
  208. paddw m1, m2
  209. paddw m0, m3
  210. paddw m0, m1
  211. psraw m0, 6
  212. STORE m0, m1, %1
  213. ; go to next line
  214. add dstq, dststrideq
  215. add srcq, srcstrideq
  216. dec heightd ; next row
  217. jg .nextrow
  218. REP_RET
  219. %endmacro
  220. %if ARCH_X86_32
  221. INIT_MMX mmx
  222. FILTER_V put
  223. FILTER_H put
  224. INIT_MMX mmx2
  225. FILTER_V avg
  226. FILTER_H avg
  227. INIT_MMX 3dnow
  228. FILTER_V avg
  229. FILTER_H avg
  230. %endif
  231. INIT_XMM sse2
  232. FILTER_H put
  233. FILTER_H avg
  234. FILTER_V put
  235. FILTER_V avg
  236. %macro FILTER_SSSE3 1
  237. cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
  238. %ifdef PIC
  239. lea picregq, [sixtap_filter_hb_m]
  240. %endif
  241. ; read 5 lines
  242. sub srcq, srcstrideq
  243. LOAD my, sixtap_filter_hb
  244. sub srcq, srcstrideq
  245. movh m0, [srcq]
  246. movh m1, [srcq+srcstrideq]
  247. movh m2, [srcq+srcstrideq*2]
  248. lea srcq, [srcq+srcstrideq*2]
  249. add srcq, srcstrideq
  250. mova m5, [myq]
  251. movh m3, [srcq]
  252. movh m4, [srcq+srcstrideq]
  253. lea srcq, [srcq+2*srcstrideq]
  254. .nextrow:
  255. mova m6, m2
  256. punpcklbw m0, m1
  257. punpcklbw m6, m3
  258. pmaddubsw m0, m5
  259. pmaddubsw m6, [myq+16]
  260. movh m7, [srcq] ; read new row
  261. paddw m6, m0
  262. mova m0, m1
  263. mova m1, m2
  264. mova m2, m3
  265. mova m3, m4
  266. mova m4, m7
  267. punpcklbw m7, m3
  268. pmaddubsw m7, m5
  269. paddw m6, m7
  270. pmulhrsw m6, [pw_512]
  271. STORE m6, m7, %1
  272. ; go to next line
  273. add dstq, dststrideq
  274. add srcq, srcstrideq
  275. dec heightd ; next row
  276. jg .nextrow
  277. REP_RET
  278. cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
  279. %ifdef PIC
  280. lea picregq, [sixtap_filter_hb_m]
  281. %endif
  282. mova m3, [filter_h6_shuf2]
  283. mova m4, [filter_h6_shuf3]
  284. LOAD mx, sixtap_filter_hb
  285. mova m5, [mxq] ; set up 6tap filter in bytes
  286. mova m6, [mxq+16]
  287. mova m7, [filter_h6_shuf1]
  288. .nextrow:
  289. movu m0, [srcq-2]
  290. mova m1, m0
  291. mova m2, m0
  292. pshufb m0, m7
  293. pshufb m1, m3
  294. pshufb m2, m4
  295. pmaddubsw m0, m5
  296. pmaddubsw m1, m6
  297. pmaddubsw m2, m5
  298. paddw m0, m1
  299. paddw m0, m2
  300. pmulhrsw m0, [pw_512]
  301. STORE m0, m1, %1
  302. ; go to next line
  303. add dstq, dststrideq
  304. add srcq, srcstrideq
  305. dec heightd ; next row
  306. jg .nextrow
  307. REP_RET
  308. %endmacro
  309. INIT_XMM ssse3
  310. FILTER_SSSE3 put
  311. FILTER_SSSE3 avg
  312. ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
  313. %macro RV40_WCORE 4-5
  314. movh m4, [%3 + r6 + 0]
  315. movh m5, [%4 + r6 + 0]
  316. %if %0 == 4
  317. %define OFFSET r6 + mmsize / 2
  318. %else
  319. ; 8x8 block and sse2, stride was provided
  320. %define OFFSET r6
  321. add r6, r5
  322. %endif
  323. movh m6, [%3 + OFFSET]
  324. movh m7, [%4 + OFFSET]
  325. %if %1 == 0
  326. ; 14bits weights
  327. punpcklbw m4, m0
  328. punpcklbw m5, m0
  329. punpcklbw m6, m0
  330. punpcklbw m7, m0
  331. psllw m4, 7
  332. psllw m5, 7
  333. psllw m6, 7
  334. psllw m7, 7
  335. pmulhw m4, m3
  336. pmulhw m5, m2
  337. pmulhw m6, m3
  338. pmulhw m7, m2
  339. paddw m4, m5
  340. paddw m6, m7
  341. %else
  342. ; 5bits weights
  343. %if cpuflag(ssse3)
  344. punpcklbw m4, m5
  345. punpcklbw m6, m7
  346. pmaddubsw m4, m3
  347. pmaddubsw m6, m3
  348. %else
  349. punpcklbw m4, m0
  350. punpcklbw m5, m0
  351. punpcklbw m6, m0
  352. punpcklbw m7, m0
  353. pmullw m4, m3
  354. pmullw m5, m2
  355. pmullw m6, m3
  356. pmullw m7, m2
  357. paddw m4, m5
  358. paddw m6, m7
  359. %endif
  360. %endif
  361. ; bias and shift down
  362. %if cpuflag(ssse3)
  363. pmulhrsw m4, m1
  364. pmulhrsw m6, m1
  365. %else
  366. paddw m4, m1
  367. paddw m6, m1
  368. psrlw m4, 5
  369. psrlw m6, 5
  370. %endif
  371. packuswb m4, m6
  372. %if %0 == 5
  373. ; Only called for 8x8 blocks and sse2
  374. sub r6, r5
  375. movh [%2 + r6], m4
  376. add r6, r5
  377. movhps [%2 + r6], m4
  378. %else
  379. mova [%2 + r6], m4
  380. %endif
  381. %endmacro
  382. %macro MAIN_LOOP 2
  383. %if mmsize == 8
  384. RV40_WCORE %2, r0, r1, r2
  385. %if %1 == 16
  386. RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
  387. %endif
  388. ; Prepare for next loop
  389. add r6, r5
  390. %else
  391. %ifidn %1, 8
  392. RV40_WCORE %2, r0, r1, r2, r5
  393. ; Prepare 2 next lines
  394. add r6, r5
  395. %else
  396. RV40_WCORE %2, r0, r1, r2
  397. ; Prepare single next line
  398. add r6, r5
  399. %endif
  400. %endif
  401. %endmacro
  402. ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
  403. ; %1=size %2=num of xmm regs
  404. ; The weights are FP0.14 notation of fractions depending on pts.
  405. ; For timebases without rounding error (i.e. PAL), the fractions
  406. ; can be simplified, and several operations can be avoided.
  407. ; Therefore, we check here whether they are multiples of 2^9 for
  408. ; those simplifications to occur.
  409. %macro RV40_WEIGHT 3
  410. cglobal rv40_weight_func_%1_%2, 6, 7, 8
  411. %if cpuflag(ssse3)
  412. mova m1, [pw_1024]
  413. %else
  414. mova m1, [pw_16]
  415. %endif
  416. pxor m0, m0
  417. ; Set loop counter and increments
  418. mov r6, r5
  419. shl r6, %3
  420. add r0, r6
  421. add r1, r6
  422. add r2, r6
  423. neg r6
  424. movd m2, r3d
  425. movd m3, r4d
  426. %ifidn %1,rnd
  427. %define RND 0
  428. SPLATW m2, m2
  429. %else
  430. %define RND 1
  431. %if cpuflag(ssse3)
  432. punpcklbw m3, m2
  433. %else
  434. SPLATW m2, m2
  435. %endif
  436. %endif
  437. SPLATW m3, m3
  438. .loop:
  439. MAIN_LOOP %2, RND
  440. jnz .loop
  441. REP_RET
  442. %endmacro
  443. INIT_MMX mmx2
  444. RV40_WEIGHT rnd, 8, 3
  445. RV40_WEIGHT rnd, 16, 4
  446. RV40_WEIGHT nornd, 8, 3
  447. RV40_WEIGHT nornd, 16, 4
  448. INIT_XMM sse2
  449. RV40_WEIGHT rnd, 8, 3
  450. RV40_WEIGHT rnd, 16, 4
  451. RV40_WEIGHT nornd, 8, 3
  452. RV40_WEIGHT nornd, 16, 4
  453. INIT_XMM ssse3
  454. RV40_WEIGHT rnd, 8, 3
  455. RV40_WEIGHT rnd, 16, 4
  456. RV40_WEIGHT nornd, 8, 3
  457. RV40_WEIGHT nornd, 16, 4