You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

506 lines
12KB

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV40 decoder
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. align 16
  26. pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
  27. sixtap_filter_hb_m: times 8 db 1, -5
  28. times 8 db 52, 20
  29. ; multiplied by 2 to have the same shift
  30. times 8 db 2, -10
  31. times 8 db 40, 40
  32. ; back to normal
  33. times 8 db 1, -5
  34. times 8 db 20, 52
  35. sixtap_filter_v_m: times 8 dw 1
  36. times 8 dw -5
  37. times 8 dw 52
  38. times 8 dw 20
  39. ; multiplied by 2 to have the same shift
  40. times 8 dw 2
  41. times 8 dw -10
  42. times 8 dw 40
  43. times 8 dw 40
  44. ; back to normal
  45. times 8 dw 1
  46. times 8 dw -5
  47. times 8 dw 20
  48. times 8 dw 52
  49. %ifdef PIC
  50. %define sixtap_filter_hw picregq
  51. %define sixtap_filter_hb picregq
  52. %define sixtap_filter_v picregq
  53. %define npicregs 1
  54. %else
  55. %define sixtap_filter_hw sixtap_filter_hw_m
  56. %define sixtap_filter_hb sixtap_filter_hb_m
  57. %define sixtap_filter_v sixtap_filter_v_m
  58. %define npicregs 0
  59. %endif
  60. filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  61. filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  62. filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  63. cextern pw_32
  64. cextern pw_16
  65. cextern pw_512
  66. SECTION .text
  67. ;-----------------------------------------------------------------------------
  68. ; subpel MC functions:
  69. ;
  70. ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  71. ; uint8_t *src, int srcstride,
  72. ; int len, int m);
  73. ;----------------------------------------------------------------------
  74. %macro LOAD 2
  75. %if WIN64
  76. movsxd %1q, %1d
  77. %endif
  78. %ifdef PIC
  79. add %1q, picregq
  80. %else
  81. add %1q, %2
  82. %endif
  83. %endmacro
  84. %macro STORE 3
  85. %ifidn %3, avg
  86. movh %2, [dstq]
  87. %endif
  88. packuswb %1, %1
  89. %ifidn %3, avg
  90. %if cpuflag(3dnow)
  91. pavgusb %1, %2
  92. %else
  93. pavgb %1, %2
  94. %endif
  95. %endif
  96. movh [dstq], %1
  97. %endmacro
  98. %macro FILTER_V 1
  99. cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
  100. %ifdef PIC
  101. lea picregq, [sixtap_filter_v_m]
  102. %endif
  103. pxor m7, m7
  104. LOAD my, sixtap_filter_v
  105. ; read 5 lines
  106. sub srcq, srcstrideq
  107. sub srcq, srcstrideq
  108. movh m0, [srcq]
  109. movh m1, [srcq+srcstrideq]
  110. movh m2, [srcq+srcstrideq*2]
  111. lea srcq, [srcq+srcstrideq*2]
  112. add srcq, srcstrideq
  113. movh m3, [srcq]
  114. movh m4, [srcq+srcstrideq]
  115. punpcklbw m0, m7
  116. punpcklbw m1, m7
  117. punpcklbw m2, m7
  118. punpcklbw m3, m7
  119. punpcklbw m4, m7
  120. %ifdef m8
  121. mova m8, [myq+ 0]
  122. mova m9, [myq+16]
  123. mova m10, [myq+32]
  124. mova m11, [myq+48]
  125. %define COEFF05 m8
  126. %define COEFF14 m9
  127. %define COEFF2 m10
  128. %define COEFF3 m11
  129. %else
  130. %define COEFF05 [myq+ 0]
  131. %define COEFF14 [myq+16]
  132. %define COEFF2 [myq+32]
  133. %define COEFF3 [myq+48]
  134. %endif
  135. .nextrow:
  136. mova m6, m1
  137. movh m5, [srcq+2*srcstrideq] ; read new row
  138. paddw m6, m4
  139. punpcklbw m5, m7
  140. pmullw m6, COEFF14
  141. paddw m0, m5
  142. pmullw m0, COEFF05
  143. paddw m6, m0
  144. mova m0, m1
  145. paddw m6, [pw_32]
  146. mova m1, m2
  147. pmullw m2, COEFF2
  148. paddw m6, m2
  149. mova m2, m3
  150. pmullw m3, COEFF3
  151. paddw m6, m3
  152. ; round/clip/store
  153. mova m3, m4
  154. psraw m6, 6
  155. mova m4, m5
  156. STORE m6, m5, %1
  157. ; go to next line
  158. add dstq, dststrideq
  159. add srcq, srcstrideq
  160. dec heightd ; next row
  161. jg .nextrow
  162. REP_RET
  163. %endmacro
  164. %macro FILTER_H 1
  165. cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
  166. %ifdef PIC
  167. lea picregq, [sixtap_filter_v_m]
  168. %endif
  169. pxor m7, m7
  170. LOAD mx, sixtap_filter_v
  171. mova m6, [pw_32]
  172. %ifdef m8
  173. mova m8, [mxq+ 0]
  174. mova m9, [mxq+16]
  175. mova m10, [mxq+32]
  176. mova m11, [mxq+48]
  177. %define COEFF05 m8
  178. %define COEFF14 m9
  179. %define COEFF2 m10
  180. %define COEFF3 m11
  181. %else
  182. %define COEFF05 [mxq+ 0]
  183. %define COEFF14 [mxq+16]
  184. %define COEFF2 [mxq+32]
  185. %define COEFF3 [mxq+48]
  186. %endif
  187. .nextrow:
  188. movq m0, [srcq-2]
  189. movq m5, [srcq+3]
  190. movq m1, [srcq-1]
  191. movq m4, [srcq+2]
  192. punpcklbw m0, m7
  193. punpcklbw m5, m7
  194. punpcklbw m1, m7
  195. punpcklbw m4, m7
  196. movq m2, [srcq-0]
  197. movq m3, [srcq+1]
  198. paddw m0, m5
  199. paddw m1, m4
  200. punpcklbw m2, m7
  201. punpcklbw m3, m7
  202. pmullw m0, COEFF05
  203. pmullw m1, COEFF14
  204. pmullw m2, COEFF2
  205. pmullw m3, COEFF3
  206. paddw m0, m6
  207. paddw m1, m2
  208. paddw m0, m3
  209. paddw m0, m1
  210. psraw m0, 6
  211. STORE m0, m1, %1
  212. ; go to next line
  213. add dstq, dststrideq
  214. add srcq, srcstrideq
  215. dec heightd ; next row
  216. jg .nextrow
  217. REP_RET
  218. %endmacro
  219. %if ARCH_X86_32
  220. INIT_MMX mmx
  221. FILTER_V put
  222. FILTER_H put
  223. INIT_MMX mmxext
  224. FILTER_V avg
  225. FILTER_H avg
  226. INIT_MMX 3dnow
  227. FILTER_V avg
  228. FILTER_H avg
  229. %endif
  230. INIT_XMM sse2
  231. FILTER_H put
  232. FILTER_H avg
  233. FILTER_V put
  234. FILTER_V avg
  235. %macro FILTER_SSSE3 1
  236. cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
  237. %ifdef PIC
  238. lea picregq, [sixtap_filter_hb_m]
  239. %endif
  240. ; read 5 lines
  241. sub srcq, srcstrideq
  242. LOAD my, sixtap_filter_hb
  243. sub srcq, srcstrideq
  244. movh m0, [srcq]
  245. movh m1, [srcq+srcstrideq]
  246. movh m2, [srcq+srcstrideq*2]
  247. lea srcq, [srcq+srcstrideq*2]
  248. add srcq, srcstrideq
  249. mova m5, [myq]
  250. movh m3, [srcq]
  251. movh m4, [srcq+srcstrideq]
  252. lea srcq, [srcq+2*srcstrideq]
  253. .nextrow:
  254. mova m6, m2
  255. punpcklbw m0, m1
  256. punpcklbw m6, m3
  257. pmaddubsw m0, m5
  258. pmaddubsw m6, [myq+16]
  259. movh m7, [srcq] ; read new row
  260. paddw m6, m0
  261. mova m0, m1
  262. mova m1, m2
  263. mova m2, m3
  264. mova m3, m4
  265. mova m4, m7
  266. punpcklbw m7, m3
  267. pmaddubsw m7, m5
  268. paddw m6, m7
  269. pmulhrsw m6, [pw_512]
  270. STORE m6, m7, %1
  271. ; go to next line
  272. add dstq, dststrideq
  273. add srcq, srcstrideq
  274. dec heightd ; next row
  275. jg .nextrow
  276. REP_RET
  277. cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
  278. %ifdef PIC
  279. lea picregq, [sixtap_filter_hb_m]
  280. %endif
  281. mova m3, [filter_h6_shuf2]
  282. mova m4, [filter_h6_shuf3]
  283. LOAD mx, sixtap_filter_hb
  284. mova m5, [mxq] ; set up 6tap filter in bytes
  285. mova m6, [mxq+16]
  286. mova m7, [filter_h6_shuf1]
  287. .nextrow:
  288. movu m0, [srcq-2]
  289. mova m1, m0
  290. mova m2, m0
  291. pshufb m0, m7
  292. pshufb m1, m3
  293. pshufb m2, m4
  294. pmaddubsw m0, m5
  295. pmaddubsw m1, m6
  296. pmaddubsw m2, m5
  297. paddw m0, m1
  298. paddw m0, m2
  299. pmulhrsw m0, [pw_512]
  300. STORE m0, m1, %1
  301. ; go to next line
  302. add dstq, dststrideq
  303. add srcq, srcstrideq
  304. dec heightd ; next row
  305. jg .nextrow
  306. REP_RET
  307. %endmacro
  308. INIT_XMM ssse3
  309. FILTER_SSSE3 put
  310. FILTER_SSSE3 avg
  311. ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
  312. %macro RV40_WCORE 4-5
  313. movh m4, [%3 + r6 + 0]
  314. movh m5, [%4 + r6 + 0]
  315. %if %0 == 4
  316. %define OFFSET r6 + mmsize / 2
  317. %else
  318. ; 8x8 block and sse2, stride was provided
  319. %define OFFSET r6
  320. add r6, r5
  321. %endif
  322. movh m6, [%3 + OFFSET]
  323. movh m7, [%4 + OFFSET]
  324. %if %1 == 0
  325. ; 14bits weights
  326. punpcklbw m4, m0
  327. punpcklbw m5, m0
  328. punpcklbw m6, m0
  329. punpcklbw m7, m0
  330. psllw m4, 7
  331. psllw m5, 7
  332. psllw m6, 7
  333. psllw m7, 7
  334. pmulhw m4, m3
  335. pmulhw m5, m2
  336. pmulhw m6, m3
  337. pmulhw m7, m2
  338. paddw m4, m5
  339. paddw m6, m7
  340. %else
  341. ; 5bits weights
  342. %if cpuflag(ssse3)
  343. punpcklbw m4, m5
  344. punpcklbw m6, m7
  345. pmaddubsw m4, m3
  346. pmaddubsw m6, m3
  347. %else
  348. punpcklbw m4, m0
  349. punpcklbw m5, m0
  350. punpcklbw m6, m0
  351. punpcklbw m7, m0
  352. pmullw m4, m3
  353. pmullw m5, m2
  354. pmullw m6, m3
  355. pmullw m7, m2
  356. paddw m4, m5
  357. paddw m6, m7
  358. %endif
  359. %endif
  360. ; bias and shift down
  361. %if cpuflag(ssse3)
  362. pmulhrsw m4, m1
  363. pmulhrsw m6, m1
  364. %else
  365. paddw m4, m1
  366. paddw m6, m1
  367. psrlw m4, 5
  368. psrlw m6, 5
  369. %endif
  370. packuswb m4, m6
  371. %if %0 == 5
  372. ; Only called for 8x8 blocks and sse2
  373. sub r6, r5
  374. movh [%2 + r6], m4
  375. add r6, r5
  376. movhps [%2 + r6], m4
  377. %else
  378. mova [%2 + r6], m4
  379. %endif
  380. %endmacro
  381. %macro MAIN_LOOP 2
  382. %if mmsize == 8
  383. RV40_WCORE %2, r0, r1, r2
  384. %if %1 == 16
  385. RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
  386. %endif
  387. ; Prepare for next loop
  388. add r6, r5
  389. %else
  390. %ifidn %1, 8
  391. RV40_WCORE %2, r0, r1, r2, r5
  392. ; Prepare 2 next lines
  393. add r6, r5
  394. %else
  395. RV40_WCORE %2, r0, r1, r2
  396. ; Prepare single next line
  397. add r6, r5
  398. %endif
  399. %endif
  400. %endmacro
  401. ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
  402. ; %1=size %2=num of xmm regs
  403. ; The weights are FP0.14 notation of fractions depending on pts.
  404. ; For timebases without rounding error (i.e. PAL), the fractions
  405. ; can be simplified, and several operations can be avoided.
  406. ; Therefore, we check here whether they are multiples of 2^9 for
  407. ; those simplifications to occur.
  408. %macro RV40_WEIGHT 3
  409. cglobal rv40_weight_func_%1_%2, 6, 7, 8
  410. %if cpuflag(ssse3)
  411. mova m1, [pw_1024]
  412. %else
  413. mova m1, [pw_16]
  414. %endif
  415. pxor m0, m0
  416. ; Set loop counter and increments
  417. mov r6, r5
  418. shl r6, %3
  419. add r0, r6
  420. add r1, r6
  421. add r2, r6
  422. neg r6
  423. movd m2, r3d
  424. movd m3, r4d
  425. %ifidn %1,rnd
  426. %define RND 0
  427. SPLATW m2, m2
  428. %else
  429. %define RND 1
  430. %if cpuflag(ssse3)
  431. punpcklbw m3, m2
  432. %else
  433. SPLATW m2, m2
  434. %endif
  435. %endif
  436. SPLATW m3, m3
  437. .loop:
  438. MAIN_LOOP %2, RND
  439. jnz .loop
  440. REP_RET
  441. %endmacro
  442. INIT_MMX mmxext
  443. RV40_WEIGHT rnd, 8, 3
  444. RV40_WEIGHT rnd, 16, 4
  445. RV40_WEIGHT nornd, 8, 3
  446. RV40_WEIGHT nornd, 16, 4
  447. INIT_XMM sse2
  448. RV40_WEIGHT rnd, 8, 3
  449. RV40_WEIGHT rnd, 16, 4
  450. RV40_WEIGHT nornd, 8, 3
  451. RV40_WEIGHT nornd, 16, 4
  452. INIT_XMM ssse3
  453. RV40_WEIGHT rnd, 8, 3
  454. RV40_WEIGHT rnd, 16, 4
  455. RV40_WEIGHT nornd, 8, 3
  456. RV40_WEIGHT nornd, 16, 4