You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

570 lines
22KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
  4. ;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. %if ARCH_X86_64
  24. %define pointer resq
  25. %else
  26. %define pointer resd
  27. %endif
  28. struc ResampleContext
  29. .av_class: pointer 1
  30. .filter_bank: pointer 1
  31. .filter_length: resd 1
  32. .filter_alloc: resd 1
  33. .ideal_dst_incr: resd 1
  34. .dst_incr: resd 1
  35. .dst_incr_div: resd 1
  36. .dst_incr_mod: resd 1
  37. .index: resd 1
  38. .frac: resd 1
  39. .src_incr: resd 1
  40. .compensation_distance: resd 1
  41. .phase_shift: resd 1
  42. .phase_mask: resd 1
  43. ; there's a few more here but we only care about the first few
  44. endstruc
  45. SECTION_RODATA
  46. pf_1: dd 1.0
  47. pd_0x4000: dd 0x4000
  48. SECTION .text
  49. %macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps
  50. ; int resample_common_$format(ResampleContext *ctx, $format *dst,
  51. ; const $format *src, int size, int update_ctx)
  52. %if ARCH_X86_64 ; unix64 and win64
  53. cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \
  54. dst_incr_mod, size, min_filter_count_x4, \
  55. min_filter_len_x4, dst_incr_div, src_incr, \
  56. phase_mask, dst_end, filter_bank
  57. ; use red-zone for variable storage
  58. %define ctx_stackq [rsp-0x8]
  59. %define src_stackq [rsp-0x10]
  60. %if WIN64
  61. %define update_context_stackd r4m
  62. %else ; unix64
  63. %define update_context_stackd [rsp-0x14]
  64. %endif
  65. ; load as many variables in registers as possible; for the rest, store
  66. ; on stack so that we have 'ctx' available as one extra register
  67. mov sized, r3d
  68. mov phase_maskd, [ctxq+ResampleContext.phase_mask]
  69. %if UNIX64
  70. mov update_context_stackd, r4d
  71. %endif
  72. mov indexd, [ctxq+ResampleContext.index]
  73. mov fracd, [ctxq+ResampleContext.frac]
  74. mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
  75. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  76. mov src_incrd, [ctxq+ResampleContext.src_incr]
  77. mov ctx_stackq, ctxq
  78. mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
  79. mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
  80. shl min_filter_len_x4d, %3
  81. lea dst_endq, [dstq+sizeq*%2]
  82. %if UNIX64
  83. mov ecx, [ctxq+ResampleContext.phase_shift]
  84. mov edi, [ctxq+ResampleContext.filter_alloc]
  85. DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
  86. filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  87. src_incr, phase_mask, dst_end, filter_bank
  88. %elif WIN64
  89. mov R9d, [ctxq+ResampleContext.filter_alloc]
  90. mov ecx, [ctxq+ResampleContext.phase_shift]
  91. DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  92. filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  93. src_incr, phase_mask, dst_end, filter_bank
  94. %endif
  95. neg min_filter_len_x4q
  96. sub filter_bankq, min_filter_len_x4q
  97. sub srcq, min_filter_len_x4q
  98. mov src_stackq, srcq
  99. %else ; x86-32
  100. cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
  101. index, min_filter_length_x4, filter_bank
  102. ; push temp variables to stack
  103. %define ctx_stackq r0mp
  104. %define src_stackq r2mp
  105. %define update_context_stackd r4m
  106. mov dstq, r1mp
  107. mov r3, r3mp
  108. lea r3, [dstq+r3*%2]
  109. PUSH dword [ctxq+ResampleContext.dst_incr_div]
  110. PUSH dword [ctxq+ResampleContext.dst_incr_mod]
  111. PUSH dword [ctxq+ResampleContext.filter_alloc]
  112. PUSH r3
  113. PUSH dword [ctxq+ResampleContext.phase_mask]
  114. PUSH dword [ctxq+ResampleContext.src_incr]
  115. mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
  116. mov indexd, [ctxq+ResampleContext.index]
  117. shl min_filter_length_x4d, %3
  118. mov fracd, [ctxq+ResampleContext.frac]
  119. neg min_filter_length_x4q
  120. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  121. sub r2mp, min_filter_length_x4q
  122. sub filter_bankq, min_filter_length_x4q
  123. PUSH min_filter_length_x4q
  124. PUSH filter_bankq
  125. mov phase_shiftd, [ctxq+ResampleContext.phase_shift]
  126. DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter
  127. %define filter_bankq dword [rsp+0x0]
  128. %define min_filter_length_x4q dword [rsp+0x4]
  129. %define src_incrd dword [rsp+0x8]
  130. %define phase_maskd dword [rsp+0xc]
  131. %define dst_endq dword [rsp+0x10]
  132. %define filter_allocd dword [rsp+0x14]
  133. %define dst_incr_modd dword [rsp+0x18]
  134. %define dst_incr_divd dword [rsp+0x1c]
  135. mov srcq, r2mp
  136. %endif
  137. .loop:
  138. mov filterd, filter_allocd
  139. imul filterd, indexd
  140. %if ARCH_X86_64
  141. mov min_filter_count_x4q, min_filter_len_x4q
  142. lea filterq, [filter_bankq+filterq*%2]
  143. %else ; x86-32
  144. mov min_filter_count_x4q, filter_bankq
  145. lea filterq, [min_filter_count_x4q+filterq*%2]
  146. mov min_filter_count_x4q, min_filter_length_x4q
  147. %endif
  148. %ifidn %1, float
  149. xorps m0, m0, m0
  150. %else ; int16
  151. movd m0, [pd_0x4000]
  152. %endif
  153. align 16
  154. .inner_loop:
  155. movu m1, [srcq+min_filter_count_x4q*1]
  156. %ifidn %1, float
  157. mulps m1, m1, [filterq+min_filter_count_x4q*1]
  158. addps m0, m0, m1
  159. %else ; int16
  160. pmaddwd m1, [filterq+min_filter_count_x4q*1]
  161. paddd m0, m1
  162. %endif
  163. add min_filter_count_x4q, mmsize
  164. js .inner_loop
  165. %if cpuflag(avx)
  166. vextractf128 xm1, m0, 0x1
  167. addps xm0, xm1
  168. %endif
  169. ; horizontal sum & store
  170. %ifidn %1, float
  171. movhlps xm1, xm0
  172. addps xm0, xm1
  173. shufps xm1, xm0, xm0, q0001
  174. add fracd, dst_incr_modd
  175. addps xm0, xm1
  176. add indexd, dst_incr_divd
  177. movss [dstq], xm0
  178. %else ; int16
  179. %if mmsize == 16
  180. pshufd m1, m0, q0032
  181. paddd m0, m1
  182. pshufd m1, m0, q0001
  183. %else ; mmsize == 8
  184. pshufw m1, m0, q0032
  185. %endif
  186. paddd m0, m1
  187. psrad m0, 15
  188. add fracd, dst_incr_modd
  189. packssdw m0, m0
  190. add indexd, dst_incr_divd
  191. movd [dstq], m0
  192. %endif
  193. cmp fracd, src_incrd
  194. jl .skip
  195. sub fracd, src_incrd
  196. inc indexd
  197. %if UNIX64
  198. DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
  199. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  200. src_incr, phase_mask, dst_end, filter_bank
  201. %elif WIN64
  202. DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  203. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  204. src_incr, phase_mask, dst_end, filter_bank
  205. %else ; x86-32
  206. DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr
  207. %endif
  208. .skip:
  209. mov index_incrd, indexd
  210. add dstq, %2
  211. and indexd, phase_maskd
  212. sar index_incrd, phase_shiftb
  213. lea srcq, [srcq+index_incrq*%2]
  214. cmp dstq, dst_endq
  215. jne .loop
  216. %if ARCH_X86_64
  217. DEFINE_ARGS ctx, dst, src, phase_shift, index, frac
  218. %else ; x86-32
  219. DEFINE_ARGS src, ctx, update_context, frac, index
  220. %endif
  221. cmp dword update_context_stackd, 0
  222. jz .skip_store
  223. ; strictly speaking, the function should always return the consumed
  224. ; number of bytes; however, we only use the value if update_context
  225. ; is true, so let's just leave it uninitialized otherwise
  226. mov ctxq, ctx_stackq
  227. movifnidn rax, srcq
  228. mov [ctxq+ResampleContext.frac ], fracd
  229. sub rax, src_stackq
  230. mov [ctxq+ResampleContext.index], indexd
  231. shr rax, %3
  232. .skip_store:
  233. %if ARCH_X86_32
  234. ADD rsp, 0x20
  235. %endif
  236. RET
  237. ; int resample_linear_$format(ResampleContext *ctx, float *dst,
  238. ; const float *src, int size, int update_ctx)
  239. %if ARCH_X86_64 ; unix64 and win64
  240. %if UNIX64
  241. cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \
  242. size, dst_incr_mod, min_filter_count_x4, \
  243. min_filter_len_x4, dst_incr_div, src_incr, \
  244. src, dst_end, filter_bank
  245. mov srcq, r2mp
  246. %else ; win64
  247. cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \
  248. size, dst_incr_mod, min_filter_count_x4, \
  249. min_filter_len_x4, dst_incr_div, src_incr, \
  250. dst, dst_end, filter_bank
  251. mov dstq, r1mp
  252. %endif
  253. ; use red-zone for variable storage
  254. %define ctx_stackq [rsp-0x8]
  255. %define src_stackq [rsp-0x10]
  256. %define phase_mask_stackd [rsp-0x14]
  257. %if WIN64
  258. %define update_context_stackd r4m
  259. %else ; unix64
  260. %define update_context_stackd [rsp-0x18]
  261. %endif
  262. ; load as many variables in registers as possible; for the rest, store
  263. ; on stack so that we have 'ctx' available as one extra register
  264. mov sized, r3d
  265. mov phase_maskd, [ctxq+ResampleContext.phase_mask]
  266. %if UNIX64
  267. mov update_context_stackd, r4d
  268. %endif
  269. mov indexd, [ctxq+ResampleContext.index]
  270. mov fracd, [ctxq+ResampleContext.frac]
  271. mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
  272. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  273. mov src_incrd, [ctxq+ResampleContext.src_incr]
  274. mov ctx_stackq, ctxq
  275. mov phase_mask_stackd, phase_maskd
  276. mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
  277. %ifidn %1, float
  278. cvtsi2ss xm0, src_incrd
  279. movss xm4, [pf_1]
  280. divss xm4, xm0
  281. %else ; int16
  282. movd m4, [pd_0x4000]
  283. %endif
  284. mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
  285. shl min_filter_len_x4d, %3
  286. lea dst_endq, [dstq+sizeq*%2]
  287. %if UNIX64
  288. mov ecx, [ctxq+ResampleContext.phase_shift]
  289. mov edi, [ctxq+ResampleContext.filter_alloc]
  290. DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \
  291. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  292. dst_incr_div, src_incr, src, dst_end, filter_bank
  293. %elif WIN64
  294. mov R9d, [ctxq+ResampleContext.filter_alloc]
  295. mov ecx, [ctxq+ResampleContext.phase_shift]
  296. DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \
  297. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  298. dst_incr_div, src_incr, dst, dst_end, filter_bank
  299. %endif
  300. neg min_filter_len_x4q
  301. sub filter_bankq, min_filter_len_x4q
  302. sub srcq, min_filter_len_x4q
  303. mov src_stackq, srcq
  304. %else ; x86-32
  305. cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
  306. frac, index, dst, filter_bank
  307. ; push temp variables to stack
  308. %define ctx_stackq r0mp
  309. %define src_stackq r2mp
  310. %define update_context_stackd r4m
  311. mov dstq, r1mp
  312. mov r3, r3mp
  313. lea r3, [dstq+r3*%2]
  314. PUSH dword [ctxq+ResampleContext.dst_incr_div]
  315. PUSH r3
  316. mov r3, dword [ctxq+ResampleContext.filter_alloc]
  317. PUSH dword [ctxq+ResampleContext.dst_incr_mod]
  318. PUSH r3
  319. shl r3, %3
  320. PUSH r3
  321. mov r3, dword [ctxq+ResampleContext.src_incr]
  322. PUSH dword [ctxq+ResampleContext.phase_mask]
  323. PUSH r3d
  324. %ifidn %1, float
  325. cvtsi2ss xm0, r3d
  326. movss xm4, [pf_1]
  327. divss xm4, xm0
  328. %else ; int16
  329. movd m4, [pd_0x4000]
  330. %endif
  331. mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
  332. mov indexd, [ctxq+ResampleContext.index]
  333. shl min_filter_length_x4d, %3
  334. mov fracd, [ctxq+ResampleContext.frac]
  335. neg min_filter_length_x4q
  336. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  337. sub r2mp, min_filter_length_x4q
  338. sub filter_bankq, min_filter_length_x4q
  339. PUSH min_filter_length_x4q
  340. PUSH filter_bankq
  341. PUSH dword [ctxq+ResampleContext.phase_shift]
  342. DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
  343. %define phase_shift_stackd dword [rsp+0x0]
  344. %define filter_bankq dword [rsp+0x4]
  345. %define min_filter_length_x4q dword [rsp+0x8]
  346. %define src_incrd dword [rsp+0xc]
  347. %define phase_mask_stackd dword [rsp+0x10]
  348. %define filter_alloc_x4q dword [rsp+0x14]
  349. %define filter_allocd dword [rsp+0x18]
  350. %define dst_incr_modd dword [rsp+0x1c]
  351. %define dst_endq dword [rsp+0x20]
  352. %define dst_incr_divd dword [rsp+0x24]
  353. mov srcq, r2mp
  354. %endif
  355. .loop:
  356. mov filter1d, filter_allocd
  357. imul filter1d, indexd
  358. %if ARCH_X86_64
  359. mov min_filter_count_x4q, min_filter_len_x4q
  360. lea filter1q, [filter_bankq+filter1q*%2]
  361. lea filter2q, [filter1q+filter_allocq*%2]
  362. %else ; x86-32
  363. mov min_filter_count_x4q, filter_bankq
  364. lea filter1q, [min_filter_count_x4q+filter1q*%2]
  365. mov min_filter_count_x4q, min_filter_length_x4q
  366. mov filter2q, filter1q
  367. add filter2q, filter_alloc_x4q
  368. %endif
  369. %ifidn %1, float
  370. xorps m0, m0, m0
  371. xorps m2, m2, m2
  372. %else ; int16
  373. mova m0, m4
  374. mova m2, m4
  375. %endif
  376. align 16
  377. .inner_loop:
  378. movu m1, [srcq+min_filter_count_x4q*1]
  379. %ifidn %1, float
  380. mulps m3, m1, [filter2q+min_filter_count_x4q*1]
  381. mulps m1, m1, [filter1q+min_filter_count_x4q*1]
  382. addps m2, m2, m3
  383. addps m0, m0, m1
  384. %else ; int16
  385. pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
  386. pmaddwd m1, [filter1q+min_filter_count_x4q*1]
  387. paddd m2, m3
  388. paddd m0, m1
  389. %endif
  390. add min_filter_count_x4q, mmsize
  391. js .inner_loop
  392. %if cpuflag(avx)
  393. vextractf128 xm1, m0, 0x1
  394. vextractf128 xm3, m2, 0x1
  395. addps xm0, xm1
  396. addps xm2, xm3
  397. %endif
  398. %ifidn %1, float
  399. ; val += (v2 - val) * (FELEML) frac / c->src_incr;
  400. cvtsi2ss xm1, fracd
  401. subps xm2, xm0
  402. mulps xm1, xm4
  403. shufps xm1, xm1, q0000
  404. mulps xm2, xm1
  405. addps xm0, xm2
  406. ; horizontal sum & store
  407. movhlps xm1, xm0
  408. addps xm0, xm1
  409. shufps xm1, xm0, xm0, q0001
  410. add fracd, dst_incr_modd
  411. addps xm0, xm1
  412. add indexd, dst_incr_divd
  413. movss [dstq], xm0
  414. %else ; int16
  415. %if mmsize == 16
  416. pshufd m3, m2, q0032
  417. pshufd m1, m0, q0032
  418. paddd m2, m3
  419. paddd m0, m1
  420. pshufd m3, m2, q0001
  421. pshufd m1, m0, q0001
  422. %else ; mmsize == 8
  423. pshufw m3, m2, q0032
  424. pshufw m1, m0, q0032
  425. %endif
  426. paddd m2, m3
  427. paddd m0, m1
  428. psubd m2, m0
  429. ; This is probably a really bad idea on atom and other machines with a
  430. ; long transfer latency between GPRs and XMMs (atom). However, it does
  431. ; make the clip a lot simpler...
  432. movd eax, m2
  433. add indexd, dst_incr_divd
  434. imul fracd
  435. idiv src_incrd
  436. movd m1, eax
  437. add fracd, dst_incr_modd
  438. paddd m0, m1
  439. psrad m0, 15
  440. packssdw m0, m0
  441. movd [dstq], m0
  442. ; note that for imul/idiv, I need to move filter to edx/eax for each:
  443. ; - 32bit: eax=r0[filter1], edx=r2[filter2]
  444. ; - win64: eax=r6[filter1], edx=r1[todo]
  445. ; - unix64: eax=r6[filter1], edx=r2[todo]
  446. %endif
  447. cmp fracd, src_incrd
  448. jl .skip
  449. sub fracd, src_incrd
  450. inc indexd
  451. %if UNIX64
  452. DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \
  453. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  454. dst_incr_div, src_incr, src, dst_end, filter_bank
  455. %elif WIN64
  456. DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \
  457. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  458. dst_incr_div, src_incr, dst, dst_end, filter_bank
  459. %else ; x86-32
  460. DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src
  461. %endif
  462. .skip:
  463. %if ARCH_X86_32
  464. mov phase_shiftd, phase_shift_stackd
  465. %endif
  466. mov index_incrd, indexd
  467. add dstq, %2
  468. and indexd, phase_mask_stackd
  469. sar index_incrd, phase_shiftb
  470. lea srcq, [srcq+index_incrq*%2]
  471. cmp dstq, dst_endq
  472. jne .loop
  473. %if UNIX64
  474. DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \
  475. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  476. dst_incr_div, src_incr, src, dst_end, filter_bank
  477. %elif WIN64
  478. DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \
  479. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  480. dst_incr_div, src_incr, dst, dst_end, filter_bank
  481. %else ; x86-32
  482. DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
  483. %endif
  484. cmp dword update_context_stackd, 0
  485. jz .skip_store
  486. ; strictly speaking, the function should always return the consumed
  487. ; number of bytes; however, we only use the value if update_context
  488. ; is true, so let's just leave it uninitialized otherwise
  489. mov ctxq, ctx_stackq
  490. movifnidn rax, srcq
  491. mov [ctxq+ResampleContext.frac ], fracd
  492. sub rax, src_stackq
  493. mov [ctxq+ResampleContext.index], indexd
  494. shr rax, %3
  495. .skip_store:
  496. %if ARCH_X86_32
  497. ADD rsp, 0x28
  498. %endif
  499. RET
  500. %endmacro
  501. INIT_XMM sse
  502. RESAMPLE_FNS float, 4, 2
  503. %if HAVE_AVX_EXTERNAL
  504. INIT_YMM avx
  505. RESAMPLE_FNS float, 4, 2
  506. %endif
  507. %if ARCH_X86_32
  508. INIT_MMX mmxext
  509. RESAMPLE_FNS int16, 2, 1
  510. %endif
  511. INIT_XMM sse2
  512. RESAMPLE_FNS int16, 2, 1