You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

361 lines
14KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../swscale_internal.h"
  21. #include "libavutil/x86/asm.h"
  22. #include "libavutil/x86/cpu.h"
  23. #include "libavutil/mem_internal.h"
  24. #define RET 0xC3 // near return opcode for x86
  25. #define PREFETCH "prefetchnta"
  26. #if HAVE_INLINE_ASM
  27. av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
  28. int16_t *filter, int32_t *filterPos,
  29. int numSplits)
  30. {
  31. uint8_t *fragmentA;
  32. x86_reg imm8OfPShufW1A;
  33. x86_reg imm8OfPShufW2A;
  34. x86_reg fragmentLengthA;
  35. uint8_t *fragmentB;
  36. x86_reg imm8OfPShufW1B;
  37. x86_reg imm8OfPShufW2B;
  38. x86_reg fragmentLengthB;
  39. int fragmentPos;
  40. int xpos, i;
  41. // create an optimized horizontal scaling routine
  42. /* This scaler is made of runtime-generated MMXEXT code using specially tuned
  43. * pshufw instructions. For every four output pixels, if four input pixels
  44. * are enough for the fast bilinear scaling, then a chunk of fragmentB is
  45. * used. If five input pixels are needed, then a chunk of fragmentA is used.
  46. */
  47. // code fragment
  48. __asm__ volatile (
  49. "jmp 9f \n\t"
  50. // Begin
  51. "0: \n\t"
  52. "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
  53. "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
  54. "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
  55. "punpcklbw %%mm7, %%mm1 \n\t"
  56. "punpcklbw %%mm7, %%mm0 \n\t"
  57. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  58. "1: \n\t"
  59. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  60. "2: \n\t"
  61. "psubw %%mm1, %%mm0 \n\t"
  62. "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
  63. "pmullw %%mm3, %%mm0 \n\t"
  64. "psllw $7, %%mm1 \n\t"
  65. "paddw %%mm1, %%mm0 \n\t"
  66. "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
  67. "add $8, %%"FF_REG_a" \n\t"
  68. // End
  69. "9: \n\t"
  70. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  71. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  72. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  73. "dec %1 \n\t"
  74. "dec %2 \n\t"
  75. "sub %0, %1 \n\t"
  76. "sub %0, %2 \n\t"
  77. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  78. "sub %0, %3 \n\t"
  79. : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
  80. "=r" (fragmentLengthA)
  81. );
  82. __asm__ volatile (
  83. "jmp 9f \n\t"
  84. // Begin
  85. "0: \n\t"
  86. "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
  87. "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
  88. "punpcklbw %%mm7, %%mm0 \n\t"
  89. "pshufw $0xFF, %%mm0, %%mm1 \n\t"
  90. "1: \n\t"
  91. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  92. "2: \n\t"
  93. "psubw %%mm1, %%mm0 \n\t"
  94. "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
  95. "pmullw %%mm3, %%mm0 \n\t"
  96. "psllw $7, %%mm1 \n\t"
  97. "paddw %%mm1, %%mm0 \n\t"
  98. "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
  99. "add $8, %%"FF_REG_a" \n\t"
  100. // End
  101. "9: \n\t"
  102. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  103. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  104. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  105. "dec %1 \n\t"
  106. "dec %2 \n\t"
  107. "sub %0, %1 \n\t"
  108. "sub %0, %2 \n\t"
  109. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  110. "sub %0, %3 \n\t"
  111. : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
  112. "=r" (fragmentLengthB)
  113. );
  114. xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
  115. fragmentPos = 0;
  116. for (i = 0; i < dstW / numSplits; i++) {
  117. int xx = xpos >> 16;
  118. if ((i & 3) == 0) {
  119. int a = 0;
  120. int b = ((xpos + xInc) >> 16) - xx;
  121. int c = ((xpos + xInc * 2) >> 16) - xx;
  122. int d = ((xpos + xInc * 3) >> 16) - xx;
  123. int inc = (d + 1 < 4);
  124. uint8_t *fragment = inc ? fragmentB : fragmentA;
  125. x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
  126. x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
  127. x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
  128. int maxShift = 3 - (d + inc);
  129. int shift = 0;
  130. if (filterCode) {
  131. filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
  132. filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
  133. filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
  134. filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
  135. filterPos[i / 2] = xx;
  136. memcpy(filterCode + fragmentPos, fragment, fragmentLength);
  137. filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
  138. ((b + inc) << 2) |
  139. ((c + inc) << 4) |
  140. ((d + inc) << 6);
  141. filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
  142. (c << 4) |
  143. (d << 6);
  144. if (i + 4 - inc >= dstW)
  145. shift = maxShift; // avoid overread
  146. else if ((filterPos[i / 2] & 3) <= maxShift)
  147. shift = filterPos[i / 2] & 3; // align
  148. if (shift && i >= shift) {
  149. filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
  150. filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
  151. filterPos[i / 2] -= shift;
  152. }
  153. }
  154. fragmentPos += fragmentLength;
  155. if (filterCode)
  156. filterCode[fragmentPos] = RET;
  157. }
  158. xpos += xInc;
  159. }
  160. if (filterCode)
  161. filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
  162. return fragmentPos + 1;
  163. }
  164. void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
  165. int dstWidth, const uint8_t *src,
  166. int srcW, int xInc)
  167. {
  168. int32_t *filterPos = c->hLumFilterPos;
  169. int16_t *filter = c->hLumFilter;
  170. void *mmxextFilterCode = c->lumMmxextFilterCode;
  171. int i;
  172. #if ARCH_X86_64
  173. uint64_t retsave;
  174. #else
  175. #if !HAVE_EBX_AVAILABLE
  176. uint64_t ebxsave;
  177. #endif
  178. #endif
  179. __asm__ volatile(
  180. #if ARCH_X86_64
  181. "mov -8(%%rsp), %%"FF_REG_a" \n\t"
  182. "mov %%"FF_REG_a", %5 \n\t" // retsave
  183. #else
  184. #if !HAVE_EBX_AVAILABLE
  185. "mov %%"FF_REG_b", %5 \n\t" // ebxsave
  186. #endif
  187. #endif
  188. "pxor %%mm7, %%mm7 \n\t"
  189. "mov %0, %%"FF_REG_c" \n\t"
  190. "mov %1, %%"FF_REG_D" \n\t"
  191. "mov %2, %%"FF_REG_d" \n\t"
  192. "mov %3, %%"FF_REG_b" \n\t"
  193. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  194. PREFETCH" (%%"FF_REG_c") \n\t"
  195. PREFETCH" 32(%%"FF_REG_c") \n\t"
  196. PREFETCH" 64(%%"FF_REG_c") \n\t"
  197. #if ARCH_X86_64
  198. #define CALL_MMXEXT_FILTER_CODE \
  199. "movl (%%"FF_REG_b"), %%esi \n\t"\
  200. "call *%4 \n\t"\
  201. "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
  202. "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
  203. "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
  204. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  205. #else
  206. #define CALL_MMXEXT_FILTER_CODE \
  207. "movl (%%"FF_REG_b"), %%esi \n\t"\
  208. "call *%4 \n\t"\
  209. "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
  210. "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
  211. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  212. #endif /* ARCH_X86_64 */
  213. CALL_MMXEXT_FILTER_CODE
  214. CALL_MMXEXT_FILTER_CODE
  215. CALL_MMXEXT_FILTER_CODE
  216. CALL_MMXEXT_FILTER_CODE
  217. CALL_MMXEXT_FILTER_CODE
  218. CALL_MMXEXT_FILTER_CODE
  219. CALL_MMXEXT_FILTER_CODE
  220. CALL_MMXEXT_FILTER_CODE
  221. #if ARCH_X86_64
  222. "mov %5, %%"FF_REG_a" \n\t"
  223. "mov %%"FF_REG_a", -8(%%rsp) \n\t"
  224. #else
  225. #if !HAVE_EBX_AVAILABLE
  226. "mov %5, %%"FF_REG_b" \n\t"
  227. #endif
  228. #endif
  229. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  230. "m" (mmxextFilterCode)
  231. #if ARCH_X86_64
  232. ,"m"(retsave)
  233. #else
  234. #if !HAVE_EBX_AVAILABLE
  235. ,"m" (ebxsave)
  236. #endif
  237. #endif
  238. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
  239. #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
  240. ,"%"FF_REG_b
  241. #endif
  242. );
  243. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  244. dst[i] = src[srcW-1]*128;
  245. }
  246. void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
  247. int dstWidth, const uint8_t *src1,
  248. const uint8_t *src2, int srcW, int xInc)
  249. {
  250. int32_t *filterPos = c->hChrFilterPos;
  251. int16_t *filter = c->hChrFilter;
  252. void *mmxextFilterCode = c->chrMmxextFilterCode;
  253. int i;
  254. #if ARCH_X86_64
  255. DECLARE_ALIGNED(8, uint64_t, retsave);
  256. #else
  257. #if !HAVE_EBX_AVAILABLE
  258. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  259. #endif
  260. #endif
  261. __asm__ volatile(
  262. #if ARCH_X86_64
  263. "mov -8(%%rsp), %%"FF_REG_a" \n\t"
  264. "mov %%"FF_REG_a", %7 \n\t" // retsave
  265. #else
  266. #if !HAVE_EBX_AVAILABLE
  267. "mov %%"FF_REG_b", %7 \n\t" // ebxsave
  268. #endif
  269. #endif
  270. "pxor %%mm7, %%mm7 \n\t"
  271. "mov %0, %%"FF_REG_c" \n\t"
  272. "mov %1, %%"FF_REG_D" \n\t"
  273. "mov %2, %%"FF_REG_d" \n\t"
  274. "mov %3, %%"FF_REG_b" \n\t"
  275. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  276. PREFETCH" (%%"FF_REG_c") \n\t"
  277. PREFETCH" 32(%%"FF_REG_c") \n\t"
  278. PREFETCH" 64(%%"FF_REG_c") \n\t"
  279. CALL_MMXEXT_FILTER_CODE
  280. CALL_MMXEXT_FILTER_CODE
  281. CALL_MMXEXT_FILTER_CODE
  282. CALL_MMXEXT_FILTER_CODE
  283. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  284. "mov %5, %%"FF_REG_c" \n\t" // src2
  285. "mov %6, %%"FF_REG_D" \n\t" // dst2
  286. PREFETCH" (%%"FF_REG_c") \n\t"
  287. PREFETCH" 32(%%"FF_REG_c") \n\t"
  288. PREFETCH" 64(%%"FF_REG_c") \n\t"
  289. CALL_MMXEXT_FILTER_CODE
  290. CALL_MMXEXT_FILTER_CODE
  291. CALL_MMXEXT_FILTER_CODE
  292. CALL_MMXEXT_FILTER_CODE
  293. #if ARCH_X86_64
  294. "mov %7, %%"FF_REG_a" \n\t"
  295. "mov %%"FF_REG_a", -8(%%rsp) \n\t"
  296. #else
  297. #if !HAVE_EBX_AVAILABLE
  298. "mov %7, %%"FF_REG_b" \n\t"
  299. #endif
  300. #endif
  301. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  302. "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
  303. #if ARCH_X86_64
  304. ,"m"(retsave)
  305. #else
  306. #if !HAVE_EBX_AVAILABLE
  307. ,"m" (ebxsave)
  308. #endif
  309. #endif
  310. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
  311. #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
  312. ,"%"FF_REG_b
  313. #endif
  314. );
  315. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  316. dst1[i] = src1[srcW-1]*128;
  317. dst2[i] = src2[srcW-1]*128;
  318. }
  319. }
  320. #endif //HAVE_INLINE_ASM