You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

856 lines
36KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmxext
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of Libav.
  11. *
  12. * Libav is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * Libav is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with Libav; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  27. clobber bug - now it will work with 2.95.2 and also with -fPIC
  28. */
  29. static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  30. {
  31. __asm__ volatile(
  32. "lea (%3, %3), %%"REG_a" \n\t"
  33. "1: \n\t"
  34. "movq (%1), %%mm0 \n\t"
  35. "movq (%1, %3), %%mm1 \n\t"
  36. PAVGB" 1(%1), %%mm0 \n\t"
  37. PAVGB" 1(%1, %3), %%mm1 \n\t"
  38. "movq %%mm0, (%2) \n\t"
  39. "movq %%mm1, (%2, %3) \n\t"
  40. "add %%"REG_a", %1 \n\t"
  41. "add %%"REG_a", %2 \n\t"
  42. "movq (%1), %%mm0 \n\t"
  43. "movq (%1, %3), %%mm1 \n\t"
  44. PAVGB" 1(%1), %%mm0 \n\t"
  45. PAVGB" 1(%1, %3), %%mm1 \n\t"
  46. "add %%"REG_a", %1 \n\t"
  47. "movq %%mm0, (%2) \n\t"
  48. "movq %%mm1, (%2, %3) \n\t"
  49. "add %%"REG_a", %2 \n\t"
  50. "subl $4, %0 \n\t"
  51. "jnz 1b \n\t"
  52. :"+g"(h), "+S"(pixels), "+D"(block)
  53. :"r" ((x86_reg)line_size)
  54. :"%"REG_a, "memory");
  55. }
  56. #ifndef SKIP_FOR_3DNOW
  57. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  58. {
  59. __asm__ volatile(
  60. "testl $1, %0 \n\t"
  61. " jz 1f \n\t"
  62. "movq (%1), %%mm0 \n\t"
  63. "movq (%2), %%mm1 \n\t"
  64. "add %4, %1 \n\t"
  65. "add $8, %2 \n\t"
  66. PAVGB" %%mm1, %%mm0 \n\t"
  67. "movq %%mm0, (%3) \n\t"
  68. "add %5, %3 \n\t"
  69. "decl %0 \n\t"
  70. "1: \n\t"
  71. "movq (%1), %%mm0 \n\t"
  72. "add %4, %1 \n\t"
  73. "movq (%1), %%mm1 \n\t"
  74. "add %4, %1 \n\t"
  75. PAVGB" (%2), %%mm0 \n\t"
  76. PAVGB" 8(%2), %%mm1 \n\t"
  77. "movq %%mm0, (%3) \n\t"
  78. "add %5, %3 \n\t"
  79. "movq %%mm1, (%3) \n\t"
  80. "add %5, %3 \n\t"
  81. "movq (%1), %%mm0 \n\t"
  82. "add %4, %1 \n\t"
  83. "movq (%1), %%mm1 \n\t"
  84. "add %4, %1 \n\t"
  85. PAVGB" 16(%2), %%mm0 \n\t"
  86. PAVGB" 24(%2), %%mm1 \n\t"
  87. "movq %%mm0, (%3) \n\t"
  88. "add %5, %3 \n\t"
  89. "movq %%mm1, (%3) \n\t"
  90. "add %5, %3 \n\t"
  91. "add $32, %2 \n\t"
  92. "subl $4, %0 \n\t"
  93. "jnz 1b \n\t"
  94. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  95. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  96. #else
  97. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  98. #endif
  99. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  100. :"memory");
  101. //the following should be used, though better not with gcc ...
  102. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  103. :"r"(src1Stride), "r"(dstStride)
  104. :"memory");*/
  105. }
  106. static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  107. {
  108. __asm__ volatile(
  109. "pcmpeqb %%mm6, %%mm6 \n\t"
  110. "testl $1, %0 \n\t"
  111. " jz 1f \n\t"
  112. "movq (%1), %%mm0 \n\t"
  113. "movq (%2), %%mm1 \n\t"
  114. "add %4, %1 \n\t"
  115. "add $8, %2 \n\t"
  116. "pxor %%mm6, %%mm0 \n\t"
  117. "pxor %%mm6, %%mm1 \n\t"
  118. PAVGB" %%mm1, %%mm0 \n\t"
  119. "pxor %%mm6, %%mm0 \n\t"
  120. "movq %%mm0, (%3) \n\t"
  121. "add %5, %3 \n\t"
  122. "decl %0 \n\t"
  123. "1: \n\t"
  124. "movq (%1), %%mm0 \n\t"
  125. "add %4, %1 \n\t"
  126. "movq (%1), %%mm1 \n\t"
  127. "add %4, %1 \n\t"
  128. "movq (%2), %%mm2 \n\t"
  129. "movq 8(%2), %%mm3 \n\t"
  130. "pxor %%mm6, %%mm0 \n\t"
  131. "pxor %%mm6, %%mm1 \n\t"
  132. "pxor %%mm6, %%mm2 \n\t"
  133. "pxor %%mm6, %%mm3 \n\t"
  134. PAVGB" %%mm2, %%mm0 \n\t"
  135. PAVGB" %%mm3, %%mm1 \n\t"
  136. "pxor %%mm6, %%mm0 \n\t"
  137. "pxor %%mm6, %%mm1 \n\t"
  138. "movq %%mm0, (%3) \n\t"
  139. "add %5, %3 \n\t"
  140. "movq %%mm1, (%3) \n\t"
  141. "add %5, %3 \n\t"
  142. "movq (%1), %%mm0 \n\t"
  143. "add %4, %1 \n\t"
  144. "movq (%1), %%mm1 \n\t"
  145. "add %4, %1 \n\t"
  146. "movq 16(%2), %%mm2 \n\t"
  147. "movq 24(%2), %%mm3 \n\t"
  148. "pxor %%mm6, %%mm0 \n\t"
  149. "pxor %%mm6, %%mm1 \n\t"
  150. "pxor %%mm6, %%mm2 \n\t"
  151. "pxor %%mm6, %%mm3 \n\t"
  152. PAVGB" %%mm2, %%mm0 \n\t"
  153. PAVGB" %%mm3, %%mm1 \n\t"
  154. "pxor %%mm6, %%mm0 \n\t"
  155. "pxor %%mm6, %%mm1 \n\t"
  156. "movq %%mm0, (%3) \n\t"
  157. "add %5, %3 \n\t"
  158. "movq %%mm1, (%3) \n\t"
  159. "add %5, %3 \n\t"
  160. "add $32, %2 \n\t"
  161. "subl $4, %0 \n\t"
  162. "jnz 1b \n\t"
  163. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  164. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  165. #else
  166. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  167. #endif
  168. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  169. :"memory");
  170. //the following should be used, though better not with gcc ...
  171. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  172. :"r"(src1Stride), "r"(dstStride)
  173. :"memory");*/
  174. }
  175. static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  176. {
  177. __asm__ volatile(
  178. "testl $1, %0 \n\t"
  179. " jz 1f \n\t"
  180. "movq (%1), %%mm0 \n\t"
  181. "movq (%2), %%mm1 \n\t"
  182. "add %4, %1 \n\t"
  183. "add $8, %2 \n\t"
  184. PAVGB" %%mm1, %%mm0 \n\t"
  185. PAVGB" (%3), %%mm0 \n\t"
  186. "movq %%mm0, (%3) \n\t"
  187. "add %5, %3 \n\t"
  188. "decl %0 \n\t"
  189. "1: \n\t"
  190. "movq (%1), %%mm0 \n\t"
  191. "add %4, %1 \n\t"
  192. "movq (%1), %%mm1 \n\t"
  193. "add %4, %1 \n\t"
  194. PAVGB" (%2), %%mm0 \n\t"
  195. PAVGB" 8(%2), %%mm1 \n\t"
  196. PAVGB" (%3), %%mm0 \n\t"
  197. "movq %%mm0, (%3) \n\t"
  198. "add %5, %3 \n\t"
  199. PAVGB" (%3), %%mm1 \n\t"
  200. "movq %%mm1, (%3) \n\t"
  201. "add %5, %3 \n\t"
  202. "movq (%1), %%mm0 \n\t"
  203. "add %4, %1 \n\t"
  204. "movq (%1), %%mm1 \n\t"
  205. "add %4, %1 \n\t"
  206. PAVGB" 16(%2), %%mm0 \n\t"
  207. PAVGB" 24(%2), %%mm1 \n\t"
  208. PAVGB" (%3), %%mm0 \n\t"
  209. "movq %%mm0, (%3) \n\t"
  210. "add %5, %3 \n\t"
  211. PAVGB" (%3), %%mm1 \n\t"
  212. "movq %%mm1, (%3) \n\t"
  213. "add %5, %3 \n\t"
  214. "add $32, %2 \n\t"
  215. "subl $4, %0 \n\t"
  216. "jnz 1b \n\t"
  217. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  218. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  219. #else
  220. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  221. #endif
  222. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  223. :"memory");
  224. //the following should be used, though better not with gcc ...
  225. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  226. :"r"(src1Stride), "r"(dstStride)
  227. :"memory");*/
  228. }
  229. #endif /* SKIP_FOR_3DNOW */
  230. static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  231. {
  232. __asm__ volatile(
  233. "lea (%3, %3), %%"REG_a" \n\t"
  234. "1: \n\t"
  235. "movq (%1), %%mm0 \n\t"
  236. "movq (%1, %3), %%mm1 \n\t"
  237. "movq 8(%1), %%mm2 \n\t"
  238. "movq 8(%1, %3), %%mm3 \n\t"
  239. PAVGB" 1(%1), %%mm0 \n\t"
  240. PAVGB" 1(%1, %3), %%mm1 \n\t"
  241. PAVGB" 9(%1), %%mm2 \n\t"
  242. PAVGB" 9(%1, %3), %%mm3 \n\t"
  243. "movq %%mm0, (%2) \n\t"
  244. "movq %%mm1, (%2, %3) \n\t"
  245. "movq %%mm2, 8(%2) \n\t"
  246. "movq %%mm3, 8(%2, %3) \n\t"
  247. "add %%"REG_a", %1 \n\t"
  248. "add %%"REG_a", %2 \n\t"
  249. "movq (%1), %%mm0 \n\t"
  250. "movq (%1, %3), %%mm1 \n\t"
  251. "movq 8(%1), %%mm2 \n\t"
  252. "movq 8(%1, %3), %%mm3 \n\t"
  253. PAVGB" 1(%1), %%mm0 \n\t"
  254. PAVGB" 1(%1, %3), %%mm1 \n\t"
  255. PAVGB" 9(%1), %%mm2 \n\t"
  256. PAVGB" 9(%1, %3), %%mm3 \n\t"
  257. "add %%"REG_a", %1 \n\t"
  258. "movq %%mm0, (%2) \n\t"
  259. "movq %%mm1, (%2, %3) \n\t"
  260. "movq %%mm2, 8(%2) \n\t"
  261. "movq %%mm3, 8(%2, %3) \n\t"
  262. "add %%"REG_a", %2 \n\t"
  263. "subl $4, %0 \n\t"
  264. "jnz 1b \n\t"
  265. :"+g"(h), "+S"(pixels), "+D"(block)
  266. :"r" ((x86_reg)line_size)
  267. :"%"REG_a, "memory");
  268. }
  269. #ifndef SKIP_FOR_3DNOW
  270. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  271. {
  272. __asm__ volatile(
  273. "testl $1, %0 \n\t"
  274. " jz 1f \n\t"
  275. "movq (%1), %%mm0 \n\t"
  276. "movq 8(%1), %%mm1 \n\t"
  277. PAVGB" (%2), %%mm0 \n\t"
  278. PAVGB" 8(%2), %%mm1 \n\t"
  279. "add %4, %1 \n\t"
  280. "add $16, %2 \n\t"
  281. "movq %%mm0, (%3) \n\t"
  282. "movq %%mm1, 8(%3) \n\t"
  283. "add %5, %3 \n\t"
  284. "decl %0 \n\t"
  285. "1: \n\t"
  286. "movq (%1), %%mm0 \n\t"
  287. "movq 8(%1), %%mm1 \n\t"
  288. "add %4, %1 \n\t"
  289. PAVGB" (%2), %%mm0 \n\t"
  290. PAVGB" 8(%2), %%mm1 \n\t"
  291. "movq %%mm0, (%3) \n\t"
  292. "movq %%mm1, 8(%3) \n\t"
  293. "add %5, %3 \n\t"
  294. "movq (%1), %%mm0 \n\t"
  295. "movq 8(%1), %%mm1 \n\t"
  296. "add %4, %1 \n\t"
  297. PAVGB" 16(%2), %%mm0 \n\t"
  298. PAVGB" 24(%2), %%mm1 \n\t"
  299. "movq %%mm0, (%3) \n\t"
  300. "movq %%mm1, 8(%3) \n\t"
  301. "add %5, %3 \n\t"
  302. "add $32, %2 \n\t"
  303. "subl $2, %0 \n\t"
  304. "jnz 1b \n\t"
  305. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  306. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  307. #else
  308. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  309. #endif
  310. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  311. :"memory");
  312. //the following should be used, though better not with gcc ...
  313. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  314. :"r"(src1Stride), "r"(dstStride)
  315. :"memory");*/
  316. }
  317. static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  318. {
  319. __asm__ volatile(
  320. "testl $1, %0 \n\t"
  321. " jz 1f \n\t"
  322. "movq (%1), %%mm0 \n\t"
  323. "movq 8(%1), %%mm1 \n\t"
  324. PAVGB" (%2), %%mm0 \n\t"
  325. PAVGB" 8(%2), %%mm1 \n\t"
  326. "add %4, %1 \n\t"
  327. "add $16, %2 \n\t"
  328. PAVGB" (%3), %%mm0 \n\t"
  329. PAVGB" 8(%3), %%mm1 \n\t"
  330. "movq %%mm0, (%3) \n\t"
  331. "movq %%mm1, 8(%3) \n\t"
  332. "add %5, %3 \n\t"
  333. "decl %0 \n\t"
  334. "1: \n\t"
  335. "movq (%1), %%mm0 \n\t"
  336. "movq 8(%1), %%mm1 \n\t"
  337. "add %4, %1 \n\t"
  338. PAVGB" (%2), %%mm0 \n\t"
  339. PAVGB" 8(%2), %%mm1 \n\t"
  340. PAVGB" (%3), %%mm0 \n\t"
  341. PAVGB" 8(%3), %%mm1 \n\t"
  342. "movq %%mm0, (%3) \n\t"
  343. "movq %%mm1, 8(%3) \n\t"
  344. "add %5, %3 \n\t"
  345. "movq (%1), %%mm0 \n\t"
  346. "movq 8(%1), %%mm1 \n\t"
  347. "add %4, %1 \n\t"
  348. PAVGB" 16(%2), %%mm0 \n\t"
  349. PAVGB" 24(%2), %%mm1 \n\t"
  350. PAVGB" (%3), %%mm0 \n\t"
  351. PAVGB" 8(%3), %%mm1 \n\t"
  352. "movq %%mm0, (%3) \n\t"
  353. "movq %%mm1, 8(%3) \n\t"
  354. "add %5, %3 \n\t"
  355. "add $32, %2 \n\t"
  356. "subl $2, %0 \n\t"
  357. "jnz 1b \n\t"
  358. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  359. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  360. #else
  361. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  362. #endif
  363. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  364. :"memory");
  365. //the following should be used, though better not with gcc ...
  366. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  367. :"r"(src1Stride), "r"(dstStride)
  368. :"memory");*/
  369. }
  370. static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  371. {
  372. __asm__ volatile(
  373. "pcmpeqb %%mm6, %%mm6 \n\t"
  374. "testl $1, %0 \n\t"
  375. " jz 1f \n\t"
  376. "movq (%1), %%mm0 \n\t"
  377. "movq 8(%1), %%mm1 \n\t"
  378. "movq (%2), %%mm2 \n\t"
  379. "movq 8(%2), %%mm3 \n\t"
  380. "pxor %%mm6, %%mm0 \n\t"
  381. "pxor %%mm6, %%mm1 \n\t"
  382. "pxor %%mm6, %%mm2 \n\t"
  383. "pxor %%mm6, %%mm3 \n\t"
  384. PAVGB" %%mm2, %%mm0 \n\t"
  385. PAVGB" %%mm3, %%mm1 \n\t"
  386. "pxor %%mm6, %%mm0 \n\t"
  387. "pxor %%mm6, %%mm1 \n\t"
  388. "add %4, %1 \n\t"
  389. "add $16, %2 \n\t"
  390. "movq %%mm0, (%3) \n\t"
  391. "movq %%mm1, 8(%3) \n\t"
  392. "add %5, %3 \n\t"
  393. "decl %0 \n\t"
  394. "1: \n\t"
  395. "movq (%1), %%mm0 \n\t"
  396. "movq 8(%1), %%mm1 \n\t"
  397. "add %4, %1 \n\t"
  398. "movq (%2), %%mm2 \n\t"
  399. "movq 8(%2), %%mm3 \n\t"
  400. "pxor %%mm6, %%mm0 \n\t"
  401. "pxor %%mm6, %%mm1 \n\t"
  402. "pxor %%mm6, %%mm2 \n\t"
  403. "pxor %%mm6, %%mm3 \n\t"
  404. PAVGB" %%mm2, %%mm0 \n\t"
  405. PAVGB" %%mm3, %%mm1 \n\t"
  406. "pxor %%mm6, %%mm0 \n\t"
  407. "pxor %%mm6, %%mm1 \n\t"
  408. "movq %%mm0, (%3) \n\t"
  409. "movq %%mm1, 8(%3) \n\t"
  410. "add %5, %3 \n\t"
  411. "movq (%1), %%mm0 \n\t"
  412. "movq 8(%1), %%mm1 \n\t"
  413. "add %4, %1 \n\t"
  414. "movq 16(%2), %%mm2 \n\t"
  415. "movq 24(%2), %%mm3 \n\t"
  416. "pxor %%mm6, %%mm0 \n\t"
  417. "pxor %%mm6, %%mm1 \n\t"
  418. "pxor %%mm6, %%mm2 \n\t"
  419. "pxor %%mm6, %%mm3 \n\t"
  420. PAVGB" %%mm2, %%mm0 \n\t"
  421. PAVGB" %%mm3, %%mm1 \n\t"
  422. "pxor %%mm6, %%mm0 \n\t"
  423. "pxor %%mm6, %%mm1 \n\t"
  424. "movq %%mm0, (%3) \n\t"
  425. "movq %%mm1, 8(%3) \n\t"
  426. "add %5, %3 \n\t"
  427. "add $32, %2 \n\t"
  428. "subl $2, %0 \n\t"
  429. "jnz 1b \n\t"
  430. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  431. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  432. #else
  433. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  434. #endif
  435. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  436. :"memory");
  437. //the following should be used, though better not with gcc ...
  438. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  439. :"r"(src1Stride), "r"(dstStride)
  440. :"memory");*/
  441. }
  442. #endif /* SKIP_FOR_3DNOW */
  443. /* GL: this function does incorrect rounding if overflow */
  444. static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  445. {
  446. MOVQ_BONE(mm6);
  447. __asm__ volatile(
  448. "lea (%3, %3), %%"REG_a" \n\t"
  449. "1: \n\t"
  450. "movq (%1), %%mm0 \n\t"
  451. "movq (%1, %3), %%mm2 \n\t"
  452. "movq 1(%1), %%mm1 \n\t"
  453. "movq 1(%1, %3), %%mm3 \n\t"
  454. "add %%"REG_a", %1 \n\t"
  455. "psubusb %%mm6, %%mm0 \n\t"
  456. "psubusb %%mm6, %%mm2 \n\t"
  457. PAVGB" %%mm1, %%mm0 \n\t"
  458. PAVGB" %%mm3, %%mm2 \n\t"
  459. "movq %%mm0, (%2) \n\t"
  460. "movq %%mm2, (%2, %3) \n\t"
  461. "movq (%1), %%mm0 \n\t"
  462. "movq 1(%1), %%mm1 \n\t"
  463. "movq (%1, %3), %%mm2 \n\t"
  464. "movq 1(%1, %3), %%mm3 \n\t"
  465. "add %%"REG_a", %2 \n\t"
  466. "add %%"REG_a", %1 \n\t"
  467. "psubusb %%mm6, %%mm0 \n\t"
  468. "psubusb %%mm6, %%mm2 \n\t"
  469. PAVGB" %%mm1, %%mm0 \n\t"
  470. PAVGB" %%mm3, %%mm2 \n\t"
  471. "movq %%mm0, (%2) \n\t"
  472. "movq %%mm2, (%2, %3) \n\t"
  473. "add %%"REG_a", %2 \n\t"
  474. "subl $4, %0 \n\t"
  475. "jnz 1b \n\t"
  476. :"+g"(h), "+S"(pixels), "+D"(block)
  477. :"r" ((x86_reg)line_size)
  478. :"%"REG_a, "memory");
  479. }
  480. static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  481. {
  482. __asm__ volatile (
  483. "pcmpeqb %%mm6, %%mm6 \n\t"
  484. "1: \n\t"
  485. "movq (%1), %%mm0 \n\t"
  486. "movq (%1, %3), %%mm2 \n\t"
  487. "movq 1(%1), %%mm1 \n\t"
  488. "movq 1(%1, %3), %%mm3 \n\t"
  489. "pxor %%mm6, %%mm0 \n\t"
  490. "pxor %%mm6, %%mm2 \n\t"
  491. "pxor %%mm6, %%mm1 \n\t"
  492. "pxor %%mm6, %%mm3 \n\t"
  493. PAVGB" %%mm1, %%mm0 \n\t"
  494. PAVGB" %%mm3, %%mm2 \n\t"
  495. "pxor %%mm6, %%mm0 \n\t"
  496. "pxor %%mm6, %%mm2 \n\t"
  497. "movq %%mm0, (%2) \n\t"
  498. "movq %%mm2, (%2, %3) \n\t"
  499. "movq (%1, %3,2), %%mm0 \n\t"
  500. "movq 1(%1, %3,2), %%mm1 \n\t"
  501. "movq (%1, %4), %%mm2 \n\t"
  502. "movq 1(%1, %4), %%mm3 \n\t"
  503. "pxor %%mm6, %%mm0 \n\t"
  504. "pxor %%mm6, %%mm1 \n\t"
  505. "pxor %%mm6, %%mm2 \n\t"
  506. "pxor %%mm6, %%mm3 \n\t"
  507. PAVGB" %%mm1, %%mm0 \n\t"
  508. PAVGB" %%mm3, %%mm2 \n\t"
  509. "pxor %%mm6, %%mm0 \n\t"
  510. "pxor %%mm6, %%mm2 \n\t"
  511. "movq %%mm0, (%2, %3,2) \n\t"
  512. "movq %%mm2, (%2, %4) \n\t"
  513. "lea (%1, %3,4), %1 \n\t"
  514. "lea (%2, %3,4), %2 \n\t"
  515. "subl $4, %0 \n\t"
  516. "jg 1b \n\t"
  517. : "+g"(h), "+r"(pixels), "+r"(block)
  518. : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
  519. : "memory"
  520. );
  521. }
  522. static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  523. {
  524. __asm__ volatile(
  525. "lea (%3, %3), %%"REG_a" \n\t"
  526. "movq (%1), %%mm0 \n\t"
  527. "sub %3, %2 \n\t"
  528. "1: \n\t"
  529. "movq (%1, %3), %%mm1 \n\t"
  530. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  531. "add %%"REG_a", %1 \n\t"
  532. PAVGB" %%mm1, %%mm0 \n\t"
  533. PAVGB" %%mm2, %%mm1 \n\t"
  534. "movq %%mm0, (%2, %3) \n\t"
  535. "movq %%mm1, (%2, %%"REG_a") \n\t"
  536. "movq (%1, %3), %%mm1 \n\t"
  537. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  538. "add %%"REG_a", %2 \n\t"
  539. "add %%"REG_a", %1 \n\t"
  540. PAVGB" %%mm1, %%mm2 \n\t"
  541. PAVGB" %%mm0, %%mm1 \n\t"
  542. "movq %%mm2, (%2, %3) \n\t"
  543. "movq %%mm1, (%2, %%"REG_a") \n\t"
  544. "add %%"REG_a", %2 \n\t"
  545. "subl $4, %0 \n\t"
  546. "jnz 1b \n\t"
  547. :"+g"(h), "+S"(pixels), "+D" (block)
  548. :"r" ((x86_reg)line_size)
  549. :"%"REG_a, "memory");
  550. }
  551. /* GL: this function does incorrect rounding if overflow */
  552. static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  553. {
  554. MOVQ_BONE(mm6);
  555. __asm__ volatile(
  556. "lea (%3, %3), %%"REG_a" \n\t"
  557. "movq (%1), %%mm0 \n\t"
  558. "sub %3, %2 \n\t"
  559. "1: \n\t"
  560. "movq (%1, %3), %%mm1 \n\t"
  561. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  562. "add %%"REG_a", %1 \n\t"
  563. "psubusb %%mm6, %%mm1 \n\t"
  564. PAVGB" %%mm1, %%mm0 \n\t"
  565. PAVGB" %%mm2, %%mm1 \n\t"
  566. "movq %%mm0, (%2, %3) \n\t"
  567. "movq %%mm1, (%2, %%"REG_a") \n\t"
  568. "movq (%1, %3), %%mm1 \n\t"
  569. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  570. "add %%"REG_a", %2 \n\t"
  571. "add %%"REG_a", %1 \n\t"
  572. "psubusb %%mm6, %%mm1 \n\t"
  573. PAVGB" %%mm1, %%mm2 \n\t"
  574. PAVGB" %%mm0, %%mm1 \n\t"
  575. "movq %%mm2, (%2, %3) \n\t"
  576. "movq %%mm1, (%2, %%"REG_a") \n\t"
  577. "add %%"REG_a", %2 \n\t"
  578. "subl $4, %0 \n\t"
  579. "jnz 1b \n\t"
  580. :"+g"(h), "+S"(pixels), "+D" (block)
  581. :"r" ((x86_reg)line_size)
  582. :"%"REG_a, "memory");
  583. }
  584. static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  585. {
  586. __asm__ volatile (
  587. "movq (%1), %%mm0 \n\t"
  588. "pcmpeqb %%mm6, %%mm6 \n\t"
  589. "add %3, %1 \n\t"
  590. "pxor %%mm6, %%mm0 \n\t"
  591. "1: \n\t"
  592. "movq (%1), %%mm1 \n\t"
  593. "movq (%1, %3), %%mm2 \n\t"
  594. "pxor %%mm6, %%mm1 \n\t"
  595. "pxor %%mm6, %%mm2 \n\t"
  596. PAVGB" %%mm1, %%mm0 \n\t"
  597. PAVGB" %%mm2, %%mm1 \n\t"
  598. "pxor %%mm6, %%mm0 \n\t"
  599. "pxor %%mm6, %%mm1 \n\t"
  600. "movq %%mm0, (%2) \n\t"
  601. "movq %%mm1, (%2, %3) \n\t"
  602. "movq (%1, %3,2), %%mm1 \n\t"
  603. "movq (%1, %4), %%mm0 \n\t"
  604. "pxor %%mm6, %%mm1 \n\t"
  605. "pxor %%mm6, %%mm0 \n\t"
  606. PAVGB" %%mm1, %%mm2 \n\t"
  607. PAVGB" %%mm0, %%mm1 \n\t"
  608. "pxor %%mm6, %%mm2 \n\t"
  609. "pxor %%mm6, %%mm1 \n\t"
  610. "movq %%mm2, (%2, %3,2) \n\t"
  611. "movq %%mm1, (%2, %4) \n\t"
  612. "lea (%1, %3,4), %1 \n\t"
  613. "lea (%2, %3,4), %2 \n\t"
  614. "subl $4, %0 \n\t"
  615. "jg 1b \n\t"
  616. :"+g"(h), "+r"(pixels), "+r" (block)
  617. :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
  618. :"memory"
  619. );
  620. }
  621. static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  622. {
  623. __asm__ volatile(
  624. "lea (%3, %3), %%"REG_a" \n\t"
  625. "1: \n\t"
  626. "movq (%2), %%mm0 \n\t"
  627. "movq (%2, %3), %%mm1 \n\t"
  628. PAVGB" (%1), %%mm0 \n\t"
  629. PAVGB" (%1, %3), %%mm1 \n\t"
  630. "movq %%mm0, (%2) \n\t"
  631. "movq %%mm1, (%2, %3) \n\t"
  632. "add %%"REG_a", %1 \n\t"
  633. "add %%"REG_a", %2 \n\t"
  634. "movq (%2), %%mm0 \n\t"
  635. "movq (%2, %3), %%mm1 \n\t"
  636. PAVGB" (%1), %%mm0 \n\t"
  637. PAVGB" (%1, %3), %%mm1 \n\t"
  638. "add %%"REG_a", %1 \n\t"
  639. "movq %%mm0, (%2) \n\t"
  640. "movq %%mm1, (%2, %3) \n\t"
  641. "add %%"REG_a", %2 \n\t"
  642. "subl $4, %0 \n\t"
  643. "jnz 1b \n\t"
  644. :"+g"(h), "+S"(pixels), "+D"(block)
  645. :"r" ((x86_reg)line_size)
  646. :"%"REG_a, "memory");
  647. }
  648. static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  649. {
  650. __asm__ volatile(
  651. "lea (%3, %3), %%"REG_a" \n\t"
  652. "1: \n\t"
  653. "movq (%1), %%mm0 \n\t"
  654. "movq (%1, %3), %%mm2 \n\t"
  655. PAVGB" 1(%1), %%mm0 \n\t"
  656. PAVGB" 1(%1, %3), %%mm2 \n\t"
  657. PAVGB" (%2), %%mm0 \n\t"
  658. PAVGB" (%2, %3), %%mm2 \n\t"
  659. "add %%"REG_a", %1 \n\t"
  660. "movq %%mm0, (%2) \n\t"
  661. "movq %%mm2, (%2, %3) \n\t"
  662. "movq (%1), %%mm0 \n\t"
  663. "movq (%1, %3), %%mm2 \n\t"
  664. PAVGB" 1(%1), %%mm0 \n\t"
  665. PAVGB" 1(%1, %3), %%mm2 \n\t"
  666. "add %%"REG_a", %2 \n\t"
  667. "add %%"REG_a", %1 \n\t"
  668. PAVGB" (%2), %%mm0 \n\t"
  669. PAVGB" (%2, %3), %%mm2 \n\t"
  670. "movq %%mm0, (%2) \n\t"
  671. "movq %%mm2, (%2, %3) \n\t"
  672. "add %%"REG_a", %2 \n\t"
  673. "subl $4, %0 \n\t"
  674. "jnz 1b \n\t"
  675. :"+g"(h), "+S"(pixels), "+D"(block)
  676. :"r" ((x86_reg)line_size)
  677. :"%"REG_a, "memory");
  678. }
  679. static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  680. {
  681. __asm__ volatile(
  682. "lea (%3, %3), %%"REG_a" \n\t"
  683. "movq (%1), %%mm0 \n\t"
  684. "sub %3, %2 \n\t"
  685. "1: \n\t"
  686. "movq (%1, %3), %%mm1 \n\t"
  687. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  688. "add %%"REG_a", %1 \n\t"
  689. PAVGB" %%mm1, %%mm0 \n\t"
  690. PAVGB" %%mm2, %%mm1 \n\t"
  691. "movq (%2, %3), %%mm3 \n\t"
  692. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  693. PAVGB" %%mm3, %%mm0 \n\t"
  694. PAVGB" %%mm4, %%mm1 \n\t"
  695. "movq %%mm0, (%2, %3) \n\t"
  696. "movq %%mm1, (%2, %%"REG_a") \n\t"
  697. "movq (%1, %3), %%mm1 \n\t"
  698. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  699. PAVGB" %%mm1, %%mm2 \n\t"
  700. PAVGB" %%mm0, %%mm1 \n\t"
  701. "add %%"REG_a", %2 \n\t"
  702. "add %%"REG_a", %1 \n\t"
  703. "movq (%2, %3), %%mm3 \n\t"
  704. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  705. PAVGB" %%mm3, %%mm2 \n\t"
  706. PAVGB" %%mm4, %%mm1 \n\t"
  707. "movq %%mm2, (%2, %3) \n\t"
  708. "movq %%mm1, (%2, %%"REG_a") \n\t"
  709. "add %%"REG_a", %2 \n\t"
  710. "subl $4, %0 \n\t"
  711. "jnz 1b \n\t"
  712. :"+g"(h), "+S"(pixels), "+D"(block)
  713. :"r" ((x86_reg)line_size)
  714. :"%"REG_a, "memory");
  715. }
  716. /* Note this is not correctly rounded, but this function is only
  717. * used for B-frames so it does not matter. */
  718. static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  719. {
  720. MOVQ_BONE(mm6);
  721. __asm__ volatile(
  722. "lea (%3, %3), %%"REG_a" \n\t"
  723. "movq (%1), %%mm0 \n\t"
  724. PAVGB" 1(%1), %%mm0 \n\t"
  725. ".p2align 3 \n\t"
  726. "1: \n\t"
  727. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  728. "movq (%1, %3), %%mm1 \n\t"
  729. "psubusb %%mm6, %%mm2 \n\t"
  730. PAVGB" 1(%1, %3), %%mm1 \n\t"
  731. PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
  732. "add %%"REG_a", %1 \n\t"
  733. PAVGB" %%mm1, %%mm0 \n\t"
  734. PAVGB" %%mm2, %%mm1 \n\t"
  735. PAVGB" (%2), %%mm0 \n\t"
  736. PAVGB" (%2, %3), %%mm1 \n\t"
  737. "movq %%mm0, (%2) \n\t"
  738. "movq %%mm1, (%2, %3) \n\t"
  739. "movq (%1, %3), %%mm1 \n\t"
  740. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  741. PAVGB" 1(%1, %3), %%mm1 \n\t"
  742. PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
  743. "add %%"REG_a", %2 \n\t"
  744. "add %%"REG_a", %1 \n\t"
  745. PAVGB" %%mm1, %%mm2 \n\t"
  746. PAVGB" %%mm0, %%mm1 \n\t"
  747. PAVGB" (%2), %%mm2 \n\t"
  748. PAVGB" (%2, %3), %%mm1 \n\t"
  749. "movq %%mm2, (%2) \n\t"
  750. "movq %%mm1, (%2, %3) \n\t"
  751. "add %%"REG_a", %2 \n\t"
  752. "subl $4, %0 \n\t"
  753. "jnz 1b \n\t"
  754. :"+g"(h), "+S"(pixels), "+D"(block)
  755. :"r" ((x86_reg)line_size)
  756. :"%"REG_a, "memory");
  757. }
  758. //FIXME the following could be optimized too ...
  759. static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  760. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  761. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  762. }
  763. static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  764. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  765. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  766. }
  767. static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  768. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  769. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  770. }
  771. static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  772. DEF(avg_pixels8)(block , pixels , line_size, h);
  773. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  774. }
  775. static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  776. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  777. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  778. }
  779. static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  780. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  781. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  782. }
  783. static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  784. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  785. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  786. }
  787. #define QPEL_2TAP_L3(OPNAME) \
  788. static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  789. __asm__ volatile(\
  790. "1: \n\t"\
  791. "movq (%1,%2), %%mm0 \n\t"\
  792. "movq 8(%1,%2), %%mm1 \n\t"\
  793. PAVGB" (%1,%3), %%mm0 \n\t"\
  794. PAVGB" 8(%1,%3), %%mm1 \n\t"\
  795. PAVGB" (%1), %%mm0 \n\t"\
  796. PAVGB" 8(%1), %%mm1 \n\t"\
  797. STORE_OP( (%1,%4),%%mm0)\
  798. STORE_OP(8(%1,%4),%%mm1)\
  799. "movq %%mm0, (%1,%4) \n\t"\
  800. "movq %%mm1, 8(%1,%4) \n\t"\
  801. "add %5, %1 \n\t"\
  802. "decl %0 \n\t"\
  803. "jnz 1b \n\t"\
  804. :"+g"(h), "+r"(src)\
  805. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  806. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  807. :"memory"\
  808. );\
  809. }\
  810. static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  811. __asm__ volatile(\
  812. "1: \n\t"\
  813. "movq (%1,%2), %%mm0 \n\t"\
  814. PAVGB" (%1,%3), %%mm0 \n\t"\
  815. PAVGB" (%1), %%mm0 \n\t"\
  816. STORE_OP((%1,%4),%%mm0)\
  817. "movq %%mm0, (%1,%4) \n\t"\
  818. "add %5, %1 \n\t"\
  819. "decl %0 \n\t"\
  820. "jnz 1b \n\t"\
  821. :"+g"(h), "+r"(src)\
  822. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  823. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  824. :"memory"\
  825. );\
  826. }
  827. #ifndef SKIP_FOR_3DNOW
  828. #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
  829. QPEL_2TAP_L3(avg_)
  830. #undef STORE_OP
  831. #define STORE_OP(a,b)
  832. QPEL_2TAP_L3(put_)
  833. #undef STORE_OP
  834. #undef QPEL_2TAP_L3
  835. #endif /* SKIP_FOR_3DNOW */