You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

978 lines
41KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of Libav.
  11. *
  12. * Libav is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * Libav is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with Libav; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  27. clobber bug - now it will work with 2.95.2 and also with -fPIC
  28. */
  29. static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  30. {
  31. __asm__ volatile(
  32. "lea (%3, %3), %%"REG_a" \n\t"
  33. "1: \n\t"
  34. "movq (%1), %%mm0 \n\t"
  35. "movq (%1, %3), %%mm1 \n\t"
  36. PAVGB" 1(%1), %%mm0 \n\t"
  37. PAVGB" 1(%1, %3), %%mm1 \n\t"
  38. "movq %%mm0, (%2) \n\t"
  39. "movq %%mm1, (%2, %3) \n\t"
  40. "add %%"REG_a", %1 \n\t"
  41. "add %%"REG_a", %2 \n\t"
  42. "movq (%1), %%mm0 \n\t"
  43. "movq (%1, %3), %%mm1 \n\t"
  44. PAVGB" 1(%1), %%mm0 \n\t"
  45. PAVGB" 1(%1, %3), %%mm1 \n\t"
  46. "add %%"REG_a", %1 \n\t"
  47. "movq %%mm0, (%2) \n\t"
  48. "movq %%mm1, (%2, %3) \n\t"
  49. "add %%"REG_a", %2 \n\t"
  50. "subl $4, %0 \n\t"
  51. "jnz 1b \n\t"
  52. :"+g"(h), "+S"(pixels), "+D"(block)
  53. :"r" ((x86_reg)line_size)
  54. :"%"REG_a, "memory");
  55. }
  56. static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  57. {
  58. __asm__ volatile(
  59. "testl $1, %0 \n\t"
  60. " jz 1f \n\t"
  61. "movd (%1), %%mm0 \n\t"
  62. "movd (%2), %%mm1 \n\t"
  63. "add %4, %1 \n\t"
  64. "add $4, %2 \n\t"
  65. PAVGB" %%mm1, %%mm0 \n\t"
  66. "movd %%mm0, (%3) \n\t"
  67. "add %5, %3 \n\t"
  68. "decl %0 \n\t"
  69. "1: \n\t"
  70. "movd (%1), %%mm0 \n\t"
  71. "add %4, %1 \n\t"
  72. "movd (%1), %%mm1 \n\t"
  73. "movd (%2), %%mm2 \n\t"
  74. "movd 4(%2), %%mm3 \n\t"
  75. "add %4, %1 \n\t"
  76. PAVGB" %%mm2, %%mm0 \n\t"
  77. PAVGB" %%mm3, %%mm1 \n\t"
  78. "movd %%mm0, (%3) \n\t"
  79. "add %5, %3 \n\t"
  80. "movd %%mm1, (%3) \n\t"
  81. "add %5, %3 \n\t"
  82. "movd (%1), %%mm0 \n\t"
  83. "add %4, %1 \n\t"
  84. "movd (%1), %%mm1 \n\t"
  85. "movd 8(%2), %%mm2 \n\t"
  86. "movd 12(%2), %%mm3 \n\t"
  87. "add %4, %1 \n\t"
  88. PAVGB" %%mm2, %%mm0 \n\t"
  89. PAVGB" %%mm3, %%mm1 \n\t"
  90. "movd %%mm0, (%3) \n\t"
  91. "add %5, %3 \n\t"
  92. "movd %%mm1, (%3) \n\t"
  93. "add %5, %3 \n\t"
  94. "add $16, %2 \n\t"
  95. "subl $4, %0 \n\t"
  96. "jnz 1b \n\t"
  97. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  98. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  99. #else
  100. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  101. #endif
  102. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  103. :"memory");
  104. }
  105. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  106. {
  107. __asm__ volatile(
  108. "testl $1, %0 \n\t"
  109. " jz 1f \n\t"
  110. "movq (%1), %%mm0 \n\t"
  111. "movq (%2), %%mm1 \n\t"
  112. "add %4, %1 \n\t"
  113. "add $8, %2 \n\t"
  114. PAVGB" %%mm1, %%mm0 \n\t"
  115. "movq %%mm0, (%3) \n\t"
  116. "add %5, %3 \n\t"
  117. "decl %0 \n\t"
  118. "1: \n\t"
  119. "movq (%1), %%mm0 \n\t"
  120. "add %4, %1 \n\t"
  121. "movq (%1), %%mm1 \n\t"
  122. "add %4, %1 \n\t"
  123. PAVGB" (%2), %%mm0 \n\t"
  124. PAVGB" 8(%2), %%mm1 \n\t"
  125. "movq %%mm0, (%3) \n\t"
  126. "add %5, %3 \n\t"
  127. "movq %%mm1, (%3) \n\t"
  128. "add %5, %3 \n\t"
  129. "movq (%1), %%mm0 \n\t"
  130. "add %4, %1 \n\t"
  131. "movq (%1), %%mm1 \n\t"
  132. "add %4, %1 \n\t"
  133. PAVGB" 16(%2), %%mm0 \n\t"
  134. PAVGB" 24(%2), %%mm1 \n\t"
  135. "movq %%mm0, (%3) \n\t"
  136. "add %5, %3 \n\t"
  137. "movq %%mm1, (%3) \n\t"
  138. "add %5, %3 \n\t"
  139. "add $32, %2 \n\t"
  140. "subl $4, %0 \n\t"
  141. "jnz 1b \n\t"
  142. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  143. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  144. #else
  145. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  146. #endif
  147. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  148. :"memory");
  149. //the following should be used, though better not with gcc ...
  150. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  151. :"r"(src1Stride), "r"(dstStride)
  152. :"memory");*/
  153. }
  154. static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  155. {
  156. __asm__ volatile(
  157. "pcmpeqb %%mm6, %%mm6 \n\t"
  158. "testl $1, %0 \n\t"
  159. " jz 1f \n\t"
  160. "movq (%1), %%mm0 \n\t"
  161. "movq (%2), %%mm1 \n\t"
  162. "add %4, %1 \n\t"
  163. "add $8, %2 \n\t"
  164. "pxor %%mm6, %%mm0 \n\t"
  165. "pxor %%mm6, %%mm1 \n\t"
  166. PAVGB" %%mm1, %%mm0 \n\t"
  167. "pxor %%mm6, %%mm0 \n\t"
  168. "movq %%mm0, (%3) \n\t"
  169. "add %5, %3 \n\t"
  170. "decl %0 \n\t"
  171. "1: \n\t"
  172. "movq (%1), %%mm0 \n\t"
  173. "add %4, %1 \n\t"
  174. "movq (%1), %%mm1 \n\t"
  175. "add %4, %1 \n\t"
  176. "movq (%2), %%mm2 \n\t"
  177. "movq 8(%2), %%mm3 \n\t"
  178. "pxor %%mm6, %%mm0 \n\t"
  179. "pxor %%mm6, %%mm1 \n\t"
  180. "pxor %%mm6, %%mm2 \n\t"
  181. "pxor %%mm6, %%mm3 \n\t"
  182. PAVGB" %%mm2, %%mm0 \n\t"
  183. PAVGB" %%mm3, %%mm1 \n\t"
  184. "pxor %%mm6, %%mm0 \n\t"
  185. "pxor %%mm6, %%mm1 \n\t"
  186. "movq %%mm0, (%3) \n\t"
  187. "add %5, %3 \n\t"
  188. "movq %%mm1, (%3) \n\t"
  189. "add %5, %3 \n\t"
  190. "movq (%1), %%mm0 \n\t"
  191. "add %4, %1 \n\t"
  192. "movq (%1), %%mm1 \n\t"
  193. "add %4, %1 \n\t"
  194. "movq 16(%2), %%mm2 \n\t"
  195. "movq 24(%2), %%mm3 \n\t"
  196. "pxor %%mm6, %%mm0 \n\t"
  197. "pxor %%mm6, %%mm1 \n\t"
  198. "pxor %%mm6, %%mm2 \n\t"
  199. "pxor %%mm6, %%mm3 \n\t"
  200. PAVGB" %%mm2, %%mm0 \n\t"
  201. PAVGB" %%mm3, %%mm1 \n\t"
  202. "pxor %%mm6, %%mm0 \n\t"
  203. "pxor %%mm6, %%mm1 \n\t"
  204. "movq %%mm0, (%3) \n\t"
  205. "add %5, %3 \n\t"
  206. "movq %%mm1, (%3) \n\t"
  207. "add %5, %3 \n\t"
  208. "add $32, %2 \n\t"
  209. "subl $4, %0 \n\t"
  210. "jnz 1b \n\t"
  211. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  212. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  213. #else
  214. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  215. #endif
  216. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  217. :"memory");
  218. //the following should be used, though better not with gcc ...
  219. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  220. :"r"(src1Stride), "r"(dstStride)
  221. :"memory");*/
  222. }
  223. static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  224. {
  225. __asm__ volatile(
  226. "testl $1, %0 \n\t"
  227. " jz 1f \n\t"
  228. "movd (%1), %%mm0 \n\t"
  229. "movd (%2), %%mm1 \n\t"
  230. "add %4, %1 \n\t"
  231. "add $4, %2 \n\t"
  232. PAVGB" %%mm1, %%mm0 \n\t"
  233. PAVGB" (%3), %%mm0 \n\t"
  234. "movd %%mm0, (%3) \n\t"
  235. "add %5, %3 \n\t"
  236. "decl %0 \n\t"
  237. "1: \n\t"
  238. "movd (%1), %%mm0 \n\t"
  239. "add %4, %1 \n\t"
  240. "movd (%1), %%mm1 \n\t"
  241. "add %4, %1 \n\t"
  242. PAVGB" (%2), %%mm0 \n\t"
  243. PAVGB" 4(%2), %%mm1 \n\t"
  244. PAVGB" (%3), %%mm0 \n\t"
  245. "movd %%mm0, (%3) \n\t"
  246. "add %5, %3 \n\t"
  247. PAVGB" (%3), %%mm1 \n\t"
  248. "movd %%mm1, (%3) \n\t"
  249. "add %5, %3 \n\t"
  250. "movd (%1), %%mm0 \n\t"
  251. "add %4, %1 \n\t"
  252. "movd (%1), %%mm1 \n\t"
  253. "add %4, %1 \n\t"
  254. PAVGB" 8(%2), %%mm0 \n\t"
  255. PAVGB" 12(%2), %%mm1 \n\t"
  256. PAVGB" (%3), %%mm0 \n\t"
  257. "movd %%mm0, (%3) \n\t"
  258. "add %5, %3 \n\t"
  259. PAVGB" (%3), %%mm1 \n\t"
  260. "movd %%mm1, (%3) \n\t"
  261. "add %5, %3 \n\t"
  262. "add $16, %2 \n\t"
  263. "subl $4, %0 \n\t"
  264. "jnz 1b \n\t"
  265. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  266. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  267. #else
  268. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  269. #endif
  270. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  271. :"memory");
  272. }
  273. static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  274. {
  275. __asm__ volatile(
  276. "testl $1, %0 \n\t"
  277. " jz 1f \n\t"
  278. "movq (%1), %%mm0 \n\t"
  279. "movq (%2), %%mm1 \n\t"
  280. "add %4, %1 \n\t"
  281. "add $8, %2 \n\t"
  282. PAVGB" %%mm1, %%mm0 \n\t"
  283. PAVGB" (%3), %%mm0 \n\t"
  284. "movq %%mm0, (%3) \n\t"
  285. "add %5, %3 \n\t"
  286. "decl %0 \n\t"
  287. "1: \n\t"
  288. "movq (%1), %%mm0 \n\t"
  289. "add %4, %1 \n\t"
  290. "movq (%1), %%mm1 \n\t"
  291. "add %4, %1 \n\t"
  292. PAVGB" (%2), %%mm0 \n\t"
  293. PAVGB" 8(%2), %%mm1 \n\t"
  294. PAVGB" (%3), %%mm0 \n\t"
  295. "movq %%mm0, (%3) \n\t"
  296. "add %5, %3 \n\t"
  297. PAVGB" (%3), %%mm1 \n\t"
  298. "movq %%mm1, (%3) \n\t"
  299. "add %5, %3 \n\t"
  300. "movq (%1), %%mm0 \n\t"
  301. "add %4, %1 \n\t"
  302. "movq (%1), %%mm1 \n\t"
  303. "add %4, %1 \n\t"
  304. PAVGB" 16(%2), %%mm0 \n\t"
  305. PAVGB" 24(%2), %%mm1 \n\t"
  306. PAVGB" (%3), %%mm0 \n\t"
  307. "movq %%mm0, (%3) \n\t"
  308. "add %5, %3 \n\t"
  309. PAVGB" (%3), %%mm1 \n\t"
  310. "movq %%mm1, (%3) \n\t"
  311. "add %5, %3 \n\t"
  312. "add $32, %2 \n\t"
  313. "subl $4, %0 \n\t"
  314. "jnz 1b \n\t"
  315. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  316. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  317. #else
  318. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  319. #endif
  320. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  321. :"memory");
  322. //the following should be used, though better not with gcc ...
  323. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  324. :"r"(src1Stride), "r"(dstStride)
  325. :"memory");*/
  326. }
  327. static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  328. {
  329. __asm__ volatile(
  330. "lea (%3, %3), %%"REG_a" \n\t"
  331. "1: \n\t"
  332. "movq (%1), %%mm0 \n\t"
  333. "movq (%1, %3), %%mm1 \n\t"
  334. "movq 8(%1), %%mm2 \n\t"
  335. "movq 8(%1, %3), %%mm3 \n\t"
  336. PAVGB" 1(%1), %%mm0 \n\t"
  337. PAVGB" 1(%1, %3), %%mm1 \n\t"
  338. PAVGB" 9(%1), %%mm2 \n\t"
  339. PAVGB" 9(%1, %3), %%mm3 \n\t"
  340. "movq %%mm0, (%2) \n\t"
  341. "movq %%mm1, (%2, %3) \n\t"
  342. "movq %%mm2, 8(%2) \n\t"
  343. "movq %%mm3, 8(%2, %3) \n\t"
  344. "add %%"REG_a", %1 \n\t"
  345. "add %%"REG_a", %2 \n\t"
  346. "movq (%1), %%mm0 \n\t"
  347. "movq (%1, %3), %%mm1 \n\t"
  348. "movq 8(%1), %%mm2 \n\t"
  349. "movq 8(%1, %3), %%mm3 \n\t"
  350. PAVGB" 1(%1), %%mm0 \n\t"
  351. PAVGB" 1(%1, %3), %%mm1 \n\t"
  352. PAVGB" 9(%1), %%mm2 \n\t"
  353. PAVGB" 9(%1, %3), %%mm3 \n\t"
  354. "add %%"REG_a", %1 \n\t"
  355. "movq %%mm0, (%2) \n\t"
  356. "movq %%mm1, (%2, %3) \n\t"
  357. "movq %%mm2, 8(%2) \n\t"
  358. "movq %%mm3, 8(%2, %3) \n\t"
  359. "add %%"REG_a", %2 \n\t"
  360. "subl $4, %0 \n\t"
  361. "jnz 1b \n\t"
  362. :"+g"(h), "+S"(pixels), "+D"(block)
  363. :"r" ((x86_reg)line_size)
  364. :"%"REG_a, "memory");
  365. }
  366. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  367. {
  368. __asm__ volatile(
  369. "testl $1, %0 \n\t"
  370. " jz 1f \n\t"
  371. "movq (%1), %%mm0 \n\t"
  372. "movq 8(%1), %%mm1 \n\t"
  373. PAVGB" (%2), %%mm0 \n\t"
  374. PAVGB" 8(%2), %%mm1 \n\t"
  375. "add %4, %1 \n\t"
  376. "add $16, %2 \n\t"
  377. "movq %%mm0, (%3) \n\t"
  378. "movq %%mm1, 8(%3) \n\t"
  379. "add %5, %3 \n\t"
  380. "decl %0 \n\t"
  381. "1: \n\t"
  382. "movq (%1), %%mm0 \n\t"
  383. "movq 8(%1), %%mm1 \n\t"
  384. "add %4, %1 \n\t"
  385. PAVGB" (%2), %%mm0 \n\t"
  386. PAVGB" 8(%2), %%mm1 \n\t"
  387. "movq %%mm0, (%3) \n\t"
  388. "movq %%mm1, 8(%3) \n\t"
  389. "add %5, %3 \n\t"
  390. "movq (%1), %%mm0 \n\t"
  391. "movq 8(%1), %%mm1 \n\t"
  392. "add %4, %1 \n\t"
  393. PAVGB" 16(%2), %%mm0 \n\t"
  394. PAVGB" 24(%2), %%mm1 \n\t"
  395. "movq %%mm0, (%3) \n\t"
  396. "movq %%mm1, 8(%3) \n\t"
  397. "add %5, %3 \n\t"
  398. "add $32, %2 \n\t"
  399. "subl $2, %0 \n\t"
  400. "jnz 1b \n\t"
  401. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  402. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  403. #else
  404. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  405. #endif
  406. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  407. :"memory");
  408. //the following should be used, though better not with gcc ...
  409. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  410. :"r"(src1Stride), "r"(dstStride)
  411. :"memory");*/
  412. }
  413. static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  414. {
  415. __asm__ volatile(
  416. "testl $1, %0 \n\t"
  417. " jz 1f \n\t"
  418. "movq (%1), %%mm0 \n\t"
  419. "movq 8(%1), %%mm1 \n\t"
  420. PAVGB" (%2), %%mm0 \n\t"
  421. PAVGB" 8(%2), %%mm1 \n\t"
  422. "add %4, %1 \n\t"
  423. "add $16, %2 \n\t"
  424. PAVGB" (%3), %%mm0 \n\t"
  425. PAVGB" 8(%3), %%mm1 \n\t"
  426. "movq %%mm0, (%3) \n\t"
  427. "movq %%mm1, 8(%3) \n\t"
  428. "add %5, %3 \n\t"
  429. "decl %0 \n\t"
  430. "1: \n\t"
  431. "movq (%1), %%mm0 \n\t"
  432. "movq 8(%1), %%mm1 \n\t"
  433. "add %4, %1 \n\t"
  434. PAVGB" (%2), %%mm0 \n\t"
  435. PAVGB" 8(%2), %%mm1 \n\t"
  436. PAVGB" (%3), %%mm0 \n\t"
  437. PAVGB" 8(%3), %%mm1 \n\t"
  438. "movq %%mm0, (%3) \n\t"
  439. "movq %%mm1, 8(%3) \n\t"
  440. "add %5, %3 \n\t"
  441. "movq (%1), %%mm0 \n\t"
  442. "movq 8(%1), %%mm1 \n\t"
  443. "add %4, %1 \n\t"
  444. PAVGB" 16(%2), %%mm0 \n\t"
  445. PAVGB" 24(%2), %%mm1 \n\t"
  446. PAVGB" (%3), %%mm0 \n\t"
  447. PAVGB" 8(%3), %%mm1 \n\t"
  448. "movq %%mm0, (%3) \n\t"
  449. "movq %%mm1, 8(%3) \n\t"
  450. "add %5, %3 \n\t"
  451. "add $32, %2 \n\t"
  452. "subl $2, %0 \n\t"
  453. "jnz 1b \n\t"
  454. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  455. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  456. #else
  457. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  458. #endif
  459. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  460. :"memory");
  461. //the following should be used, though better not with gcc ...
  462. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  463. :"r"(src1Stride), "r"(dstStride)
  464. :"memory");*/
  465. }
  466. static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  467. {
  468. __asm__ volatile(
  469. "pcmpeqb %%mm6, %%mm6 \n\t"
  470. "testl $1, %0 \n\t"
  471. " jz 1f \n\t"
  472. "movq (%1), %%mm0 \n\t"
  473. "movq 8(%1), %%mm1 \n\t"
  474. "movq (%2), %%mm2 \n\t"
  475. "movq 8(%2), %%mm3 \n\t"
  476. "pxor %%mm6, %%mm0 \n\t"
  477. "pxor %%mm6, %%mm1 \n\t"
  478. "pxor %%mm6, %%mm2 \n\t"
  479. "pxor %%mm6, %%mm3 \n\t"
  480. PAVGB" %%mm2, %%mm0 \n\t"
  481. PAVGB" %%mm3, %%mm1 \n\t"
  482. "pxor %%mm6, %%mm0 \n\t"
  483. "pxor %%mm6, %%mm1 \n\t"
  484. "add %4, %1 \n\t"
  485. "add $16, %2 \n\t"
  486. "movq %%mm0, (%3) \n\t"
  487. "movq %%mm1, 8(%3) \n\t"
  488. "add %5, %3 \n\t"
  489. "decl %0 \n\t"
  490. "1: \n\t"
  491. "movq (%1), %%mm0 \n\t"
  492. "movq 8(%1), %%mm1 \n\t"
  493. "add %4, %1 \n\t"
  494. "movq (%2), %%mm2 \n\t"
  495. "movq 8(%2), %%mm3 \n\t"
  496. "pxor %%mm6, %%mm0 \n\t"
  497. "pxor %%mm6, %%mm1 \n\t"
  498. "pxor %%mm6, %%mm2 \n\t"
  499. "pxor %%mm6, %%mm3 \n\t"
  500. PAVGB" %%mm2, %%mm0 \n\t"
  501. PAVGB" %%mm3, %%mm1 \n\t"
  502. "pxor %%mm6, %%mm0 \n\t"
  503. "pxor %%mm6, %%mm1 \n\t"
  504. "movq %%mm0, (%3) \n\t"
  505. "movq %%mm1, 8(%3) \n\t"
  506. "add %5, %3 \n\t"
  507. "movq (%1), %%mm0 \n\t"
  508. "movq 8(%1), %%mm1 \n\t"
  509. "add %4, %1 \n\t"
  510. "movq 16(%2), %%mm2 \n\t"
  511. "movq 24(%2), %%mm3 \n\t"
  512. "pxor %%mm6, %%mm0 \n\t"
  513. "pxor %%mm6, %%mm1 \n\t"
  514. "pxor %%mm6, %%mm2 \n\t"
  515. "pxor %%mm6, %%mm3 \n\t"
  516. PAVGB" %%mm2, %%mm0 \n\t"
  517. PAVGB" %%mm3, %%mm1 \n\t"
  518. "pxor %%mm6, %%mm0 \n\t"
  519. "pxor %%mm6, %%mm1 \n\t"
  520. "movq %%mm0, (%3) \n\t"
  521. "movq %%mm1, 8(%3) \n\t"
  522. "add %5, %3 \n\t"
  523. "add $32, %2 \n\t"
  524. "subl $2, %0 \n\t"
  525. "jnz 1b \n\t"
  526. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  527. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  528. #else
  529. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  530. #endif
  531. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  532. :"memory");
  533. //the following should be used, though better not with gcc ...
  534. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  535. :"r"(src1Stride), "r"(dstStride)
  536. :"memory");*/
  537. }
  538. /* GL: this function does incorrect rounding if overflow */
  539. static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  540. {
  541. MOVQ_BONE(mm6);
  542. __asm__ volatile(
  543. "lea (%3, %3), %%"REG_a" \n\t"
  544. "1: \n\t"
  545. "movq (%1), %%mm0 \n\t"
  546. "movq (%1, %3), %%mm2 \n\t"
  547. "movq 1(%1), %%mm1 \n\t"
  548. "movq 1(%1, %3), %%mm3 \n\t"
  549. "add %%"REG_a", %1 \n\t"
  550. "psubusb %%mm6, %%mm0 \n\t"
  551. "psubusb %%mm6, %%mm2 \n\t"
  552. PAVGB" %%mm1, %%mm0 \n\t"
  553. PAVGB" %%mm3, %%mm2 \n\t"
  554. "movq %%mm0, (%2) \n\t"
  555. "movq %%mm2, (%2, %3) \n\t"
  556. "movq (%1), %%mm0 \n\t"
  557. "movq 1(%1), %%mm1 \n\t"
  558. "movq (%1, %3), %%mm2 \n\t"
  559. "movq 1(%1, %3), %%mm3 \n\t"
  560. "add %%"REG_a", %2 \n\t"
  561. "add %%"REG_a", %1 \n\t"
  562. "psubusb %%mm6, %%mm0 \n\t"
  563. "psubusb %%mm6, %%mm2 \n\t"
  564. PAVGB" %%mm1, %%mm0 \n\t"
  565. PAVGB" %%mm3, %%mm2 \n\t"
  566. "movq %%mm0, (%2) \n\t"
  567. "movq %%mm2, (%2, %3) \n\t"
  568. "add %%"REG_a", %2 \n\t"
  569. "subl $4, %0 \n\t"
  570. "jnz 1b \n\t"
  571. :"+g"(h), "+S"(pixels), "+D"(block)
  572. :"r" ((x86_reg)line_size)
  573. :"%"REG_a, "memory");
  574. }
  575. static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  576. {
  577. __asm__ volatile (
  578. "pcmpeqb %%mm6, %%mm6 \n\t"
  579. "1: \n\t"
  580. "movq (%1), %%mm0 \n\t"
  581. "movq (%1, %3), %%mm2 \n\t"
  582. "movq 1(%1), %%mm1 \n\t"
  583. "movq 1(%1, %3), %%mm3 \n\t"
  584. "pxor %%mm6, %%mm0 \n\t"
  585. "pxor %%mm6, %%mm2 \n\t"
  586. "pxor %%mm6, %%mm1 \n\t"
  587. "pxor %%mm6, %%mm3 \n\t"
  588. PAVGB" %%mm1, %%mm0 \n\t"
  589. PAVGB" %%mm3, %%mm2 \n\t"
  590. "pxor %%mm6, %%mm0 \n\t"
  591. "pxor %%mm6, %%mm2 \n\t"
  592. "movq %%mm0, (%2) \n\t"
  593. "movq %%mm2, (%2, %3) \n\t"
  594. "movq (%1, %3,2), %%mm0 \n\t"
  595. "movq 1(%1, %3,2), %%mm1 \n\t"
  596. "movq (%1, %4), %%mm2 \n\t"
  597. "movq 1(%1, %4), %%mm3 \n\t"
  598. "pxor %%mm6, %%mm0 \n\t"
  599. "pxor %%mm6, %%mm1 \n\t"
  600. "pxor %%mm6, %%mm2 \n\t"
  601. "pxor %%mm6, %%mm3 \n\t"
  602. PAVGB" %%mm1, %%mm0 \n\t"
  603. PAVGB" %%mm3, %%mm2 \n\t"
  604. "pxor %%mm6, %%mm0 \n\t"
  605. "pxor %%mm6, %%mm2 \n\t"
  606. "movq %%mm0, (%2, %3,2) \n\t"
  607. "movq %%mm2, (%2, %4) \n\t"
  608. "lea (%1, %3,4), %1 \n\t"
  609. "lea (%2, %3,4), %2 \n\t"
  610. "subl $4, %0 \n\t"
  611. "jg 1b \n\t"
  612. : "+g"(h), "+r"(pixels), "+r"(block)
  613. : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
  614. : "memory"
  615. );
  616. }
  617. static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  618. {
  619. __asm__ volatile(
  620. "lea (%3, %3), %%"REG_a" \n\t"
  621. "movq (%1), %%mm0 \n\t"
  622. "sub %3, %2 \n\t"
  623. "1: \n\t"
  624. "movq (%1, %3), %%mm1 \n\t"
  625. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  626. "add %%"REG_a", %1 \n\t"
  627. PAVGB" %%mm1, %%mm0 \n\t"
  628. PAVGB" %%mm2, %%mm1 \n\t"
  629. "movq %%mm0, (%2, %3) \n\t"
  630. "movq %%mm1, (%2, %%"REG_a") \n\t"
  631. "movq (%1, %3), %%mm1 \n\t"
  632. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  633. "add %%"REG_a", %2 \n\t"
  634. "add %%"REG_a", %1 \n\t"
  635. PAVGB" %%mm1, %%mm2 \n\t"
  636. PAVGB" %%mm0, %%mm1 \n\t"
  637. "movq %%mm2, (%2, %3) \n\t"
  638. "movq %%mm1, (%2, %%"REG_a") \n\t"
  639. "add %%"REG_a", %2 \n\t"
  640. "subl $4, %0 \n\t"
  641. "jnz 1b \n\t"
  642. :"+g"(h), "+S"(pixels), "+D" (block)
  643. :"r" ((x86_reg)line_size)
  644. :"%"REG_a, "memory");
  645. }
  646. /* GL: this function does incorrect rounding if overflow */
  647. static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  648. {
  649. MOVQ_BONE(mm6);
  650. __asm__ volatile(
  651. "lea (%3, %3), %%"REG_a" \n\t"
  652. "movq (%1), %%mm0 \n\t"
  653. "sub %3, %2 \n\t"
  654. "1: \n\t"
  655. "movq (%1, %3), %%mm1 \n\t"
  656. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  657. "add %%"REG_a", %1 \n\t"
  658. "psubusb %%mm6, %%mm1 \n\t"
  659. PAVGB" %%mm1, %%mm0 \n\t"
  660. PAVGB" %%mm2, %%mm1 \n\t"
  661. "movq %%mm0, (%2, %3) \n\t"
  662. "movq %%mm1, (%2, %%"REG_a") \n\t"
  663. "movq (%1, %3), %%mm1 \n\t"
  664. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  665. "add %%"REG_a", %2 \n\t"
  666. "add %%"REG_a", %1 \n\t"
  667. "psubusb %%mm6, %%mm1 \n\t"
  668. PAVGB" %%mm1, %%mm2 \n\t"
  669. PAVGB" %%mm0, %%mm1 \n\t"
  670. "movq %%mm2, (%2, %3) \n\t"
  671. "movq %%mm1, (%2, %%"REG_a") \n\t"
  672. "add %%"REG_a", %2 \n\t"
  673. "subl $4, %0 \n\t"
  674. "jnz 1b \n\t"
  675. :"+g"(h), "+S"(pixels), "+D" (block)
  676. :"r" ((x86_reg)line_size)
  677. :"%"REG_a, "memory");
  678. }
  679. static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  680. {
  681. __asm__ volatile (
  682. "movq (%1), %%mm0 \n\t"
  683. "pcmpeqb %%mm6, %%mm6 \n\t"
  684. "add %3, %1 \n\t"
  685. "pxor %%mm6, %%mm0 \n\t"
  686. "1: \n\t"
  687. "movq (%1), %%mm1 \n\t"
  688. "movq (%1, %3), %%mm2 \n\t"
  689. "pxor %%mm6, %%mm1 \n\t"
  690. "pxor %%mm6, %%mm2 \n\t"
  691. PAVGB" %%mm1, %%mm0 \n\t"
  692. PAVGB" %%mm2, %%mm1 \n\t"
  693. "pxor %%mm6, %%mm0 \n\t"
  694. "pxor %%mm6, %%mm1 \n\t"
  695. "movq %%mm0, (%2) \n\t"
  696. "movq %%mm1, (%2, %3) \n\t"
  697. "movq (%1, %3,2), %%mm1 \n\t"
  698. "movq (%1, %4), %%mm0 \n\t"
  699. "pxor %%mm6, %%mm1 \n\t"
  700. "pxor %%mm6, %%mm0 \n\t"
  701. PAVGB" %%mm1, %%mm2 \n\t"
  702. PAVGB" %%mm0, %%mm1 \n\t"
  703. "pxor %%mm6, %%mm2 \n\t"
  704. "pxor %%mm6, %%mm1 \n\t"
  705. "movq %%mm2, (%2, %3,2) \n\t"
  706. "movq %%mm1, (%2, %4) \n\t"
  707. "lea (%1, %3,4), %1 \n\t"
  708. "lea (%2, %3,4), %2 \n\t"
  709. "subl $4, %0 \n\t"
  710. "jg 1b \n\t"
  711. :"+g"(h), "+r"(pixels), "+r" (block)
  712. :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
  713. :"memory"
  714. );
  715. }
  716. static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  717. {
  718. __asm__ volatile(
  719. "lea (%3, %3), %%"REG_a" \n\t"
  720. "1: \n\t"
  721. "movq (%2), %%mm0 \n\t"
  722. "movq (%2, %3), %%mm1 \n\t"
  723. PAVGB" (%1), %%mm0 \n\t"
  724. PAVGB" (%1, %3), %%mm1 \n\t"
  725. "movq %%mm0, (%2) \n\t"
  726. "movq %%mm1, (%2, %3) \n\t"
  727. "add %%"REG_a", %1 \n\t"
  728. "add %%"REG_a", %2 \n\t"
  729. "movq (%2), %%mm0 \n\t"
  730. "movq (%2, %3), %%mm1 \n\t"
  731. PAVGB" (%1), %%mm0 \n\t"
  732. PAVGB" (%1, %3), %%mm1 \n\t"
  733. "add %%"REG_a", %1 \n\t"
  734. "movq %%mm0, (%2) \n\t"
  735. "movq %%mm1, (%2, %3) \n\t"
  736. "add %%"REG_a", %2 \n\t"
  737. "subl $4, %0 \n\t"
  738. "jnz 1b \n\t"
  739. :"+g"(h), "+S"(pixels), "+D"(block)
  740. :"r" ((x86_reg)line_size)
  741. :"%"REG_a, "memory");
  742. }
  743. static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  744. {
  745. __asm__ volatile(
  746. "lea (%3, %3), %%"REG_a" \n\t"
  747. "1: \n\t"
  748. "movq (%1), %%mm0 \n\t"
  749. "movq (%1, %3), %%mm2 \n\t"
  750. PAVGB" 1(%1), %%mm0 \n\t"
  751. PAVGB" 1(%1, %3), %%mm2 \n\t"
  752. PAVGB" (%2), %%mm0 \n\t"
  753. PAVGB" (%2, %3), %%mm2 \n\t"
  754. "add %%"REG_a", %1 \n\t"
  755. "movq %%mm0, (%2) \n\t"
  756. "movq %%mm2, (%2, %3) \n\t"
  757. "movq (%1), %%mm0 \n\t"
  758. "movq (%1, %3), %%mm2 \n\t"
  759. PAVGB" 1(%1), %%mm0 \n\t"
  760. PAVGB" 1(%1, %3), %%mm2 \n\t"
  761. "add %%"REG_a", %2 \n\t"
  762. "add %%"REG_a", %1 \n\t"
  763. PAVGB" (%2), %%mm0 \n\t"
  764. PAVGB" (%2, %3), %%mm2 \n\t"
  765. "movq %%mm0, (%2) \n\t"
  766. "movq %%mm2, (%2, %3) \n\t"
  767. "add %%"REG_a", %2 \n\t"
  768. "subl $4, %0 \n\t"
  769. "jnz 1b \n\t"
  770. :"+g"(h), "+S"(pixels), "+D"(block)
  771. :"r" ((x86_reg)line_size)
  772. :"%"REG_a, "memory");
  773. }
  774. static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  775. {
  776. __asm__ volatile(
  777. "lea (%3, %3), %%"REG_a" \n\t"
  778. "movq (%1), %%mm0 \n\t"
  779. "sub %3, %2 \n\t"
  780. "1: \n\t"
  781. "movq (%1, %3), %%mm1 \n\t"
  782. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  783. "add %%"REG_a", %1 \n\t"
  784. PAVGB" %%mm1, %%mm0 \n\t"
  785. PAVGB" %%mm2, %%mm1 \n\t"
  786. "movq (%2, %3), %%mm3 \n\t"
  787. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  788. PAVGB" %%mm3, %%mm0 \n\t"
  789. PAVGB" %%mm4, %%mm1 \n\t"
  790. "movq %%mm0, (%2, %3) \n\t"
  791. "movq %%mm1, (%2, %%"REG_a") \n\t"
  792. "movq (%1, %3), %%mm1 \n\t"
  793. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  794. PAVGB" %%mm1, %%mm2 \n\t"
  795. PAVGB" %%mm0, %%mm1 \n\t"
  796. "add %%"REG_a", %2 \n\t"
  797. "add %%"REG_a", %1 \n\t"
  798. "movq (%2, %3), %%mm3 \n\t"
  799. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  800. PAVGB" %%mm3, %%mm2 \n\t"
  801. PAVGB" %%mm4, %%mm1 \n\t"
  802. "movq %%mm2, (%2, %3) \n\t"
  803. "movq %%mm1, (%2, %%"REG_a") \n\t"
  804. "add %%"REG_a", %2 \n\t"
  805. "subl $4, %0 \n\t"
  806. "jnz 1b \n\t"
  807. :"+g"(h), "+S"(pixels), "+D"(block)
  808. :"r" ((x86_reg)line_size)
  809. :"%"REG_a, "memory");
  810. }
  811. /* Note this is not correctly rounded, but this function is only
  812. * used for B-frames so it does not matter. */
  813. static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  814. {
  815. MOVQ_BONE(mm6);
  816. __asm__ volatile(
  817. "lea (%3, %3), %%"REG_a" \n\t"
  818. "movq (%1), %%mm0 \n\t"
  819. PAVGB" 1(%1), %%mm0 \n\t"
  820. ".p2align 3 \n\t"
  821. "1: \n\t"
  822. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  823. "movq (%1, %3), %%mm1 \n\t"
  824. "psubusb %%mm6, %%mm2 \n\t"
  825. PAVGB" 1(%1, %3), %%mm1 \n\t"
  826. PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
  827. "add %%"REG_a", %1 \n\t"
  828. PAVGB" %%mm1, %%mm0 \n\t"
  829. PAVGB" %%mm2, %%mm1 \n\t"
  830. PAVGB" (%2), %%mm0 \n\t"
  831. PAVGB" (%2, %3), %%mm1 \n\t"
  832. "movq %%mm0, (%2) \n\t"
  833. "movq %%mm1, (%2, %3) \n\t"
  834. "movq (%1, %3), %%mm1 \n\t"
  835. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  836. PAVGB" 1(%1, %3), %%mm1 \n\t"
  837. PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
  838. "add %%"REG_a", %2 \n\t"
  839. "add %%"REG_a", %1 \n\t"
  840. PAVGB" %%mm1, %%mm2 \n\t"
  841. PAVGB" %%mm0, %%mm1 \n\t"
  842. PAVGB" (%2), %%mm2 \n\t"
  843. PAVGB" (%2, %3), %%mm1 \n\t"
  844. "movq %%mm2, (%2) \n\t"
  845. "movq %%mm1, (%2, %3) \n\t"
  846. "add %%"REG_a", %2 \n\t"
  847. "subl $4, %0 \n\t"
  848. "jnz 1b \n\t"
  849. :"+g"(h), "+S"(pixels), "+D"(block)
  850. :"r" ((x86_reg)line_size)
  851. :"%"REG_a, "memory");
  852. }
  853. static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  854. {
  855. do {
  856. __asm__ volatile(
  857. "movd (%1), %%mm0 \n\t"
  858. "movd (%1, %2), %%mm1 \n\t"
  859. "movd (%1, %2, 2), %%mm2 \n\t"
  860. "movd (%1, %3), %%mm3 \n\t"
  861. PAVGB" (%0), %%mm0 \n\t"
  862. PAVGB" (%0, %2), %%mm1 \n\t"
  863. PAVGB" (%0, %2, 2), %%mm2 \n\t"
  864. PAVGB" (%0, %3), %%mm3 \n\t"
  865. "movd %%mm0, (%1) \n\t"
  866. "movd %%mm1, (%1, %2) \n\t"
  867. "movd %%mm2, (%1, %2, 2) \n\t"
  868. "movd %%mm3, (%1, %3) \n\t"
  869. ::"S"(pixels), "D"(block),
  870. "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
  871. :"memory");
  872. block += 4*line_size;
  873. pixels += 4*line_size;
  874. h -= 4;
  875. } while(h > 0);
  876. }
  877. //FIXME the following could be optimized too ...
  878. static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  879. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  880. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  881. }
  882. static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  883. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  884. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  885. }
  886. static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  887. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  888. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  889. }
  890. static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  891. DEF(avg_pixels8)(block , pixels , line_size, h);
  892. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  893. }
  894. static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  895. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  896. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  897. }
  898. static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  899. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  900. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  901. }
  902. static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  903. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  904. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  905. }
  906. #define QPEL_2TAP_L3(OPNAME) \
  907. static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  908. __asm__ volatile(\
  909. "1: \n\t"\
  910. "movq (%1,%2), %%mm0 \n\t"\
  911. "movq 8(%1,%2), %%mm1 \n\t"\
  912. PAVGB" (%1,%3), %%mm0 \n\t"\
  913. PAVGB" 8(%1,%3), %%mm1 \n\t"\
  914. PAVGB" (%1), %%mm0 \n\t"\
  915. PAVGB" 8(%1), %%mm1 \n\t"\
  916. STORE_OP( (%1,%4),%%mm0)\
  917. STORE_OP(8(%1,%4),%%mm1)\
  918. "movq %%mm0, (%1,%4) \n\t"\
  919. "movq %%mm1, 8(%1,%4) \n\t"\
  920. "add %5, %1 \n\t"\
  921. "decl %0 \n\t"\
  922. "jnz 1b \n\t"\
  923. :"+g"(h), "+r"(src)\
  924. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  925. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  926. :"memory"\
  927. );\
  928. }\
  929. static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
  930. __asm__ volatile(\
  931. "1: \n\t"\
  932. "movq (%1,%2), %%mm0 \n\t"\
  933. PAVGB" (%1,%3), %%mm0 \n\t"\
  934. PAVGB" (%1), %%mm0 \n\t"\
  935. STORE_OP((%1,%4),%%mm0)\
  936. "movq %%mm0, (%1,%4) \n\t"\
  937. "add %5, %1 \n\t"\
  938. "decl %0 \n\t"\
  939. "jnz 1b \n\t"\
  940. :"+g"(h), "+r"(src)\
  941. :"r"((x86_reg)off1), "r"((x86_reg)off2),\
  942. "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
  943. :"memory"\
  944. );\
  945. }
  946. #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
  947. QPEL_2TAP_L3(avg_)
  948. #undef STORE_OP
  949. #define STORE_OP(a,b)
  950. QPEL_2TAP_L3(put_)
  951. #undef STORE_OP
  952. #undef QPEL_2TAP_L3