You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

595 lines
23KB

  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of Libav.
  11. *
  12. * Libav is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * Libav is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with Libav; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. // put_pixels
  27. static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  28. {
  29. MOVQ_BFE(mm6);
  30. __asm__ volatile(
  31. "lea (%3, %3), %%"REG_a" \n\t"
  32. ".p2align 3 \n\t"
  33. "1: \n\t"
  34. "movq (%1), %%mm0 \n\t"
  35. "movq 1(%1), %%mm1 \n\t"
  36. "movq (%1, %3), %%mm2 \n\t"
  37. "movq 1(%1, %3), %%mm3 \n\t"
  38. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  39. "movq %%mm4, (%2) \n\t"
  40. "movq %%mm5, (%2, %3) \n\t"
  41. "add %%"REG_a", %1 \n\t"
  42. "add %%"REG_a", %2 \n\t"
  43. "movq (%1), %%mm0 \n\t"
  44. "movq 1(%1), %%mm1 \n\t"
  45. "movq (%1, %3), %%mm2 \n\t"
  46. "movq 1(%1, %3), %%mm3 \n\t"
  47. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  48. "movq %%mm4, (%2) \n\t"
  49. "movq %%mm5, (%2, %3) \n\t"
  50. "add %%"REG_a", %1 \n\t"
  51. "add %%"REG_a", %2 \n\t"
  52. "subl $4, %0 \n\t"
  53. "jnz 1b \n\t"
  54. :"+g"(h), "+S"(pixels), "+D"(block)
  55. :"r"((x86_reg)line_size)
  56. :REG_a, "memory");
  57. }
  58. static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  59. {
  60. MOVQ_BFE(mm6);
  61. __asm__ volatile(
  62. "testl $1, %0 \n\t"
  63. " jz 1f \n\t"
  64. "movq (%1), %%mm0 \n\t"
  65. "movq (%2), %%mm1 \n\t"
  66. "add %4, %1 \n\t"
  67. "add $8, %2 \n\t"
  68. PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
  69. "movq %%mm4, (%3) \n\t"
  70. "add %5, %3 \n\t"
  71. "decl %0 \n\t"
  72. ".p2align 3 \n\t"
  73. "1: \n\t"
  74. "movq (%1), %%mm0 \n\t"
  75. "movq (%2), %%mm1 \n\t"
  76. "add %4, %1 \n\t"
  77. "movq (%1), %%mm2 \n\t"
  78. "movq 8(%2), %%mm3 \n\t"
  79. "add %4, %1 \n\t"
  80. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  81. "movq %%mm4, (%3) \n\t"
  82. "add %5, %3 \n\t"
  83. "movq %%mm5, (%3) \n\t"
  84. "add %5, %3 \n\t"
  85. "movq (%1), %%mm0 \n\t"
  86. "movq 16(%2), %%mm1 \n\t"
  87. "add %4, %1 \n\t"
  88. "movq (%1), %%mm2 \n\t"
  89. "movq 24(%2), %%mm3 \n\t"
  90. "add %4, %1 \n\t"
  91. "add $32, %2 \n\t"
  92. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  93. "movq %%mm4, (%3) \n\t"
  94. "add %5, %3 \n\t"
  95. "movq %%mm5, (%3) \n\t"
  96. "add %5, %3 \n\t"
  97. "subl $4, %0 \n\t"
  98. "jnz 1b \n\t"
  99. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  100. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  101. #else
  102. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  103. #endif
  104. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  105. :"memory");
  106. }
  107. static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  108. {
  109. MOVQ_BFE(mm6);
  110. __asm__ volatile(
  111. "lea (%3, %3), %%"REG_a" \n\t"
  112. ".p2align 3 \n\t"
  113. "1: \n\t"
  114. "movq (%1), %%mm0 \n\t"
  115. "movq 1(%1), %%mm1 \n\t"
  116. "movq (%1, %3), %%mm2 \n\t"
  117. "movq 1(%1, %3), %%mm3 \n\t"
  118. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  119. "movq %%mm4, (%2) \n\t"
  120. "movq %%mm5, (%2, %3) \n\t"
  121. "movq 8(%1), %%mm0 \n\t"
  122. "movq 9(%1), %%mm1 \n\t"
  123. "movq 8(%1, %3), %%mm2 \n\t"
  124. "movq 9(%1, %3), %%mm3 \n\t"
  125. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  126. "movq %%mm4, 8(%2) \n\t"
  127. "movq %%mm5, 8(%2, %3) \n\t"
  128. "add %%"REG_a", %1 \n\t"
  129. "add %%"REG_a", %2 \n\t"
  130. "movq (%1), %%mm0 \n\t"
  131. "movq 1(%1), %%mm1 \n\t"
  132. "movq (%1, %3), %%mm2 \n\t"
  133. "movq 1(%1, %3), %%mm3 \n\t"
  134. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  135. "movq %%mm4, (%2) \n\t"
  136. "movq %%mm5, (%2, %3) \n\t"
  137. "movq 8(%1), %%mm0 \n\t"
  138. "movq 9(%1), %%mm1 \n\t"
  139. "movq 8(%1, %3), %%mm2 \n\t"
  140. "movq 9(%1, %3), %%mm3 \n\t"
  141. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  142. "movq %%mm4, 8(%2) \n\t"
  143. "movq %%mm5, 8(%2, %3) \n\t"
  144. "add %%"REG_a", %1 \n\t"
  145. "add %%"REG_a", %2 \n\t"
  146. "subl $4, %0 \n\t"
  147. "jnz 1b \n\t"
  148. :"+g"(h), "+S"(pixels), "+D"(block)
  149. :"r"((x86_reg)line_size)
  150. :REG_a, "memory");
  151. }
  152. static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  153. {
  154. MOVQ_BFE(mm6);
  155. __asm__ volatile(
  156. "testl $1, %0 \n\t"
  157. " jz 1f \n\t"
  158. "movq (%1), %%mm0 \n\t"
  159. "movq (%2), %%mm1 \n\t"
  160. "movq 8(%1), %%mm2 \n\t"
  161. "movq 8(%2), %%mm3 \n\t"
  162. "add %4, %1 \n\t"
  163. "add $16, %2 \n\t"
  164. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  165. "movq %%mm4, (%3) \n\t"
  166. "movq %%mm5, 8(%3) \n\t"
  167. "add %5, %3 \n\t"
  168. "decl %0 \n\t"
  169. ".p2align 3 \n\t"
  170. "1: \n\t"
  171. "movq (%1), %%mm0 \n\t"
  172. "movq (%2), %%mm1 \n\t"
  173. "movq 8(%1), %%mm2 \n\t"
  174. "movq 8(%2), %%mm3 \n\t"
  175. "add %4, %1 \n\t"
  176. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  177. "movq %%mm4, (%3) \n\t"
  178. "movq %%mm5, 8(%3) \n\t"
  179. "add %5, %3 \n\t"
  180. "movq (%1), %%mm0 \n\t"
  181. "movq 16(%2), %%mm1 \n\t"
  182. "movq 8(%1), %%mm2 \n\t"
  183. "movq 24(%2), %%mm3 \n\t"
  184. "add %4, %1 \n\t"
  185. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  186. "movq %%mm4, (%3) \n\t"
  187. "movq %%mm5, 8(%3) \n\t"
  188. "add %5, %3 \n\t"
  189. "add $32, %2 \n\t"
  190. "subl $2, %0 \n\t"
  191. "jnz 1b \n\t"
  192. #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
  193. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  194. #else
  195. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  196. #endif
  197. :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  198. :"memory");
  199. }
  200. static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  201. {
  202. MOVQ_BFE(mm6);
  203. __asm__ volatile(
  204. "lea (%3, %3), %%"REG_a" \n\t"
  205. "movq (%1), %%mm0 \n\t"
  206. ".p2align 3 \n\t"
  207. "1: \n\t"
  208. "movq (%1, %3), %%mm1 \n\t"
  209. "movq (%1, %%"REG_a"),%%mm2 \n\t"
  210. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  211. "movq %%mm4, (%2) \n\t"
  212. "movq %%mm5, (%2, %3) \n\t"
  213. "add %%"REG_a", %1 \n\t"
  214. "add %%"REG_a", %2 \n\t"
  215. "movq (%1, %3), %%mm1 \n\t"
  216. "movq (%1, %%"REG_a"),%%mm0 \n\t"
  217. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  218. "movq %%mm4, (%2) \n\t"
  219. "movq %%mm5, (%2, %3) \n\t"
  220. "add %%"REG_a", %1 \n\t"
  221. "add %%"REG_a", %2 \n\t"
  222. "subl $4, %0 \n\t"
  223. "jnz 1b \n\t"
  224. :"+g"(h), "+S"(pixels), "+D"(block)
  225. :"r"((x86_reg)line_size)
  226. :REG_a, "memory");
  227. }
  228. static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  229. {
  230. MOVQ_ZERO(mm7);
  231. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  232. __asm__ volatile(
  233. "movq (%1), %%mm0 \n\t"
  234. "movq 1(%1), %%mm4 \n\t"
  235. "movq %%mm0, %%mm1 \n\t"
  236. "movq %%mm4, %%mm5 \n\t"
  237. "punpcklbw %%mm7, %%mm0 \n\t"
  238. "punpcklbw %%mm7, %%mm4 \n\t"
  239. "punpckhbw %%mm7, %%mm1 \n\t"
  240. "punpckhbw %%mm7, %%mm5 \n\t"
  241. "paddusw %%mm0, %%mm4 \n\t"
  242. "paddusw %%mm1, %%mm5 \n\t"
  243. "xor %%"REG_a", %%"REG_a" \n\t"
  244. "add %3, %1 \n\t"
  245. ".p2align 3 \n\t"
  246. "1: \n\t"
  247. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  248. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  249. "movq %%mm0, %%mm1 \n\t"
  250. "movq %%mm2, %%mm3 \n\t"
  251. "punpcklbw %%mm7, %%mm0 \n\t"
  252. "punpcklbw %%mm7, %%mm2 \n\t"
  253. "punpckhbw %%mm7, %%mm1 \n\t"
  254. "punpckhbw %%mm7, %%mm3 \n\t"
  255. "paddusw %%mm2, %%mm0 \n\t"
  256. "paddusw %%mm3, %%mm1 \n\t"
  257. "paddusw %%mm6, %%mm4 \n\t"
  258. "paddusw %%mm6, %%mm5 \n\t"
  259. "paddusw %%mm0, %%mm4 \n\t"
  260. "paddusw %%mm1, %%mm5 \n\t"
  261. "psrlw $2, %%mm4 \n\t"
  262. "psrlw $2, %%mm5 \n\t"
  263. "packuswb %%mm5, %%mm4 \n\t"
  264. "movq %%mm4, (%2, %%"REG_a") \n\t"
  265. "add %3, %%"REG_a" \n\t"
  266. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  267. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  268. "movq %%mm2, %%mm3 \n\t"
  269. "movq %%mm4, %%mm5 \n\t"
  270. "punpcklbw %%mm7, %%mm2 \n\t"
  271. "punpcklbw %%mm7, %%mm4 \n\t"
  272. "punpckhbw %%mm7, %%mm3 \n\t"
  273. "punpckhbw %%mm7, %%mm5 \n\t"
  274. "paddusw %%mm2, %%mm4 \n\t"
  275. "paddusw %%mm3, %%mm5 \n\t"
  276. "paddusw %%mm6, %%mm0 \n\t"
  277. "paddusw %%mm6, %%mm1 \n\t"
  278. "paddusw %%mm4, %%mm0 \n\t"
  279. "paddusw %%mm5, %%mm1 \n\t"
  280. "psrlw $2, %%mm0 \n\t"
  281. "psrlw $2, %%mm1 \n\t"
  282. "packuswb %%mm1, %%mm0 \n\t"
  283. "movq %%mm0, (%2, %%"REG_a") \n\t"
  284. "add %3, %%"REG_a" \n\t"
  285. "subl $2, %0 \n\t"
  286. "jnz 1b \n\t"
  287. :"+g"(h), "+S"(pixels)
  288. :"D"(block), "r"((x86_reg)line_size)
  289. :REG_a, "memory");
  290. }
  291. // avg_pixels
  292. static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  293. {
  294. MOVQ_BFE(mm6);
  295. JUMPALIGN();
  296. do {
  297. __asm__ volatile(
  298. "movd %0, %%mm0 \n\t"
  299. "movd %1, %%mm1 \n\t"
  300. OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
  301. "movd %%mm2, %0 \n\t"
  302. :"+m"(*block)
  303. :"m"(*pixels)
  304. :"memory");
  305. pixels += line_size;
  306. block += line_size;
  307. }
  308. while (--h);
  309. }
  310. #ifndef NO_RND
  311. // in case more speed is needed - unroling would certainly help
  312. static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  313. {
  314. MOVQ_BFE(mm6);
  315. JUMPALIGN();
  316. do {
  317. __asm__ volatile(
  318. "movq %0, %%mm0 \n\t"
  319. "movq %1, %%mm1 \n\t"
  320. OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
  321. "movq %%mm2, %0 \n\t"
  322. :"+m"(*block)
  323. :"m"(*pixels)
  324. :"memory");
  325. pixels += line_size;
  326. block += line_size;
  327. }
  328. while (--h);
  329. }
  330. #endif // NO_RND
  331. static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  332. {
  333. MOVQ_BFE(mm6);
  334. JUMPALIGN();
  335. do {
  336. __asm__ volatile(
  337. "movq %0, %%mm0 \n\t"
  338. "movq %1, %%mm1 \n\t"
  339. OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
  340. "movq %%mm2, %0 \n\t"
  341. "movq 8%0, %%mm0 \n\t"
  342. "movq 8%1, %%mm1 \n\t"
  343. OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
  344. "movq %%mm2, 8%0 \n\t"
  345. :"+m"(*block)
  346. :"m"(*pixels)
  347. :"memory");
  348. pixels += line_size;
  349. block += line_size;
  350. }
  351. while (--h);
  352. }
  353. #ifndef NO_RND
  354. static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  355. {
  356. MOVQ_BFE(mm6);
  357. JUMPALIGN();
  358. do {
  359. __asm__ volatile(
  360. "movq %1, %%mm0 \n\t"
  361. "movq 1%1, %%mm1 \n\t"
  362. "movq %0, %%mm3 \n\t"
  363. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  364. OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
  365. "movq %%mm0, %0 \n\t"
  366. :"+m"(*block)
  367. :"m"(*pixels)
  368. :"memory");
  369. pixels += line_size;
  370. block += line_size;
  371. } while (--h);
  372. }
  373. #endif // NO_RND
  374. static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  375. {
  376. MOVQ_BFE(mm6);
  377. JUMPALIGN();
  378. do {
  379. __asm__ volatile(
  380. "movq %1, %%mm0 \n\t"
  381. "movq %2, %%mm1 \n\t"
  382. "movq %0, %%mm3 \n\t"
  383. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  384. OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
  385. "movq %%mm0, %0 \n\t"
  386. :"+m"(*dst)
  387. :"m"(*src1), "m"(*src2)
  388. :"memory");
  389. dst += dstStride;
  390. src1 += src1Stride;
  391. src2 += 8;
  392. } while (--h);
  393. }
  394. static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  395. {
  396. MOVQ_BFE(mm6);
  397. JUMPALIGN();
  398. do {
  399. __asm__ volatile(
  400. "movq %1, %%mm0 \n\t"
  401. "movq 1%1, %%mm1 \n\t"
  402. "movq %0, %%mm3 \n\t"
  403. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  404. OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
  405. "movq %%mm0, %0 \n\t"
  406. "movq 8%1, %%mm0 \n\t"
  407. "movq 9%1, %%mm1 \n\t"
  408. "movq 8%0, %%mm3 \n\t"
  409. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  410. OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
  411. "movq %%mm0, 8%0 \n\t"
  412. :"+m"(*block)
  413. :"m"(*pixels)
  414. :"memory");
  415. pixels += line_size;
  416. block += line_size;
  417. } while (--h);
  418. }
  419. static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  420. {
  421. MOVQ_BFE(mm6);
  422. JUMPALIGN();
  423. do {
  424. __asm__ volatile(
  425. "movq %1, %%mm0 \n\t"
  426. "movq %2, %%mm1 \n\t"
  427. "movq %0, %%mm3 \n\t"
  428. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  429. OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
  430. "movq %%mm0, %0 \n\t"
  431. "movq 8%1, %%mm0 \n\t"
  432. "movq 8%2, %%mm1 \n\t"
  433. "movq 8%0, %%mm3 \n\t"
  434. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  435. OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
  436. "movq %%mm0, 8%0 \n\t"
  437. :"+m"(*dst)
  438. :"m"(*src1), "m"(*src2)
  439. :"memory");
  440. dst += dstStride;
  441. src1 += src1Stride;
  442. src2 += 16;
  443. } while (--h);
  444. }
  445. static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  446. {
  447. MOVQ_BFE(mm6);
  448. __asm__ volatile(
  449. "lea (%3, %3), %%"REG_a" \n\t"
  450. "movq (%1), %%mm0 \n\t"
  451. ".p2align 3 \n\t"
  452. "1: \n\t"
  453. "movq (%1, %3), %%mm1 \n\t"
  454. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  455. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  456. "movq (%2), %%mm3 \n\t"
  457. OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
  458. "movq (%2, %3), %%mm3 \n\t"
  459. OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
  460. "movq %%mm0, (%2) \n\t"
  461. "movq %%mm1, (%2, %3) \n\t"
  462. "add %%"REG_a", %1 \n\t"
  463. "add %%"REG_a", %2 \n\t"
  464. "movq (%1, %3), %%mm1 \n\t"
  465. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  466. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  467. "movq (%2), %%mm3 \n\t"
  468. OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
  469. "movq (%2, %3), %%mm3 \n\t"
  470. OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
  471. "movq %%mm2, (%2) \n\t"
  472. "movq %%mm1, (%2, %3) \n\t"
  473. "add %%"REG_a", %1 \n\t"
  474. "add %%"REG_a", %2 \n\t"
  475. "subl $4, %0 \n\t"
  476. "jnz 1b \n\t"
  477. :"+g"(h), "+S"(pixels), "+D"(block)
  478. :"r"((x86_reg)line_size)
  479. :REG_a, "memory");
  480. }
  481. // this routine is 'slightly' suboptimal but mostly unused
  482. static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  483. {
  484. MOVQ_ZERO(mm7);
  485. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  486. __asm__ volatile(
  487. "movq (%1), %%mm0 \n\t"
  488. "movq 1(%1), %%mm4 \n\t"
  489. "movq %%mm0, %%mm1 \n\t"
  490. "movq %%mm4, %%mm5 \n\t"
  491. "punpcklbw %%mm7, %%mm0 \n\t"
  492. "punpcklbw %%mm7, %%mm4 \n\t"
  493. "punpckhbw %%mm7, %%mm1 \n\t"
  494. "punpckhbw %%mm7, %%mm5 \n\t"
  495. "paddusw %%mm0, %%mm4 \n\t"
  496. "paddusw %%mm1, %%mm5 \n\t"
  497. "xor %%"REG_a", %%"REG_a" \n\t"
  498. "add %3, %1 \n\t"
  499. ".p2align 3 \n\t"
  500. "1: \n\t"
  501. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  502. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  503. "movq %%mm0, %%mm1 \n\t"
  504. "movq %%mm2, %%mm3 \n\t"
  505. "punpcklbw %%mm7, %%mm0 \n\t"
  506. "punpcklbw %%mm7, %%mm2 \n\t"
  507. "punpckhbw %%mm7, %%mm1 \n\t"
  508. "punpckhbw %%mm7, %%mm3 \n\t"
  509. "paddusw %%mm2, %%mm0 \n\t"
  510. "paddusw %%mm3, %%mm1 \n\t"
  511. "paddusw %%mm6, %%mm4 \n\t"
  512. "paddusw %%mm6, %%mm5 \n\t"
  513. "paddusw %%mm0, %%mm4 \n\t"
  514. "paddusw %%mm1, %%mm5 \n\t"
  515. "psrlw $2, %%mm4 \n\t"
  516. "psrlw $2, %%mm5 \n\t"
  517. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  518. "packuswb %%mm5, %%mm4 \n\t"
  519. "pcmpeqd %%mm2, %%mm2 \n\t"
  520. "paddb %%mm2, %%mm2 \n\t"
  521. OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
  522. "movq %%mm5, (%2, %%"REG_a") \n\t"
  523. "add %3, %%"REG_a" \n\t"
  524. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  525. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  526. "movq %%mm2, %%mm3 \n\t"
  527. "movq %%mm4, %%mm5 \n\t"
  528. "punpcklbw %%mm7, %%mm2 \n\t"
  529. "punpcklbw %%mm7, %%mm4 \n\t"
  530. "punpckhbw %%mm7, %%mm3 \n\t"
  531. "punpckhbw %%mm7, %%mm5 \n\t"
  532. "paddusw %%mm2, %%mm4 \n\t"
  533. "paddusw %%mm3, %%mm5 \n\t"
  534. "paddusw %%mm6, %%mm0 \n\t"
  535. "paddusw %%mm6, %%mm1 \n\t"
  536. "paddusw %%mm4, %%mm0 \n\t"
  537. "paddusw %%mm5, %%mm1 \n\t"
  538. "psrlw $2, %%mm0 \n\t"
  539. "psrlw $2, %%mm1 \n\t"
  540. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  541. "packuswb %%mm1, %%mm0 \n\t"
  542. "pcmpeqd %%mm2, %%mm2 \n\t"
  543. "paddb %%mm2, %%mm2 \n\t"
  544. OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
  545. "movq %%mm1, (%2, %%"REG_a") \n\t"
  546. "add %3, %%"REG_a" \n\t"
  547. "subl $2, %0 \n\t"
  548. "jnz 1b \n\t"
  549. :"+g"(h), "+S"(pixels)
  550. :"D"(block), "r"((x86_reg)line_size)
  551. :REG_a, "memory");
  552. }
  553. //FIXME optimize
  554. static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  555. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  556. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  557. }
  558. static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  559. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  560. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  561. }
  562. static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  563. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  564. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  565. }
  566. static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  567. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  568. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  569. }