You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

571 lines
16KB

  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  21. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  22. */
  23. // put_pixels
  24. static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  25. {
  26. MOVQ_BFE(mm6);
  27. __asm __volatile(
  28. "lea (%3, %3), %%eax \n\t"
  29. ".balign 8 \n\t"
  30. "1: \n\t"
  31. "movq (%1), %%mm0 \n\t"
  32. "movq 1(%1), %%mm1 \n\t"
  33. "movq (%1, %3), %%mm2 \n\t"
  34. "movq 1(%1, %3), %%mm3 \n\t"
  35. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  36. "movq %%mm4, (%2) \n\t"
  37. "movq %%mm5, (%2, %3) \n\t"
  38. "addl %%eax, %1 \n\t"
  39. "addl %%eax, %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq 1(%1), %%mm1 \n\t"
  42. "movq (%1, %3), %%mm2 \n\t"
  43. "movq 1(%1, %3), %%mm3 \n\t"
  44. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  45. "movq %%mm4, (%2) \n\t"
  46. "movq %%mm5, (%2, %3) \n\t"
  47. "addl %%eax, %1 \n\t"
  48. "addl %%eax, %2 \n\t"
  49. "subl $4, %0 \n\t"
  50. "jnz 1b \n\t"
  51. :"+g"(h), "+S"(pixels), "+D"(block)
  52. :"r"(line_size)
  53. :"eax", "memory");
  54. }
  55. static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  56. {
  57. MOVQ_BFE(mm6);
  58. __asm __volatile(
  59. "testl $1, %0 \n\t"
  60. " jz 1f \n\t"
  61. "movq (%1), %%mm0 \n\t"
  62. "movq (%2), %%mm1 \n\t"
  63. "addl %4, %1 \n\t"
  64. "addl $8, %2 \n\t"
  65. PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
  66. "movq %%mm4, (%3) \n\t"
  67. "addl %5, %3 \n\t"
  68. "decl %0 \n\t"
  69. ".balign 8 \n\t"
  70. "1: \n\t"
  71. "movq (%1), %%mm0 \n\t"
  72. "movq (%2), %%mm1 \n\t"
  73. "addl %4, %1 \n\t"
  74. "movq (%1), %%mm2 \n\t"
  75. "movq 8(%2), %%mm3 \n\t"
  76. "addl %4, %1 \n\t"
  77. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  78. "movq %%mm4, (%3) \n\t"
  79. "addl %5, %3 \n\t"
  80. "movq %%mm5, (%3) \n\t"
  81. "addl %5, %3 \n\t"
  82. "movq (%1), %%mm0 \n\t"
  83. "movq 16(%2), %%mm1 \n\t"
  84. "addl %4, %1 \n\t"
  85. "movq (%1), %%mm2 \n\t"
  86. "movq 24(%2), %%mm3 \n\t"
  87. "addl %4, %1 \n\t"
  88. "addl $32, %2 \n\t"
  89. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  90. "movq %%mm4, (%3) \n\t"
  91. "addl %5, %3 \n\t"
  92. "movq %%mm5, (%3) \n\t"
  93. "addl %5, %3 \n\t"
  94. "subl $4, %0 \n\t"
  95. "jnz 1b \n\t"
  96. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  97. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  98. #else
  99. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  100. #endif
  101. :"S"(src1Stride), "D"(dstStride)
  102. :"memory");
  103. }
  104. static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  105. {
  106. MOVQ_BFE(mm6);
  107. __asm __volatile(
  108. "lea (%3, %3), %%eax \n\t"
  109. ".balign 8 \n\t"
  110. "1: \n\t"
  111. "movq (%1), %%mm0 \n\t"
  112. "movq 1(%1), %%mm1 \n\t"
  113. "movq (%1, %3), %%mm2 \n\t"
  114. "movq 1(%1, %3), %%mm3 \n\t"
  115. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  116. "movq %%mm4, (%2) \n\t"
  117. "movq %%mm5, (%2, %3) \n\t"
  118. "movq 8(%1), %%mm0 \n\t"
  119. "movq 9(%1), %%mm1 \n\t"
  120. "movq 8(%1, %3), %%mm2 \n\t"
  121. "movq 9(%1, %3), %%mm3 \n\t"
  122. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  123. "movq %%mm4, 8(%2) \n\t"
  124. "movq %%mm5, 8(%2, %3) \n\t"
  125. "addl %%eax, %1 \n\t"
  126. "addl %%eax, %2 \n\t"
  127. "movq (%1), %%mm0 \n\t"
  128. "movq 1(%1), %%mm1 \n\t"
  129. "movq (%1, %3), %%mm2 \n\t"
  130. "movq 1(%1, %3), %%mm3 \n\t"
  131. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  132. "movq %%mm4, (%2) \n\t"
  133. "movq %%mm5, (%2, %3) \n\t"
  134. "movq 8(%1), %%mm0 \n\t"
  135. "movq 9(%1), %%mm1 \n\t"
  136. "movq 8(%1, %3), %%mm2 \n\t"
  137. "movq 9(%1, %3), %%mm3 \n\t"
  138. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  139. "movq %%mm4, 8(%2) \n\t"
  140. "movq %%mm5, 8(%2, %3) \n\t"
  141. "addl %%eax, %1 \n\t"
  142. "addl %%eax, %2 \n\t"
  143. "subl $4, %0 \n\t"
  144. "jnz 1b \n\t"
  145. :"+g"(h), "+S"(pixels), "+D"(block)
  146. :"r"(line_size)
  147. :"eax", "memory");
  148. }
  149. static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  150. {
  151. MOVQ_BFE(mm6);
  152. __asm __volatile(
  153. "testl $1, %0 \n\t"
  154. " jz 1f \n\t"
  155. "movq (%1), %%mm0 \n\t"
  156. "movq (%2), %%mm1 \n\t"
  157. "movq 8(%1), %%mm2 \n\t"
  158. "movq 8(%2), %%mm3 \n\t"
  159. "addl %4, %1 \n\t"
  160. "addl $16, %2 \n\t"
  161. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  162. "movq %%mm4, (%3) \n\t"
  163. "movq %%mm5, 8(%3) \n\t"
  164. "addl %5, %3 \n\t"
  165. "decl %0 \n\t"
  166. ".balign 8 \n\t"
  167. "1: \n\t"
  168. "movq (%1), %%mm0 \n\t"
  169. "movq (%2), %%mm1 \n\t"
  170. "movq 8(%1), %%mm2 \n\t"
  171. "movq 8(%2), %%mm3 \n\t"
  172. "addl %4, %1 \n\t"
  173. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  174. "movq %%mm4, (%3) \n\t"
  175. "movq %%mm5, 8(%3) \n\t"
  176. "addl %5, %3 \n\t"
  177. "movq (%1), %%mm0 \n\t"
  178. "movq 16(%2), %%mm1 \n\t"
  179. "movq 8(%1), %%mm2 \n\t"
  180. "movq 24(%2), %%mm3 \n\t"
  181. "addl %4, %1 \n\t"
  182. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  183. "movq %%mm4, (%3) \n\t"
  184. "movq %%mm5, 8(%3) \n\t"
  185. "addl %5, %3 \n\t"
  186. "addl $32, %2 \n\t"
  187. "subl $2, %0 \n\t"
  188. "jnz 1b \n\t"
  189. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  190. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  191. #else
  192. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  193. #endif
  194. :"S"(src1Stride), "D"(dstStride)
  195. :"memory");
  196. }
  197. static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  198. {
  199. MOVQ_BFE(mm6);
  200. __asm __volatile(
  201. "lea (%3, %3), %%eax \n\t"
  202. "movq (%1), %%mm0 \n\t"
  203. ".balign 8 \n\t"
  204. "1: \n\t"
  205. "movq (%1, %3), %%mm1 \n\t"
  206. "movq (%1, %%eax),%%mm2 \n\t"
  207. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  208. "movq %%mm4, (%2) \n\t"
  209. "movq %%mm5, (%2, %3) \n\t"
  210. "addl %%eax, %1 \n\t"
  211. "addl %%eax, %2 \n\t"
  212. "movq (%1, %3), %%mm1 \n\t"
  213. "movq (%1, %%eax),%%mm0 \n\t"
  214. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  215. "movq %%mm4, (%2) \n\t"
  216. "movq %%mm5, (%2, %3) \n\t"
  217. "addl %%eax, %1 \n\t"
  218. "addl %%eax, %2 \n\t"
  219. "subl $4, %0 \n\t"
  220. "jnz 1b \n\t"
  221. :"+g"(h), "+S"(pixels), "+D"(block)
  222. :"r"(line_size)
  223. :"eax", "memory");
  224. }
  225. static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  226. {
  227. MOVQ_ZERO(mm7);
  228. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  229. __asm __volatile(
  230. "movq (%1), %%mm0 \n\t"
  231. "movq 1(%1), %%mm4 \n\t"
  232. "movq %%mm0, %%mm1 \n\t"
  233. "movq %%mm4, %%mm5 \n\t"
  234. "punpcklbw %%mm7, %%mm0 \n\t"
  235. "punpcklbw %%mm7, %%mm4 \n\t"
  236. "punpckhbw %%mm7, %%mm1 \n\t"
  237. "punpckhbw %%mm7, %%mm5 \n\t"
  238. "paddusw %%mm0, %%mm4 \n\t"
  239. "paddusw %%mm1, %%mm5 \n\t"
  240. "xorl %%eax, %%eax \n\t"
  241. "addl %3, %1 \n\t"
  242. ".balign 8 \n\t"
  243. "1: \n\t"
  244. "movq (%1, %%eax), %%mm0 \n\t"
  245. "movq 1(%1, %%eax), %%mm2 \n\t"
  246. "movq %%mm0, %%mm1 \n\t"
  247. "movq %%mm2, %%mm3 \n\t"
  248. "punpcklbw %%mm7, %%mm0 \n\t"
  249. "punpcklbw %%mm7, %%mm2 \n\t"
  250. "punpckhbw %%mm7, %%mm1 \n\t"
  251. "punpckhbw %%mm7, %%mm3 \n\t"
  252. "paddusw %%mm2, %%mm0 \n\t"
  253. "paddusw %%mm3, %%mm1 \n\t"
  254. "paddusw %%mm6, %%mm4 \n\t"
  255. "paddusw %%mm6, %%mm5 \n\t"
  256. "paddusw %%mm0, %%mm4 \n\t"
  257. "paddusw %%mm1, %%mm5 \n\t"
  258. "psrlw $2, %%mm4 \n\t"
  259. "psrlw $2, %%mm5 \n\t"
  260. "packuswb %%mm5, %%mm4 \n\t"
  261. "movq %%mm4, (%2, %%eax) \n\t"
  262. "addl %3, %%eax \n\t"
  263. "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  264. "movq 1(%1, %%eax), %%mm4 \n\t"
  265. "movq %%mm2, %%mm3 \n\t"
  266. "movq %%mm4, %%mm5 \n\t"
  267. "punpcklbw %%mm7, %%mm2 \n\t"
  268. "punpcklbw %%mm7, %%mm4 \n\t"
  269. "punpckhbw %%mm7, %%mm3 \n\t"
  270. "punpckhbw %%mm7, %%mm5 \n\t"
  271. "paddusw %%mm2, %%mm4 \n\t"
  272. "paddusw %%mm3, %%mm5 \n\t"
  273. "paddusw %%mm6, %%mm0 \n\t"
  274. "paddusw %%mm6, %%mm1 \n\t"
  275. "paddusw %%mm4, %%mm0 \n\t"
  276. "paddusw %%mm5, %%mm1 \n\t"
  277. "psrlw $2, %%mm0 \n\t"
  278. "psrlw $2, %%mm1 \n\t"
  279. "packuswb %%mm1, %%mm0 \n\t"
  280. "movq %%mm0, (%2, %%eax) \n\t"
  281. "addl %3, %%eax \n\t"
  282. "subl $2, %0 \n\t"
  283. "jnz 1b \n\t"
  284. :"+g"(h), "+S"(pixels)
  285. :"D"(block), "r"(line_size)
  286. :"eax", "memory");
  287. }
  288. // avg_pixels
  289. // in case more speed is needed - unroling would certainly help
  290. static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  291. {
  292. MOVQ_BFE(mm6);
  293. JUMPALIGN();
  294. do {
  295. __asm __volatile(
  296. "movq %0, %%mm0 \n\t"
  297. "movq %1, %%mm1 \n\t"
  298. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  299. "movq %%mm2, %0 \n\t"
  300. :"+m"(*block)
  301. :"m"(*pixels)
  302. :"memory");
  303. pixels += line_size;
  304. block += line_size;
  305. }
  306. while (--h);
  307. }
  308. static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  309. {
  310. MOVQ_BFE(mm6);
  311. JUMPALIGN();
  312. do {
  313. __asm __volatile(
  314. "movq %0, %%mm0 \n\t"
  315. "movq %1, %%mm1 \n\t"
  316. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  317. "movq %%mm2, %0 \n\t"
  318. "movq 8%0, %%mm0 \n\t"
  319. "movq 8%1, %%mm1 \n\t"
  320. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  321. "movq %%mm2, 8%0 \n\t"
  322. :"+m"(*block)
  323. :"m"(*pixels)
  324. :"memory");
  325. pixels += line_size;
  326. block += line_size;
  327. }
  328. while (--h);
  329. }
  330. static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  331. {
  332. MOVQ_BFE(mm6);
  333. JUMPALIGN();
  334. do {
  335. __asm __volatile(
  336. "movq %1, %%mm0 \n\t"
  337. "movq 1%1, %%mm1 \n\t"
  338. "movq %0, %%mm3 \n\t"
  339. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  340. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  341. "movq %%mm0, %0 \n\t"
  342. :"+m"(*block)
  343. :"m"(*pixels)
  344. :"memory");
  345. pixels += line_size;
  346. block += line_size;
  347. } while (--h);
  348. }
  349. static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  350. {
  351. MOVQ_BFE(mm6);
  352. JUMPALIGN();
  353. do {
  354. __asm __volatile(
  355. "movq %1, %%mm0 \n\t"
  356. "movq %2, %%mm1 \n\t"
  357. "movq %0, %%mm3 \n\t"
  358. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  359. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  360. "movq %%mm0, %0 \n\t"
  361. :"+m"(*dst)
  362. :"m"(*src1), "m"(*src2)
  363. :"memory");
  364. dst += dstStride;
  365. src1 += src1Stride;
  366. src2 += 8;
  367. } while (--h);
  368. }
  369. static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  370. {
  371. MOVQ_BFE(mm6);
  372. JUMPALIGN();
  373. do {
  374. __asm __volatile(
  375. "movq %1, %%mm0 \n\t"
  376. "movq 1%1, %%mm1 \n\t"
  377. "movq %0, %%mm3 \n\t"
  378. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  379. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  380. "movq %%mm0, %0 \n\t"
  381. "movq 8%1, %%mm0 \n\t"
  382. "movq 9%1, %%mm1 \n\t"
  383. "movq 8%0, %%mm3 \n\t"
  384. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  385. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  386. "movq %%mm0, 8%0 \n\t"
  387. :"+m"(*block)
  388. :"m"(*pixels)
  389. :"memory");
  390. pixels += line_size;
  391. block += line_size;
  392. } while (--h);
  393. }
  394. static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  395. {
  396. MOVQ_BFE(mm6);
  397. JUMPALIGN();
  398. do {
  399. __asm __volatile(
  400. "movq %1, %%mm0 \n\t"
  401. "movq %2, %%mm1 \n\t"
  402. "movq %0, %%mm3 \n\t"
  403. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  404. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  405. "movq %%mm0, %0 \n\t"
  406. "movq 8%1, %%mm0 \n\t"
  407. "movq 8%2, %%mm1 \n\t"
  408. "movq 8%0, %%mm3 \n\t"
  409. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  410. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  411. "movq %%mm0, 8%0 \n\t"
  412. :"+m"(*dst)
  413. :"m"(*src1), "m"(*src2)
  414. :"memory");
  415. dst += dstStride;
  416. src1 += src1Stride;
  417. src2 += 16;
  418. } while (--h);
  419. }
  420. static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  421. {
  422. MOVQ_BFE(mm6);
  423. __asm __volatile(
  424. "lea (%3, %3), %%eax \n\t"
  425. "movq (%1), %%mm0 \n\t"
  426. ".balign 8 \n\t"
  427. "1: \n\t"
  428. "movq (%1, %3), %%mm1 \n\t"
  429. "movq (%1, %%eax), %%mm2 \n\t"
  430. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  431. "movq (%2), %%mm3 \n\t"
  432. PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
  433. "movq (%2, %3), %%mm3 \n\t"
  434. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  435. "movq %%mm0, (%2) \n\t"
  436. "movq %%mm1, (%2, %3) \n\t"
  437. "addl %%eax, %1 \n\t"
  438. "addl %%eax, %2 \n\t"
  439. "movq (%1, %3), %%mm1 \n\t"
  440. "movq (%1, %%eax), %%mm0 \n\t"
  441. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  442. "movq (%2), %%mm3 \n\t"
  443. PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
  444. "movq (%2, %3), %%mm3 \n\t"
  445. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  446. "movq %%mm2, (%2) \n\t"
  447. "movq %%mm1, (%2, %3) \n\t"
  448. "addl %%eax, %1 \n\t"
  449. "addl %%eax, %2 \n\t"
  450. "subl $4, %0 \n\t"
  451. "jnz 1b \n\t"
  452. :"+g"(h), "+S"(pixels), "+D"(block)
  453. :"r"(line_size)
  454. :"eax", "memory");
  455. }
  456. // this routine is 'slightly' suboptimal but mostly unused
  457. static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  458. {
  459. MOVQ_ZERO(mm7);
  460. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  461. __asm __volatile(
  462. "movq (%1), %%mm0 \n\t"
  463. "movq 1(%1), %%mm4 \n\t"
  464. "movq %%mm0, %%mm1 \n\t"
  465. "movq %%mm4, %%mm5 \n\t"
  466. "punpcklbw %%mm7, %%mm0 \n\t"
  467. "punpcklbw %%mm7, %%mm4 \n\t"
  468. "punpckhbw %%mm7, %%mm1 \n\t"
  469. "punpckhbw %%mm7, %%mm5 \n\t"
  470. "paddusw %%mm0, %%mm4 \n\t"
  471. "paddusw %%mm1, %%mm5 \n\t"
  472. "xorl %%eax, %%eax \n\t"
  473. "addl %3, %1 \n\t"
  474. ".balign 8 \n\t"
  475. "1: \n\t"
  476. "movq (%1, %%eax), %%mm0 \n\t"
  477. "movq 1(%1, %%eax), %%mm2 \n\t"
  478. "movq %%mm0, %%mm1 \n\t"
  479. "movq %%mm2, %%mm3 \n\t"
  480. "punpcklbw %%mm7, %%mm0 \n\t"
  481. "punpcklbw %%mm7, %%mm2 \n\t"
  482. "punpckhbw %%mm7, %%mm1 \n\t"
  483. "punpckhbw %%mm7, %%mm3 \n\t"
  484. "paddusw %%mm2, %%mm0 \n\t"
  485. "paddusw %%mm3, %%mm1 \n\t"
  486. "paddusw %%mm6, %%mm4 \n\t"
  487. "paddusw %%mm6, %%mm5 \n\t"
  488. "paddusw %%mm0, %%mm4 \n\t"
  489. "paddusw %%mm1, %%mm5 \n\t"
  490. "psrlw $2, %%mm4 \n\t"
  491. "psrlw $2, %%mm5 \n\t"
  492. "movq (%2, %%eax), %%mm3 \n\t"
  493. "packuswb %%mm5, %%mm4 \n\t"
  494. "pcmpeqd %%mm2, %%mm2 \n\t"
  495. "paddb %%mm2, %%mm2 \n\t"
  496. PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
  497. "movq %%mm5, (%2, %%eax) \n\t"
  498. "addl %3, %%eax \n\t"
  499. "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  500. "movq 1(%1, %%eax), %%mm4 \n\t"
  501. "movq %%mm2, %%mm3 \n\t"
  502. "movq %%mm4, %%mm5 \n\t"
  503. "punpcklbw %%mm7, %%mm2 \n\t"
  504. "punpcklbw %%mm7, %%mm4 \n\t"
  505. "punpckhbw %%mm7, %%mm3 \n\t"
  506. "punpckhbw %%mm7, %%mm5 \n\t"
  507. "paddusw %%mm2, %%mm4 \n\t"
  508. "paddusw %%mm3, %%mm5 \n\t"
  509. "paddusw %%mm6, %%mm0 \n\t"
  510. "paddusw %%mm6, %%mm1 \n\t"
  511. "paddusw %%mm4, %%mm0 \n\t"
  512. "paddusw %%mm5, %%mm1 \n\t"
  513. "psrlw $2, %%mm0 \n\t"
  514. "psrlw $2, %%mm1 \n\t"
  515. "movq (%2, %%eax), %%mm3 \n\t"
  516. "packuswb %%mm1, %%mm0 \n\t"
  517. "pcmpeqd %%mm2, %%mm2 \n\t"
  518. "paddb %%mm2, %%mm2 \n\t"
  519. PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
  520. "movq %%mm1, (%2, %%eax) \n\t"
  521. "addl %3, %%eax \n\t"
  522. "subl $2, %0 \n\t"
  523. "jnz 1b \n\t"
  524. :"+g"(h), "+S"(pixels)
  525. :"D"(block), "r"(line_size)
  526. :"eax", "memory");
  527. }
  528. //FIXME optimize
  529. static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  530. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  531. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  532. }
  533. static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  534. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  535. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  536. }
  537. static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  538. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  539. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  540. }
  541. static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  542. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  543. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  544. }