You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

572 lines
16KB

  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. // put_pixels
  25. static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  26. {
  27. MOVQ_BFE(mm6);
  28. __asm __volatile(
  29. "lea (%3, %3), %%eax \n\t"
  30. ".balign 8 \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq 1(%1), %%mm1 \n\t"
  34. "movq (%1, %3), %%mm2 \n\t"
  35. "movq 1(%1, %3), %%mm3 \n\t"
  36. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  37. "movq %%mm4, (%2) \n\t"
  38. "movq %%mm5, (%2, %3) \n\t"
  39. "addl %%eax, %1 \n\t"
  40. "addl %%eax, %2 \n\t"
  41. "movq (%1), %%mm0 \n\t"
  42. "movq 1(%1), %%mm1 \n\t"
  43. "movq (%1, %3), %%mm2 \n\t"
  44. "movq 1(%1, %3), %%mm3 \n\t"
  45. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  46. "movq %%mm4, (%2) \n\t"
  47. "movq %%mm5, (%2, %3) \n\t"
  48. "addl %%eax, %1 \n\t"
  49. "addl %%eax, %2 \n\t"
  50. "subl $4, %0 \n\t"
  51. "jnz 1b \n\t"
  52. :"+g"(h), "+S"(pixels), "+D"(block)
  53. :"r"(line_size)
  54. :"eax", "memory");
  55. }
  56. static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  57. {
  58. MOVQ_BFE(mm6);
  59. __asm __volatile(
  60. "testl $1, %0 \n\t"
  61. " jz 1f \n\t"
  62. "movq (%1), %%mm0 \n\t"
  63. "movq (%2), %%mm1 \n\t"
  64. "addl %4, %1 \n\t"
  65. "addl $8, %2 \n\t"
  66. PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
  67. "movq %%mm4, (%3) \n\t"
  68. "addl %5, %3 \n\t"
  69. "decl %0 \n\t"
  70. ".balign 8 \n\t"
  71. "1: \n\t"
  72. "movq (%1), %%mm0 \n\t"
  73. "movq (%2), %%mm1 \n\t"
  74. "addl %4, %1 \n\t"
  75. "movq (%1), %%mm2 \n\t"
  76. "movq 8(%2), %%mm3 \n\t"
  77. "addl %4, %1 \n\t"
  78. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  79. "movq %%mm4, (%3) \n\t"
  80. "addl %5, %3 \n\t"
  81. "movq %%mm5, (%3) \n\t"
  82. "addl %5, %3 \n\t"
  83. "movq (%1), %%mm0 \n\t"
  84. "movq 16(%2), %%mm1 \n\t"
  85. "addl %4, %1 \n\t"
  86. "movq (%1), %%mm2 \n\t"
  87. "movq 24(%2), %%mm3 \n\t"
  88. "addl %4, %1 \n\t"
  89. "addl $32, %2 \n\t"
  90. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  91. "movq %%mm4, (%3) \n\t"
  92. "addl %5, %3 \n\t"
  93. "movq %%mm5, (%3) \n\t"
  94. "addl %5, %3 \n\t"
  95. "subl $4, %0 \n\t"
  96. "jnz 1b \n\t"
  97. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  98. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  99. #else
  100. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  101. #endif
  102. :"S"(src1Stride), "D"(dstStride)
  103. :"memory");
  104. }
  105. static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  106. {
  107. MOVQ_BFE(mm6);
  108. __asm __volatile(
  109. "lea (%3, %3), %%eax \n\t"
  110. ".balign 8 \n\t"
  111. "1: \n\t"
  112. "movq (%1), %%mm0 \n\t"
  113. "movq 1(%1), %%mm1 \n\t"
  114. "movq (%1, %3), %%mm2 \n\t"
  115. "movq 1(%1, %3), %%mm3 \n\t"
  116. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  117. "movq %%mm4, (%2) \n\t"
  118. "movq %%mm5, (%2, %3) \n\t"
  119. "movq 8(%1), %%mm0 \n\t"
  120. "movq 9(%1), %%mm1 \n\t"
  121. "movq 8(%1, %3), %%mm2 \n\t"
  122. "movq 9(%1, %3), %%mm3 \n\t"
  123. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  124. "movq %%mm4, 8(%2) \n\t"
  125. "movq %%mm5, 8(%2, %3) \n\t"
  126. "addl %%eax, %1 \n\t"
  127. "addl %%eax, %2 \n\t"
  128. "movq (%1), %%mm0 \n\t"
  129. "movq 1(%1), %%mm1 \n\t"
  130. "movq (%1, %3), %%mm2 \n\t"
  131. "movq 1(%1, %3), %%mm3 \n\t"
  132. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  133. "movq %%mm4, (%2) \n\t"
  134. "movq %%mm5, (%2, %3) \n\t"
  135. "movq 8(%1), %%mm0 \n\t"
  136. "movq 9(%1), %%mm1 \n\t"
  137. "movq 8(%1, %3), %%mm2 \n\t"
  138. "movq 9(%1, %3), %%mm3 \n\t"
  139. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  140. "movq %%mm4, 8(%2) \n\t"
  141. "movq %%mm5, 8(%2, %3) \n\t"
  142. "addl %%eax, %1 \n\t"
  143. "addl %%eax, %2 \n\t"
  144. "subl $4, %0 \n\t"
  145. "jnz 1b \n\t"
  146. :"+g"(h), "+S"(pixels), "+D"(block)
  147. :"r"(line_size)
  148. :"eax", "memory");
  149. }
  150. static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  151. {
  152. MOVQ_BFE(mm6);
  153. __asm __volatile(
  154. "testl $1, %0 \n\t"
  155. " jz 1f \n\t"
  156. "movq (%1), %%mm0 \n\t"
  157. "movq (%2), %%mm1 \n\t"
  158. "movq 8(%1), %%mm2 \n\t"
  159. "movq 8(%2), %%mm3 \n\t"
  160. "addl %4, %1 \n\t"
  161. "addl $16, %2 \n\t"
  162. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  163. "movq %%mm4, (%3) \n\t"
  164. "movq %%mm5, 8(%3) \n\t"
  165. "addl %5, %3 \n\t"
  166. "decl %0 \n\t"
  167. ".balign 8 \n\t"
  168. "1: \n\t"
  169. "movq (%1), %%mm0 \n\t"
  170. "movq (%2), %%mm1 \n\t"
  171. "movq 8(%1), %%mm2 \n\t"
  172. "movq 8(%2), %%mm3 \n\t"
  173. "addl %4, %1 \n\t"
  174. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  175. "movq %%mm4, (%3) \n\t"
  176. "movq %%mm5, 8(%3) \n\t"
  177. "addl %5, %3 \n\t"
  178. "movq (%1), %%mm0 \n\t"
  179. "movq 16(%2), %%mm1 \n\t"
  180. "movq 8(%1), %%mm2 \n\t"
  181. "movq 24(%2), %%mm3 \n\t"
  182. "addl %4, %1 \n\t"
  183. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  184. "movq %%mm4, (%3) \n\t"
  185. "movq %%mm5, 8(%3) \n\t"
  186. "addl %5, %3 \n\t"
  187. "addl $32, %2 \n\t"
  188. "subl $2, %0 \n\t"
  189. "jnz 1b \n\t"
  190. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  191. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  192. #else
  193. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  194. #endif
  195. :"S"(src1Stride), "D"(dstStride)
  196. :"memory");
  197. }
  198. static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  199. {
  200. MOVQ_BFE(mm6);
  201. __asm __volatile(
  202. "lea (%3, %3), %%eax \n\t"
  203. "movq (%1), %%mm0 \n\t"
  204. ".balign 8 \n\t"
  205. "1: \n\t"
  206. "movq (%1, %3), %%mm1 \n\t"
  207. "movq (%1, %%eax),%%mm2 \n\t"
  208. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  209. "movq %%mm4, (%2) \n\t"
  210. "movq %%mm5, (%2, %3) \n\t"
  211. "addl %%eax, %1 \n\t"
  212. "addl %%eax, %2 \n\t"
  213. "movq (%1, %3), %%mm1 \n\t"
  214. "movq (%1, %%eax),%%mm0 \n\t"
  215. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  216. "movq %%mm4, (%2) \n\t"
  217. "movq %%mm5, (%2, %3) \n\t"
  218. "addl %%eax, %1 \n\t"
  219. "addl %%eax, %2 \n\t"
  220. "subl $4, %0 \n\t"
  221. "jnz 1b \n\t"
  222. :"+g"(h), "+S"(pixels), "+D"(block)
  223. :"r"(line_size)
  224. :"eax", "memory");
  225. }
  226. static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  227. {
  228. MOVQ_ZERO(mm7);
  229. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  230. __asm __volatile(
  231. "movq (%1), %%mm0 \n\t"
  232. "movq 1(%1), %%mm4 \n\t"
  233. "movq %%mm0, %%mm1 \n\t"
  234. "movq %%mm4, %%mm5 \n\t"
  235. "punpcklbw %%mm7, %%mm0 \n\t"
  236. "punpcklbw %%mm7, %%mm4 \n\t"
  237. "punpckhbw %%mm7, %%mm1 \n\t"
  238. "punpckhbw %%mm7, %%mm5 \n\t"
  239. "paddusw %%mm0, %%mm4 \n\t"
  240. "paddusw %%mm1, %%mm5 \n\t"
  241. "xorl %%eax, %%eax \n\t"
  242. "addl %3, %1 \n\t"
  243. ".balign 8 \n\t"
  244. "1: \n\t"
  245. "movq (%1, %%eax), %%mm0 \n\t"
  246. "movq 1(%1, %%eax), %%mm2 \n\t"
  247. "movq %%mm0, %%mm1 \n\t"
  248. "movq %%mm2, %%mm3 \n\t"
  249. "punpcklbw %%mm7, %%mm0 \n\t"
  250. "punpcklbw %%mm7, %%mm2 \n\t"
  251. "punpckhbw %%mm7, %%mm1 \n\t"
  252. "punpckhbw %%mm7, %%mm3 \n\t"
  253. "paddusw %%mm2, %%mm0 \n\t"
  254. "paddusw %%mm3, %%mm1 \n\t"
  255. "paddusw %%mm6, %%mm4 \n\t"
  256. "paddusw %%mm6, %%mm5 \n\t"
  257. "paddusw %%mm0, %%mm4 \n\t"
  258. "paddusw %%mm1, %%mm5 \n\t"
  259. "psrlw $2, %%mm4 \n\t"
  260. "psrlw $2, %%mm5 \n\t"
  261. "packuswb %%mm5, %%mm4 \n\t"
  262. "movq %%mm4, (%2, %%eax) \n\t"
  263. "addl %3, %%eax \n\t"
  264. "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  265. "movq 1(%1, %%eax), %%mm4 \n\t"
  266. "movq %%mm2, %%mm3 \n\t"
  267. "movq %%mm4, %%mm5 \n\t"
  268. "punpcklbw %%mm7, %%mm2 \n\t"
  269. "punpcklbw %%mm7, %%mm4 \n\t"
  270. "punpckhbw %%mm7, %%mm3 \n\t"
  271. "punpckhbw %%mm7, %%mm5 \n\t"
  272. "paddusw %%mm2, %%mm4 \n\t"
  273. "paddusw %%mm3, %%mm5 \n\t"
  274. "paddusw %%mm6, %%mm0 \n\t"
  275. "paddusw %%mm6, %%mm1 \n\t"
  276. "paddusw %%mm4, %%mm0 \n\t"
  277. "paddusw %%mm5, %%mm1 \n\t"
  278. "psrlw $2, %%mm0 \n\t"
  279. "psrlw $2, %%mm1 \n\t"
  280. "packuswb %%mm1, %%mm0 \n\t"
  281. "movq %%mm0, (%2, %%eax) \n\t"
  282. "addl %3, %%eax \n\t"
  283. "subl $2, %0 \n\t"
  284. "jnz 1b \n\t"
  285. :"+g"(h), "+S"(pixels)
  286. :"D"(block), "r"(line_size)
  287. :"eax", "memory");
  288. }
  289. // avg_pixels
  290. // in case more speed is needed - unroling would certainly help
  291. static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  292. {
  293. MOVQ_BFE(mm6);
  294. JUMPALIGN();
  295. do {
  296. __asm __volatile(
  297. "movq %0, %%mm0 \n\t"
  298. "movq %1, %%mm1 \n\t"
  299. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  300. "movq %%mm2, %0 \n\t"
  301. :"+m"(*block)
  302. :"m"(*pixels)
  303. :"memory");
  304. pixels += line_size;
  305. block += line_size;
  306. }
  307. while (--h);
  308. }
  309. static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  310. {
  311. MOVQ_BFE(mm6);
  312. JUMPALIGN();
  313. do {
  314. __asm __volatile(
  315. "movq %0, %%mm0 \n\t"
  316. "movq %1, %%mm1 \n\t"
  317. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  318. "movq %%mm2, %0 \n\t"
  319. "movq 8%0, %%mm0 \n\t"
  320. "movq 8%1, %%mm1 \n\t"
  321. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  322. "movq %%mm2, 8%0 \n\t"
  323. :"+m"(*block)
  324. :"m"(*pixels)
  325. :"memory");
  326. pixels += line_size;
  327. block += line_size;
  328. }
  329. while (--h);
  330. }
  331. static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  332. {
  333. MOVQ_BFE(mm6);
  334. JUMPALIGN();
  335. do {
  336. __asm __volatile(
  337. "movq %1, %%mm0 \n\t"
  338. "movq 1%1, %%mm1 \n\t"
  339. "movq %0, %%mm3 \n\t"
  340. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  341. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  342. "movq %%mm0, %0 \n\t"
  343. :"+m"(*block)
  344. :"m"(*pixels)
  345. :"memory");
  346. pixels += line_size;
  347. block += line_size;
  348. } while (--h);
  349. }
  350. static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  351. {
  352. MOVQ_BFE(mm6);
  353. JUMPALIGN();
  354. do {
  355. __asm __volatile(
  356. "movq %1, %%mm0 \n\t"
  357. "movq %2, %%mm1 \n\t"
  358. "movq %0, %%mm3 \n\t"
  359. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  360. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  361. "movq %%mm0, %0 \n\t"
  362. :"+m"(*dst)
  363. :"m"(*src1), "m"(*src2)
  364. :"memory");
  365. dst += dstStride;
  366. src1 += src1Stride;
  367. src2 += 8;
  368. } while (--h);
  369. }
  370. static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  371. {
  372. MOVQ_BFE(mm6);
  373. JUMPALIGN();
  374. do {
  375. __asm __volatile(
  376. "movq %1, %%mm0 \n\t"
  377. "movq 1%1, %%mm1 \n\t"
  378. "movq %0, %%mm3 \n\t"
  379. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  380. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  381. "movq %%mm0, %0 \n\t"
  382. "movq 8%1, %%mm0 \n\t"
  383. "movq 9%1, %%mm1 \n\t"
  384. "movq 8%0, %%mm3 \n\t"
  385. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  386. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  387. "movq %%mm0, 8%0 \n\t"
  388. :"+m"(*block)
  389. :"m"(*pixels)
  390. :"memory");
  391. pixels += line_size;
  392. block += line_size;
  393. } while (--h);
  394. }
  395. static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  396. {
  397. MOVQ_BFE(mm6);
  398. JUMPALIGN();
  399. do {
  400. __asm __volatile(
  401. "movq %1, %%mm0 \n\t"
  402. "movq %2, %%mm1 \n\t"
  403. "movq %0, %%mm3 \n\t"
  404. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  405. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  406. "movq %%mm0, %0 \n\t"
  407. "movq 8%1, %%mm0 \n\t"
  408. "movq 8%2, %%mm1 \n\t"
  409. "movq 8%0, %%mm3 \n\t"
  410. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  411. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  412. "movq %%mm0, 8%0 \n\t"
  413. :"+m"(*dst)
  414. :"m"(*src1), "m"(*src2)
  415. :"memory");
  416. dst += dstStride;
  417. src1 += src1Stride;
  418. src2 += 16;
  419. } while (--h);
  420. }
  421. static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  422. {
  423. MOVQ_BFE(mm6);
  424. __asm __volatile(
  425. "lea (%3, %3), %%eax \n\t"
  426. "movq (%1), %%mm0 \n\t"
  427. ".balign 8 \n\t"
  428. "1: \n\t"
  429. "movq (%1, %3), %%mm1 \n\t"
  430. "movq (%1, %%eax), %%mm2 \n\t"
  431. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  432. "movq (%2), %%mm3 \n\t"
  433. PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
  434. "movq (%2, %3), %%mm3 \n\t"
  435. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  436. "movq %%mm0, (%2) \n\t"
  437. "movq %%mm1, (%2, %3) \n\t"
  438. "addl %%eax, %1 \n\t"
  439. "addl %%eax, %2 \n\t"
  440. "movq (%1, %3), %%mm1 \n\t"
  441. "movq (%1, %%eax), %%mm0 \n\t"
  442. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  443. "movq (%2), %%mm3 \n\t"
  444. PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
  445. "movq (%2, %3), %%mm3 \n\t"
  446. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  447. "movq %%mm2, (%2) \n\t"
  448. "movq %%mm1, (%2, %3) \n\t"
  449. "addl %%eax, %1 \n\t"
  450. "addl %%eax, %2 \n\t"
  451. "subl $4, %0 \n\t"
  452. "jnz 1b \n\t"
  453. :"+g"(h), "+S"(pixels), "+D"(block)
  454. :"r"(line_size)
  455. :"eax", "memory");
  456. }
  457. // this routine is 'slightly' suboptimal but mostly unused
  458. static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  459. {
  460. MOVQ_ZERO(mm7);
  461. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  462. __asm __volatile(
  463. "movq (%1), %%mm0 \n\t"
  464. "movq 1(%1), %%mm4 \n\t"
  465. "movq %%mm0, %%mm1 \n\t"
  466. "movq %%mm4, %%mm5 \n\t"
  467. "punpcklbw %%mm7, %%mm0 \n\t"
  468. "punpcklbw %%mm7, %%mm4 \n\t"
  469. "punpckhbw %%mm7, %%mm1 \n\t"
  470. "punpckhbw %%mm7, %%mm5 \n\t"
  471. "paddusw %%mm0, %%mm4 \n\t"
  472. "paddusw %%mm1, %%mm5 \n\t"
  473. "xorl %%eax, %%eax \n\t"
  474. "addl %3, %1 \n\t"
  475. ".balign 8 \n\t"
  476. "1: \n\t"
  477. "movq (%1, %%eax), %%mm0 \n\t"
  478. "movq 1(%1, %%eax), %%mm2 \n\t"
  479. "movq %%mm0, %%mm1 \n\t"
  480. "movq %%mm2, %%mm3 \n\t"
  481. "punpcklbw %%mm7, %%mm0 \n\t"
  482. "punpcklbw %%mm7, %%mm2 \n\t"
  483. "punpckhbw %%mm7, %%mm1 \n\t"
  484. "punpckhbw %%mm7, %%mm3 \n\t"
  485. "paddusw %%mm2, %%mm0 \n\t"
  486. "paddusw %%mm3, %%mm1 \n\t"
  487. "paddusw %%mm6, %%mm4 \n\t"
  488. "paddusw %%mm6, %%mm5 \n\t"
  489. "paddusw %%mm0, %%mm4 \n\t"
  490. "paddusw %%mm1, %%mm5 \n\t"
  491. "psrlw $2, %%mm4 \n\t"
  492. "psrlw $2, %%mm5 \n\t"
  493. "movq (%2, %%eax), %%mm3 \n\t"
  494. "packuswb %%mm5, %%mm4 \n\t"
  495. "pcmpeqd %%mm2, %%mm2 \n\t"
  496. "paddb %%mm2, %%mm2 \n\t"
  497. PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
  498. "movq %%mm5, (%2, %%eax) \n\t"
  499. "addl %3, %%eax \n\t"
  500. "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  501. "movq 1(%1, %%eax), %%mm4 \n\t"
  502. "movq %%mm2, %%mm3 \n\t"
  503. "movq %%mm4, %%mm5 \n\t"
  504. "punpcklbw %%mm7, %%mm2 \n\t"
  505. "punpcklbw %%mm7, %%mm4 \n\t"
  506. "punpckhbw %%mm7, %%mm3 \n\t"
  507. "punpckhbw %%mm7, %%mm5 \n\t"
  508. "paddusw %%mm2, %%mm4 \n\t"
  509. "paddusw %%mm3, %%mm5 \n\t"
  510. "paddusw %%mm6, %%mm0 \n\t"
  511. "paddusw %%mm6, %%mm1 \n\t"
  512. "paddusw %%mm4, %%mm0 \n\t"
  513. "paddusw %%mm5, %%mm1 \n\t"
  514. "psrlw $2, %%mm0 \n\t"
  515. "psrlw $2, %%mm1 \n\t"
  516. "movq (%2, %%eax), %%mm3 \n\t"
  517. "packuswb %%mm1, %%mm0 \n\t"
  518. "pcmpeqd %%mm2, %%mm2 \n\t"
  519. "paddb %%mm2, %%mm2 \n\t"
  520. PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
  521. "movq %%mm1, (%2, %%eax) \n\t"
  522. "addl %3, %%eax \n\t"
  523. "subl $2, %0 \n\t"
  524. "jnz 1b \n\t"
  525. :"+g"(h), "+S"(pixels)
  526. :"D"(block), "r"(line_size)
  527. :"eax", "memory");
  528. }
  529. //FIXME optimize
  530. static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  531. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  532. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  533. }
  534. static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  535. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  536. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  537. }
  538. static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  539. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  540. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  541. }
  542. static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  543. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  544. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  545. }