You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

591 lines
16KB

  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. // put_pixels
  25. static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  26. {
  27. MOVQ_BFE(mm6);
  28. __asm __volatile(
  29. "lea (%3, %3), %%"REG_a" \n\t"
  30. ".balign 8 \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq 1(%1), %%mm1 \n\t"
  34. "movq (%1, %3), %%mm2 \n\t"
  35. "movq 1(%1, %3), %%mm3 \n\t"
  36. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  37. "movq %%mm4, (%2) \n\t"
  38. "movq %%mm5, (%2, %3) \n\t"
  39. "add %%"REG_a", %1 \n\t"
  40. "add %%"REG_a", %2 \n\t"
  41. "movq (%1), %%mm0 \n\t"
  42. "movq 1(%1), %%mm1 \n\t"
  43. "movq (%1, %3), %%mm2 \n\t"
  44. "movq 1(%1, %3), %%mm3 \n\t"
  45. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  46. "movq %%mm4, (%2) \n\t"
  47. "movq %%mm5, (%2, %3) \n\t"
  48. "add %%"REG_a", %1 \n\t"
  49. "add %%"REG_a", %2 \n\t"
  50. "subl $4, %0 \n\t"
  51. "jnz 1b \n\t"
  52. :"+g"(h), "+S"(pixels), "+D"(block)
  53. :"r"((long)line_size)
  54. :REG_a, "memory");
  55. }
  56. static void attribute_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  57. {
  58. MOVQ_BFE(mm6);
  59. __asm __volatile(
  60. "testl $1, %0 \n\t"
  61. " jz 1f \n\t"
  62. "movq (%1), %%mm0 \n\t"
  63. "movq (%2), %%mm1 \n\t"
  64. "add %4, %1 \n\t"
  65. "add $8, %2 \n\t"
  66. PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
  67. "movq %%mm4, (%3) \n\t"
  68. "add %5, %3 \n\t"
  69. "decl %0 \n\t"
  70. ".balign 8 \n\t"
  71. "1: \n\t"
  72. "movq (%1), %%mm0 \n\t"
  73. "movq (%2), %%mm1 \n\t"
  74. "add %4, %1 \n\t"
  75. "movq (%1), %%mm2 \n\t"
  76. "movq 8(%2), %%mm3 \n\t"
  77. "add %4, %1 \n\t"
  78. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  79. "movq %%mm4, (%3) \n\t"
  80. "add %5, %3 \n\t"
  81. "movq %%mm5, (%3) \n\t"
  82. "add %5, %3 \n\t"
  83. "movq (%1), %%mm0 \n\t"
  84. "movq 16(%2), %%mm1 \n\t"
  85. "add %4, %1 \n\t"
  86. "movq (%1), %%mm2 \n\t"
  87. "movq 24(%2), %%mm3 \n\t"
  88. "add %4, %1 \n\t"
  89. "add $32, %2 \n\t"
  90. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  91. "movq %%mm4, (%3) \n\t"
  92. "add %5, %3 \n\t"
  93. "movq %%mm5, (%3) \n\t"
  94. "add %5, %3 \n\t"
  95. "subl $4, %0 \n\t"
  96. "jnz 1b \n\t"
  97. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  98. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  99. #else
  100. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  101. #endif
  102. :"S"((long)src1Stride), "D"((long)dstStride)
  103. :"memory");
  104. }
  105. static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  106. {
  107. MOVQ_BFE(mm6);
  108. __asm __volatile(
  109. "lea (%3, %3), %%"REG_a" \n\t"
  110. ".balign 8 \n\t"
  111. "1: \n\t"
  112. "movq (%1), %%mm0 \n\t"
  113. "movq 1(%1), %%mm1 \n\t"
  114. "movq (%1, %3), %%mm2 \n\t"
  115. "movq 1(%1, %3), %%mm3 \n\t"
  116. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  117. "movq %%mm4, (%2) \n\t"
  118. "movq %%mm5, (%2, %3) \n\t"
  119. "movq 8(%1), %%mm0 \n\t"
  120. "movq 9(%1), %%mm1 \n\t"
  121. "movq 8(%1, %3), %%mm2 \n\t"
  122. "movq 9(%1, %3), %%mm3 \n\t"
  123. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  124. "movq %%mm4, 8(%2) \n\t"
  125. "movq %%mm5, 8(%2, %3) \n\t"
  126. "add %%"REG_a", %1 \n\t"
  127. "add %%"REG_a", %2 \n\t"
  128. "movq (%1), %%mm0 \n\t"
  129. "movq 1(%1), %%mm1 \n\t"
  130. "movq (%1, %3), %%mm2 \n\t"
  131. "movq 1(%1, %3), %%mm3 \n\t"
  132. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  133. "movq %%mm4, (%2) \n\t"
  134. "movq %%mm5, (%2, %3) \n\t"
  135. "movq 8(%1), %%mm0 \n\t"
  136. "movq 9(%1), %%mm1 \n\t"
  137. "movq 8(%1, %3), %%mm2 \n\t"
  138. "movq 9(%1, %3), %%mm3 \n\t"
  139. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  140. "movq %%mm4, 8(%2) \n\t"
  141. "movq %%mm5, 8(%2, %3) \n\t"
  142. "add %%"REG_a", %1 \n\t"
  143. "add %%"REG_a", %2 \n\t"
  144. "subl $4, %0 \n\t"
  145. "jnz 1b \n\t"
  146. :"+g"(h), "+S"(pixels), "+D"(block)
  147. :"r"((long)line_size)
  148. :REG_a, "memory");
  149. }
  150. static void attribute_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  151. {
  152. MOVQ_BFE(mm6);
  153. __asm __volatile(
  154. "testl $1, %0 \n\t"
  155. " jz 1f \n\t"
  156. "movq (%1), %%mm0 \n\t"
  157. "movq (%2), %%mm1 \n\t"
  158. "movq 8(%1), %%mm2 \n\t"
  159. "movq 8(%2), %%mm3 \n\t"
  160. "add %4, %1 \n\t"
  161. "add $16, %2 \n\t"
  162. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  163. "movq %%mm4, (%3) \n\t"
  164. "movq %%mm5, 8(%3) \n\t"
  165. "add %5, %3 \n\t"
  166. "decl %0 \n\t"
  167. ".balign 8 \n\t"
  168. "1: \n\t"
  169. "movq (%1), %%mm0 \n\t"
  170. "movq (%2), %%mm1 \n\t"
  171. "movq 8(%1), %%mm2 \n\t"
  172. "movq 8(%2), %%mm3 \n\t"
  173. "add %4, %1 \n\t"
  174. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  175. "movq %%mm4, (%3) \n\t"
  176. "movq %%mm5, 8(%3) \n\t"
  177. "add %5, %3 \n\t"
  178. "movq (%1), %%mm0 \n\t"
  179. "movq 16(%2), %%mm1 \n\t"
  180. "movq 8(%1), %%mm2 \n\t"
  181. "movq 24(%2), %%mm3 \n\t"
  182. "add %4, %1 \n\t"
  183. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  184. "movq %%mm4, (%3) \n\t"
  185. "movq %%mm5, 8(%3) \n\t"
  186. "add %5, %3 \n\t"
  187. "add $32, %2 \n\t"
  188. "subl $2, %0 \n\t"
  189. "jnz 1b \n\t"
  190. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  191. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  192. #else
  193. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  194. #endif
  195. :"S"((long)src1Stride), "D"((long)dstStride)
  196. :"memory");
  197. }
  198. static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  199. {
  200. MOVQ_BFE(mm6);
  201. __asm __volatile(
  202. "lea (%3, %3), %%"REG_a" \n\t"
  203. "movq (%1), %%mm0 \n\t"
  204. ".balign 8 \n\t"
  205. "1: \n\t"
  206. "movq (%1, %3), %%mm1 \n\t"
  207. "movq (%1, %%"REG_a"),%%mm2 \n\t"
  208. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  209. "movq %%mm4, (%2) \n\t"
  210. "movq %%mm5, (%2, %3) \n\t"
  211. "add %%"REG_a", %1 \n\t"
  212. "add %%"REG_a", %2 \n\t"
  213. "movq (%1, %3), %%mm1 \n\t"
  214. "movq (%1, %%"REG_a"),%%mm0 \n\t"
  215. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  216. "movq %%mm4, (%2) \n\t"
  217. "movq %%mm5, (%2, %3) \n\t"
  218. "add %%"REG_a", %1 \n\t"
  219. "add %%"REG_a", %2 \n\t"
  220. "subl $4, %0 \n\t"
  221. "jnz 1b \n\t"
  222. :"+g"(h), "+S"(pixels), "+D"(block)
  223. :"r"((long)line_size)
  224. :REG_a, "memory");
  225. }
  226. static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  227. {
  228. MOVQ_ZERO(mm7);
  229. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  230. __asm __volatile(
  231. "movq (%1), %%mm0 \n\t"
  232. "movq 1(%1), %%mm4 \n\t"
  233. "movq %%mm0, %%mm1 \n\t"
  234. "movq %%mm4, %%mm5 \n\t"
  235. "punpcklbw %%mm7, %%mm0 \n\t"
  236. "punpcklbw %%mm7, %%mm4 \n\t"
  237. "punpckhbw %%mm7, %%mm1 \n\t"
  238. "punpckhbw %%mm7, %%mm5 \n\t"
  239. "paddusw %%mm0, %%mm4 \n\t"
  240. "paddusw %%mm1, %%mm5 \n\t"
  241. "xor %%"REG_a", %%"REG_a" \n\t"
  242. "add %3, %1 \n\t"
  243. ".balign 8 \n\t"
  244. "1: \n\t"
  245. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  246. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  247. "movq %%mm0, %%mm1 \n\t"
  248. "movq %%mm2, %%mm3 \n\t"
  249. "punpcklbw %%mm7, %%mm0 \n\t"
  250. "punpcklbw %%mm7, %%mm2 \n\t"
  251. "punpckhbw %%mm7, %%mm1 \n\t"
  252. "punpckhbw %%mm7, %%mm3 \n\t"
  253. "paddusw %%mm2, %%mm0 \n\t"
  254. "paddusw %%mm3, %%mm1 \n\t"
  255. "paddusw %%mm6, %%mm4 \n\t"
  256. "paddusw %%mm6, %%mm5 \n\t"
  257. "paddusw %%mm0, %%mm4 \n\t"
  258. "paddusw %%mm1, %%mm5 \n\t"
  259. "psrlw $2, %%mm4 \n\t"
  260. "psrlw $2, %%mm5 \n\t"
  261. "packuswb %%mm5, %%mm4 \n\t"
  262. "movq %%mm4, (%2, %%"REG_a") \n\t"
  263. "add %3, %%"REG_a" \n\t"
  264. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  265. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  266. "movq %%mm2, %%mm3 \n\t"
  267. "movq %%mm4, %%mm5 \n\t"
  268. "punpcklbw %%mm7, %%mm2 \n\t"
  269. "punpcklbw %%mm7, %%mm4 \n\t"
  270. "punpckhbw %%mm7, %%mm3 \n\t"
  271. "punpckhbw %%mm7, %%mm5 \n\t"
  272. "paddusw %%mm2, %%mm4 \n\t"
  273. "paddusw %%mm3, %%mm5 \n\t"
  274. "paddusw %%mm6, %%mm0 \n\t"
  275. "paddusw %%mm6, %%mm1 \n\t"
  276. "paddusw %%mm4, %%mm0 \n\t"
  277. "paddusw %%mm5, %%mm1 \n\t"
  278. "psrlw $2, %%mm0 \n\t"
  279. "psrlw $2, %%mm1 \n\t"
  280. "packuswb %%mm1, %%mm0 \n\t"
  281. "movq %%mm0, (%2, %%"REG_a") \n\t"
  282. "add %3, %%"REG_a" \n\t"
  283. "subl $2, %0 \n\t"
  284. "jnz 1b \n\t"
  285. :"+g"(h), "+S"(pixels)
  286. :"D"(block), "r"((long)line_size)
  287. :REG_a, "memory");
  288. }
  289. // avg_pixels
  290. static void attribute_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  291. {
  292. MOVQ_BFE(mm6);
  293. JUMPALIGN();
  294. do {
  295. __asm __volatile(
  296. "movd %0, %%mm0 \n\t"
  297. "movd %1, %%mm1 \n\t"
  298. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  299. "movd %%mm2, %0 \n\t"
  300. :"+m"(*block)
  301. :"m"(*pixels)
  302. :"memory");
  303. pixels += line_size;
  304. block += line_size;
  305. }
  306. while (--h);
  307. }
  308. // in case more speed is needed - unroling would certainly help
  309. static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  310. {
  311. MOVQ_BFE(mm6);
  312. JUMPALIGN();
  313. do {
  314. __asm __volatile(
  315. "movq %0, %%mm0 \n\t"
  316. "movq %1, %%mm1 \n\t"
  317. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  318. "movq %%mm2, %0 \n\t"
  319. :"+m"(*block)
  320. :"m"(*pixels)
  321. :"memory");
  322. pixels += line_size;
  323. block += line_size;
  324. }
  325. while (--h);
  326. }
  327. static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  328. {
  329. MOVQ_BFE(mm6);
  330. JUMPALIGN();
  331. do {
  332. __asm __volatile(
  333. "movq %0, %%mm0 \n\t"
  334. "movq %1, %%mm1 \n\t"
  335. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  336. "movq %%mm2, %0 \n\t"
  337. "movq 8%0, %%mm0 \n\t"
  338. "movq 8%1, %%mm1 \n\t"
  339. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  340. "movq %%mm2, 8%0 \n\t"
  341. :"+m"(*block)
  342. :"m"(*pixels)
  343. :"memory");
  344. pixels += line_size;
  345. block += line_size;
  346. }
  347. while (--h);
  348. }
  349. static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  350. {
  351. MOVQ_BFE(mm6);
  352. JUMPALIGN();
  353. do {
  354. __asm __volatile(
  355. "movq %1, %%mm0 \n\t"
  356. "movq 1%1, %%mm1 \n\t"
  357. "movq %0, %%mm3 \n\t"
  358. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  359. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  360. "movq %%mm0, %0 \n\t"
  361. :"+m"(*block)
  362. :"m"(*pixels)
  363. :"memory");
  364. pixels += line_size;
  365. block += line_size;
  366. } while (--h);
  367. }
  368. static __attribute__((unused)) void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  369. {
  370. MOVQ_BFE(mm6);
  371. JUMPALIGN();
  372. do {
  373. __asm __volatile(
  374. "movq %1, %%mm0 \n\t"
  375. "movq %2, %%mm1 \n\t"
  376. "movq %0, %%mm3 \n\t"
  377. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  378. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  379. "movq %%mm0, %0 \n\t"
  380. :"+m"(*dst)
  381. :"m"(*src1), "m"(*src2)
  382. :"memory");
  383. dst += dstStride;
  384. src1 += src1Stride;
  385. src2 += 8;
  386. } while (--h);
  387. }
  388. static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  389. {
  390. MOVQ_BFE(mm6);
  391. JUMPALIGN();
  392. do {
  393. __asm __volatile(
  394. "movq %1, %%mm0 \n\t"
  395. "movq 1%1, %%mm1 \n\t"
  396. "movq %0, %%mm3 \n\t"
  397. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  398. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  399. "movq %%mm0, %0 \n\t"
  400. "movq 8%1, %%mm0 \n\t"
  401. "movq 9%1, %%mm1 \n\t"
  402. "movq 8%0, %%mm3 \n\t"
  403. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  404. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  405. "movq %%mm0, 8%0 \n\t"
  406. :"+m"(*block)
  407. :"m"(*pixels)
  408. :"memory");
  409. pixels += line_size;
  410. block += line_size;
  411. } while (--h);
  412. }
  413. static __attribute__((unused)) void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  414. {
  415. MOVQ_BFE(mm6);
  416. JUMPALIGN();
  417. do {
  418. __asm __volatile(
  419. "movq %1, %%mm0 \n\t"
  420. "movq %2, %%mm1 \n\t"
  421. "movq %0, %%mm3 \n\t"
  422. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  423. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  424. "movq %%mm0, %0 \n\t"
  425. "movq 8%1, %%mm0 \n\t"
  426. "movq 8%2, %%mm1 \n\t"
  427. "movq 8%0, %%mm3 \n\t"
  428. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  429. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  430. "movq %%mm0, 8%0 \n\t"
  431. :"+m"(*dst)
  432. :"m"(*src1), "m"(*src2)
  433. :"memory");
  434. dst += dstStride;
  435. src1 += src1Stride;
  436. src2 += 16;
  437. } while (--h);
  438. }
  439. static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  440. {
  441. MOVQ_BFE(mm6);
  442. __asm __volatile(
  443. "lea (%3, %3), %%"REG_a" \n\t"
  444. "movq (%1), %%mm0 \n\t"
  445. ".balign 8 \n\t"
  446. "1: \n\t"
  447. "movq (%1, %3), %%mm1 \n\t"
  448. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  449. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  450. "movq (%2), %%mm3 \n\t"
  451. PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
  452. "movq (%2, %3), %%mm3 \n\t"
  453. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  454. "movq %%mm0, (%2) \n\t"
  455. "movq %%mm1, (%2, %3) \n\t"
  456. "add %%"REG_a", %1 \n\t"
  457. "add %%"REG_a", %2 \n\t"
  458. "movq (%1, %3), %%mm1 \n\t"
  459. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  460. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  461. "movq (%2), %%mm3 \n\t"
  462. PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
  463. "movq (%2, %3), %%mm3 \n\t"
  464. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  465. "movq %%mm2, (%2) \n\t"
  466. "movq %%mm1, (%2, %3) \n\t"
  467. "add %%"REG_a", %1 \n\t"
  468. "add %%"REG_a", %2 \n\t"
  469. "subl $4, %0 \n\t"
  470. "jnz 1b \n\t"
  471. :"+g"(h), "+S"(pixels), "+D"(block)
  472. :"r"((long)line_size)
  473. :REG_a, "memory");
  474. }
  475. // this routine is 'slightly' suboptimal but mostly unused
  476. static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  477. {
  478. MOVQ_ZERO(mm7);
  479. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  480. __asm __volatile(
  481. "movq (%1), %%mm0 \n\t"
  482. "movq 1(%1), %%mm4 \n\t"
  483. "movq %%mm0, %%mm1 \n\t"
  484. "movq %%mm4, %%mm5 \n\t"
  485. "punpcklbw %%mm7, %%mm0 \n\t"
  486. "punpcklbw %%mm7, %%mm4 \n\t"
  487. "punpckhbw %%mm7, %%mm1 \n\t"
  488. "punpckhbw %%mm7, %%mm5 \n\t"
  489. "paddusw %%mm0, %%mm4 \n\t"
  490. "paddusw %%mm1, %%mm5 \n\t"
  491. "xor %%"REG_a", %%"REG_a" \n\t"
  492. "add %3, %1 \n\t"
  493. ".balign 8 \n\t"
  494. "1: \n\t"
  495. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  496. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  497. "movq %%mm0, %%mm1 \n\t"
  498. "movq %%mm2, %%mm3 \n\t"
  499. "punpcklbw %%mm7, %%mm0 \n\t"
  500. "punpcklbw %%mm7, %%mm2 \n\t"
  501. "punpckhbw %%mm7, %%mm1 \n\t"
  502. "punpckhbw %%mm7, %%mm3 \n\t"
  503. "paddusw %%mm2, %%mm0 \n\t"
  504. "paddusw %%mm3, %%mm1 \n\t"
  505. "paddusw %%mm6, %%mm4 \n\t"
  506. "paddusw %%mm6, %%mm5 \n\t"
  507. "paddusw %%mm0, %%mm4 \n\t"
  508. "paddusw %%mm1, %%mm5 \n\t"
  509. "psrlw $2, %%mm4 \n\t"
  510. "psrlw $2, %%mm5 \n\t"
  511. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  512. "packuswb %%mm5, %%mm4 \n\t"
  513. "pcmpeqd %%mm2, %%mm2 \n\t"
  514. "paddb %%mm2, %%mm2 \n\t"
  515. PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
  516. "movq %%mm5, (%2, %%"REG_a") \n\t"
  517. "add %3, %%"REG_a" \n\t"
  518. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  519. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  520. "movq %%mm2, %%mm3 \n\t"
  521. "movq %%mm4, %%mm5 \n\t"
  522. "punpcklbw %%mm7, %%mm2 \n\t"
  523. "punpcklbw %%mm7, %%mm4 \n\t"
  524. "punpckhbw %%mm7, %%mm3 \n\t"
  525. "punpckhbw %%mm7, %%mm5 \n\t"
  526. "paddusw %%mm2, %%mm4 \n\t"
  527. "paddusw %%mm3, %%mm5 \n\t"
  528. "paddusw %%mm6, %%mm0 \n\t"
  529. "paddusw %%mm6, %%mm1 \n\t"
  530. "paddusw %%mm4, %%mm0 \n\t"
  531. "paddusw %%mm5, %%mm1 \n\t"
  532. "psrlw $2, %%mm0 \n\t"
  533. "psrlw $2, %%mm1 \n\t"
  534. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  535. "packuswb %%mm1, %%mm0 \n\t"
  536. "pcmpeqd %%mm2, %%mm2 \n\t"
  537. "paddb %%mm2, %%mm2 \n\t"
  538. PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
  539. "movq %%mm1, (%2, %%"REG_a") \n\t"
  540. "add %3, %%"REG_a" \n\t"
  541. "subl $2, %0 \n\t"
  542. "jnz 1b \n\t"
  543. :"+g"(h), "+S"(pixels)
  544. :"D"(block), "r"((long)line_size)
  545. :REG_a, "memory");
  546. }
  547. //FIXME optimize
  548. static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  549. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  550. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  551. }
  552. static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  553. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  554. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  555. }
  556. static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  557. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  558. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  559. }
  560. static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  561. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  562. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  563. }