You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

821 lines
23KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  25. clobber bug - now it will work with 2.95.2 and also with -fPIC
  26. */
  27. static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  28. {
  29. __asm __volatile(
  30. "lea (%3, %3), %%"REG_a" \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq (%1, %3), %%mm1 \n\t"
  34. PAVGB" 1(%1), %%mm0 \n\t"
  35. PAVGB" 1(%1, %3), %%mm1 \n\t"
  36. "movq %%mm0, (%2) \n\t"
  37. "movq %%mm1, (%2, %3) \n\t"
  38. "add %%"REG_a", %1 \n\t"
  39. "add %%"REG_a", %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq (%1, %3), %%mm1 \n\t"
  42. PAVGB" 1(%1), %%mm0 \n\t"
  43. PAVGB" 1(%1, %3), %%mm1 \n\t"
  44. "add %%"REG_a", %1 \n\t"
  45. "movq %%mm0, (%2) \n\t"
  46. "movq %%mm1, (%2, %3) \n\t"
  47. "add %%"REG_a", %2 \n\t"
  48. "subl $4, %0 \n\t"
  49. "jnz 1b \n\t"
  50. :"+g"(h), "+S"(pixels), "+D"(block)
  51. :"r" ((long)line_size)
  52. :"%"REG_a, "memory");
  53. }
  54. static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  55. {
  56. __asm __volatile(
  57. "testl $1, %0 \n\t"
  58. " jz 1f \n\t"
  59. "movd (%1), %%mm0 \n\t"
  60. "movd (%2), %%mm1 \n\t"
  61. "add %4, %1 \n\t"
  62. "add $4, %2 \n\t"
  63. PAVGB" %%mm1, %%mm0 \n\t"
  64. "movd %%mm0, (%3) \n\t"
  65. "add %5, %3 \n\t"
  66. "decl %0 \n\t"
  67. "1: \n\t"
  68. "movd (%1), %%mm0 \n\t"
  69. "add %4, %1 \n\t"
  70. "movd (%1), %%mm1 \n\t"
  71. "movd (%2), %%mm2 \n\t"
  72. "movd 4(%2), %%mm3 \n\t"
  73. "add %4, %1 \n\t"
  74. PAVGB" %%mm2, %%mm0 \n\t"
  75. PAVGB" %%mm3, %%mm1 \n\t"
  76. "movd %%mm0, (%3) \n\t"
  77. "add %5, %3 \n\t"
  78. "movd %%mm1, (%3) \n\t"
  79. "add %5, %3 \n\t"
  80. "movd (%1), %%mm0 \n\t"
  81. "add %4, %1 \n\t"
  82. "movd (%1), %%mm1 \n\t"
  83. "movd 8(%2), %%mm2 \n\t"
  84. "movd 12(%2), %%mm3 \n\t"
  85. "add %4, %1 \n\t"
  86. PAVGB" %%mm2, %%mm0 \n\t"
  87. PAVGB" %%mm3, %%mm1 \n\t"
  88. "movd %%mm0, (%3) \n\t"
  89. "add %5, %3 \n\t"
  90. "movd %%mm1, (%3) \n\t"
  91. "add %5, %3 \n\t"
  92. "add $16, %2 \n\t"
  93. "subl $4, %0 \n\t"
  94. "jnz 1b \n\t"
  95. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  96. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  97. #else
  98. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  99. #endif
  100. :"S"((long)src1Stride), "D"((long)dstStride)
  101. :"memory");
  102. }
  103. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  104. {
  105. __asm __volatile(
  106. "testl $1, %0 \n\t"
  107. " jz 1f \n\t"
  108. "movq (%1), %%mm0 \n\t"
  109. "movq (%2), %%mm1 \n\t"
  110. "add %4, %1 \n\t"
  111. "add $8, %2 \n\t"
  112. PAVGB" %%mm1, %%mm0 \n\t"
  113. "movq %%mm0, (%3) \n\t"
  114. "add %5, %3 \n\t"
  115. "decl %0 \n\t"
  116. "1: \n\t"
  117. "movq (%1), %%mm0 \n\t"
  118. "add %4, %1 \n\t"
  119. "movq (%1), %%mm1 \n\t"
  120. "add %4, %1 \n\t"
  121. PAVGB" (%2), %%mm0 \n\t"
  122. PAVGB" 8(%2), %%mm1 \n\t"
  123. "movq %%mm0, (%3) \n\t"
  124. "add %5, %3 \n\t"
  125. "movq %%mm1, (%3) \n\t"
  126. "add %5, %3 \n\t"
  127. "movq (%1), %%mm0 \n\t"
  128. "add %4, %1 \n\t"
  129. "movq (%1), %%mm1 \n\t"
  130. "add %4, %1 \n\t"
  131. PAVGB" 16(%2), %%mm0 \n\t"
  132. PAVGB" 24(%2), %%mm1 \n\t"
  133. "movq %%mm0, (%3) \n\t"
  134. "add %5, %3 \n\t"
  135. "movq %%mm1, (%3) \n\t"
  136. "add %5, %3 \n\t"
  137. "add $32, %2 \n\t"
  138. "subl $4, %0 \n\t"
  139. "jnz 1b \n\t"
  140. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  141. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  142. #else
  143. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  144. #endif
  145. :"S"((long)src1Stride), "D"((long)dstStride)
  146. :"memory");
  147. //the following should be used, though better not with gcc ...
  148. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  149. :"r"(src1Stride), "r"(dstStride)
  150. :"memory");*/
  151. }
  152. static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  153. {
  154. __asm __volatile(
  155. "pcmpeqb %%mm6, %%mm6 \n\t"
  156. "testl $1, %0 \n\t"
  157. " jz 1f \n\t"
  158. "movq (%1), %%mm0 \n\t"
  159. "movq (%2), %%mm1 \n\t"
  160. "add %4, %1 \n\t"
  161. "add $8, %2 \n\t"
  162. "pxor %%mm6, %%mm0 \n\t"
  163. "pxor %%mm6, %%mm1 \n\t"
  164. PAVGB" %%mm1, %%mm0 \n\t"
  165. "pxor %%mm6, %%mm0 \n\t"
  166. "movq %%mm0, (%3) \n\t"
  167. "add %5, %3 \n\t"
  168. "decl %0 \n\t"
  169. "1: \n\t"
  170. "movq (%1), %%mm0 \n\t"
  171. "add %4, %1 \n\t"
  172. "movq (%1), %%mm1 \n\t"
  173. "add %4, %1 \n\t"
  174. "movq (%2), %%mm2 \n\t"
  175. "movq 8(%2), %%mm3 \n\t"
  176. "pxor %%mm6, %%mm0 \n\t"
  177. "pxor %%mm6, %%mm1 \n\t"
  178. "pxor %%mm6, %%mm2 \n\t"
  179. "pxor %%mm6, %%mm3 \n\t"
  180. PAVGB" %%mm2, %%mm0 \n\t"
  181. PAVGB" %%mm3, %%mm1 \n\t"
  182. "pxor %%mm6, %%mm0 \n\t"
  183. "pxor %%mm6, %%mm1 \n\t"
  184. "movq %%mm0, (%3) \n\t"
  185. "add %5, %3 \n\t"
  186. "movq %%mm1, (%3) \n\t"
  187. "add %5, %3 \n\t"
  188. "movq (%1), %%mm0 \n\t"
  189. "add %4, %1 \n\t"
  190. "movq (%1), %%mm1 \n\t"
  191. "add %4, %1 \n\t"
  192. "movq 16(%2), %%mm2 \n\t"
  193. "movq 24(%2), %%mm3 \n\t"
  194. "pxor %%mm6, %%mm0 \n\t"
  195. "pxor %%mm6, %%mm1 \n\t"
  196. "pxor %%mm6, %%mm2 \n\t"
  197. "pxor %%mm6, %%mm3 \n\t"
  198. PAVGB" %%mm2, %%mm0 \n\t"
  199. PAVGB" %%mm3, %%mm1 \n\t"
  200. "pxor %%mm6, %%mm0 \n\t"
  201. "pxor %%mm6, %%mm1 \n\t"
  202. "movq %%mm0, (%3) \n\t"
  203. "add %5, %3 \n\t"
  204. "movq %%mm1, (%3) \n\t"
  205. "add %5, %3 \n\t"
  206. "add $32, %2 \n\t"
  207. "subl $4, %0 \n\t"
  208. "jnz 1b \n\t"
  209. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  210. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  211. #else
  212. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  213. #endif
  214. :"S"((long)src1Stride), "D"((long)dstStride)
  215. :"memory");
  216. //the following should be used, though better not with gcc ...
  217. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  218. :"r"(src1Stride), "r"(dstStride)
  219. :"memory");*/
  220. }
  221. static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  222. {
  223. __asm __volatile(
  224. "testl $1, %0 \n\t"
  225. " jz 1f \n\t"
  226. "movd (%1), %%mm0 \n\t"
  227. "movd (%2), %%mm1 \n\t"
  228. "add %4, %1 \n\t"
  229. "add $4, %2 \n\t"
  230. PAVGB" %%mm1, %%mm0 \n\t"
  231. PAVGB" (%3), %%mm0 \n\t"
  232. "movd %%mm0, (%3) \n\t"
  233. "add %5, %3 \n\t"
  234. "decl %0 \n\t"
  235. "1: \n\t"
  236. "movd (%1), %%mm0 \n\t"
  237. "add %4, %1 \n\t"
  238. "movd (%1), %%mm1 \n\t"
  239. "add %4, %1 \n\t"
  240. PAVGB" (%2), %%mm0 \n\t"
  241. PAVGB" 4(%2), %%mm1 \n\t"
  242. PAVGB" (%3), %%mm0 \n\t"
  243. "movd %%mm0, (%3) \n\t"
  244. "add %5, %3 \n\t"
  245. PAVGB" (%3), %%mm1 \n\t"
  246. "movd %%mm1, (%3) \n\t"
  247. "add %5, %3 \n\t"
  248. "movd (%1), %%mm0 \n\t"
  249. "add %4, %1 \n\t"
  250. "movd (%1), %%mm1 \n\t"
  251. "add %4, %1 \n\t"
  252. PAVGB" 8(%2), %%mm0 \n\t"
  253. PAVGB" 12(%2), %%mm1 \n\t"
  254. PAVGB" (%3), %%mm0 \n\t"
  255. "movd %%mm0, (%3) \n\t"
  256. "add %5, %3 \n\t"
  257. PAVGB" (%3), %%mm1 \n\t"
  258. "movd %%mm1, (%3) \n\t"
  259. "add %5, %3 \n\t"
  260. "add $16, %2 \n\t"
  261. "subl $4, %0 \n\t"
  262. "jnz 1b \n\t"
  263. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  264. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  265. #else
  266. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  267. #endif
  268. :"S"((long)src1Stride), "D"((long)dstStride)
  269. :"memory");
  270. }
  271. static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  272. {
  273. __asm __volatile(
  274. "testl $1, %0 \n\t"
  275. " jz 1f \n\t"
  276. "movq (%1), %%mm0 \n\t"
  277. "movq (%2), %%mm1 \n\t"
  278. "add %4, %1 \n\t"
  279. "add $8, %2 \n\t"
  280. PAVGB" %%mm1, %%mm0 \n\t"
  281. PAVGB" (%3), %%mm0 \n\t"
  282. "movq %%mm0, (%3) \n\t"
  283. "add %5, %3 \n\t"
  284. "decl %0 \n\t"
  285. "1: \n\t"
  286. "movq (%1), %%mm0 \n\t"
  287. "add %4, %1 \n\t"
  288. "movq (%1), %%mm1 \n\t"
  289. "add %4, %1 \n\t"
  290. PAVGB" (%2), %%mm0 \n\t"
  291. PAVGB" 8(%2), %%mm1 \n\t"
  292. PAVGB" (%3), %%mm0 \n\t"
  293. "movq %%mm0, (%3) \n\t"
  294. "add %5, %3 \n\t"
  295. PAVGB" (%3), %%mm1 \n\t"
  296. "movq %%mm1, (%3) \n\t"
  297. "add %5, %3 \n\t"
  298. "movq (%1), %%mm0 \n\t"
  299. "add %4, %1 \n\t"
  300. "movq (%1), %%mm1 \n\t"
  301. "add %4, %1 \n\t"
  302. PAVGB" 16(%2), %%mm0 \n\t"
  303. PAVGB" 24(%2), %%mm1 \n\t"
  304. PAVGB" (%3), %%mm0 \n\t"
  305. "movq %%mm0, (%3) \n\t"
  306. "add %5, %3 \n\t"
  307. PAVGB" (%3), %%mm1 \n\t"
  308. "movq %%mm1, (%3) \n\t"
  309. "add %5, %3 \n\t"
  310. "add $32, %2 \n\t"
  311. "subl $4, %0 \n\t"
  312. "jnz 1b \n\t"
  313. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  314. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  315. #else
  316. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  317. #endif
  318. :"S"((long)src1Stride), "D"((long)dstStride)
  319. :"memory");
  320. //the following should be used, though better not with gcc ...
  321. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  322. :"r"(src1Stride), "r"(dstStride)
  323. :"memory");*/
  324. }
  325. static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  326. {
  327. __asm __volatile(
  328. "lea (%3, %3), %%"REG_a" \n\t"
  329. "1: \n\t"
  330. "movq (%1), %%mm0 \n\t"
  331. "movq (%1, %3), %%mm1 \n\t"
  332. "movq 8(%1), %%mm2 \n\t"
  333. "movq 8(%1, %3), %%mm3 \n\t"
  334. PAVGB" 1(%1), %%mm0 \n\t"
  335. PAVGB" 1(%1, %3), %%mm1 \n\t"
  336. PAVGB" 9(%1), %%mm2 \n\t"
  337. PAVGB" 9(%1, %3), %%mm3 \n\t"
  338. "movq %%mm0, (%2) \n\t"
  339. "movq %%mm1, (%2, %3) \n\t"
  340. "movq %%mm2, 8(%2) \n\t"
  341. "movq %%mm3, 8(%2, %3) \n\t"
  342. "add %%"REG_a", %1 \n\t"
  343. "add %%"REG_a", %2 \n\t"
  344. "movq (%1), %%mm0 \n\t"
  345. "movq (%1, %3), %%mm1 \n\t"
  346. "movq 8(%1), %%mm2 \n\t"
  347. "movq 8(%1, %3), %%mm3 \n\t"
  348. PAVGB" 1(%1), %%mm0 \n\t"
  349. PAVGB" 1(%1, %3), %%mm1 \n\t"
  350. PAVGB" 9(%1), %%mm2 \n\t"
  351. PAVGB" 9(%1, %3), %%mm3 \n\t"
  352. "add %%"REG_a", %1 \n\t"
  353. "movq %%mm0, (%2) \n\t"
  354. "movq %%mm1, (%2, %3) \n\t"
  355. "movq %%mm2, 8(%2) \n\t"
  356. "movq %%mm3, 8(%2, %3) \n\t"
  357. "add %%"REG_a", %2 \n\t"
  358. "subl $4, %0 \n\t"
  359. "jnz 1b \n\t"
  360. :"+g"(h), "+S"(pixels), "+D"(block)
  361. :"r" ((long)line_size)
  362. :"%"REG_a, "memory");
  363. }
  364. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  365. {
  366. __asm __volatile(
  367. "testl $1, %0 \n\t"
  368. " jz 1f \n\t"
  369. "movq (%1), %%mm0 \n\t"
  370. "movq 8(%1), %%mm1 \n\t"
  371. PAVGB" (%2), %%mm0 \n\t"
  372. PAVGB" 8(%2), %%mm1 \n\t"
  373. "add %4, %1 \n\t"
  374. "add $16, %2 \n\t"
  375. "movq %%mm0, (%3) \n\t"
  376. "movq %%mm1, 8(%3) \n\t"
  377. "add %5, %3 \n\t"
  378. "decl %0 \n\t"
  379. "1: \n\t"
  380. "movq (%1), %%mm0 \n\t"
  381. "movq 8(%1), %%mm1 \n\t"
  382. "add %4, %1 \n\t"
  383. PAVGB" (%2), %%mm0 \n\t"
  384. PAVGB" 8(%2), %%mm1 \n\t"
  385. "movq %%mm0, (%3) \n\t"
  386. "movq %%mm1, 8(%3) \n\t"
  387. "add %5, %3 \n\t"
  388. "movq (%1), %%mm0 \n\t"
  389. "movq 8(%1), %%mm1 \n\t"
  390. "add %4, %1 \n\t"
  391. PAVGB" 16(%2), %%mm0 \n\t"
  392. PAVGB" 24(%2), %%mm1 \n\t"
  393. "movq %%mm0, (%3) \n\t"
  394. "movq %%mm1, 8(%3) \n\t"
  395. "add %5, %3 \n\t"
  396. "add $32, %2 \n\t"
  397. "subl $2, %0 \n\t"
  398. "jnz 1b \n\t"
  399. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  400. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  401. #else
  402. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  403. #endif
  404. :"S"((long)src1Stride), "D"((long)dstStride)
  405. :"memory");
  406. //the following should be used, though better not with gcc ...
  407. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  408. :"r"(src1Stride), "r"(dstStride)
  409. :"memory");*/
  410. }
  411. static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  412. {
  413. __asm __volatile(
  414. "testl $1, %0 \n\t"
  415. " jz 1f \n\t"
  416. "movq (%1), %%mm0 \n\t"
  417. "movq 8(%1), %%mm1 \n\t"
  418. PAVGB" (%2), %%mm0 \n\t"
  419. PAVGB" 8(%2), %%mm1 \n\t"
  420. "add %4, %1 \n\t"
  421. "add $16, %2 \n\t"
  422. PAVGB" (%3), %%mm0 \n\t"
  423. PAVGB" 8(%3), %%mm1 \n\t"
  424. "movq %%mm0, (%3) \n\t"
  425. "movq %%mm1, 8(%3) \n\t"
  426. "add %5, %3 \n\t"
  427. "decl %0 \n\t"
  428. "1: \n\t"
  429. "movq (%1), %%mm0 \n\t"
  430. "movq 8(%1), %%mm1 \n\t"
  431. "add %4, %1 \n\t"
  432. PAVGB" (%2), %%mm0 \n\t"
  433. PAVGB" 8(%2), %%mm1 \n\t"
  434. PAVGB" (%3), %%mm0 \n\t"
  435. PAVGB" 8(%3), %%mm1 \n\t"
  436. "movq %%mm0, (%3) \n\t"
  437. "movq %%mm1, 8(%3) \n\t"
  438. "add %5, %3 \n\t"
  439. "movq (%1), %%mm0 \n\t"
  440. "movq 8(%1), %%mm1 \n\t"
  441. "add %4, %1 \n\t"
  442. PAVGB" 16(%2), %%mm0 \n\t"
  443. PAVGB" 24(%2), %%mm1 \n\t"
  444. PAVGB" (%3), %%mm0 \n\t"
  445. PAVGB" 8(%3), %%mm1 \n\t"
  446. "movq %%mm0, (%3) \n\t"
  447. "movq %%mm1, 8(%3) \n\t"
  448. "add %5, %3 \n\t"
  449. "add $32, %2 \n\t"
  450. "subl $2, %0 \n\t"
  451. "jnz 1b \n\t"
  452. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  453. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  454. #else
  455. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  456. #endif
  457. :"S"((long)src1Stride), "D"((long)dstStride)
  458. :"memory");
  459. //the following should be used, though better not with gcc ...
  460. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  461. :"r"(src1Stride), "r"(dstStride)
  462. :"memory");*/
  463. }
  464. static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  465. {
  466. __asm __volatile(
  467. "pcmpeqb %%mm6, %%mm6\n\t"
  468. "testl $1, %0 \n\t"
  469. " jz 1f \n\t"
  470. "movq (%1), %%mm0 \n\t"
  471. "movq 8(%1), %%mm1 \n\t"
  472. "movq (%2), %%mm2 \n\t"
  473. "movq 8(%2), %%mm3 \n\t"
  474. "pxor %%mm6, %%mm0 \n\t"
  475. "pxor %%mm6, %%mm1 \n\t"
  476. "pxor %%mm6, %%mm2 \n\t"
  477. "pxor %%mm6, %%mm3 \n\t"
  478. PAVGB" %%mm2, %%mm0 \n\t"
  479. PAVGB" %%mm3, %%mm1 \n\t"
  480. "pxor %%mm6, %%mm0 \n\t"
  481. "pxor %%mm6, %%mm1 \n\t"
  482. "add %4, %1 \n\t"
  483. "add $16, %2 \n\t"
  484. "movq %%mm0, (%3) \n\t"
  485. "movq %%mm1, 8(%3) \n\t"
  486. "add %5, %3 \n\t"
  487. "decl %0 \n\t"
  488. "1: \n\t"
  489. "movq (%1), %%mm0 \n\t"
  490. "movq 8(%1), %%mm1 \n\t"
  491. "add %4, %1 \n\t"
  492. "movq (%2), %%mm2 \n\t"
  493. "movq 8(%2), %%mm3 \n\t"
  494. "pxor %%mm6, %%mm0 \n\t"
  495. "pxor %%mm6, %%mm1 \n\t"
  496. "pxor %%mm6, %%mm2 \n\t"
  497. "pxor %%mm6, %%mm3 \n\t"
  498. PAVGB" %%mm2, %%mm0 \n\t"
  499. PAVGB" %%mm3, %%mm1 \n\t"
  500. "pxor %%mm6, %%mm0 \n\t"
  501. "pxor %%mm6, %%mm1 \n\t"
  502. "movq %%mm0, (%3) \n\t"
  503. "movq %%mm1, 8(%3) \n\t"
  504. "add %5, %3 \n\t"
  505. "movq (%1), %%mm0 \n\t"
  506. "movq 8(%1), %%mm1 \n\t"
  507. "add %4, %1 \n\t"
  508. "movq 16(%2), %%mm2 \n\t"
  509. "movq 24(%2), %%mm3 \n\t"
  510. "pxor %%mm6, %%mm0 \n\t"
  511. "pxor %%mm6, %%mm1 \n\t"
  512. "pxor %%mm6, %%mm2 \n\t"
  513. "pxor %%mm6, %%mm3 \n\t"
  514. PAVGB" %%mm2, %%mm0 \n\t"
  515. PAVGB" %%mm3, %%mm1 \n\t"
  516. "pxor %%mm6, %%mm0 \n\t"
  517. "pxor %%mm6, %%mm1 \n\t"
  518. "movq %%mm0, (%3) \n\t"
  519. "movq %%mm1, 8(%3) \n\t"
  520. "add %5, %3 \n\t"
  521. "add $32, %2 \n\t"
  522. "subl $2, %0 \n\t"
  523. "jnz 1b \n\t"
  524. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  525. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  526. #else
  527. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  528. #endif
  529. :"S"((long)src1Stride), "D"((long)dstStride)
  530. :"memory");
  531. //the following should be used, though better not with gcc ...
  532. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  533. :"r"(src1Stride), "r"(dstStride)
  534. :"memory");*/
  535. }
  536. /* GL: this function does incorrect rounding if overflow */
  537. static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  538. {
  539. MOVQ_BONE(mm6);
  540. __asm __volatile(
  541. "lea (%3, %3), %%"REG_a" \n\t"
  542. "1: \n\t"
  543. "movq (%1), %%mm0 \n\t"
  544. "movq (%1, %3), %%mm2 \n\t"
  545. "movq 1(%1), %%mm1 \n\t"
  546. "movq 1(%1, %3), %%mm3 \n\t"
  547. "add %%"REG_a", %1 \n\t"
  548. "psubusb %%mm6, %%mm0 \n\t"
  549. "psubusb %%mm6, %%mm2 \n\t"
  550. PAVGB" %%mm1, %%mm0 \n\t"
  551. PAVGB" %%mm3, %%mm2 \n\t"
  552. "movq %%mm0, (%2) \n\t"
  553. "movq %%mm2, (%2, %3) \n\t"
  554. "movq (%1), %%mm0 \n\t"
  555. "movq 1(%1), %%mm1 \n\t"
  556. "movq (%1, %3), %%mm2 \n\t"
  557. "movq 1(%1, %3), %%mm3 \n\t"
  558. "add %%"REG_a", %2 \n\t"
  559. "add %%"REG_a", %1 \n\t"
  560. "psubusb %%mm6, %%mm0 \n\t"
  561. "psubusb %%mm6, %%mm2 \n\t"
  562. PAVGB" %%mm1, %%mm0 \n\t"
  563. PAVGB" %%mm3, %%mm2 \n\t"
  564. "movq %%mm0, (%2) \n\t"
  565. "movq %%mm2, (%2, %3) \n\t"
  566. "add %%"REG_a", %2 \n\t"
  567. "subl $4, %0 \n\t"
  568. "jnz 1b \n\t"
  569. :"+g"(h), "+S"(pixels), "+D"(block)
  570. :"r" ((long)line_size)
  571. :"%"REG_a, "memory");
  572. }
  573. static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  574. {
  575. __asm __volatile(
  576. "lea (%3, %3), %%"REG_a" \n\t"
  577. "movq (%1), %%mm0 \n\t"
  578. "sub %3, %2 \n\t"
  579. "1: \n\t"
  580. "movq (%1, %3), %%mm1 \n\t"
  581. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  582. "add %%"REG_a", %1 \n\t"
  583. PAVGB" %%mm1, %%mm0 \n\t"
  584. PAVGB" %%mm2, %%mm1 \n\t"
  585. "movq %%mm0, (%2, %3) \n\t"
  586. "movq %%mm1, (%2, %%"REG_a") \n\t"
  587. "movq (%1, %3), %%mm1 \n\t"
  588. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  589. "add %%"REG_a", %2 \n\t"
  590. "add %%"REG_a", %1 \n\t"
  591. PAVGB" %%mm1, %%mm2 \n\t"
  592. PAVGB" %%mm0, %%mm1 \n\t"
  593. "movq %%mm2, (%2, %3) \n\t"
  594. "movq %%mm1, (%2, %%"REG_a") \n\t"
  595. "add %%"REG_a", %2 \n\t"
  596. "subl $4, %0 \n\t"
  597. "jnz 1b \n\t"
  598. :"+g"(h), "+S"(pixels), "+D" (block)
  599. :"r" ((long)line_size)
  600. :"%"REG_a, "memory");
  601. }
  602. /* GL: this function does incorrect rounding if overflow */
  603. static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  604. {
  605. MOVQ_BONE(mm6);
  606. __asm __volatile(
  607. "lea (%3, %3), %%"REG_a" \n\t"
  608. "movq (%1), %%mm0 \n\t"
  609. "sub %3, %2 \n\t"
  610. "1: \n\t"
  611. "movq (%1, %3), %%mm1 \n\t"
  612. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  613. "add %%"REG_a", %1 \n\t"
  614. "psubusb %%mm6, %%mm1 \n\t"
  615. PAVGB" %%mm1, %%mm0 \n\t"
  616. PAVGB" %%mm2, %%mm1 \n\t"
  617. "movq %%mm0, (%2, %3) \n\t"
  618. "movq %%mm1, (%2, %%"REG_a") \n\t"
  619. "movq (%1, %3), %%mm1 \n\t"
  620. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  621. "add %%"REG_a", %2 \n\t"
  622. "add %%"REG_a", %1 \n\t"
  623. "psubusb %%mm6, %%mm1 \n\t"
  624. PAVGB" %%mm1, %%mm2 \n\t"
  625. PAVGB" %%mm0, %%mm1 \n\t"
  626. "movq %%mm2, (%2, %3) \n\t"
  627. "movq %%mm1, (%2, %%"REG_a") \n\t"
  628. "add %%"REG_a", %2 \n\t"
  629. "subl $4, %0 \n\t"
  630. "jnz 1b \n\t"
  631. :"+g"(h), "+S"(pixels), "+D" (block)
  632. :"r" ((long)line_size)
  633. :"%"REG_a, "memory");
  634. }
  635. static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  636. {
  637. __asm __volatile(
  638. "lea (%3, %3), %%"REG_a" \n\t"
  639. "1: \n\t"
  640. "movq (%2), %%mm0 \n\t"
  641. "movq (%2, %3), %%mm1 \n\t"
  642. PAVGB" (%1), %%mm0 \n\t"
  643. PAVGB" (%1, %3), %%mm1 \n\t"
  644. "movq %%mm0, (%2) \n\t"
  645. "movq %%mm1, (%2, %3) \n\t"
  646. "add %%"REG_a", %1 \n\t"
  647. "add %%"REG_a", %2 \n\t"
  648. "movq (%2), %%mm0 \n\t"
  649. "movq (%2, %3), %%mm1 \n\t"
  650. PAVGB" (%1), %%mm0 \n\t"
  651. PAVGB" (%1, %3), %%mm1 \n\t"
  652. "add %%"REG_a", %1 \n\t"
  653. "movq %%mm0, (%2) \n\t"
  654. "movq %%mm1, (%2, %3) \n\t"
  655. "add %%"REG_a", %2 \n\t"
  656. "subl $4, %0 \n\t"
  657. "jnz 1b \n\t"
  658. :"+g"(h), "+S"(pixels), "+D"(block)
  659. :"r" ((long)line_size)
  660. :"%"REG_a, "memory");
  661. }
  662. static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  663. {
  664. __asm __volatile(
  665. "lea (%3, %3), %%"REG_a" \n\t"
  666. "1: \n\t"
  667. "movq (%1), %%mm0 \n\t"
  668. "movq (%1, %3), %%mm2 \n\t"
  669. PAVGB" 1(%1), %%mm0 \n\t"
  670. PAVGB" 1(%1, %3), %%mm2 \n\t"
  671. PAVGB" (%2), %%mm0 \n\t"
  672. PAVGB" (%2, %3), %%mm2 \n\t"
  673. "add %%"REG_a", %1 \n\t"
  674. "movq %%mm0, (%2) \n\t"
  675. "movq %%mm2, (%2, %3) \n\t"
  676. "movq (%1), %%mm0 \n\t"
  677. "movq (%1, %3), %%mm2 \n\t"
  678. PAVGB" 1(%1), %%mm0 \n\t"
  679. PAVGB" 1(%1, %3), %%mm2 \n\t"
  680. "add %%"REG_a", %2 \n\t"
  681. "add %%"REG_a", %1 \n\t"
  682. PAVGB" (%2), %%mm0 \n\t"
  683. PAVGB" (%2, %3), %%mm2 \n\t"
  684. "movq %%mm0, (%2) \n\t"
  685. "movq %%mm2, (%2, %3) \n\t"
  686. "add %%"REG_a", %2 \n\t"
  687. "subl $4, %0 \n\t"
  688. "jnz 1b \n\t"
  689. :"+g"(h), "+S"(pixels), "+D"(block)
  690. :"r" ((long)line_size)
  691. :"%"REG_a, "memory");
  692. }
  693. static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  694. {
  695. __asm __volatile(
  696. "lea (%3, %3), %%"REG_a" \n\t"
  697. "movq (%1), %%mm0 \n\t"
  698. "sub %3, %2 \n\t"
  699. "1: \n\t"
  700. "movq (%1, %3), %%mm1 \n\t"
  701. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  702. "add %%"REG_a", %1 \n\t"
  703. PAVGB" %%mm1, %%mm0 \n\t"
  704. PAVGB" %%mm2, %%mm1 \n\t"
  705. "movq (%2, %3), %%mm3 \n\t"
  706. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  707. PAVGB" %%mm3, %%mm0 \n\t"
  708. PAVGB" %%mm4, %%mm1 \n\t"
  709. "movq %%mm0, (%2, %3) \n\t"
  710. "movq %%mm1, (%2, %%"REG_a") \n\t"
  711. "movq (%1, %3), %%mm1 \n\t"
  712. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  713. PAVGB" %%mm1, %%mm2 \n\t"
  714. PAVGB" %%mm0, %%mm1 \n\t"
  715. "add %%"REG_a", %2 \n\t"
  716. "add %%"REG_a", %1 \n\t"
  717. "movq (%2, %3), %%mm3 \n\t"
  718. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  719. PAVGB" %%mm3, %%mm2 \n\t"
  720. PAVGB" %%mm4, %%mm1 \n\t"
  721. "movq %%mm2, (%2, %3) \n\t"
  722. "movq %%mm1, (%2, %%"REG_a") \n\t"
  723. "add %%"REG_a", %2 \n\t"
  724. "subl $4, %0 \n\t"
  725. "jnz 1b \n\t"
  726. :"+g"(h), "+S"(pixels), "+D"(block)
  727. :"r" ((long)line_size)
  728. :"%"REG_a, "memory");
  729. }
  730. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  731. static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  732. {
  733. MOVQ_BONE(mm6);
  734. __asm __volatile(
  735. "lea (%3, %3), %%"REG_a" \n\t"
  736. "movq (%1), %%mm0 \n\t"
  737. PAVGB" 1(%1), %%mm0 \n\t"
  738. ".balign 8 \n\t"
  739. "1: \n\t"
  740. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  741. "movq (%1, %3), %%mm1 \n\t"
  742. "psubusb %%mm6, %%mm2 \n\t"
  743. PAVGB" 1(%1, %3), %%mm1 \n\t"
  744. PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
  745. "add %%"REG_a", %1 \n\t"
  746. PAVGB" %%mm1, %%mm0 \n\t"
  747. PAVGB" %%mm2, %%mm1 \n\t"
  748. PAVGB" (%2), %%mm0 \n\t"
  749. PAVGB" (%2, %3), %%mm1 \n\t"
  750. "movq %%mm0, (%2) \n\t"
  751. "movq %%mm1, (%2, %3) \n\t"
  752. "movq (%1, %3), %%mm1 \n\t"
  753. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  754. PAVGB" 1(%1, %3), %%mm1 \n\t"
  755. PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
  756. "add %%"REG_a", %2 \n\t"
  757. "add %%"REG_a", %1 \n\t"
  758. PAVGB" %%mm1, %%mm2 \n\t"
  759. PAVGB" %%mm0, %%mm1 \n\t"
  760. PAVGB" (%2), %%mm2 \n\t"
  761. PAVGB" (%2, %3), %%mm1 \n\t"
  762. "movq %%mm2, (%2) \n\t"
  763. "movq %%mm1, (%2, %3) \n\t"
  764. "add %%"REG_a", %2 \n\t"
  765. "subl $4, %0 \n\t"
  766. "jnz 1b \n\t"
  767. :"+g"(h), "+S"(pixels), "+D"(block)
  768. :"r" ((long)line_size)
  769. :"%"REG_a, "memory");
  770. }
  771. //FIXME the following could be optimized too ...
  772. static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  773. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  774. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  775. }
  776. static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  777. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  778. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  779. }
  780. static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  781. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  782. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  783. }
  784. static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  785. DEF(avg_pixels8)(block , pixels , line_size, h);
  786. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  787. }
  788. static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  789. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  790. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  791. }
  792. static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  793. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  794. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  795. }
  796. static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  797. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  798. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  799. }