You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

817 lines
23KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  25. clobber bug - now it will work with 2.95.2 and also with -fPIC
  26. */
  27. static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  28. {
  29. __asm __volatile(
  30. "lea (%3, %3), %%"REG_a" \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq (%1, %3), %%mm1 \n\t"
  34. PAVGB" 1(%1), %%mm0 \n\t"
  35. PAVGB" 1(%1, %3), %%mm1 \n\t"
  36. "movq %%mm0, (%2) \n\t"
  37. "movq %%mm1, (%2, %3) \n\t"
  38. "add %%"REG_a", %1 \n\t"
  39. "add %%"REG_a", %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq (%1, %3), %%mm1 \n\t"
  42. PAVGB" 1(%1), %%mm0 \n\t"
  43. PAVGB" 1(%1, %3), %%mm1 \n\t"
  44. "add %%"REG_a", %1 \n\t"
  45. "movq %%mm0, (%2) \n\t"
  46. "movq %%mm1, (%2, %3) \n\t"
  47. "add %%"REG_a", %2 \n\t"
  48. "subl $4, %0 \n\t"
  49. "jnz 1b \n\t"
  50. :"+g"(h), "+S"(pixels), "+D"(block)
  51. :"r" ((long)line_size)
  52. :"%"REG_a, "memory");
  53. }
  54. static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  55. {
  56. __asm __volatile(
  57. "testl $1, %0 \n\t"
  58. " jz 1f \n\t"
  59. "movd (%1), %%mm0 \n\t"
  60. "movd (%2), %%mm1 \n\t"
  61. "add %4, %1 \n\t"
  62. "add $4, %2 \n\t"
  63. PAVGB" %%mm1, %%mm0 \n\t"
  64. "movd %%mm0, (%3) \n\t"
  65. "add %5, %3 \n\t"
  66. "decl %0 \n\t"
  67. "1: \n\t"
  68. "movd (%1), %%mm0 \n\t"
  69. "add %4, %1 \n\t"
  70. "movd (%1), %%mm1 \n\t"
  71. "add %4, %1 \n\t"
  72. PAVGB" (%2), %%mm0 \n\t"
  73. PAVGB" 4(%2), %%mm1 \n\t"
  74. "movd %%mm0, (%3) \n\t"
  75. "add %5, %3 \n\t"
  76. "movd %%mm1, (%3) \n\t"
  77. "add %5, %3 \n\t"
  78. "movd (%1), %%mm0 \n\t"
  79. "add %4, %1 \n\t"
  80. "movd (%1), %%mm1 \n\t"
  81. "add %4, %1 \n\t"
  82. PAVGB" 8(%2), %%mm0 \n\t"
  83. PAVGB" 12(%2), %%mm1 \n\t"
  84. "movd %%mm0, (%3) \n\t"
  85. "add %5, %3 \n\t"
  86. "movd %%mm1, (%3) \n\t"
  87. "add %5, %3 \n\t"
  88. "add $16, %2 \n\t"
  89. "subl $4, %0 \n\t"
  90. "jnz 1b \n\t"
  91. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  92. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  93. #else
  94. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  95. #endif
  96. :"S"((long)src1Stride), "D"((long)dstStride)
  97. :"memory");
  98. }
  99. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  100. {
  101. __asm __volatile(
  102. "testl $1, %0 \n\t"
  103. " jz 1f \n\t"
  104. "movq (%1), %%mm0 \n\t"
  105. "movq (%2), %%mm1 \n\t"
  106. "add %4, %1 \n\t"
  107. "add $8, %2 \n\t"
  108. PAVGB" %%mm1, %%mm0 \n\t"
  109. "movq %%mm0, (%3) \n\t"
  110. "add %5, %3 \n\t"
  111. "decl %0 \n\t"
  112. "1: \n\t"
  113. "movq (%1), %%mm0 \n\t"
  114. "add %4, %1 \n\t"
  115. "movq (%1), %%mm1 \n\t"
  116. "add %4, %1 \n\t"
  117. PAVGB" (%2), %%mm0 \n\t"
  118. PAVGB" 8(%2), %%mm1 \n\t"
  119. "movq %%mm0, (%3) \n\t"
  120. "add %5, %3 \n\t"
  121. "movq %%mm1, (%3) \n\t"
  122. "add %5, %3 \n\t"
  123. "movq (%1), %%mm0 \n\t"
  124. "add %4, %1 \n\t"
  125. "movq (%1), %%mm1 \n\t"
  126. "add %4, %1 \n\t"
  127. PAVGB" 16(%2), %%mm0 \n\t"
  128. PAVGB" 24(%2), %%mm1 \n\t"
  129. "movq %%mm0, (%3) \n\t"
  130. "add %5, %3 \n\t"
  131. "movq %%mm1, (%3) \n\t"
  132. "add %5, %3 \n\t"
  133. "add $32, %2 \n\t"
  134. "subl $4, %0 \n\t"
  135. "jnz 1b \n\t"
  136. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  137. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  138. #else
  139. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  140. #endif
  141. :"S"((long)src1Stride), "D"((long)dstStride)
  142. :"memory");
  143. //the following should be used, though better not with gcc ...
  144. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  145. :"r"(src1Stride), "r"(dstStride)
  146. :"memory");*/
  147. }
  148. static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  149. {
  150. __asm __volatile(
  151. "pcmpeqb %%mm6, %%mm6 \n\t"
  152. "testl $1, %0 \n\t"
  153. " jz 1f \n\t"
  154. "movq (%1), %%mm0 \n\t"
  155. "movq (%2), %%mm1 \n\t"
  156. "add %4, %1 \n\t"
  157. "add $8, %2 \n\t"
  158. "pxor %%mm6, %%mm0 \n\t"
  159. "pxor %%mm6, %%mm1 \n\t"
  160. PAVGB" %%mm1, %%mm0 \n\t"
  161. "pxor %%mm6, %%mm0 \n\t"
  162. "movq %%mm0, (%3) \n\t"
  163. "add %5, %3 \n\t"
  164. "decl %0 \n\t"
  165. "1: \n\t"
  166. "movq (%1), %%mm0 \n\t"
  167. "add %4, %1 \n\t"
  168. "movq (%1), %%mm1 \n\t"
  169. "add %4, %1 \n\t"
  170. "movq (%2), %%mm2 \n\t"
  171. "movq 8(%2), %%mm3 \n\t"
  172. "pxor %%mm6, %%mm0 \n\t"
  173. "pxor %%mm6, %%mm1 \n\t"
  174. "pxor %%mm6, %%mm2 \n\t"
  175. "pxor %%mm6, %%mm3 \n\t"
  176. PAVGB" %%mm2, %%mm0 \n\t"
  177. PAVGB" %%mm3, %%mm1 \n\t"
  178. "pxor %%mm6, %%mm0 \n\t"
  179. "pxor %%mm6, %%mm1 \n\t"
  180. "movq %%mm0, (%3) \n\t"
  181. "add %5, %3 \n\t"
  182. "movq %%mm1, (%3) \n\t"
  183. "add %5, %3 \n\t"
  184. "movq (%1), %%mm0 \n\t"
  185. "add %4, %1 \n\t"
  186. "movq (%1), %%mm1 \n\t"
  187. "add %4, %1 \n\t"
  188. "movq 16(%2), %%mm2 \n\t"
  189. "movq 24(%2), %%mm3 \n\t"
  190. "pxor %%mm6, %%mm0 \n\t"
  191. "pxor %%mm6, %%mm1 \n\t"
  192. "pxor %%mm6, %%mm2 \n\t"
  193. "pxor %%mm6, %%mm3 \n\t"
  194. PAVGB" %%mm2, %%mm0 \n\t"
  195. PAVGB" %%mm3, %%mm1 \n\t"
  196. "pxor %%mm6, %%mm0 \n\t"
  197. "pxor %%mm6, %%mm1 \n\t"
  198. "movq %%mm0, (%3) \n\t"
  199. "add %5, %3 \n\t"
  200. "movq %%mm1, (%3) \n\t"
  201. "add %5, %3 \n\t"
  202. "add $32, %2 \n\t"
  203. "subl $4, %0 \n\t"
  204. "jnz 1b \n\t"
  205. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  206. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  207. #else
  208. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  209. #endif
  210. :"S"((long)src1Stride), "D"((long)dstStride)
  211. :"memory");
  212. //the following should be used, though better not with gcc ...
  213. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  214. :"r"(src1Stride), "r"(dstStride)
  215. :"memory");*/
  216. }
  217. static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  218. {
  219. __asm __volatile(
  220. "testl $1, %0 \n\t"
  221. " jz 1f \n\t"
  222. "movd (%1), %%mm0 \n\t"
  223. "movd (%2), %%mm1 \n\t"
  224. "add %4, %1 \n\t"
  225. "add $4, %2 \n\t"
  226. PAVGB" %%mm1, %%mm0 \n\t"
  227. PAVGB" (%3), %%mm0 \n\t"
  228. "movd %%mm0, (%3) \n\t"
  229. "add %5, %3 \n\t"
  230. "decl %0 \n\t"
  231. "1: \n\t"
  232. "movd (%1), %%mm0 \n\t"
  233. "add %4, %1 \n\t"
  234. "movd (%1), %%mm1 \n\t"
  235. "add %4, %1 \n\t"
  236. PAVGB" (%2), %%mm0 \n\t"
  237. PAVGB" 4(%2), %%mm1 \n\t"
  238. PAVGB" (%3), %%mm0 \n\t"
  239. "movd %%mm0, (%3) \n\t"
  240. "add %5, %3 \n\t"
  241. PAVGB" (%3), %%mm1 \n\t"
  242. "movd %%mm1, (%3) \n\t"
  243. "add %5, %3 \n\t"
  244. "movd (%1), %%mm0 \n\t"
  245. "add %4, %1 \n\t"
  246. "movd (%1), %%mm1 \n\t"
  247. "add %4, %1 \n\t"
  248. PAVGB" 8(%2), %%mm0 \n\t"
  249. PAVGB" 12(%2), %%mm1 \n\t"
  250. PAVGB" (%3), %%mm0 \n\t"
  251. "movd %%mm0, (%3) \n\t"
  252. "add %5, %3 \n\t"
  253. PAVGB" (%3), %%mm1 \n\t"
  254. "movd %%mm1, (%3) \n\t"
  255. "add %5, %3 \n\t"
  256. "add $16, %2 \n\t"
  257. "subl $4, %0 \n\t"
  258. "jnz 1b \n\t"
  259. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  260. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  261. #else
  262. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  263. #endif
  264. :"S"((long)src1Stride), "D"((long)dstStride)
  265. :"memory");
  266. }
  267. static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  268. {
  269. __asm __volatile(
  270. "testl $1, %0 \n\t"
  271. " jz 1f \n\t"
  272. "movq (%1), %%mm0 \n\t"
  273. "movq (%2), %%mm1 \n\t"
  274. "add %4, %1 \n\t"
  275. "add $8, %2 \n\t"
  276. PAVGB" %%mm1, %%mm0 \n\t"
  277. PAVGB" (%3), %%mm0 \n\t"
  278. "movq %%mm0, (%3) \n\t"
  279. "add %5, %3 \n\t"
  280. "decl %0 \n\t"
  281. "1: \n\t"
  282. "movq (%1), %%mm0 \n\t"
  283. "add %4, %1 \n\t"
  284. "movq (%1), %%mm1 \n\t"
  285. "add %4, %1 \n\t"
  286. PAVGB" (%2), %%mm0 \n\t"
  287. PAVGB" 8(%2), %%mm1 \n\t"
  288. PAVGB" (%3), %%mm0 \n\t"
  289. "movq %%mm0, (%3) \n\t"
  290. "add %5, %3 \n\t"
  291. PAVGB" (%3), %%mm1 \n\t"
  292. "movq %%mm1, (%3) \n\t"
  293. "add %5, %3 \n\t"
  294. "movq (%1), %%mm0 \n\t"
  295. "add %4, %1 \n\t"
  296. "movq (%1), %%mm1 \n\t"
  297. "add %4, %1 \n\t"
  298. PAVGB" 16(%2), %%mm0 \n\t"
  299. PAVGB" 24(%2), %%mm1 \n\t"
  300. PAVGB" (%3), %%mm0 \n\t"
  301. "movq %%mm0, (%3) \n\t"
  302. "add %5, %3 \n\t"
  303. PAVGB" (%3), %%mm1 \n\t"
  304. "movq %%mm1, (%3) \n\t"
  305. "add %5, %3 \n\t"
  306. "add $32, %2 \n\t"
  307. "subl $4, %0 \n\t"
  308. "jnz 1b \n\t"
  309. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  310. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  311. #else
  312. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  313. #endif
  314. :"S"((long)src1Stride), "D"((long)dstStride)
  315. :"memory");
  316. //the following should be used, though better not with gcc ...
  317. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  318. :"r"(src1Stride), "r"(dstStride)
  319. :"memory");*/
  320. }
  321. static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  322. {
  323. __asm __volatile(
  324. "lea (%3, %3), %%"REG_a" \n\t"
  325. "1: \n\t"
  326. "movq (%1), %%mm0 \n\t"
  327. "movq (%1, %3), %%mm1 \n\t"
  328. "movq 8(%1), %%mm2 \n\t"
  329. "movq 8(%1, %3), %%mm3 \n\t"
  330. PAVGB" 1(%1), %%mm0 \n\t"
  331. PAVGB" 1(%1, %3), %%mm1 \n\t"
  332. PAVGB" 9(%1), %%mm2 \n\t"
  333. PAVGB" 9(%1, %3), %%mm3 \n\t"
  334. "movq %%mm0, (%2) \n\t"
  335. "movq %%mm1, (%2, %3) \n\t"
  336. "movq %%mm2, 8(%2) \n\t"
  337. "movq %%mm3, 8(%2, %3) \n\t"
  338. "add %%"REG_a", %1 \n\t"
  339. "add %%"REG_a", %2 \n\t"
  340. "movq (%1), %%mm0 \n\t"
  341. "movq (%1, %3), %%mm1 \n\t"
  342. "movq 8(%1), %%mm2 \n\t"
  343. "movq 8(%1, %3), %%mm3 \n\t"
  344. PAVGB" 1(%1), %%mm0 \n\t"
  345. PAVGB" 1(%1, %3), %%mm1 \n\t"
  346. PAVGB" 9(%1), %%mm2 \n\t"
  347. PAVGB" 9(%1, %3), %%mm3 \n\t"
  348. "add %%"REG_a", %1 \n\t"
  349. "movq %%mm0, (%2) \n\t"
  350. "movq %%mm1, (%2, %3) \n\t"
  351. "movq %%mm2, 8(%2) \n\t"
  352. "movq %%mm3, 8(%2, %3) \n\t"
  353. "add %%"REG_a", %2 \n\t"
  354. "subl $4, %0 \n\t"
  355. "jnz 1b \n\t"
  356. :"+g"(h), "+S"(pixels), "+D"(block)
  357. :"r" ((long)line_size)
  358. :"%"REG_a, "memory");
  359. }
  360. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  361. {
  362. __asm __volatile(
  363. "testl $1, %0 \n\t"
  364. " jz 1f \n\t"
  365. "movq (%1), %%mm0 \n\t"
  366. "movq 8(%1), %%mm1 \n\t"
  367. PAVGB" (%2), %%mm0 \n\t"
  368. PAVGB" 8(%2), %%mm1 \n\t"
  369. "add %4, %1 \n\t"
  370. "add $16, %2 \n\t"
  371. "movq %%mm0, (%3) \n\t"
  372. "movq %%mm1, 8(%3) \n\t"
  373. "add %5, %3 \n\t"
  374. "decl %0 \n\t"
  375. "1: \n\t"
  376. "movq (%1), %%mm0 \n\t"
  377. "movq 8(%1), %%mm1 \n\t"
  378. "add %4, %1 \n\t"
  379. PAVGB" (%2), %%mm0 \n\t"
  380. PAVGB" 8(%2), %%mm1 \n\t"
  381. "movq %%mm0, (%3) \n\t"
  382. "movq %%mm1, 8(%3) \n\t"
  383. "add %5, %3 \n\t"
  384. "movq (%1), %%mm0 \n\t"
  385. "movq 8(%1), %%mm1 \n\t"
  386. "add %4, %1 \n\t"
  387. PAVGB" 16(%2), %%mm0 \n\t"
  388. PAVGB" 24(%2), %%mm1 \n\t"
  389. "movq %%mm0, (%3) \n\t"
  390. "movq %%mm1, 8(%3) \n\t"
  391. "add %5, %3 \n\t"
  392. "add $32, %2 \n\t"
  393. "subl $2, %0 \n\t"
  394. "jnz 1b \n\t"
  395. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  396. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  397. #else
  398. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  399. #endif
  400. :"S"((long)src1Stride), "D"((long)dstStride)
  401. :"memory");
  402. //the following should be used, though better not with gcc ...
  403. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  404. :"r"(src1Stride), "r"(dstStride)
  405. :"memory");*/
  406. }
  407. static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  408. {
  409. __asm __volatile(
  410. "testl $1, %0 \n\t"
  411. " jz 1f \n\t"
  412. "movq (%1), %%mm0 \n\t"
  413. "movq 8(%1), %%mm1 \n\t"
  414. PAVGB" (%2), %%mm0 \n\t"
  415. PAVGB" 8(%2), %%mm1 \n\t"
  416. "add %4, %1 \n\t"
  417. "add $16, %2 \n\t"
  418. PAVGB" (%3), %%mm0 \n\t"
  419. PAVGB" 8(%3), %%mm1 \n\t"
  420. "movq %%mm0, (%3) \n\t"
  421. "movq %%mm1, 8(%3) \n\t"
  422. "add %5, %3 \n\t"
  423. "decl %0 \n\t"
  424. "1: \n\t"
  425. "movq (%1), %%mm0 \n\t"
  426. "movq 8(%1), %%mm1 \n\t"
  427. "add %4, %1 \n\t"
  428. PAVGB" (%2), %%mm0 \n\t"
  429. PAVGB" 8(%2), %%mm1 \n\t"
  430. PAVGB" (%3), %%mm0 \n\t"
  431. PAVGB" 8(%3), %%mm1 \n\t"
  432. "movq %%mm0, (%3) \n\t"
  433. "movq %%mm1, 8(%3) \n\t"
  434. "add %5, %3 \n\t"
  435. "movq (%1), %%mm0 \n\t"
  436. "movq 8(%1), %%mm1 \n\t"
  437. "add %4, %1 \n\t"
  438. PAVGB" 16(%2), %%mm0 \n\t"
  439. PAVGB" 24(%2), %%mm1 \n\t"
  440. PAVGB" (%3), %%mm0 \n\t"
  441. PAVGB" 8(%3), %%mm1 \n\t"
  442. "movq %%mm0, (%3) \n\t"
  443. "movq %%mm1, 8(%3) \n\t"
  444. "add %5, %3 \n\t"
  445. "add $32, %2 \n\t"
  446. "subl $2, %0 \n\t"
  447. "jnz 1b \n\t"
  448. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  449. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  450. #else
  451. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  452. #endif
  453. :"S"((long)src1Stride), "D"((long)dstStride)
  454. :"memory");
  455. //the following should be used, though better not with gcc ...
  456. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  457. :"r"(src1Stride), "r"(dstStride)
  458. :"memory");*/
  459. }
  460. static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  461. {
  462. __asm __volatile(
  463. "pcmpeqb %%mm6, %%mm6\n\t"
  464. "testl $1, %0 \n\t"
  465. " jz 1f \n\t"
  466. "movq (%1), %%mm0 \n\t"
  467. "movq 8(%1), %%mm1 \n\t"
  468. "movq (%2), %%mm2 \n\t"
  469. "movq 8(%2), %%mm3 \n\t"
  470. "pxor %%mm6, %%mm0 \n\t"
  471. "pxor %%mm6, %%mm1 \n\t"
  472. "pxor %%mm6, %%mm2 \n\t"
  473. "pxor %%mm6, %%mm3 \n\t"
  474. PAVGB" %%mm2, %%mm0 \n\t"
  475. PAVGB" %%mm3, %%mm1 \n\t"
  476. "pxor %%mm6, %%mm0 \n\t"
  477. "pxor %%mm6, %%mm1 \n\t"
  478. "add %4, %1 \n\t"
  479. "add $16, %2 \n\t"
  480. "movq %%mm0, (%3) \n\t"
  481. "movq %%mm1, 8(%3) \n\t"
  482. "add %5, %3 \n\t"
  483. "decl %0 \n\t"
  484. "1: \n\t"
  485. "movq (%1), %%mm0 \n\t"
  486. "movq 8(%1), %%mm1 \n\t"
  487. "add %4, %1 \n\t"
  488. "movq (%2), %%mm2 \n\t"
  489. "movq 8(%2), %%mm3 \n\t"
  490. "pxor %%mm6, %%mm0 \n\t"
  491. "pxor %%mm6, %%mm1 \n\t"
  492. "pxor %%mm6, %%mm2 \n\t"
  493. "pxor %%mm6, %%mm3 \n\t"
  494. PAVGB" %%mm2, %%mm0 \n\t"
  495. PAVGB" %%mm3, %%mm1 \n\t"
  496. "pxor %%mm6, %%mm0 \n\t"
  497. "pxor %%mm6, %%mm1 \n\t"
  498. "movq %%mm0, (%3) \n\t"
  499. "movq %%mm1, 8(%3) \n\t"
  500. "add %5, %3 \n\t"
  501. "movq (%1), %%mm0 \n\t"
  502. "movq 8(%1), %%mm1 \n\t"
  503. "add %4, %1 \n\t"
  504. "movq 16(%2), %%mm2 \n\t"
  505. "movq 24(%2), %%mm3 \n\t"
  506. "pxor %%mm6, %%mm0 \n\t"
  507. "pxor %%mm6, %%mm1 \n\t"
  508. "pxor %%mm6, %%mm2 \n\t"
  509. "pxor %%mm6, %%mm3 \n\t"
  510. PAVGB" %%mm2, %%mm0 \n\t"
  511. PAVGB" %%mm3, %%mm1 \n\t"
  512. "pxor %%mm6, %%mm0 \n\t"
  513. "pxor %%mm6, %%mm1 \n\t"
  514. "movq %%mm0, (%3) \n\t"
  515. "movq %%mm1, 8(%3) \n\t"
  516. "add %5, %3 \n\t"
  517. "add $32, %2 \n\t"
  518. "subl $2, %0 \n\t"
  519. "jnz 1b \n\t"
  520. #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
  521. :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  522. #else
  523. :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  524. #endif
  525. :"S"((long)src1Stride), "D"((long)dstStride)
  526. :"memory");
  527. //the following should be used, though better not with gcc ...
  528. /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  529. :"r"(src1Stride), "r"(dstStride)
  530. :"memory");*/
  531. }
  532. /* GL: this function does incorrect rounding if overflow */
  533. static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  534. {
  535. MOVQ_BONE(mm6);
  536. __asm __volatile(
  537. "lea (%3, %3), %%"REG_a" \n\t"
  538. "1: \n\t"
  539. "movq (%1), %%mm0 \n\t"
  540. "movq (%1, %3), %%mm2 \n\t"
  541. "movq 1(%1), %%mm1 \n\t"
  542. "movq 1(%1, %3), %%mm3 \n\t"
  543. "add %%"REG_a", %1 \n\t"
  544. "psubusb %%mm6, %%mm0 \n\t"
  545. "psubusb %%mm6, %%mm2 \n\t"
  546. PAVGB" %%mm1, %%mm0 \n\t"
  547. PAVGB" %%mm3, %%mm2 \n\t"
  548. "movq %%mm0, (%2) \n\t"
  549. "movq %%mm2, (%2, %3) \n\t"
  550. "movq (%1), %%mm0 \n\t"
  551. "movq 1(%1), %%mm1 \n\t"
  552. "movq (%1, %3), %%mm2 \n\t"
  553. "movq 1(%1, %3), %%mm3 \n\t"
  554. "add %%"REG_a", %2 \n\t"
  555. "add %%"REG_a", %1 \n\t"
  556. "psubusb %%mm6, %%mm0 \n\t"
  557. "psubusb %%mm6, %%mm2 \n\t"
  558. PAVGB" %%mm1, %%mm0 \n\t"
  559. PAVGB" %%mm3, %%mm2 \n\t"
  560. "movq %%mm0, (%2) \n\t"
  561. "movq %%mm2, (%2, %3) \n\t"
  562. "add %%"REG_a", %2 \n\t"
  563. "subl $4, %0 \n\t"
  564. "jnz 1b \n\t"
  565. :"+g"(h), "+S"(pixels), "+D"(block)
  566. :"r" ((long)line_size)
  567. :"%"REG_a, "memory");
  568. }
  569. static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  570. {
  571. __asm __volatile(
  572. "lea (%3, %3), %%"REG_a" \n\t"
  573. "movq (%1), %%mm0 \n\t"
  574. "sub %3, %2 \n\t"
  575. "1: \n\t"
  576. "movq (%1, %3), %%mm1 \n\t"
  577. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  578. "add %%"REG_a", %1 \n\t"
  579. PAVGB" %%mm1, %%mm0 \n\t"
  580. PAVGB" %%mm2, %%mm1 \n\t"
  581. "movq %%mm0, (%2, %3) \n\t"
  582. "movq %%mm1, (%2, %%"REG_a") \n\t"
  583. "movq (%1, %3), %%mm1 \n\t"
  584. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  585. "add %%"REG_a", %2 \n\t"
  586. "add %%"REG_a", %1 \n\t"
  587. PAVGB" %%mm1, %%mm2 \n\t"
  588. PAVGB" %%mm0, %%mm1 \n\t"
  589. "movq %%mm2, (%2, %3) \n\t"
  590. "movq %%mm1, (%2, %%"REG_a") \n\t"
  591. "add %%"REG_a", %2 \n\t"
  592. "subl $4, %0 \n\t"
  593. "jnz 1b \n\t"
  594. :"+g"(h), "+S"(pixels), "+D" (block)
  595. :"r" ((long)line_size)
  596. :"%"REG_a, "memory");
  597. }
  598. /* GL: this function does incorrect rounding if overflow */
  599. static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  600. {
  601. MOVQ_BONE(mm6);
  602. __asm __volatile(
  603. "lea (%3, %3), %%"REG_a" \n\t"
  604. "movq (%1), %%mm0 \n\t"
  605. "sub %3, %2 \n\t"
  606. "1: \n\t"
  607. "movq (%1, %3), %%mm1 \n\t"
  608. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  609. "add %%"REG_a", %1 \n\t"
  610. "psubusb %%mm6, %%mm1 \n\t"
  611. PAVGB" %%mm1, %%mm0 \n\t"
  612. PAVGB" %%mm2, %%mm1 \n\t"
  613. "movq %%mm0, (%2, %3) \n\t"
  614. "movq %%mm1, (%2, %%"REG_a") \n\t"
  615. "movq (%1, %3), %%mm1 \n\t"
  616. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  617. "add %%"REG_a", %2 \n\t"
  618. "add %%"REG_a", %1 \n\t"
  619. "psubusb %%mm6, %%mm1 \n\t"
  620. PAVGB" %%mm1, %%mm2 \n\t"
  621. PAVGB" %%mm0, %%mm1 \n\t"
  622. "movq %%mm2, (%2, %3) \n\t"
  623. "movq %%mm1, (%2, %%"REG_a") \n\t"
  624. "add %%"REG_a", %2 \n\t"
  625. "subl $4, %0 \n\t"
  626. "jnz 1b \n\t"
  627. :"+g"(h), "+S"(pixels), "+D" (block)
  628. :"r" ((long)line_size)
  629. :"%"REG_a, "memory");
  630. }
  631. static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  632. {
  633. __asm __volatile(
  634. "lea (%3, %3), %%"REG_a" \n\t"
  635. "1: \n\t"
  636. "movq (%2), %%mm0 \n\t"
  637. "movq (%2, %3), %%mm1 \n\t"
  638. PAVGB" (%1), %%mm0 \n\t"
  639. PAVGB" (%1, %3), %%mm1 \n\t"
  640. "movq %%mm0, (%2) \n\t"
  641. "movq %%mm1, (%2, %3) \n\t"
  642. "add %%"REG_a", %1 \n\t"
  643. "add %%"REG_a", %2 \n\t"
  644. "movq (%2), %%mm0 \n\t"
  645. "movq (%2, %3), %%mm1 \n\t"
  646. PAVGB" (%1), %%mm0 \n\t"
  647. PAVGB" (%1, %3), %%mm1 \n\t"
  648. "add %%"REG_a", %1 \n\t"
  649. "movq %%mm0, (%2) \n\t"
  650. "movq %%mm1, (%2, %3) \n\t"
  651. "add %%"REG_a", %2 \n\t"
  652. "subl $4, %0 \n\t"
  653. "jnz 1b \n\t"
  654. :"+g"(h), "+S"(pixels), "+D"(block)
  655. :"r" ((long)line_size)
  656. :"%"REG_a, "memory");
  657. }
  658. static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  659. {
  660. __asm __volatile(
  661. "lea (%3, %3), %%"REG_a" \n\t"
  662. "1: \n\t"
  663. "movq (%1), %%mm0 \n\t"
  664. "movq (%1, %3), %%mm2 \n\t"
  665. PAVGB" 1(%1), %%mm0 \n\t"
  666. PAVGB" 1(%1, %3), %%mm2 \n\t"
  667. PAVGB" (%2), %%mm0 \n\t"
  668. PAVGB" (%2, %3), %%mm2 \n\t"
  669. "add %%"REG_a", %1 \n\t"
  670. "movq %%mm0, (%2) \n\t"
  671. "movq %%mm2, (%2, %3) \n\t"
  672. "movq (%1), %%mm0 \n\t"
  673. "movq (%1, %3), %%mm2 \n\t"
  674. PAVGB" 1(%1), %%mm0 \n\t"
  675. PAVGB" 1(%1, %3), %%mm2 \n\t"
  676. "add %%"REG_a", %2 \n\t"
  677. "add %%"REG_a", %1 \n\t"
  678. PAVGB" (%2), %%mm0 \n\t"
  679. PAVGB" (%2, %3), %%mm2 \n\t"
  680. "movq %%mm0, (%2) \n\t"
  681. "movq %%mm2, (%2, %3) \n\t"
  682. "add %%"REG_a", %2 \n\t"
  683. "subl $4, %0 \n\t"
  684. "jnz 1b \n\t"
  685. :"+g"(h), "+S"(pixels), "+D"(block)
  686. :"r" ((long)line_size)
  687. :"%"REG_a, "memory");
  688. }
  689. static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  690. {
  691. __asm __volatile(
  692. "lea (%3, %3), %%"REG_a" \n\t"
  693. "movq (%1), %%mm0 \n\t"
  694. "sub %3, %2 \n\t"
  695. "1: \n\t"
  696. "movq (%1, %3), %%mm1 \n\t"
  697. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  698. "add %%"REG_a", %1 \n\t"
  699. PAVGB" %%mm1, %%mm0 \n\t"
  700. PAVGB" %%mm2, %%mm1 \n\t"
  701. "movq (%2, %3), %%mm3 \n\t"
  702. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  703. PAVGB" %%mm3, %%mm0 \n\t"
  704. PAVGB" %%mm4, %%mm1 \n\t"
  705. "movq %%mm0, (%2, %3) \n\t"
  706. "movq %%mm1, (%2, %%"REG_a") \n\t"
  707. "movq (%1, %3), %%mm1 \n\t"
  708. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  709. PAVGB" %%mm1, %%mm2 \n\t"
  710. PAVGB" %%mm0, %%mm1 \n\t"
  711. "add %%"REG_a", %2 \n\t"
  712. "add %%"REG_a", %1 \n\t"
  713. "movq (%2, %3), %%mm3 \n\t"
  714. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  715. PAVGB" %%mm3, %%mm2 \n\t"
  716. PAVGB" %%mm4, %%mm1 \n\t"
  717. "movq %%mm2, (%2, %3) \n\t"
  718. "movq %%mm1, (%2, %%"REG_a") \n\t"
  719. "add %%"REG_a", %2 \n\t"
  720. "subl $4, %0 \n\t"
  721. "jnz 1b \n\t"
  722. :"+g"(h), "+S"(pixels), "+D"(block)
  723. :"r" ((long)line_size)
  724. :"%"REG_a, "memory");
  725. }
  726. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  727. static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  728. {
  729. MOVQ_BONE(mm6);
  730. __asm __volatile(
  731. "lea (%3, %3), %%"REG_a" \n\t"
  732. "movq (%1), %%mm0 \n\t"
  733. PAVGB" 1(%1), %%mm0 \n\t"
  734. ".balign 8 \n\t"
  735. "1: \n\t"
  736. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  737. "movq (%1, %3), %%mm1 \n\t"
  738. "psubusb %%mm6, %%mm2 \n\t"
  739. PAVGB" 1(%1, %3), %%mm1 \n\t"
  740. PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
  741. "add %%"REG_a", %1 \n\t"
  742. PAVGB" %%mm1, %%mm0 \n\t"
  743. PAVGB" %%mm2, %%mm1 \n\t"
  744. PAVGB" (%2), %%mm0 \n\t"
  745. PAVGB" (%2, %3), %%mm1 \n\t"
  746. "movq %%mm0, (%2) \n\t"
  747. "movq %%mm1, (%2, %3) \n\t"
  748. "movq (%1, %3), %%mm1 \n\t"
  749. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  750. PAVGB" 1(%1, %3), %%mm1 \n\t"
  751. PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
  752. "add %%"REG_a", %2 \n\t"
  753. "add %%"REG_a", %1 \n\t"
  754. PAVGB" %%mm1, %%mm2 \n\t"
  755. PAVGB" %%mm0, %%mm1 \n\t"
  756. PAVGB" (%2), %%mm2 \n\t"
  757. PAVGB" (%2, %3), %%mm1 \n\t"
  758. "movq %%mm2, (%2) \n\t"
  759. "movq %%mm1, (%2, %3) \n\t"
  760. "add %%"REG_a", %2 \n\t"
  761. "subl $4, %0 \n\t"
  762. "jnz 1b \n\t"
  763. :"+g"(h), "+S"(pixels), "+D"(block)
  764. :"r" ((long)line_size)
  765. :"%"REG_a, "memory");
  766. }
  767. //FIXME the following could be optimized too ...
  768. static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  769. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  770. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  771. }
  772. static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  773. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  774. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  775. }
  776. static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  777. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  778. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  779. }
  780. static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  781. DEF(avg_pixels8)(block , pixels , line_size, h);
  782. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  783. }
  784. static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  785. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  786. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  787. }
  788. static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  789. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  790. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  791. }
  792. static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
  793. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  794. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  795. }