You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
8.2KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002 Michael Niedermayer
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  25. clobber bug - now it will work with 2.95.2 and also with -fPIC
  26. */
  27. static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  28. {
  29. __asm __volatile(
  30. "lea (%3, %3), %%eax \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq (%1, %3), %%mm1 \n\t"
  34. PAVGB" 1(%1), %%mm0 \n\t"
  35. PAVGB" 1(%1, %3), %%mm1 \n\t"
  36. "movq %%mm0, (%2) \n\t"
  37. "movq %%mm1, (%2, %3) \n\t"
  38. "addl %%eax, %1 \n\t"
  39. "addl %%eax, %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq (%1, %3), %%mm1 \n\t"
  42. PAVGB" 1(%1), %%mm0 \n\t"
  43. PAVGB" 1(%1, %3), %%mm1 \n\t"
  44. "addl %%eax, %1 \n\t"
  45. "movq %%mm0, (%2) \n\t"
  46. "movq %%mm1, (%2, %3) \n\t"
  47. "addl %%eax, %2 \n\t"
  48. "subl $4, %0 \n\t"
  49. "jnz 1b \n\t"
  50. :"+g"(h), "+S"(pixels), "+D"(block)
  51. :"r" (line_size)
  52. :"%eax", "memory");
  53. }
  54. /* GL: this function does incorrect rounding if overflow */
  55. static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  56. {
  57. MOVQ_BONE(mm6);
  58. __asm __volatile(
  59. "lea (%3, %3), %%eax \n\t"
  60. "1: \n\t"
  61. "movq (%1), %%mm0 \n\t"
  62. "movq (%1, %3), %%mm2 \n\t"
  63. "movq 1(%1), %%mm1 \n\t"
  64. "movq 1(%1, %3), %%mm3 \n\t"
  65. "addl %%eax, %1 \n\t"
  66. "psubusb %%mm6, %%mm0 \n\t"
  67. "psubusb %%mm6, %%mm2 \n\t"
  68. PAVGB" %%mm1, %%mm0 \n\t"
  69. PAVGB" %%mm3, %%mm2 \n\t"
  70. "movq %%mm0, (%2) \n\t"
  71. "movq %%mm2, (%2, %3) \n\t"
  72. "movq (%1), %%mm0 \n\t"
  73. "movq 1(%1), %%mm1 \n\t"
  74. "movq (%1, %3), %%mm2 \n\t"
  75. "movq 1(%1, %3), %%mm3 \n\t"
  76. "addl %%eax, %2 \n\t"
  77. "addl %%eax, %1 \n\t"
  78. "psubusb %%mm6, %%mm0 \n\t"
  79. "psubusb %%mm6, %%mm2 \n\t"
  80. PAVGB" %%mm1, %%mm0 \n\t"
  81. PAVGB" %%mm3, %%mm2 \n\t"
  82. "movq %%mm0, (%2) \n\t"
  83. "movq %%mm2, (%2, %3) \n\t"
  84. "addl %%eax, %2 \n\t"
  85. "subl $4, %0 \n\t"
  86. "jnz 1b \n\t"
  87. :"+g"(h), "+S"(pixels), "+D"(block)
  88. :"r" (line_size)
  89. :"%eax", "memory");
  90. }
  91. static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  92. {
  93. __asm __volatile(
  94. "lea (%3, %3), %%eax \n\t"
  95. "movq (%1), %%mm0 \n\t"
  96. "subl %3, %2 \n\t"
  97. "1: \n\t"
  98. "movq (%1, %3), %%mm1 \n\t"
  99. "movq (%1, %%eax), %%mm2 \n\t"
  100. "addl %%eax, %1 \n\t"
  101. PAVGB" %%mm1, %%mm0 \n\t"
  102. PAVGB" %%mm2, %%mm1 \n\t"
  103. "movq %%mm0, (%2, %3) \n\t"
  104. "movq %%mm1, (%2, %%eax) \n\t"
  105. "movq (%1, %3), %%mm1 \n\t"
  106. "movq (%1, %%eax), %%mm0 \n\t"
  107. "addl %%eax, %2 \n\t"
  108. "addl %%eax, %1 \n\t"
  109. PAVGB" %%mm1, %%mm2 \n\t"
  110. PAVGB" %%mm0, %%mm1 \n\t"
  111. "movq %%mm2, (%2, %3) \n\t"
  112. "movq %%mm1, (%2, %%eax) \n\t"
  113. "addl %%eax, %2 \n\t"
  114. "subl $4, %0 \n\t"
  115. "jnz 1b \n\t"
  116. :"+g"(h), "+S"(pixels), "+D" (block)
  117. :"r" (line_size)
  118. :"%eax", "memory");
  119. }
  120. /* GL: this function does incorrect rounding if overflow */
  121. static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  122. {
  123. MOVQ_BONE(mm6);
  124. __asm __volatile(
  125. "lea (%3, %3), %%eax \n\t"
  126. "movq (%1), %%mm0 \n\t"
  127. "subl %3, %2 \n\t"
  128. "1: \n\t"
  129. "movq (%1, %3), %%mm1 \n\t"
  130. "movq (%1, %%eax), %%mm2 \n\t"
  131. "addl %%eax, %1 \n\t"
  132. "psubusb %%mm6, %%mm1 \n\t"
  133. PAVGB" %%mm1, %%mm0 \n\t"
  134. PAVGB" %%mm2, %%mm1 \n\t"
  135. "movq %%mm0, (%2, %3) \n\t"
  136. "movq %%mm1, (%2, %%eax) \n\t"
  137. "movq (%1, %3), %%mm1 \n\t"
  138. "movq (%1, %%eax), %%mm0 \n\t"
  139. "addl %%eax, %2 \n\t"
  140. "addl %%eax, %1 \n\t"
  141. "psubusb %%mm6, %%mm1 \n\t"
  142. PAVGB" %%mm1, %%mm2 \n\t"
  143. PAVGB" %%mm0, %%mm1 \n\t"
  144. "movq %%mm2, (%2, %3) \n\t"
  145. "movq %%mm1, (%2, %%eax) \n\t"
  146. "addl %%eax, %2 \n\t"
  147. "subl $4, %0 \n\t"
  148. "jnz 1b \n\t"
  149. :"+g"(h), "+S"(pixels), "+D" (block)
  150. :"r" (line_size)
  151. :"%eax", "memory");
  152. }
  153. static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  154. {
  155. __asm __volatile(
  156. "lea (%3, %3), %%eax \n\t"
  157. "1: \n\t"
  158. "movq (%2), %%mm0 \n\t"
  159. "movq (%2, %3), %%mm1 \n\t"
  160. PAVGB" (%1), %%mm0 \n\t"
  161. PAVGB" (%1, %3), %%mm1 \n\t"
  162. "movq %%mm0, (%2) \n\t"
  163. "movq %%mm1, (%2, %3) \n\t"
  164. "addl %%eax, %1 \n\t"
  165. "addl %%eax, %2 \n\t"
  166. "movq (%2), %%mm0 \n\t"
  167. "movq (%2, %3), %%mm1 \n\t"
  168. PAVGB" (%1), %%mm0 \n\t"
  169. PAVGB" (%1, %3), %%mm1 \n\t"
  170. "addl %%eax, %1 \n\t"
  171. "movq %%mm0, (%2) \n\t"
  172. "movq %%mm1, (%2, %3) \n\t"
  173. "addl %%eax, %2 \n\t"
  174. "subl $4, %0 \n\t"
  175. "jnz 1b \n\t"
  176. :"+g"(h), "+S"(pixels), "+D"(block)
  177. :"r" (line_size)
  178. :"%eax", "memory");
  179. }
  180. static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  181. {
  182. __asm __volatile(
  183. "lea (%3, %3), %%eax \n\t"
  184. "1: \n\t"
  185. "movq (%1), %%mm0 \n\t"
  186. "movq (%1, %3), %%mm2 \n\t"
  187. PAVGB" 1(%1), %%mm0 \n\t"
  188. PAVGB" 1(%1, %3), %%mm2 \n\t"
  189. PAVGB" (%2), %%mm0 \n\t"
  190. PAVGB" (%2, %3), %%mm2 \n\t"
  191. "addl %%eax, %1 \n\t"
  192. "movq %%mm0, (%2) \n\t"
  193. "movq %%mm2, (%2, %3) \n\t"
  194. "movq (%1), %%mm0 \n\t"
  195. "movq (%1, %3), %%mm2 \n\t"
  196. PAVGB" 1(%1), %%mm0 \n\t"
  197. PAVGB" 1(%1, %3), %%mm2 \n\t"
  198. "addl %%eax, %2 \n\t"
  199. "addl %%eax, %1 \n\t"
  200. PAVGB" (%2), %%mm0 \n\t"
  201. PAVGB" (%2, %3), %%mm2 \n\t"
  202. "movq %%mm0, (%2) \n\t"
  203. "movq %%mm2, (%2, %3) \n\t"
  204. "addl %%eax, %2 \n\t"
  205. "subl $4, %0 \n\t"
  206. "jnz 1b \n\t"
  207. :"+g"(h), "+S"(pixels), "+D"(block)
  208. :"r" (line_size)
  209. :"%eax", "memory");
  210. }
  211. static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  212. {
  213. __asm __volatile(
  214. "lea (%3, %3), %%eax \n\t"
  215. "movq (%1), %%mm0 \n\t"
  216. "subl %3, %2 \n\t"
  217. "1: \n\t"
  218. "movq (%1, %3), %%mm1 \n\t"
  219. "movq (%1, %%eax), %%mm2 \n\t"
  220. "addl %%eax, %1 \n\t"
  221. PAVGB" %%mm1, %%mm0 \n\t"
  222. PAVGB" %%mm2, %%mm1 \n\t"
  223. "movq (%2, %3), %%mm3 \n\t"
  224. "movq (%2, %%eax), %%mm4 \n\t"
  225. PAVGB" %%mm3, %%mm0 \n\t"
  226. PAVGB" %%mm4, %%mm1 \n\t"
  227. "movq %%mm0, (%2, %3) \n\t"
  228. "movq %%mm1, (%2, %%eax) \n\t"
  229. "movq (%1, %3), %%mm1 \n\t"
  230. "movq (%1, %%eax), %%mm0 \n\t"
  231. PAVGB" %%mm1, %%mm2 \n\t"
  232. PAVGB" %%mm0, %%mm1 \n\t"
  233. "addl %%eax, %2 \n\t"
  234. "addl %%eax, %1 \n\t"
  235. "movq (%2, %3), %%mm3 \n\t"
  236. "movq (%2, %%eax), %%mm4 \n\t"
  237. PAVGB" %%mm3, %%mm2 \n\t"
  238. PAVGB" %%mm4, %%mm1 \n\t"
  239. "movq %%mm2, (%2, %3) \n\t"
  240. "movq %%mm1, (%2, %%eax) \n\t"
  241. "addl %%eax, %2 \n\t"
  242. "subl $4, %0 \n\t"
  243. "jnz 1b \n\t"
  244. :"+g"(h), "+S"(pixels), "+D"(block)
  245. :"r" (line_size)
  246. :"%eax", "memory");
  247. }
  248. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  249. static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  250. {
  251. MOVQ_BONE(mm6);
  252. __asm __volatile(
  253. "lea (%3, %3), %%eax \n\t"
  254. "movq (%1), %%mm0 \n\t"
  255. PAVGB" 1(%1), %%mm0 \n\t"
  256. ".balign 8 \n\t"
  257. "1: \n\t"
  258. "movq (%1, %%eax), %%mm2 \n\t"
  259. "movq (%1, %3), %%mm1 \n\t"
  260. "psubusb %%mm6, %%mm2 \n\t"
  261. PAVGB" 1(%1, %3), %%mm1 \n\t"
  262. PAVGB" 1(%1, %%eax), %%mm2 \n\t"
  263. "addl %%eax, %1 \n\t"
  264. PAVGB" %%mm1, %%mm0 \n\t"
  265. PAVGB" %%mm2, %%mm1 \n\t"
  266. PAVGB" (%2), %%mm0 \n\t"
  267. PAVGB" (%2, %3), %%mm1 \n\t"
  268. "movq %%mm0, (%2) \n\t"
  269. "movq %%mm1, (%2, %3) \n\t"
  270. "movq (%1, %3), %%mm1 \n\t"
  271. "movq (%1, %%eax), %%mm0 \n\t"
  272. PAVGB" 1(%1, %3), %%mm1 \n\t"
  273. PAVGB" 1(%1, %%eax), %%mm0 \n\t"
  274. "addl %%eax, %2 \n\t"
  275. "addl %%eax, %1 \n\t"
  276. PAVGB" %%mm1, %%mm2 \n\t"
  277. PAVGB" %%mm0, %%mm1 \n\t"
  278. PAVGB" (%2), %%mm2 \n\t"
  279. PAVGB" (%2, %3), %%mm1 \n\t"
  280. "movq %%mm2, (%2) \n\t"
  281. "movq %%mm1, (%2, %3) \n\t"
  282. "addl %%eax, %2 \n\t"
  283. "subl $4, %0 \n\t"
  284. "jnz 1b \n\t"
  285. :"+g"(h), "+S"(pixels), "+D"(block)
  286. :"r" (line_size)
  287. :"%eax", "memory");
  288. }