You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

368 lines
11KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002 Michael Niedermayer
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  25. clobber bug - now it will work with 2.95.2 and also with -fPIC
  26. */
  27. static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  28. {
  29. __asm __volatile(
  30. "lea (%3, %3), %%eax \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq (%1, %3), %%mm1 \n\t"
  34. PAVGB" 1(%1), %%mm0 \n\t"
  35. PAVGB" 1(%1, %3), %%mm1 \n\t"
  36. "movq %%mm0, (%2) \n\t"
  37. "movq %%mm1, (%2, %3) \n\t"
  38. "addl %%eax, %1 \n\t"
  39. "addl %%eax, %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq (%1, %3), %%mm1 \n\t"
  42. PAVGB" 1(%1), %%mm0 \n\t"
  43. PAVGB" 1(%1, %3), %%mm1 \n\t"
  44. "addl %%eax, %1 \n\t"
  45. "movq %%mm0, (%2) \n\t"
  46. "movq %%mm1, (%2, %3) \n\t"
  47. "addl %%eax, %2 \n\t"
  48. "subl $4, %0 \n\t"
  49. "jnz 1b \n\t"
  50. :"+g"(h), "+S"(pixels), "+D"(block)
  51. :"r" (line_size)
  52. :"%eax", "memory");
  53. }
  54. static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  55. {
  56. __asm __volatile(
  57. "lea (%3, %3), %%eax \n\t"
  58. "1: \n\t"
  59. "movq (%1), %%mm0 \n\t"
  60. "movq (%1, %3), %%mm1 \n\t"
  61. "movq 8(%1), %%mm2 \n\t"
  62. "movq 8(%1, %3), %%mm3 \n\t"
  63. PAVGB" 1(%1), %%mm0 \n\t"
  64. PAVGB" 1(%1, %3), %%mm1 \n\t"
  65. PAVGB" 9(%1), %%mm2 \n\t"
  66. PAVGB" 9(%1, %3), %%mm3 \n\t"
  67. "movq %%mm0, (%2) \n\t"
  68. "movq %%mm1, (%2, %3) \n\t"
  69. "movq %%mm2, 8(%2) \n\t"
  70. "movq %%mm3, 8(%2, %3) \n\t"
  71. "addl %%eax, %1 \n\t"
  72. "addl %%eax, %2 \n\t"
  73. "movq (%1), %%mm0 \n\t"
  74. "movq (%1, %3), %%mm1 \n\t"
  75. "movq 8(%1), %%mm2 \n\t"
  76. "movq 8(%1, %3), %%mm3 \n\t"
  77. PAVGB" 1(%1), %%mm0 \n\t"
  78. PAVGB" 1(%1, %3), %%mm1 \n\t"
  79. PAVGB" 9(%1), %%mm2 \n\t"
  80. PAVGB" 9(%1, %3), %%mm3 \n\t"
  81. "addl %%eax, %1 \n\t"
  82. "movq %%mm0, (%2) \n\t"
  83. "movq %%mm1, (%2, %3) \n\t"
  84. "movq %%mm2, 8(%2) \n\t"
  85. "movq %%mm3, 8(%2, %3) \n\t"
  86. "addl %%eax, %2 \n\t"
  87. "subl $4, %0 \n\t"
  88. "jnz 1b \n\t"
  89. :"+g"(h), "+S"(pixels), "+D"(block)
  90. :"r" (line_size)
  91. :"%eax", "memory");
  92. }
  93. /* GL: this function does incorrect rounding if overflow */
  94. static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  95. {
  96. MOVQ_BONE(mm6);
  97. __asm __volatile(
  98. "lea (%3, %3), %%eax \n\t"
  99. "1: \n\t"
  100. "movq (%1), %%mm0 \n\t"
  101. "movq (%1, %3), %%mm2 \n\t"
  102. "movq 1(%1), %%mm1 \n\t"
  103. "movq 1(%1, %3), %%mm3 \n\t"
  104. "addl %%eax, %1 \n\t"
  105. "psubusb %%mm6, %%mm0 \n\t"
  106. "psubusb %%mm6, %%mm2 \n\t"
  107. PAVGB" %%mm1, %%mm0 \n\t"
  108. PAVGB" %%mm3, %%mm2 \n\t"
  109. "movq %%mm0, (%2) \n\t"
  110. "movq %%mm2, (%2, %3) \n\t"
  111. "movq (%1), %%mm0 \n\t"
  112. "movq 1(%1), %%mm1 \n\t"
  113. "movq (%1, %3), %%mm2 \n\t"
  114. "movq 1(%1, %3), %%mm3 \n\t"
  115. "addl %%eax, %2 \n\t"
  116. "addl %%eax, %1 \n\t"
  117. "psubusb %%mm6, %%mm0 \n\t"
  118. "psubusb %%mm6, %%mm2 \n\t"
  119. PAVGB" %%mm1, %%mm0 \n\t"
  120. PAVGB" %%mm3, %%mm2 \n\t"
  121. "movq %%mm0, (%2) \n\t"
  122. "movq %%mm2, (%2, %3) \n\t"
  123. "addl %%eax, %2 \n\t"
  124. "subl $4, %0 \n\t"
  125. "jnz 1b \n\t"
  126. :"+g"(h), "+S"(pixels), "+D"(block)
  127. :"r" (line_size)
  128. :"%eax", "memory");
  129. }
  130. static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  131. {
  132. __asm __volatile(
  133. "lea (%3, %3), %%eax \n\t"
  134. "movq (%1), %%mm0 \n\t"
  135. "subl %3, %2 \n\t"
  136. "1: \n\t"
  137. "movq (%1, %3), %%mm1 \n\t"
  138. "movq (%1, %%eax), %%mm2 \n\t"
  139. "addl %%eax, %1 \n\t"
  140. PAVGB" %%mm1, %%mm0 \n\t"
  141. PAVGB" %%mm2, %%mm1 \n\t"
  142. "movq %%mm0, (%2, %3) \n\t"
  143. "movq %%mm1, (%2, %%eax) \n\t"
  144. "movq (%1, %3), %%mm1 \n\t"
  145. "movq (%1, %%eax), %%mm0 \n\t"
  146. "addl %%eax, %2 \n\t"
  147. "addl %%eax, %1 \n\t"
  148. PAVGB" %%mm1, %%mm2 \n\t"
  149. PAVGB" %%mm0, %%mm1 \n\t"
  150. "movq %%mm2, (%2, %3) \n\t"
  151. "movq %%mm1, (%2, %%eax) \n\t"
  152. "addl %%eax, %2 \n\t"
  153. "subl $4, %0 \n\t"
  154. "jnz 1b \n\t"
  155. :"+g"(h), "+S"(pixels), "+D" (block)
  156. :"r" (line_size)
  157. :"%eax", "memory");
  158. }
  159. /* GL: this function does incorrect rounding if overflow */
  160. static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  161. {
  162. MOVQ_BONE(mm6);
  163. __asm __volatile(
  164. "lea (%3, %3), %%eax \n\t"
  165. "movq (%1), %%mm0 \n\t"
  166. "subl %3, %2 \n\t"
  167. "1: \n\t"
  168. "movq (%1, %3), %%mm1 \n\t"
  169. "movq (%1, %%eax), %%mm2 \n\t"
  170. "addl %%eax, %1 \n\t"
  171. "psubusb %%mm6, %%mm1 \n\t"
  172. PAVGB" %%mm1, %%mm0 \n\t"
  173. PAVGB" %%mm2, %%mm1 \n\t"
  174. "movq %%mm0, (%2, %3) \n\t"
  175. "movq %%mm1, (%2, %%eax) \n\t"
  176. "movq (%1, %3), %%mm1 \n\t"
  177. "movq (%1, %%eax), %%mm0 \n\t"
  178. "addl %%eax, %2 \n\t"
  179. "addl %%eax, %1 \n\t"
  180. "psubusb %%mm6, %%mm1 \n\t"
  181. PAVGB" %%mm1, %%mm2 \n\t"
  182. PAVGB" %%mm0, %%mm1 \n\t"
  183. "movq %%mm2, (%2, %3) \n\t"
  184. "movq %%mm1, (%2, %%eax) \n\t"
  185. "addl %%eax, %2 \n\t"
  186. "subl $4, %0 \n\t"
  187. "jnz 1b \n\t"
  188. :"+g"(h), "+S"(pixels), "+D" (block)
  189. :"r" (line_size)
  190. :"%eax", "memory");
  191. }
  192. static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  193. {
  194. __asm __volatile(
  195. "lea (%3, %3), %%eax \n\t"
  196. "1: \n\t"
  197. "movq (%2), %%mm0 \n\t"
  198. "movq (%2, %3), %%mm1 \n\t"
  199. PAVGB" (%1), %%mm0 \n\t"
  200. PAVGB" (%1, %3), %%mm1 \n\t"
  201. "movq %%mm0, (%2) \n\t"
  202. "movq %%mm1, (%2, %3) \n\t"
  203. "addl %%eax, %1 \n\t"
  204. "addl %%eax, %2 \n\t"
  205. "movq (%2), %%mm0 \n\t"
  206. "movq (%2, %3), %%mm1 \n\t"
  207. PAVGB" (%1), %%mm0 \n\t"
  208. PAVGB" (%1, %3), %%mm1 \n\t"
  209. "addl %%eax, %1 \n\t"
  210. "movq %%mm0, (%2) \n\t"
  211. "movq %%mm1, (%2, %3) \n\t"
  212. "addl %%eax, %2 \n\t"
  213. "subl $4, %0 \n\t"
  214. "jnz 1b \n\t"
  215. :"+g"(h), "+S"(pixels), "+D"(block)
  216. :"r" (line_size)
  217. :"%eax", "memory");
  218. }
  219. static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  220. {
  221. __asm __volatile(
  222. "lea (%3, %3), %%eax \n\t"
  223. "1: \n\t"
  224. "movq (%1), %%mm0 \n\t"
  225. "movq (%1, %3), %%mm2 \n\t"
  226. PAVGB" 1(%1), %%mm0 \n\t"
  227. PAVGB" 1(%1, %3), %%mm2 \n\t"
  228. PAVGB" (%2), %%mm0 \n\t"
  229. PAVGB" (%2, %3), %%mm2 \n\t"
  230. "addl %%eax, %1 \n\t"
  231. "movq %%mm0, (%2) \n\t"
  232. "movq %%mm2, (%2, %3) \n\t"
  233. "movq (%1), %%mm0 \n\t"
  234. "movq (%1, %3), %%mm2 \n\t"
  235. PAVGB" 1(%1), %%mm0 \n\t"
  236. PAVGB" 1(%1, %3), %%mm2 \n\t"
  237. "addl %%eax, %2 \n\t"
  238. "addl %%eax, %1 \n\t"
  239. PAVGB" (%2), %%mm0 \n\t"
  240. PAVGB" (%2, %3), %%mm2 \n\t"
  241. "movq %%mm0, (%2) \n\t"
  242. "movq %%mm2, (%2, %3) \n\t"
  243. "addl %%eax, %2 \n\t"
  244. "subl $4, %0 \n\t"
  245. "jnz 1b \n\t"
  246. :"+g"(h), "+S"(pixels), "+D"(block)
  247. :"r" (line_size)
  248. :"%eax", "memory");
  249. }
  250. static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  251. {
  252. __asm __volatile(
  253. "lea (%3, %3), %%eax \n\t"
  254. "movq (%1), %%mm0 \n\t"
  255. "subl %3, %2 \n\t"
  256. "1: \n\t"
  257. "movq (%1, %3), %%mm1 \n\t"
  258. "movq (%1, %%eax), %%mm2 \n\t"
  259. "addl %%eax, %1 \n\t"
  260. PAVGB" %%mm1, %%mm0 \n\t"
  261. PAVGB" %%mm2, %%mm1 \n\t"
  262. "movq (%2, %3), %%mm3 \n\t"
  263. "movq (%2, %%eax), %%mm4 \n\t"
  264. PAVGB" %%mm3, %%mm0 \n\t"
  265. PAVGB" %%mm4, %%mm1 \n\t"
  266. "movq %%mm0, (%2, %3) \n\t"
  267. "movq %%mm1, (%2, %%eax) \n\t"
  268. "movq (%1, %3), %%mm1 \n\t"
  269. "movq (%1, %%eax), %%mm0 \n\t"
  270. PAVGB" %%mm1, %%mm2 \n\t"
  271. PAVGB" %%mm0, %%mm1 \n\t"
  272. "addl %%eax, %2 \n\t"
  273. "addl %%eax, %1 \n\t"
  274. "movq (%2, %3), %%mm3 \n\t"
  275. "movq (%2, %%eax), %%mm4 \n\t"
  276. PAVGB" %%mm3, %%mm2 \n\t"
  277. PAVGB" %%mm4, %%mm1 \n\t"
  278. "movq %%mm2, (%2, %3) \n\t"
  279. "movq %%mm1, (%2, %%eax) \n\t"
  280. "addl %%eax, %2 \n\t"
  281. "subl $4, %0 \n\t"
  282. "jnz 1b \n\t"
  283. :"+g"(h), "+S"(pixels), "+D"(block)
  284. :"r" (line_size)
  285. :"%eax", "memory");
  286. }
  287. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  288. static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  289. {
  290. MOVQ_BONE(mm6);
  291. __asm __volatile(
  292. "lea (%3, %3), %%eax \n\t"
  293. "movq (%1), %%mm0 \n\t"
  294. PAVGB" 1(%1), %%mm0 \n\t"
  295. ".balign 8 \n\t"
  296. "1: \n\t"
  297. "movq (%1, %%eax), %%mm2 \n\t"
  298. "movq (%1, %3), %%mm1 \n\t"
  299. "psubusb %%mm6, %%mm2 \n\t"
  300. PAVGB" 1(%1, %3), %%mm1 \n\t"
  301. PAVGB" 1(%1, %%eax), %%mm2 \n\t"
  302. "addl %%eax, %1 \n\t"
  303. PAVGB" %%mm1, %%mm0 \n\t"
  304. PAVGB" %%mm2, %%mm1 \n\t"
  305. PAVGB" (%2), %%mm0 \n\t"
  306. PAVGB" (%2, %3), %%mm1 \n\t"
  307. "movq %%mm0, (%2) \n\t"
  308. "movq %%mm1, (%2, %3) \n\t"
  309. "movq (%1, %3), %%mm1 \n\t"
  310. "movq (%1, %%eax), %%mm0 \n\t"
  311. PAVGB" 1(%1, %3), %%mm1 \n\t"
  312. PAVGB" 1(%1, %%eax), %%mm0 \n\t"
  313. "addl %%eax, %2 \n\t"
  314. "addl %%eax, %1 \n\t"
  315. PAVGB" %%mm1, %%mm2 \n\t"
  316. PAVGB" %%mm0, %%mm1 \n\t"
  317. PAVGB" (%2), %%mm2 \n\t"
  318. PAVGB" (%2, %3), %%mm1 \n\t"
  319. "movq %%mm2, (%2) \n\t"
  320. "movq %%mm1, (%2, %3) \n\t"
  321. "addl %%eax, %2 \n\t"
  322. "subl $4, %0 \n\t"
  323. "jnz 1b \n\t"
  324. :"+g"(h), "+S"(pixels), "+D"(block)
  325. :"r" (line_size)
  326. :"%eax", "memory");
  327. }
  328. //FIXME the following could be optimized too ...
  329. static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  330. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  331. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  332. }
  333. static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  334. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  335. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  336. }
  337. static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  338. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  339. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  340. }
  341. static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  342. DEF(avg_pixels8)(block , pixels , line_size, h);
  343. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  344. }
  345. static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  346. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  347. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  348. }
  349. static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  350. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  351. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  352. }
  353. static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  354. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  355. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  356. }