You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

352 lines
11KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. * Copyright (c) 2002 Michael Niedermayer
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. */
  23. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  24. clobber bug - now it will work with 2.95.2 and also with -fPIC
  25. */
  26. static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  27. {
  28. __asm __volatile(
  29. "xorl %%eax, %%eax \n\t"
  30. ".balign 16 \n\t"
  31. "1: \n\t"
  32. "movq (%1, %%eax), %%mm0 \n\t"
  33. "movq 1(%1, %%eax), %%mm1 \n\t"
  34. "movq (%2, %%eax), %%mm2 \n\t"
  35. "movq 1(%2, %%eax), %%mm3 \n\t"
  36. PAVGB" %%mm1, %%mm0 \n\t"
  37. PAVGB" %%mm3, %%mm2 \n\t"
  38. "movq %%mm0, (%3, %%eax) \n\t"
  39. "movq %%mm2, (%4, %%eax) \n\t"
  40. "addl %5, %%eax \n\t"
  41. "movq (%1, %%eax), %%mm0 \n\t"
  42. "movq 1(%1, %%eax), %%mm1 \n\t"
  43. "movq (%2, %%eax), %%mm2 \n\t"
  44. "movq 1(%2, %%eax), %%mm3 \n\t"
  45. PAVGB" %%mm1, %%mm0 \n\t"
  46. PAVGB" %%mm3, %%mm2 \n\t"
  47. "movq %%mm0, (%3, %%eax) \n\t"
  48. "movq %%mm2, (%4, %%eax) \n\t"
  49. "addl %5, %%eax \n\t"
  50. "subl $4, %0 \n\t"
  51. " jnz 1b \n\t"
  52. :"+g"(h)
  53. :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
  54. "g"(line_size<<1)
  55. :"%eax", "memory");
  56. }
  57. /* GL: this function does incorrect rounding if overflow */
  58. static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  59. {
  60. __asm __volatile(
  61. "xorl %%eax, %%eax \n\t"
  62. MOVQ_BONE(%%mm7)
  63. ".balign 16 \n\t"
  64. "1: \n\t"
  65. "movq (%1, %%eax), %%mm0 \n\t"
  66. "movq 1(%1, %%eax), %%mm1 \n\t"
  67. "movq (%2, %%eax), %%mm2 \n\t"
  68. "movq 1(%2, %%eax), %%mm3 \n\t"
  69. "psubusb %%mm7, %%mm0 \n\t"
  70. "psubusb %%mm7, %%mm2 \n\t"
  71. PAVGB" %%mm1, %%mm0 \n\t"
  72. PAVGB" %%mm3, %%mm2 \n\t"
  73. "movq %%mm0, (%3, %%eax) \n\t"
  74. "movq %%mm2, (%4, %%eax) \n\t"
  75. "addl %5, %%eax \n\t"
  76. "movq (%1, %%eax), %%mm0 \n\t"
  77. "movq 1(%1, %%eax), %%mm1 \n\t"
  78. "movq (%2, %%eax), %%mm2 \n\t"
  79. "movq 1(%2, %%eax), %%mm3 \n\t"
  80. "psubusb %%mm7, %%mm0 \n\t"
  81. "psubusb %%mm7, %%mm2 \n\t"
  82. PAVGB" %%mm1, %%mm0 \n\t"
  83. PAVGB" %%mm3, %%mm2 \n\t"
  84. "movq %%mm0, (%3, %%eax) \n\t"
  85. "movq %%mm2, (%4, %%eax) \n\t"
  86. "addl %5, %%eax \n\t"
  87. "subl $4, %0 \n\t"
  88. " jnz 1b \n\t"
  89. :"+g"(h)
  90. :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
  91. "r"(line_size<<1)
  92. :"%eax", "memory");
  93. }
  94. static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  95. {
  96. #if 1
  97. // Michael - measure me
  98. __asm __volatile(
  99. "lea (%3, %3), %%eax \n\t"
  100. "movq (%1), %%mm0 \n\t"
  101. "subl %3, %2 \n\t"
  102. ".balign 16 \n\t"
  103. "1: \n\t"
  104. "movq (%1, %3), %%mm1 \n\t"
  105. "movq (%1, %%eax), %%mm2 \n\t"
  106. PAVGB" %%mm1, %%mm0 \n\t"
  107. PAVGB" %%mm2, %%mm1 \n\t"
  108. "addl %%eax, %1 \n\t"
  109. "movq %%mm0, (%2, %3) \n\t"
  110. "movq %%mm1, (%2, %%eax) \n\t"
  111. "movq (%1, %3), %%mm1 \n\t"
  112. "movq (%1, %%eax), %%mm0 \n\t"
  113. PAVGB" %%mm1, %%mm2 \n\t"
  114. PAVGB" %%mm0, %%mm1 \n\t"
  115. "addl %%eax, %2 \n\t"
  116. "addl %%eax, %1 \n\t"
  117. "movq %%mm2, (%2, %3) \n\t"
  118. "movq %%mm1, (%2, %%eax) \n\t"
  119. "addl %%eax, %2 \n\t"
  120. "subl $4, %0 \n\t"
  121. "jnz 1b \n\t"
  122. :"+g"(h), "+D"(pixels), "+S" (block)
  123. :"c"(line_size)
  124. :"%eax", "memory");
  125. #else
  126. // kabi measure me
  127. __asm __volatile(
  128. "movq (%2), %%mm0 \n\t"
  129. "addl %1, %2 \n\t"
  130. "xorl %%eax, %%eax \n\t"
  131. "leal (%1, %2), %%edi \n\t"
  132. "leal (%1, %3), %%esi \n\t"
  133. "addl %1, %1 \n\t"
  134. ".balign 16 \n\t"
  135. "1: \n\t"
  136. "movq (%2 , %%eax), %%mm1 \n\t"
  137. "movq (%%edi, %%eax), %%mm2 \n\t"
  138. PAVGB" %%mm1, %%mm0 \n\t"
  139. PAVGB" %%mm2, %%mm1 \n\t"
  140. "movq %%mm0, (%3 , %%eax) \n\t"
  141. "movq %%mm1, (%%esi, %%eax) \n\t"
  142. "addl %1, %%eax \n\t"
  143. "movq (%2 , %%eax), %%mm1 \n\t"
  144. "movq (%%edi, %%eax), %%mm0 \n\t"
  145. PAVGB" %%mm1, %%mm2 \n\t"
  146. PAVGB" %%mm0, %%mm1 \n\t"
  147. "movq %%mm2, (%3 , %%eax) \n\t"
  148. "movq %%mm1, (%%esi, %%eax) \n\t"
  149. "addl %1, %%eax \n\t"
  150. "subl $4, %0 \n\t"
  151. " jnz 1b \n\t"
  152. :"+g"(h), "+r"(line_size), "+r"(pixels)
  153. : "r" (block)
  154. : "%eax", "%esi", "%edi", "memory");
  155. #endif
  156. }
  157. /* GL: this function does incorrect rounding if overflow */
  158. static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  159. {
  160. __asm __volatile(
  161. MOVQ_BONE(%%mm7)
  162. "xorl %%eax, %%eax \n\t"
  163. "movq (%1), %%mm0 \n\t"
  164. ".balign 16 \n\t"
  165. "1: \n\t"
  166. "movq (%2, %%eax), %%mm1 \n\t"
  167. "movq (%3, %%eax), %%mm2 \n\t"
  168. "psubusb %%mm7, %%mm1 \n\t"
  169. PAVGB" %%mm1, %%mm0 \n\t"
  170. PAVGB" %%mm2, %%mm1 \n\t"
  171. "movq %%mm0, (%4, %%eax) \n\t"
  172. "movq %%mm1, (%5, %%eax) \n\t"
  173. "addl %6, %%eax \n\t"
  174. "movq (%2, %%eax), %%mm1 \n\t"
  175. "movq (%3, %%eax), %%mm0 \n\t"
  176. "psubusb %%mm7, %%mm1 \n\t"
  177. PAVGB" %%mm1, %%mm2 \n\t"
  178. PAVGB" %%mm0, %%mm1 \n\t"
  179. "movq %%mm2, (%4, %%eax) \n\t"
  180. "movq %%mm1, (%5, %%eax) \n\t"
  181. "addl %6, %%eax \n\t"
  182. "subl $4, %0 \n\t"
  183. " jnz 1b \n\t"
  184. :"+g"(h)
  185. :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  186. "r" (block+line_size), "g"(line_size<<1)
  187. :"%eax", "memory");
  188. }
  189. static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  190. {
  191. __asm __volatile(
  192. "xorl %%eax, %%eax \n\t"
  193. ".balign 16 \n\t"
  194. "1: \n\t"
  195. "movq (%1, %%eax), %%mm0 \n\t"
  196. "movq (%2, %%eax), %%mm2 \n\t"
  197. "movq (%3, %%eax), %%mm3 \n\t"
  198. "movq (%4, %%eax), %%mm4 \n\t"
  199. PAVGB" %%mm3, %%mm0 \n\t"
  200. PAVGB" %%mm4, %%mm2 \n\t"
  201. "movq %%mm0, (%3, %%eax) \n\t"
  202. "movq %%mm2, (%4, %%eax) \n\t"
  203. "addl %5, %%eax \n\t"
  204. "movq (%1, %%eax), %%mm0 \n\t"
  205. "movq (%2, %%eax), %%mm2 \n\t"
  206. "movq (%3, %%eax), %%mm3 \n\t"
  207. "movq (%4, %%eax), %%mm4 \n\t"
  208. PAVGB" %%mm3, %%mm0 \n\t"
  209. PAVGB" %%mm4, %%mm2 \n\t"
  210. "movq %%mm0, (%3, %%eax) \n\t"
  211. "movq %%mm2, (%4, %%eax) \n\t"
  212. "addl %5, %%eax \n\t"
  213. "subl $4, %0 \n\t"
  214. " jnz 1b \n\t"
  215. :"+g"(h)
  216. :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
  217. "g"(line_size<<1)
  218. :"%eax", "memory");
  219. }
  220. static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  221. {
  222. __asm __volatile(
  223. "xorl %%eax, %%eax \n\t"
  224. ".balign 16 \n\t"
  225. "1: \n\t"
  226. "movq (%1, %%eax), %%mm0 \n\t"
  227. "movq 1(%1, %%eax), %%mm1 \n\t"
  228. "movq (%2, %%eax), %%mm2 \n\t"
  229. "movq 1(%2, %%eax), %%mm3 \n\t"
  230. PAVGB" %%mm1, %%mm0 \n\t"
  231. PAVGB" %%mm3, %%mm2 \n\t"
  232. "movq (%3, %%eax), %%mm3 \n\t"
  233. "movq (%4, %%eax), %%mm4 \n\t"
  234. PAVGB" %%mm3, %%mm0 \n\t"
  235. PAVGB" %%mm4, %%mm2 \n\t"
  236. "movq %%mm0, (%3, %%eax) \n\t"
  237. "movq %%mm2, (%4, %%eax) \n\t"
  238. "addl %5, %%eax \n\t"
  239. "movq (%1, %%eax), %%mm0 \n\t"
  240. "movq 1(%1, %%eax), %%mm1 \n\t"
  241. "movq (%2, %%eax), %%mm2 \n\t"
  242. "movq 1(%2, %%eax), %%mm3 \n\t"
  243. PAVGB" %%mm1, %%mm0 \n\t"
  244. PAVGB" %%mm3, %%mm2 \n\t"
  245. "movq (%3, %%eax), %%mm3 \n\t"
  246. "movq (%4, %%eax), %%mm4 \n\t"
  247. PAVGB" %%mm3, %%mm0 \n\t"
  248. PAVGB" %%mm4, %%mm2 \n\t"
  249. "movq %%mm0, (%3, %%eax) \n\t"
  250. "movq %%mm2, (%4, %%eax) \n\t"
  251. "addl %5, %%eax \n\t"
  252. "subl $4, %0 \n\t"
  253. " jnz 1b \n\t"
  254. :"+g"(h)
  255. :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
  256. "g"(line_size<<1)
  257. :"%eax", "memory");
  258. }
  259. static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  260. {
  261. __asm __volatile(
  262. "xorl %%eax, %%eax \n\t"
  263. "movq (%1), %%mm0 \n\t"
  264. ".balign 16 \n\t"
  265. "1: \n\t"
  266. "movq (%2, %%eax), %%mm1 \n\t"
  267. "movq (%3, %%eax), %%mm2 \n\t"
  268. PAVGB" %%mm1, %%mm0 \n\t"
  269. PAVGB" %%mm2, %%mm1 \n\t"
  270. "movq (%4, %%eax), %%mm3 \n\t"
  271. "movq (%5, %%eax), %%mm4 \n\t"
  272. PAVGB" %%mm3, %%mm0 \n\t"
  273. PAVGB" %%mm4, %%mm1 \n\t"
  274. "movq %%mm0, (%4, %%eax) \n\t"
  275. "movq %%mm1, (%5, %%eax) \n\t"
  276. "addl %6, %%eax \n\t"
  277. "movq (%2, %%eax), %%mm1 \n\t"
  278. "movq (%3, %%eax), %%mm0 \n\t"
  279. PAVGB" %%mm1, %%mm2 \n\t"
  280. PAVGB" %%mm0, %%mm1 \n\t"
  281. "movq (%4, %%eax), %%mm3 \n\t"
  282. "movq (%5, %%eax), %%mm4 \n\t"
  283. PAVGB" %%mm3, %%mm2 \n\t"
  284. PAVGB" %%mm4, %%mm1 \n\t"
  285. "movq %%mm2, (%4, %%eax) \n\t"
  286. "movq %%mm1, (%5, %%eax) \n\t"
  287. "addl %6, %%eax \n\t"
  288. "subl $4, %0 \n\t"
  289. " jnz 1b \n\t"
  290. :"+g"(h)
  291. :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  292. "r" (block+line_size), "g"(line_size<<1)
  293. :"%eax", "memory");
  294. }
  295. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  296. static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  297. {
  298. __asm __volatile(
  299. MOVQ_BONE(%%mm7)
  300. "xorl %%eax, %%eax \n\t"
  301. "movq (%1), %%mm0 \n\t"
  302. "movq 1(%1), %%mm1 \n\t"
  303. PAVGB" %%mm1, %%mm0 \n\t"
  304. ".balign 16 \n\t"
  305. "1: \n\t"
  306. "movq (%2, %%eax), %%mm1 \n\t"
  307. "movq (%3, %%eax), %%mm2 \n\t"
  308. "movq 1(%2, %%eax), %%mm3 \n\t"
  309. "movq 1(%3, %%eax), %%mm4 \n\t"
  310. "psubusb %%mm7, %%mm2 \n\t"
  311. PAVGB" %%mm3, %%mm1 \n\t"
  312. PAVGB" %%mm4, %%mm2 \n\t"
  313. PAVGB" %%mm1, %%mm0 \n\t"
  314. PAVGB" %%mm2, %%mm1 \n\t"
  315. "movq (%4, %%eax), %%mm3 \n\t"
  316. "movq (%5, %%eax), %%mm4 \n\t"
  317. PAVGB" %%mm3, %%mm0 \n\t"
  318. PAVGB" %%mm4, %%mm1 \n\t"
  319. "movq %%mm0, (%4, %%eax) \n\t"
  320. "movq %%mm1, (%5, %%eax) \n\t"
  321. "addl %6, %%eax \n\t"
  322. "movq (%2, %%eax), %%mm1 \n\t"
  323. "movq (%3, %%eax), %%mm0 \n\t"
  324. "movq 1(%2, %%eax), %%mm3 \n\t"
  325. "movq 1(%3, %%eax), %%mm4 \n\t"
  326. PAVGB" %%mm3, %%mm1 \n\t"
  327. PAVGB" %%mm4, %%mm0 \n\t"
  328. PAVGB" %%mm1, %%mm2 \n\t"
  329. PAVGB" %%mm0, %%mm1 \n\t"
  330. "movq (%4, %%eax), %%mm3 \n\t"
  331. "movq (%5, %%eax), %%mm4 \n\t"
  332. PAVGB" %%mm3, %%mm2 \n\t"
  333. PAVGB" %%mm4, %%mm1 \n\t"
  334. "movq %%mm2, (%4, %%eax) \n\t"
  335. "movq %%mm1, (%5, %%eax) \n\t"
  336. "addl %6, %%eax \n\t"
  337. "subl $4, %0 \n\t"
  338. " jnz 1b \n\t"
  339. :"+g"(h)
  340. :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  341. "r" (block+line_size), "g"(line_size<<1)
  342. :"%eax", "memory");
  343. }