You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

428 lines
12KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002 Michael Niedermayer
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  23. */
  24. /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
  25. clobber bug - now it will work with 2.95.2 and also with -fPIC
  26. */
  27. static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  28. {
  29. __asm __volatile(
  30. "lea (%3, %3), %%eax \n\t"
  31. "1: \n\t"
  32. "movq (%1), %%mm0 \n\t"
  33. "movq (%1, %3), %%mm1 \n\t"
  34. PAVGB" 1(%1), %%mm0 \n\t"
  35. PAVGB" 1(%1, %3), %%mm1 \n\t"
  36. "movq %%mm0, (%2) \n\t"
  37. "movq %%mm1, (%2, %3) \n\t"
  38. "addl %%eax, %1 \n\t"
  39. "addl %%eax, %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq (%1, %3), %%mm1 \n\t"
  42. PAVGB" 1(%1), %%mm0 \n\t"
  43. PAVGB" 1(%1, %3), %%mm1 \n\t"
  44. "addl %%eax, %1 \n\t"
  45. "movq %%mm0, (%2) \n\t"
  46. "movq %%mm1, (%2, %3) \n\t"
  47. "addl %%eax, %2 \n\t"
  48. "subl $4, %0 \n\t"
  49. "jnz 1b \n\t"
  50. :"+g"(h), "+S"(pixels), "+D"(block)
  51. :"r" (line_size)
  52. :"%eax", "memory");
  53. }
  54. static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  55. {
  56. __asm __volatile(
  57. "1: \n\t"
  58. "movq (%1), %%mm0 \n\t"
  59. "addl %4, %1 \n\t"
  60. "movq (%1), %%mm1 \n\t"
  61. "addl %4, %1 \n\t"
  62. PAVGB" (%2), %%mm0 \n\t"
  63. PAVGB" 8(%2), %%mm1 \n\t"
  64. "movq %%mm0, (%3) \n\t"
  65. "addl %5, %3 \n\t"
  66. "movq %%mm1, (%3) \n\t"
  67. "addl %5, %3 \n\t"
  68. "movq (%1), %%mm0 \n\t"
  69. "addl %4, %1 \n\t"
  70. "movq (%1), %%mm1 \n\t"
  71. "addl %4, %1 \n\t"
  72. PAVGB" 16(%2), %%mm0 \n\t"
  73. PAVGB" 24(%2), %%mm1 \n\t"
  74. "movq %%mm0, (%3) \n\t"
  75. "addl %5, %3 \n\t"
  76. "movq %%mm1, (%3) \n\t"
  77. "addl %5, %3 \n\t"
  78. "addl $32, %2 \n\t"
  79. "subl $4, %0 \n\t"
  80. "jnz 1b \n\t"
  81. :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  82. :"r"(src1Stride), "r"(dstStride)
  83. :"memory");
  84. }
  85. static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  86. {
  87. __asm __volatile(
  88. "lea (%3, %3), %%eax \n\t"
  89. "1: \n\t"
  90. "movq (%1), %%mm0 \n\t"
  91. "movq (%1, %3), %%mm1 \n\t"
  92. "movq 8(%1), %%mm2 \n\t"
  93. "movq 8(%1, %3), %%mm3 \n\t"
  94. PAVGB" 1(%1), %%mm0 \n\t"
  95. PAVGB" 1(%1, %3), %%mm1 \n\t"
  96. PAVGB" 9(%1), %%mm2 \n\t"
  97. PAVGB" 9(%1, %3), %%mm3 \n\t"
  98. "movq %%mm0, (%2) \n\t"
  99. "movq %%mm1, (%2, %3) \n\t"
  100. "movq %%mm2, 8(%2) \n\t"
  101. "movq %%mm3, 8(%2, %3) \n\t"
  102. "addl %%eax, %1 \n\t"
  103. "addl %%eax, %2 \n\t"
  104. "movq (%1), %%mm0 \n\t"
  105. "movq (%1, %3), %%mm1 \n\t"
  106. "movq 8(%1), %%mm2 \n\t"
  107. "movq 8(%1, %3), %%mm3 \n\t"
  108. PAVGB" 1(%1), %%mm0 \n\t"
  109. PAVGB" 1(%1, %3), %%mm1 \n\t"
  110. PAVGB" 9(%1), %%mm2 \n\t"
  111. PAVGB" 9(%1, %3), %%mm3 \n\t"
  112. "addl %%eax, %1 \n\t"
  113. "movq %%mm0, (%2) \n\t"
  114. "movq %%mm1, (%2, %3) \n\t"
  115. "movq %%mm2, 8(%2) \n\t"
  116. "movq %%mm3, 8(%2, %3) \n\t"
  117. "addl %%eax, %2 \n\t"
  118. "subl $4, %0 \n\t"
  119. "jnz 1b \n\t"
  120. :"+g"(h), "+S"(pixels), "+D"(block)
  121. :"r" (line_size)
  122. :"%eax", "memory");
  123. }
  124. static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  125. {
  126. __asm __volatile(
  127. "1: \n\t"
  128. "movq (%1), %%mm0 \n\t"
  129. "movq 8(%1), %%mm1 \n\t"
  130. "addl %4, %1 \n\t"
  131. PAVGB" (%2), %%mm0 \n\t"
  132. PAVGB" 8(%2), %%mm1 \n\t"
  133. "movq %%mm0, (%3) \n\t"
  134. "movq %%mm1, 8(%3) \n\t"
  135. "addl %5, %3 \n\t"
  136. "movq (%1), %%mm0 \n\t"
  137. "movq 8(%1), %%mm1 \n\t"
  138. "addl %4, %1 \n\t"
  139. PAVGB" 16(%2), %%mm0 \n\t"
  140. PAVGB" 24(%2), %%mm1 \n\t"
  141. "movq %%mm0, (%3) \n\t"
  142. "movq %%mm1, 8(%3) \n\t"
  143. "addl %5, %3 \n\t"
  144. "addl $32, %2 \n\t"
  145. "subl $2, %0 \n\t"
  146. "jnz 1b \n\t"
  147. :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  148. :"r"(src1Stride), "r"(dstStride)
  149. :"memory");
  150. }
  151. /* GL: this function does incorrect rounding if overflow */
  152. static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  153. {
  154. MOVQ_BONE(mm6);
  155. __asm __volatile(
  156. "lea (%3, %3), %%eax \n\t"
  157. "1: \n\t"
  158. "movq (%1), %%mm0 \n\t"
  159. "movq (%1, %3), %%mm2 \n\t"
  160. "movq 1(%1), %%mm1 \n\t"
  161. "movq 1(%1, %3), %%mm3 \n\t"
  162. "addl %%eax, %1 \n\t"
  163. "psubusb %%mm6, %%mm0 \n\t"
  164. "psubusb %%mm6, %%mm2 \n\t"
  165. PAVGB" %%mm1, %%mm0 \n\t"
  166. PAVGB" %%mm3, %%mm2 \n\t"
  167. "movq %%mm0, (%2) \n\t"
  168. "movq %%mm2, (%2, %3) \n\t"
  169. "movq (%1), %%mm0 \n\t"
  170. "movq 1(%1), %%mm1 \n\t"
  171. "movq (%1, %3), %%mm2 \n\t"
  172. "movq 1(%1, %3), %%mm3 \n\t"
  173. "addl %%eax, %2 \n\t"
  174. "addl %%eax, %1 \n\t"
  175. "psubusb %%mm6, %%mm0 \n\t"
  176. "psubusb %%mm6, %%mm2 \n\t"
  177. PAVGB" %%mm1, %%mm0 \n\t"
  178. PAVGB" %%mm3, %%mm2 \n\t"
  179. "movq %%mm0, (%2) \n\t"
  180. "movq %%mm2, (%2, %3) \n\t"
  181. "addl %%eax, %2 \n\t"
  182. "subl $4, %0 \n\t"
  183. "jnz 1b \n\t"
  184. :"+g"(h), "+S"(pixels), "+D"(block)
  185. :"r" (line_size)
  186. :"%eax", "memory");
  187. }
  188. static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  189. {
  190. __asm __volatile(
  191. "lea (%3, %3), %%eax \n\t"
  192. "movq (%1), %%mm0 \n\t"
  193. "subl %3, %2 \n\t"
  194. "1: \n\t"
  195. "movq (%1, %3), %%mm1 \n\t"
  196. "movq (%1, %%eax), %%mm2 \n\t"
  197. "addl %%eax, %1 \n\t"
  198. PAVGB" %%mm1, %%mm0 \n\t"
  199. PAVGB" %%mm2, %%mm1 \n\t"
  200. "movq %%mm0, (%2, %3) \n\t"
  201. "movq %%mm1, (%2, %%eax) \n\t"
  202. "movq (%1, %3), %%mm1 \n\t"
  203. "movq (%1, %%eax), %%mm0 \n\t"
  204. "addl %%eax, %2 \n\t"
  205. "addl %%eax, %1 \n\t"
  206. PAVGB" %%mm1, %%mm2 \n\t"
  207. PAVGB" %%mm0, %%mm1 \n\t"
  208. "movq %%mm2, (%2, %3) \n\t"
  209. "movq %%mm1, (%2, %%eax) \n\t"
  210. "addl %%eax, %2 \n\t"
  211. "subl $4, %0 \n\t"
  212. "jnz 1b \n\t"
  213. :"+g"(h), "+S"(pixels), "+D" (block)
  214. :"r" (line_size)
  215. :"%eax", "memory");
  216. }
  217. /* GL: this function does incorrect rounding if overflow */
  218. static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  219. {
  220. MOVQ_BONE(mm6);
  221. __asm __volatile(
  222. "lea (%3, %3), %%eax \n\t"
  223. "movq (%1), %%mm0 \n\t"
  224. "subl %3, %2 \n\t"
  225. "1: \n\t"
  226. "movq (%1, %3), %%mm1 \n\t"
  227. "movq (%1, %%eax), %%mm2 \n\t"
  228. "addl %%eax, %1 \n\t"
  229. "psubusb %%mm6, %%mm1 \n\t"
  230. PAVGB" %%mm1, %%mm0 \n\t"
  231. PAVGB" %%mm2, %%mm1 \n\t"
  232. "movq %%mm0, (%2, %3) \n\t"
  233. "movq %%mm1, (%2, %%eax) \n\t"
  234. "movq (%1, %3), %%mm1 \n\t"
  235. "movq (%1, %%eax), %%mm0 \n\t"
  236. "addl %%eax, %2 \n\t"
  237. "addl %%eax, %1 \n\t"
  238. "psubusb %%mm6, %%mm1 \n\t"
  239. PAVGB" %%mm1, %%mm2 \n\t"
  240. PAVGB" %%mm0, %%mm1 \n\t"
  241. "movq %%mm2, (%2, %3) \n\t"
  242. "movq %%mm1, (%2, %%eax) \n\t"
  243. "addl %%eax, %2 \n\t"
  244. "subl $4, %0 \n\t"
  245. "jnz 1b \n\t"
  246. :"+g"(h), "+S"(pixels), "+D" (block)
  247. :"r" (line_size)
  248. :"%eax", "memory");
  249. }
  250. static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  251. {
  252. __asm __volatile(
  253. "lea (%3, %3), %%eax \n\t"
  254. "1: \n\t"
  255. "movq (%2), %%mm0 \n\t"
  256. "movq (%2, %3), %%mm1 \n\t"
  257. PAVGB" (%1), %%mm0 \n\t"
  258. PAVGB" (%1, %3), %%mm1 \n\t"
  259. "movq %%mm0, (%2) \n\t"
  260. "movq %%mm1, (%2, %3) \n\t"
  261. "addl %%eax, %1 \n\t"
  262. "addl %%eax, %2 \n\t"
  263. "movq (%2), %%mm0 \n\t"
  264. "movq (%2, %3), %%mm1 \n\t"
  265. PAVGB" (%1), %%mm0 \n\t"
  266. PAVGB" (%1, %3), %%mm1 \n\t"
  267. "addl %%eax, %1 \n\t"
  268. "movq %%mm0, (%2) \n\t"
  269. "movq %%mm1, (%2, %3) \n\t"
  270. "addl %%eax, %2 \n\t"
  271. "subl $4, %0 \n\t"
  272. "jnz 1b \n\t"
  273. :"+g"(h), "+S"(pixels), "+D"(block)
  274. :"r" (line_size)
  275. :"%eax", "memory");
  276. }
  277. static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  278. {
  279. __asm __volatile(
  280. "lea (%3, %3), %%eax \n\t"
  281. "1: \n\t"
  282. "movq (%1), %%mm0 \n\t"
  283. "movq (%1, %3), %%mm2 \n\t"
  284. PAVGB" 1(%1), %%mm0 \n\t"
  285. PAVGB" 1(%1, %3), %%mm2 \n\t"
  286. PAVGB" (%2), %%mm0 \n\t"
  287. PAVGB" (%2, %3), %%mm2 \n\t"
  288. "addl %%eax, %1 \n\t"
  289. "movq %%mm0, (%2) \n\t"
  290. "movq %%mm2, (%2, %3) \n\t"
  291. "movq (%1), %%mm0 \n\t"
  292. "movq (%1, %3), %%mm2 \n\t"
  293. PAVGB" 1(%1), %%mm0 \n\t"
  294. PAVGB" 1(%1, %3), %%mm2 \n\t"
  295. "addl %%eax, %2 \n\t"
  296. "addl %%eax, %1 \n\t"
  297. PAVGB" (%2), %%mm0 \n\t"
  298. PAVGB" (%2, %3), %%mm2 \n\t"
  299. "movq %%mm0, (%2) \n\t"
  300. "movq %%mm2, (%2, %3) \n\t"
  301. "addl %%eax, %2 \n\t"
  302. "subl $4, %0 \n\t"
  303. "jnz 1b \n\t"
  304. :"+g"(h), "+S"(pixels), "+D"(block)
  305. :"r" (line_size)
  306. :"%eax", "memory");
  307. }
  308. static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  309. {
  310. __asm __volatile(
  311. "lea (%3, %3), %%eax \n\t"
  312. "movq (%1), %%mm0 \n\t"
  313. "subl %3, %2 \n\t"
  314. "1: \n\t"
  315. "movq (%1, %3), %%mm1 \n\t"
  316. "movq (%1, %%eax), %%mm2 \n\t"
  317. "addl %%eax, %1 \n\t"
  318. PAVGB" %%mm1, %%mm0 \n\t"
  319. PAVGB" %%mm2, %%mm1 \n\t"
  320. "movq (%2, %3), %%mm3 \n\t"
  321. "movq (%2, %%eax), %%mm4 \n\t"
  322. PAVGB" %%mm3, %%mm0 \n\t"
  323. PAVGB" %%mm4, %%mm1 \n\t"
  324. "movq %%mm0, (%2, %3) \n\t"
  325. "movq %%mm1, (%2, %%eax) \n\t"
  326. "movq (%1, %3), %%mm1 \n\t"
  327. "movq (%1, %%eax), %%mm0 \n\t"
  328. PAVGB" %%mm1, %%mm2 \n\t"
  329. PAVGB" %%mm0, %%mm1 \n\t"
  330. "addl %%eax, %2 \n\t"
  331. "addl %%eax, %1 \n\t"
  332. "movq (%2, %3), %%mm3 \n\t"
  333. "movq (%2, %%eax), %%mm4 \n\t"
  334. PAVGB" %%mm3, %%mm2 \n\t"
  335. PAVGB" %%mm4, %%mm1 \n\t"
  336. "movq %%mm2, (%2, %3) \n\t"
  337. "movq %%mm1, (%2, %%eax) \n\t"
  338. "addl %%eax, %2 \n\t"
  339. "subl $4, %0 \n\t"
  340. "jnz 1b \n\t"
  341. :"+g"(h), "+S"(pixels), "+D"(block)
  342. :"r" (line_size)
  343. :"%eax", "memory");
  344. }
  345. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  346. static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  347. {
  348. MOVQ_BONE(mm6);
  349. __asm __volatile(
  350. "lea (%3, %3), %%eax \n\t"
  351. "movq (%1), %%mm0 \n\t"
  352. PAVGB" 1(%1), %%mm0 \n\t"
  353. ".balign 8 \n\t"
  354. "1: \n\t"
  355. "movq (%1, %%eax), %%mm2 \n\t"
  356. "movq (%1, %3), %%mm1 \n\t"
  357. "psubusb %%mm6, %%mm2 \n\t"
  358. PAVGB" 1(%1, %3), %%mm1 \n\t"
  359. PAVGB" 1(%1, %%eax), %%mm2 \n\t"
  360. "addl %%eax, %1 \n\t"
  361. PAVGB" %%mm1, %%mm0 \n\t"
  362. PAVGB" %%mm2, %%mm1 \n\t"
  363. PAVGB" (%2), %%mm0 \n\t"
  364. PAVGB" (%2, %3), %%mm1 \n\t"
  365. "movq %%mm0, (%2) \n\t"
  366. "movq %%mm1, (%2, %3) \n\t"
  367. "movq (%1, %3), %%mm1 \n\t"
  368. "movq (%1, %%eax), %%mm0 \n\t"
  369. PAVGB" 1(%1, %3), %%mm1 \n\t"
  370. PAVGB" 1(%1, %%eax), %%mm0 \n\t"
  371. "addl %%eax, %2 \n\t"
  372. "addl %%eax, %1 \n\t"
  373. PAVGB" %%mm1, %%mm2 \n\t"
  374. PAVGB" %%mm0, %%mm1 \n\t"
  375. PAVGB" (%2), %%mm2 \n\t"
  376. PAVGB" (%2, %3), %%mm1 \n\t"
  377. "movq %%mm2, (%2) \n\t"
  378. "movq %%mm1, (%2, %3) \n\t"
  379. "addl %%eax, %2 \n\t"
  380. "subl $4, %0 \n\t"
  381. "jnz 1b \n\t"
  382. :"+g"(h), "+S"(pixels), "+D"(block)
  383. :"r" (line_size)
  384. :"%eax", "memory");
  385. }
  386. //FIXME the following could be optimized too ...
  387. static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  388. DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
  389. DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  390. }
  391. static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  392. DEF(put_pixels8_y2)(block , pixels , line_size, h);
  393. DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
  394. }
  395. static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  396. DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
  397. DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  398. }
  399. static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  400. DEF(avg_pixels8)(block , pixels , line_size, h);
  401. DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
  402. }
  403. static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  404. DEF(avg_pixels8_x2)(block , pixels , line_size, h);
  405. DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  406. }
  407. static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  408. DEF(avg_pixels8_y2)(block , pixels , line_size, h);
  409. DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  410. }
  411. static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  412. DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
  413. DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  414. }