You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

345 lines
7.7KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  22. {
  23. int dh, hh;
  24. UINT8 *p;
  25. const UINT8 *pix;
  26. p = block;
  27. pix = pixels;
  28. hh=h>>2;
  29. dh=h&3;
  30. while(hh--) {
  31. __asm __volatile(
  32. "movq %4, %%mm0\n\t"
  33. "movq 1%4, %%mm1\n\t"
  34. "movq %5, %%mm2\n\t"
  35. "movq 1%5, %%mm3\n\t"
  36. "movq %6, %%mm4\n\t"
  37. "movq 1%6, %%mm5\n\t"
  38. "movq %7, %%mm6\n\t"
  39. "movq 1%7, %%mm7\n\t"
  40. PAVGB" %%mm1, %%mm0\n\t"
  41. PAVGB" %%mm3, %%mm2\n\t"
  42. PAVGB" %%mm5, %%mm4\n\t"
  43. PAVGB" %%mm7, %%mm6\n\t"
  44. "movq %%mm0, %0\n\t"
  45. "movq %%mm2, %1\n\t"
  46. "movq %%mm4, %2\n\t"
  47. "movq %%mm6, %3\n\t"
  48. :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
  49. :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
  50. :"memory");
  51. pix += line_size*4; p += line_size*4;
  52. }
  53. while(dh--) {
  54. __asm __volatile(
  55. "movq %1, %%mm0\n\t"
  56. "movq 1%1, %%mm1\n\t"
  57. PAVGB" %%mm1, %%mm0\n\t"
  58. "movq %%mm0, %0\n\t"
  59. :"=m"(*p)
  60. :"m"(*pix)
  61. :"memory");
  62. pix += line_size; p += line_size;
  63. }
  64. }
  65. static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  66. {
  67. int dh, hh;
  68. UINT8 *p;
  69. const UINT8 *pix;
  70. p = block;
  71. pix = pixels;
  72. hh=h>>1;
  73. dh=h&1;
  74. while(hh--) {
  75. __asm __volatile(
  76. "movq %2, %%mm0\n\t"
  77. "movq %3, %%mm1\n\t"
  78. "movq %4, %%mm2\n\t"
  79. PAVGB" %%mm1, %%mm0\n\t"
  80. PAVGB" %%mm2, %%mm1\n\t"
  81. "movq %%mm0, %0\n\t"
  82. "movq %%mm1, %1\n\t"
  83. :"=m"(*p), "=m"(*(p+line_size))
  84. :"m"(*pix), "m"(*(pix+line_size)),
  85. "m"(*(pix+line_size*2))
  86. :"memory");
  87. pix += line_size*2;
  88. p += line_size*2;
  89. }
  90. if(dh) {
  91. __asm __volatile(
  92. "movq %1, %%mm0\n\t"
  93. "movq %2, %%mm1\n\t"
  94. PAVGB" %%mm1, %%mm0\n\t"
  95. "movq %%mm0, %0\n\t"
  96. :"=m"(*p)
  97. :"m"(*pix),
  98. "m"(*(pix+line_size))
  99. :"memory");
  100. }
  101. }
  102. static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  103. {
  104. int dh, hh;
  105. UINT8 *p;
  106. const UINT8 *pix;
  107. p = block;
  108. pix = pixels;
  109. hh=h>>2;
  110. dh=h&3;
  111. while(hh--) {
  112. __asm __volatile(
  113. "movq %0, %%mm0\n\t"
  114. "movq %4, %%mm1\n\t"
  115. "movq %1, %%mm2\n\t"
  116. "movq %5, %%mm3\n\t"
  117. "movq %2, %%mm4\n\t"
  118. "movq %6, %%mm5\n\t"
  119. "movq %3, %%mm6\n\t"
  120. "movq %7, %%mm7\n\t"
  121. PAVGB" %%mm1, %%mm0\n\t"
  122. PAVGB" %%mm3, %%mm2\n\t"
  123. PAVGB" %%mm5, %%mm4\n\t"
  124. PAVGB" %%mm7, %%mm6\n\t"
  125. "movq %%mm0, %0\n\t"
  126. "movq %%mm2, %1\n\t"
  127. "movq %%mm4, %2\n\t"
  128. "movq %%mm6, %3\n\t"
  129. :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
  130. :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
  131. :"memory");
  132. pix += line_size*4; p += line_size*4;
  133. }
  134. while(dh--) {
  135. __asm __volatile(
  136. "movq %0, %%mm0\n\t"
  137. "movq %1, %%mm1\n\t"
  138. PAVGB" %%mm1, %%mm0\n\t"
  139. "movq %%mm0, %0\n\t"
  140. :"=m"(*p)
  141. :"m"(*pix)
  142. :"memory");
  143. pix += line_size; p += line_size;
  144. }
  145. }
  146. static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  147. {
  148. int dh, hh;
  149. UINT8 *p;
  150. const UINT8 *pix;
  151. p = block;
  152. pix = pixels;
  153. hh=h>>1;
  154. dh=h&1;
  155. while(hh--) {
  156. __asm __volatile(
  157. "movq %2, %%mm2\n\t"
  158. "movq 1%2, %%mm3\n\t"
  159. "movq %3, %%mm4\n\t"
  160. "movq 1%3, %%mm5\n\t"
  161. "movq %0, %%mm0\n\t"
  162. "movq %1, %%mm1\n\t"
  163. PAVGB" %%mm3, %%mm2\n\t"
  164. PAVGB" %%mm2, %%mm0\n\t"
  165. PAVGB" %%mm5, %%mm4\n\t"
  166. PAVGB" %%mm4, %%mm1\n\t"
  167. "movq %%mm0, %0\n\t"
  168. "movq %%mm1, %1\n\t"
  169. :"=m"(*p), "=m"(*(p+line_size))
  170. :"m"(*pix), "m"(*(pix+line_size))
  171. :"memory");
  172. pix += line_size*2;
  173. p += line_size*2;
  174. }
  175. if(dh) {
  176. __asm __volatile(
  177. "movq %1, %%mm1\n\t"
  178. "movq 1%1, %%mm2\n\t"
  179. "movq %0, %%mm0\n\t"
  180. PAVGB" %%mm2, %%mm1\n\t"
  181. PAVGB" %%mm1, %%mm0\n\t"
  182. "movq %%mm0, %0\n\t"
  183. :"=m"(*p)
  184. :"m"(*pix)
  185. :"memory");
  186. }
  187. }
  188. static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  189. {
  190. int dh, hh;
  191. UINT8 *p;
  192. const UINT8 *pix;
  193. p = block;
  194. pix = pixels;
  195. hh=h>>1;
  196. dh=h&1;
  197. while(hh--) {
  198. __asm __volatile(
  199. "movq %2, %%mm2\n\t"
  200. "movq %3, %%mm3\n\t"
  201. "movq %3, %%mm4\n\t"
  202. "movq %4, %%mm5\n\t"
  203. "movq %0, %%mm0\n\t"
  204. "movq %1, %%mm1\n\t"
  205. PAVGB" %%mm3, %%mm2\n\t"
  206. PAVGB" %%mm2, %%mm0\n\t"
  207. PAVGB" %%mm5, %%mm4\n\t"
  208. PAVGB" %%mm4, %%mm1\n\t"
  209. "movq %%mm0, %0\n\t"
  210. "movq %%mm1, %1\n\t"
  211. :"=m"(*p), "=m"(*(p+line_size))
  212. :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2))
  213. :"memory");
  214. pix += line_size*2;
  215. p += line_size*2;
  216. }
  217. if(dh) {
  218. __asm __volatile(
  219. "movq %1, %%mm1\n\t"
  220. "movq %2, %%mm2\n\t"
  221. "movq %0, %%mm0\n\t"
  222. PAVGB" %%mm2, %%mm1\n\t"
  223. PAVGB" %%mm1, %%mm0\n\t"
  224. "movq %%mm0, %0\n\t"
  225. :"=m"(*p)
  226. :"m"(*pix), "m"(*(pix+line_size))
  227. :"memory");
  228. }
  229. }
  230. static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  231. {
  232. UINT8 *p;
  233. const UINT8 *pix;
  234. p = block;
  235. pix = pixels;
  236. __asm __volatile(
  237. "pxor %%mm7, %%mm7\n\t"
  238. "movq %0, %%mm6\n\t"
  239. ::"m"(mm_wtwo):"memory");
  240. do {
  241. __asm __volatile(
  242. "movq %1, %%mm0\n\t"
  243. "movq %2, %%mm1\n\t"
  244. "movq 1%1, %%mm4\n\t"
  245. "movq 1%2, %%mm5\n\t"
  246. "movq %%mm0, %%mm2\n\t"
  247. "movq %%mm1, %%mm3\n\t"
  248. "punpcklbw %%mm7, %%mm0\n\t"
  249. "punpcklbw %%mm7, %%mm1\n\t"
  250. "punpckhbw %%mm7, %%mm2\n\t"
  251. "punpckhbw %%mm7, %%mm3\n\t"
  252. "paddusw %%mm1, %%mm0\n\t"
  253. "paddusw %%mm3, %%mm2\n\t"
  254. "movq %%mm4, %%mm1\n\t"
  255. "movq %%mm5, %%mm3\n\t"
  256. "punpcklbw %%mm7, %%mm4\n\t"
  257. "punpcklbw %%mm7, %%mm5\n\t"
  258. "punpckhbw %%mm7, %%mm1\n\t"
  259. "punpckhbw %%mm7, %%mm3\n\t"
  260. "paddusw %%mm5, %%mm4\n\t"
  261. "paddusw %%mm3, %%mm1\n\t"
  262. "paddusw %%mm6, %%mm4\n\t"
  263. "paddusw %%mm6, %%mm1\n\t"
  264. "paddusw %%mm4, %%mm0\n\t"
  265. "paddusw %%mm1, %%mm2\n\t"
  266. "psrlw $2, %%mm0\n\t"
  267. "psrlw $2, %%mm2\n\t"
  268. "packuswb %%mm2, %%mm0\n\t"
  269. PAVGB" %0, %%mm0\n\t"
  270. "movq %%mm0, %0\n\t"
  271. :"=m"(*p)
  272. :"m"(*pix),
  273. "m"(*(pix+line_size))
  274. :"memory");
  275. pix += line_size;
  276. p += line_size ;
  277. } while(--h);
  278. }
  279. static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  280. {
  281. DCTELEM *p;
  282. const UINT8 *pix;
  283. p = block;
  284. pix = pixels;
  285. __asm __volatile(
  286. "pxor %%mm7, %%mm7":::"memory");
  287. do {
  288. __asm __volatile(
  289. "movq 1%1, %%mm2\n\t"
  290. "movq %0, %%mm0\n\t"
  291. PAVGB" %1, %%mm2\n\t"
  292. "movq 8%0, %%mm1\n\t"
  293. "movq %%mm2, %%mm3\n\t"
  294. "punpcklbw %%mm7, %%mm2\n\t"
  295. "punpckhbw %%mm7, %%mm3\n\t"
  296. "psubsw %%mm2, %%mm0\n\t"
  297. "psubsw %%mm3, %%mm1\n\t"
  298. "movq %%mm0, %0\n\t"
  299. "movq %%mm1, 8%0\n\t"
  300. :"=m"(*p)
  301. :"m"(*pix)
  302. :"memory");
  303. pix += line_size;
  304. p += 8;
  305. } while (--h);
  306. }
  307. static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  308. {
  309. DCTELEM *p;
  310. const UINT8 *pix;
  311. p = block;
  312. pix = pixels;
  313. __asm __volatile(
  314. "pxor %%mm7, %%mm7":::"memory");
  315. do {
  316. __asm __volatile(
  317. "movq %2, %%mm2\n\t"
  318. "movq %0, %%mm0\n\t"
  319. PAVGB" %1, %%mm2\n\t"
  320. "movq 8%0, %%mm1\n\t"
  321. "movq %%mm2, %%mm3\n\t"
  322. "punpcklbw %%mm7, %%mm2\n\t"
  323. "punpckhbw %%mm7, %%mm3\n\t"
  324. "psubsw %%mm2, %%mm0\n\t"
  325. "psubsw %%mm3, %%mm1\n\t"
  326. "movq %%mm0, %0\n\t"
  327. "movq %%mm1, 8%0\n\t"
  328. :"=m"(*p)
  329. :"m"(*pix), "m"(*(pix+line_size))
  330. :"memory");
  331. pix += line_size;
  332. p += 8;
  333. } while (--h);
  334. }