You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

424 lines
12KB

  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  21. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  22. */
  23. // put_pixels
  24. static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  25. {
  26. MOVQ_BFE(mm6);
  27. __asm __volatile(
  28. "lea (%3, %3), %%eax \n\t"
  29. ".balign 8 \n\t"
  30. "1: \n\t"
  31. "movq (%1), %%mm0 \n\t"
  32. "movq 1(%1), %%mm1 \n\t"
  33. "movq (%1, %3), %%mm2 \n\t"
  34. "movq 1(%1, %3), %%mm3 \n\t"
  35. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  36. "movq %%mm4, (%2) \n\t"
  37. "movq %%mm5, (%2, %3) \n\t"
  38. "addl %%eax, %1 \n\t"
  39. "addl %%eax, %2 \n\t"
  40. "movq (%1), %%mm0 \n\t"
  41. "movq 1(%1), %%mm1 \n\t"
  42. "movq (%1, %3), %%mm2 \n\t"
  43. "movq 1(%1, %3), %%mm3 \n\t"
  44. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  45. "movq %%mm4, (%2) \n\t"
  46. "movq %%mm5, (%2, %3) \n\t"
  47. "addl %%eax, %1 \n\t"
  48. "addl %%eax, %2 \n\t"
  49. "subl $4, %0 \n\t"
  50. "jnz 1b \n\t"
  51. :"+g"(h), "+S"(pixels), "+D"(block)
  52. :"r"(line_size)
  53. :"eax", "memory");
  54. }
  55. static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  56. {
  57. MOVQ_BFE(mm6);
  58. __asm __volatile(
  59. "lea (%3, %3), %%eax \n\t"
  60. ".balign 8 \n\t"
  61. "1: \n\t"
  62. "movq (%1), %%mm0 \n\t"
  63. "movq 1(%1), %%mm1 \n\t"
  64. "movq (%1, %3), %%mm2 \n\t"
  65. "movq 1(%1, %3), %%mm3 \n\t"
  66. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  67. "movq %%mm4, (%2) \n\t"
  68. "movq %%mm5, (%2, %3) \n\t"
  69. "movq 8(%1), %%mm0 \n\t"
  70. "movq 9(%1), %%mm1 \n\t"
  71. "movq 8(%1, %3), %%mm2 \n\t"
  72. "movq 9(%1, %3), %%mm3 \n\t"
  73. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  74. "movq %%mm4, 8(%2) \n\t"
  75. "movq %%mm5, 8(%2, %3) \n\t"
  76. "addl %%eax, %1 \n\t"
  77. "addl %%eax, %2 \n\t"
  78. "movq (%1), %%mm0 \n\t"
  79. "movq 1(%1), %%mm1 \n\t"
  80. "movq (%1, %3), %%mm2 \n\t"
  81. "movq 1(%1, %3), %%mm3 \n\t"
  82. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  83. "movq %%mm4, (%2) \n\t"
  84. "movq %%mm5, (%2, %3) \n\t"
  85. "movq 8(%1), %%mm0 \n\t"
  86. "movq 9(%1), %%mm1 \n\t"
  87. "movq 8(%1, %3), %%mm2 \n\t"
  88. "movq 9(%1, %3), %%mm3 \n\t"
  89. PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
  90. "movq %%mm4, 8(%2) \n\t"
  91. "movq %%mm5, 8(%2, %3) \n\t"
  92. "addl %%eax, %1 \n\t"
  93. "addl %%eax, %2 \n\t"
  94. "subl $4, %0 \n\t"
  95. "jnz 1b \n\t"
  96. :"+g"(h), "+S"(pixels), "+D"(block)
  97. :"r"(line_size)
  98. :"eax", "memory");
  99. }
  100. static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  101. {
  102. MOVQ_BFE(mm6);
  103. __asm __volatile(
  104. "lea (%3, %3), %%eax \n\t"
  105. "movq (%1), %%mm0 \n\t"
  106. ".balign 8 \n\t"
  107. "1: \n\t"
  108. "movq (%1, %3), %%mm1 \n\t"
  109. "movq (%1, %%eax),%%mm2 \n\t"
  110. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  111. "movq %%mm4, (%2) \n\t"
  112. "movq %%mm5, (%2, %3) \n\t"
  113. "addl %%eax, %1 \n\t"
  114. "addl %%eax, %2 \n\t"
  115. "movq (%1, %3), %%mm1 \n\t"
  116. "movq (%1, %%eax),%%mm0 \n\t"
  117. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  118. "movq %%mm4, (%2) \n\t"
  119. "movq %%mm5, (%2, %3) \n\t"
  120. "addl %%eax, %1 \n\t"
  121. "addl %%eax, %2 \n\t"
  122. "subl $4, %0 \n\t"
  123. "jnz 1b \n\t"
  124. :"+g"(h), "+S"(pixels), "+D"(block)
  125. :"r"(line_size)
  126. :"eax", "memory");
  127. }
  128. static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  129. {
  130. MOVQ_ZERO(mm7);
  131. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  132. __asm __volatile(
  133. "movq (%1), %%mm0 \n\t"
  134. "movq 1(%1), %%mm4 \n\t"
  135. "movq %%mm0, %%mm1 \n\t"
  136. "movq %%mm4, %%mm5 \n\t"
  137. "punpcklbw %%mm7, %%mm0 \n\t"
  138. "punpcklbw %%mm7, %%mm4 \n\t"
  139. "punpckhbw %%mm7, %%mm1 \n\t"
  140. "punpckhbw %%mm7, %%mm5 \n\t"
  141. "paddusw %%mm0, %%mm4 \n\t"
  142. "paddusw %%mm1, %%mm5 \n\t"
  143. "xorl %%eax, %%eax \n\t"
  144. "addl %3, %1 \n\t"
  145. ".balign 8 \n\t"
  146. "1: \n\t"
  147. "movq (%1, %%eax), %%mm0 \n\t"
  148. "movq 1(%1, %%eax), %%mm2 \n\t"
  149. "movq %%mm0, %%mm1 \n\t"
  150. "movq %%mm2, %%mm3 \n\t"
  151. "punpcklbw %%mm7, %%mm0 \n\t"
  152. "punpcklbw %%mm7, %%mm2 \n\t"
  153. "punpckhbw %%mm7, %%mm1 \n\t"
  154. "punpckhbw %%mm7, %%mm3 \n\t"
  155. "paddusw %%mm2, %%mm0 \n\t"
  156. "paddusw %%mm3, %%mm1 \n\t"
  157. "paddusw %%mm6, %%mm4 \n\t"
  158. "paddusw %%mm6, %%mm5 \n\t"
  159. "paddusw %%mm0, %%mm4 \n\t"
  160. "paddusw %%mm1, %%mm5 \n\t"
  161. "psrlw $2, %%mm4 \n\t"
  162. "psrlw $2, %%mm5 \n\t"
  163. "packuswb %%mm5, %%mm4 \n\t"
  164. "movq %%mm4, (%2, %%eax) \n\t"
  165. "addl %3, %%eax \n\t"
  166. "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  167. "movq 1(%1, %%eax), %%mm4 \n\t"
  168. "movq %%mm2, %%mm3 \n\t"
  169. "movq %%mm4, %%mm5 \n\t"
  170. "punpcklbw %%mm7, %%mm2 \n\t"
  171. "punpcklbw %%mm7, %%mm4 \n\t"
  172. "punpckhbw %%mm7, %%mm3 \n\t"
  173. "punpckhbw %%mm7, %%mm5 \n\t"
  174. "paddusw %%mm2, %%mm4 \n\t"
  175. "paddusw %%mm3, %%mm5 \n\t"
  176. "paddusw %%mm6, %%mm0 \n\t"
  177. "paddusw %%mm6, %%mm1 \n\t"
  178. "paddusw %%mm4, %%mm0 \n\t"
  179. "paddusw %%mm5, %%mm1 \n\t"
  180. "psrlw $2, %%mm0 \n\t"
  181. "psrlw $2, %%mm1 \n\t"
  182. "packuswb %%mm1, %%mm0 \n\t"
  183. "movq %%mm0, (%2, %%eax) \n\t"
  184. "addl %3, %%eax \n\t"
  185. "subl $2, %0 \n\t"
  186. "jnz 1b \n\t"
  187. :"+g"(h), "+S"(pixels)
  188. :"D"(block), "r"(line_size)
  189. :"eax", "memory");
  190. }
  191. // avg_pixels
  192. // in case more speed is needed - unroling would certainly help
  193. static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  194. {
  195. MOVQ_BFE(mm6);
  196. JUMPALIGN();
  197. do {
  198. __asm __volatile(
  199. "movq %0, %%mm0 \n\t"
  200. "movq %1, %%mm1 \n\t"
  201. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  202. "movq %%mm2, %0 \n\t"
  203. :"+m"(*block)
  204. :"m"(*pixels)
  205. :"memory");
  206. pixels += line_size;
  207. block += line_size;
  208. }
  209. while (--h);
  210. }
  211. static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  212. {
  213. MOVQ_BFE(mm6);
  214. JUMPALIGN();
  215. do {
  216. __asm __volatile(
  217. "movq %0, %%mm0 \n\t"
  218. "movq %1, %%mm1 \n\t"
  219. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  220. "movq %%mm2, %0 \n\t"
  221. "movq 8%0, %%mm0 \n\t"
  222. "movq 8%1, %%mm1 \n\t"
  223. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  224. "movq %%mm2, 8%0 \n\t"
  225. :"+m"(*block)
  226. :"m"(*pixels)
  227. :"memory");
  228. pixels += line_size;
  229. block += line_size;
  230. }
  231. while (--h);
  232. }
  233. static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  234. {
  235. MOVQ_BFE(mm6);
  236. JUMPALIGN();
  237. do {
  238. __asm __volatile(
  239. "movq %1, %%mm0 \n\t"
  240. "movq 1%1, %%mm1 \n\t"
  241. "movq %0, %%mm3 \n\t"
  242. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  243. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  244. "movq %%mm0, %0 \n\t"
  245. :"+m"(*block)
  246. :"m"(*pixels)
  247. :"memory");
  248. pixels += line_size;
  249. block += line_size;
  250. } while (--h);
  251. }
  252. static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  253. {
  254. MOVQ_BFE(mm6);
  255. JUMPALIGN();
  256. do {
  257. __asm __volatile(
  258. "movq %1, %%mm0 \n\t"
  259. "movq 1%1, %%mm1 \n\t"
  260. "movq %0, %%mm3 \n\t"
  261. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  262. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  263. "movq %%mm0, %0 \n\t"
  264. "movq 8%1, %%mm0 \n\t"
  265. "movq 9%1, %%mm1 \n\t"
  266. "movq 8%0, %%mm3 \n\t"
  267. PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
  268. PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
  269. "movq %%mm0, 8%0 \n\t"
  270. :"+m"(*block)
  271. :"m"(*pixels)
  272. :"memory");
  273. pixels += line_size;
  274. block += line_size;
  275. } while (--h);
  276. }
  277. static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  278. {
  279. MOVQ_BFE(mm6);
  280. __asm __volatile(
  281. "lea (%3, %3), %%eax \n\t"
  282. "movq (%1), %%mm0 \n\t"
  283. ".balign 8 \n\t"
  284. "1: \n\t"
  285. "movq (%1, %3), %%mm1 \n\t"
  286. "movq (%1, %%eax), %%mm2 \n\t"
  287. PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
  288. "movq (%2), %%mm3 \n\t"
  289. PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
  290. "movq (%2, %3), %%mm3 \n\t"
  291. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  292. "movq %%mm0, (%2) \n\t"
  293. "movq %%mm1, (%2, %3) \n\t"
  294. "addl %%eax, %1 \n\t"
  295. "addl %%eax, %2 \n\t"
  296. "movq (%1, %3), %%mm1 \n\t"
  297. "movq (%1, %%eax), %%mm0 \n\t"
  298. PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
  299. "movq (%2), %%mm3 \n\t"
  300. PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
  301. "movq (%2, %3), %%mm3 \n\t"
  302. PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
  303. "movq %%mm2, (%2) \n\t"
  304. "movq %%mm1, (%2, %3) \n\t"
  305. "addl %%eax, %1 \n\t"
  306. "addl %%eax, %2 \n\t"
  307. "subl $4, %0 \n\t"
  308. "jnz 1b \n\t"
  309. :"+g"(h), "+S"(pixels), "+D"(block)
  310. :"r"(line_size)
  311. :"eax", "memory");
  312. }
  313. // this routine is 'slightly' suboptimal but mostly unused
  314. static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  315. {
  316. MOVQ_ZERO(mm7);
  317. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  318. __asm __volatile(
  319. "movq (%1), %%mm0 \n\t"
  320. "movq 1(%1), %%mm4 \n\t"
  321. "movq %%mm0, %%mm1 \n\t"
  322. "movq %%mm4, %%mm5 \n\t"
  323. "punpcklbw %%mm7, %%mm0 \n\t"
  324. "punpcklbw %%mm7, %%mm4 \n\t"
  325. "punpckhbw %%mm7, %%mm1 \n\t"
  326. "punpckhbw %%mm7, %%mm5 \n\t"
  327. "paddusw %%mm0, %%mm4 \n\t"
  328. "paddusw %%mm1, %%mm5 \n\t"
  329. "xorl %%eax, %%eax \n\t"
  330. "addl %3, %1 \n\t"
  331. ".balign 8 \n\t"
  332. "1: \n\t"
  333. "movq (%1, %%eax), %%mm0 \n\t"
  334. "movq 1(%1, %%eax), %%mm2 \n\t"
  335. "movq %%mm0, %%mm1 \n\t"
  336. "movq %%mm2, %%mm3 \n\t"
  337. "punpcklbw %%mm7, %%mm0 \n\t"
  338. "punpcklbw %%mm7, %%mm2 \n\t"
  339. "punpckhbw %%mm7, %%mm1 \n\t"
  340. "punpckhbw %%mm7, %%mm3 \n\t"
  341. "paddusw %%mm2, %%mm0 \n\t"
  342. "paddusw %%mm3, %%mm1 \n\t"
  343. "paddusw %%mm6, %%mm4 \n\t"
  344. "paddusw %%mm6, %%mm5 \n\t"
  345. "paddusw %%mm0, %%mm4 \n\t"
  346. "paddusw %%mm1, %%mm5 \n\t"
  347. "psrlw $2, %%mm4 \n\t"
  348. "psrlw $2, %%mm5 \n\t"
  349. "movq (%2, %%eax), %%mm3 \n\t"
  350. "packuswb %%mm5, %%mm4 \n\t"
  351. "pcmpeqd %%mm2, %%mm2 \n\t"
  352. "paddb %%mm2, %%mm2 \n\t"
  353. PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
  354. "movq %%mm5, (%2, %%eax) \n\t"
  355. "addl %3, %%eax \n\t"
  356. "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  357. "movq 1(%1, %%eax), %%mm4 \n\t"
  358. "movq %%mm2, %%mm3 \n\t"
  359. "movq %%mm4, %%mm5 \n\t"
  360. "punpcklbw %%mm7, %%mm2 \n\t"
  361. "punpcklbw %%mm7, %%mm4 \n\t"
  362. "punpckhbw %%mm7, %%mm3 \n\t"
  363. "punpckhbw %%mm7, %%mm5 \n\t"
  364. "paddusw %%mm2, %%mm4 \n\t"
  365. "paddusw %%mm3, %%mm5 \n\t"
  366. "paddusw %%mm6, %%mm0 \n\t"
  367. "paddusw %%mm6, %%mm1 \n\t"
  368. "paddusw %%mm4, %%mm0 \n\t"
  369. "paddusw %%mm5, %%mm1 \n\t"
  370. "psrlw $2, %%mm0 \n\t"
  371. "psrlw $2, %%mm1 \n\t"
  372. "movq (%2, %%eax), %%mm3 \n\t"
  373. "packuswb %%mm1, %%mm0 \n\t"
  374. "pcmpeqd %%mm2, %%mm2 \n\t"
  375. "paddb %%mm2, %%mm2 \n\t"
  376. PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
  377. "movq %%mm1, (%2, %%eax) \n\t"
  378. "addl %3, %%eax \n\t"
  379. "subl $2, %0 \n\t"
  380. "jnz 1b \n\t"
  381. :"+g"(h), "+S"(pixels)
  382. :"D"(block), "r"(line_size)
  383. :"eax", "memory");
  384. }
  385. //FIXME optimize
  386. static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  387. DEF(put, pixels8_y2)(block , pixels , line_size, h);
  388. DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
  389. }
  390. static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  391. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  392. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  393. }
  394. static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  395. DEF(avg, pixels8_y2)(block , pixels , line_size, h);
  396. DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
  397. }
  398. static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
  399. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  400. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  401. }