You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

385 lines
13KB

  1. /*
  2. * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
  3. * Loren Merritt
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. */
  19. /**
  20. * MMX optimized version of (put|avg)_h264_chroma_mc8.
  21. * H264_CHROMA_MC8_TMPL must be defined to the desired function name
  22. * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
  23. * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
  24. */
  25. static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  26. {
  27. DECLARE_ALIGNED_8(uint64_t, AA);
  28. DECLARE_ALIGNED_8(uint64_t, DD);
  29. int i;
  30. if(y==0 && x==0) {
  31. /* no filter needed */
  32. H264_CHROMA_MC8_MV0(dst, src, stride, h);
  33. return;
  34. }
  35. assert(x<8 && y<8 && x>=0 && y>=0);
  36. if(y==0)
  37. {
  38. /* horizontal filter only */
  39. asm volatile("movd %0, %%mm5\n\t"
  40. "punpcklwd %%mm5, %%mm5\n\t"
  41. "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
  42. "movq %1, %%mm4\n\t"
  43. "pxor %%mm7, %%mm7\n\t"
  44. "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
  45. : : "rm" (x), "m" (ff_pw_8));
  46. for(i=0; i<h; i++) {
  47. asm volatile(
  48. /* mm0 = src[0..7], mm1 = src[1..8] */
  49. "movq %0, %%mm0\n\t"
  50. "movq %1, %%mm1\n\t"
  51. : : "m" (src[0]), "m" (src[1]));
  52. asm volatile(
  53. /* [mm2,mm3] = A * src[0..7] */
  54. "movq %%mm0, %%mm2\n\t"
  55. "punpcklbw %%mm7, %%mm2\n\t"
  56. "pmullw %%mm4, %%mm2\n\t"
  57. "movq %%mm0, %%mm3\n\t"
  58. "punpckhbw %%mm7, %%mm3\n\t"
  59. "pmullw %%mm4, %%mm3\n\t"
  60. /* [mm2,mm3] += B * src[1..8] */
  61. "movq %%mm1, %%mm0\n\t"
  62. "punpcklbw %%mm7, %%mm0\n\t"
  63. "pmullw %%mm5, %%mm0\n\t"
  64. "punpckhbw %%mm7, %%mm1\n\t"
  65. "pmullw %%mm5, %%mm1\n\t"
  66. "paddw %%mm0, %%mm2\n\t"
  67. "paddw %%mm1, %%mm3\n\t"
  68. /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
  69. "paddw %1, %%mm2\n\t"
  70. "paddw %1, %%mm3\n\t"
  71. "psrlw $3, %%mm2\n\t"
  72. "psrlw $3, %%mm3\n\t"
  73. "packuswb %%mm3, %%mm2\n\t"
  74. H264_CHROMA_OP(%0, %%mm2)
  75. "movq %%mm2, %0\n\t"
  76. : "=m" (dst[0]) : "m" (ff_pw_4));
  77. src += stride;
  78. dst += stride;
  79. }
  80. return;
  81. }
  82. if(x==0)
  83. {
  84. /* vertical filter only */
  85. asm volatile("movd %0, %%mm6\n\t"
  86. "punpcklwd %%mm6, %%mm6\n\t"
  87. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
  88. "movq %1, %%mm4\n\t"
  89. "pxor %%mm7, %%mm7\n\t"
  90. "psubw %%mm6, %%mm4\n\t" /* mm4 = A = 8-y */
  91. : : "rm" (y), "m" (ff_pw_8));
  92. asm volatile(
  93. /* mm0 = src[0..7] */
  94. "movq %0, %%mm0\n\t"
  95. : : "m" (src[0]));
  96. for(i=0; i<h; i++) {
  97. asm volatile(
  98. /* [mm2,mm3] = A * src[0..7] */
  99. "movq %mm0, %mm2\n\t"
  100. "punpcklbw %mm7, %mm2\n\t"
  101. "pmullw %mm4, %mm2\n\t"
  102. "movq %mm0, %mm3\n\t"
  103. "punpckhbw %mm7, %mm3\n\t"
  104. "pmullw %mm4, %mm3\n\t");
  105. src += stride;
  106. asm volatile(
  107. /* mm0 = src[0..7] */
  108. "movq %0, %%mm0\n\t"
  109. : : "m" (src[0]));
  110. asm volatile(
  111. /* [mm2,mm3] += C * src[0..7] */
  112. "movq %mm0, %mm1\n\t"
  113. "punpcklbw %mm7, %mm1\n\t"
  114. "pmullw %mm6, %mm1\n\t"
  115. "paddw %mm1, %mm2\n\t"
  116. "movq %mm0, %mm5\n\t"
  117. "punpckhbw %mm7, %mm5\n\t"
  118. "pmullw %mm6, %mm5\n\t"
  119. "paddw %mm5, %mm3\n\t");
  120. asm volatile(
  121. /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
  122. "paddw %1, %%mm2\n\t"
  123. "paddw %1, %%mm3\n\t"
  124. "psrlw $3, %%mm2\n\t"
  125. "psrlw $3, %%mm3\n\t"
  126. "packuswb %%mm3, %%mm2\n\t"
  127. H264_CHROMA_OP(%0, %%mm2)
  128. "movq %%mm2, %0\n\t"
  129. : "=m" (dst[0]) : "m" (ff_pw_4));
  130. dst += stride;
  131. }
  132. return;
  133. }
  134. /* general case, bilinear */
  135. asm volatile("movd %2, %%mm4\n\t"
  136. "movd %3, %%mm6\n\t"
  137. "punpcklwd %%mm4, %%mm4\n\t"
  138. "punpcklwd %%mm6, %%mm6\n\t"
  139. "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
  140. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
  141. "movq %%mm4, %%mm5\n\t"
  142. "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
  143. "psllw $3, %%mm5\n\t"
  144. "psllw $3, %%mm6\n\t"
  145. "movq %%mm5, %%mm7\n\t"
  146. "paddw %%mm6, %%mm7\n\t"
  147. "movq %%mm4, %1\n\t" /* DD = x * y */
  148. "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
  149. "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
  150. "paddw %4, %%mm4\n\t"
  151. "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
  152. "pxor %%mm7, %%mm7\n\t"
  153. "movq %%mm4, %0\n\t"
  154. : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
  155. asm volatile(
  156. /* mm0 = src[0..7], mm1 = src[1..8] */
  157. "movq %0, %%mm0\n\t"
  158. "movq %1, %%mm1\n\t"
  159. : : "m" (src[0]), "m" (src[1]));
  160. for(i=0; i<h; i++) {
  161. asm volatile(
  162. /* [mm2,mm3] = A * src[0..7] */
  163. "movq %%mm0, %%mm2\n\t"
  164. "punpcklbw %%mm7, %%mm2\n\t"
  165. "pmullw %0, %%mm2\n\t"
  166. "movq %%mm0, %%mm3\n\t"
  167. "punpckhbw %%mm7, %%mm3\n\t"
  168. "pmullw %0, %%mm3\n\t"
  169. /* [mm2,mm3] += B * src[1..8] */
  170. "movq %%mm1, %%mm0\n\t"
  171. "punpcklbw %%mm7, %%mm0\n\t"
  172. "pmullw %%mm5, %%mm0\n\t"
  173. "punpckhbw %%mm7, %%mm1\n\t"
  174. "pmullw %%mm5, %%mm1\n\t"
  175. "paddw %%mm0, %%mm2\n\t"
  176. "paddw %%mm1, %%mm3\n\t"
  177. : : "m" (AA));
  178. src += stride;
  179. asm volatile(
  180. /* mm0 = src[0..7], mm1 = src[1..8] */
  181. "movq %0, %%mm0\n\t"
  182. "movq %1, %%mm1\n\t"
  183. : : "m" (src[0]), "m" (src[1]));
  184. asm volatile(
  185. /* [mm2,mm3] += C * src[0..7] */
  186. "movq %mm0, %mm4\n\t"
  187. "punpcklbw %mm7, %mm4\n\t"
  188. "pmullw %mm6, %mm4\n\t"
  189. "paddw %mm4, %mm2\n\t"
  190. "movq %mm0, %mm4\n\t"
  191. "punpckhbw %mm7, %mm4\n\t"
  192. "pmullw %mm6, %mm4\n\t"
  193. "paddw %mm4, %mm3\n\t");
  194. asm volatile(
  195. /* [mm2,mm3] += D * src[1..8] */
  196. "movq %%mm1, %%mm4\n\t"
  197. "punpcklbw %%mm7, %%mm4\n\t"
  198. "pmullw %0, %%mm4\n\t"
  199. "paddw %%mm4, %%mm2\n\t"
  200. "movq %%mm1, %%mm4\n\t"
  201. "punpckhbw %%mm7, %%mm4\n\t"
  202. "pmullw %0, %%mm4\n\t"
  203. "paddw %%mm4, %%mm3\n\t"
  204. : : "m" (DD));
  205. asm volatile(
  206. /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
  207. "paddw %1, %%mm2\n\t"
  208. "paddw %1, %%mm3\n\t"
  209. "psrlw $6, %%mm2\n\t"
  210. "psrlw $6, %%mm3\n\t"
  211. "packuswb %%mm3, %%mm2\n\t"
  212. H264_CHROMA_OP(%0, %%mm2)
  213. "movq %%mm2, %0\n\t"
  214. : "=m" (dst[0]) : "m" (ff_pw_32));
  215. dst+= stride;
  216. }
  217. }
  218. static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  219. {
  220. DECLARE_ALIGNED_8(uint64_t, AA);
  221. DECLARE_ALIGNED_8(uint64_t, DD);
  222. int i;
  223. /* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.
  224. * could still save a few cycles, but maybe not worth the complexity. */
  225. assert(x<8 && y<8 && x>=0 && y>=0);
  226. asm volatile("movd %2, %%mm4\n\t"
  227. "movd %3, %%mm6\n\t"
  228. "punpcklwd %%mm4, %%mm4\n\t"
  229. "punpcklwd %%mm6, %%mm6\n\t"
  230. "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
  231. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
  232. "movq %%mm4, %%mm5\n\t"
  233. "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
  234. "psllw $3, %%mm5\n\t"
  235. "psllw $3, %%mm6\n\t"
  236. "movq %%mm5, %%mm7\n\t"
  237. "paddw %%mm6, %%mm7\n\t"
  238. "movq %%mm4, %1\n\t" /* DD = x * y */
  239. "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
  240. "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
  241. "paddw %4, %%mm4\n\t"
  242. "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
  243. "pxor %%mm7, %%mm7\n\t"
  244. "movq %%mm4, %0\n\t"
  245. : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
  246. asm volatile(
  247. /* mm0 = src[0..3], mm1 = src[1..4] */
  248. "movd %0, %%mm0\n\t"
  249. "movd %1, %%mm1\n\t"
  250. "punpcklbw %%mm7, %%mm0\n\t"
  251. "punpcklbw %%mm7, %%mm1\n\t"
  252. : : "m" (src[0]), "m" (src[1]));
  253. for(i=0; i<h; i++) {
  254. asm volatile(
  255. /* mm2 = A * src[0..3] + B * src[1..4] */
  256. "movq %%mm0, %%mm2\n\t"
  257. "pmullw %0, %%mm2\n\t"
  258. "pmullw %%mm5, %%mm1\n\t"
  259. "paddw %%mm1, %%mm2\n\t"
  260. : : "m" (AA));
  261. src += stride;
  262. asm volatile(
  263. /* mm0 = src[0..3], mm1 = src[1..4] */
  264. "movd %0, %%mm0\n\t"
  265. "movd %1, %%mm1\n\t"
  266. "punpcklbw %%mm7, %%mm0\n\t"
  267. "punpcklbw %%mm7, %%mm1\n\t"
  268. : : "m" (src[0]), "m" (src[1]));
  269. asm volatile(
  270. /* mm2 += C * src[0..3] + D * src[1..4] */
  271. "movq %%mm0, %%mm3\n\t"
  272. "movq %%mm1, %%mm4\n\t"
  273. "pmullw %%mm6, %%mm3\n\t"
  274. "pmullw %0, %%mm4\n\t"
  275. "paddw %%mm3, %%mm2\n\t"
  276. "paddw %%mm4, %%mm2\n\t"
  277. : : "m" (DD));
  278. asm volatile(
  279. /* dst[0..3] = pack((mm2 + 32) >> 6) */
  280. "paddw %1, %%mm2\n\t"
  281. "psrlw $6, %%mm2\n\t"
  282. "packuswb %%mm7, %%mm2\n\t"
  283. H264_CHROMA_OP4(%0, %%mm2, %%mm3)
  284. "movd %%mm2, %0\n\t"
  285. : "=m" (dst[0]) : "m" (ff_pw_32));
  286. dst += stride;
  287. }
  288. }
  289. #ifdef H264_CHROMA_MC2_TMPL
  290. static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  291. {
  292. int CD=((1<<16)-1)*x*y + 8*y;
  293. int AB=((8<<16)-8)*x + 64 - CD;
  294. int i;
  295. asm volatile(
  296. /* mm5 = {A,B,A,B} */
  297. /* mm6 = {C,D,C,D} */
  298. "movd %0, %%mm5\n\t"
  299. "movd %1, %%mm6\n\t"
  300. "punpckldq %%mm5, %%mm5\n\t"
  301. "punpckldq %%mm6, %%mm6\n\t"
  302. "pxor %%mm7, %%mm7\n\t"
  303. :: "r"(AB), "r"(CD));
  304. asm volatile(
  305. /* mm0 = src[0,1,1,2] */
  306. "movd %0, %%mm0\n\t"
  307. "punpcklbw %%mm7, %%mm0\n\t"
  308. "pshufw $0x94, %%mm0, %%mm0\n\t"
  309. :: "m"(src[0]));
  310. for(i=0; i<h; i++) {
  311. asm volatile(
  312. /* mm1 = A * src[0,1] + B * src[1,2] */
  313. "movq %%mm0, %%mm1\n\t"
  314. "pmaddwd %%mm5, %%mm1\n\t"
  315. ::);
  316. src += stride;
  317. asm volatile(
  318. /* mm0 = src[0,1,1,2] */
  319. "movd %0, %%mm0\n\t"
  320. "punpcklbw %%mm7, %%mm0\n\t"
  321. "pshufw $0x94, %%mm0, %%mm0\n\t"
  322. :: "m"(src[0]));
  323. asm volatile(
  324. /* mm1 += C * src[0,1] + D * src[1,2] */
  325. "movq %%mm0, %%mm2\n\t"
  326. "pmaddwd %%mm6, %%mm2\n\t"
  327. "paddw %%mm2, %%mm1\n\t"
  328. ::);
  329. asm volatile(
  330. /* dst[0,1] = pack((mm1 + 32) >> 6) */
  331. "paddw %1, %%mm1\n\t"
  332. "psrlw $6, %%mm1\n\t"
  333. "packssdw %%mm7, %%mm1\n\t"
  334. "packuswb %%mm7, %%mm1\n\t"
  335. /* writes garbage to the right of dst.
  336. * ok because partitions are processed from left to right. */
  337. H264_CHROMA_OP4(%0, %%mm1, %%mm3)
  338. "movd %%mm1, %0\n\t"
  339. : "=m" (dst[0]) : "m" (ff_pw_32) : "eax");
  340. dst += stride;
  341. }
  342. }
  343. #endif