You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1434 lines
55KB

  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. *
  4. * This library is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2 of the License, or (at your option) any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with this library; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. /***********************************/
  19. /* IDCT */
  20. /* in/out: mma=mma+mmb, mmb=mmb-mma */
  21. #define SUMSUB_BA( a, b ) \
  22. "paddw "#b", "#a" \n\t"\
  23. "paddw "#b", "#b" \n\t"\
  24. "psubw "#a", "#b" \n\t"
  25. #define SUMSUB_BADC( a, b, c, d ) \
  26. "paddw "#b", "#a" \n\t"\
  27. "paddw "#d", "#c" \n\t"\
  28. "paddw "#b", "#b" \n\t"\
  29. "paddw "#d", "#d" \n\t"\
  30. "psubw "#a", "#b" \n\t"\
  31. "psubw "#c", "#d" \n\t"
  32. #define SUMSUBD2_AB( a, b, t ) \
  33. "movq "#b", "#t" \n\t"\
  34. "psraw $1 , "#b" \n\t"\
  35. "paddw "#a", "#b" \n\t"\
  36. "psraw $1 , "#a" \n\t"\
  37. "psubw "#t", "#a" \n\t"
  38. #define IDCT4_1D( s02, s13, d02, d13, t ) \
  39. SUMSUB_BA ( s02, d02 )\
  40. SUMSUBD2_AB( s13, d13, t )\
  41. SUMSUB_BADC( d13, s02, s13, d02 )
  42. #define SBUTTERFLY(a,b,t,n)\
  43. "movq " #a ", " #t " \n\t" /* abcd */\
  44. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  45. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  46. #define TRANSPOSE4(a,b,c,d,t)\
  47. SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
  48. SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
  49. SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
  50. SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
  51. #define STORE_DIFF_4P( p, t, z ) \
  52. "psraw $6, "#p" \n\t"\
  53. "movd (%0), "#t" \n\t"\
  54. "punpcklbw "#z", "#t" \n\t"\
  55. "paddsw "#t", "#p" \n\t"\
  56. "packuswb "#z", "#p" \n\t"\
  57. "movd "#p", (%0) \n\t"
  58. static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  59. {
  60. /* Load dct coeffs */
  61. asm volatile(
  62. "movq (%0), %%mm0 \n\t"
  63. "movq 8(%0), %%mm1 \n\t"
  64. "movq 16(%0), %%mm2 \n\t"
  65. "movq 24(%0), %%mm3 \n\t"
  66. :: "r"(block) );
  67. asm volatile(
  68. /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
  69. IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
  70. "movq %0, %%mm6 \n\t"
  71. /* in: 1,4,0,2 out: 1,2,3,0 */
  72. TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
  73. "paddw %%mm6, %%mm3 \n\t"
  74. /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
  75. IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
  76. "pxor %%mm7, %%mm7 \n\t"
  77. :: "m"(ff_pw_32));
  78. asm volatile(
  79. STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
  80. "add %1, %0 \n\t"
  81. STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
  82. "add %1, %0 \n\t"
  83. STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
  84. "add %1, %0 \n\t"
  85. STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
  86. : "+r"(dst)
  87. : "r" ((long)stride)
  88. );
  89. }
  90. static inline void h264_idct8_1d(int16_t *block)
  91. {
  92. asm volatile(
  93. "movq 112(%0), %%mm7 \n\t"
  94. "movq 80(%0), %%mm5 \n\t"
  95. "movq 48(%0), %%mm3 \n\t"
  96. "movq 16(%0), %%mm1 \n\t"
  97. "movq %%mm7, %%mm4 \n\t"
  98. "movq %%mm3, %%mm6 \n\t"
  99. "movq %%mm5, %%mm0 \n\t"
  100. "movq %%mm7, %%mm2 \n\t"
  101. "psraw $1, %%mm4 \n\t"
  102. "psraw $1, %%mm6 \n\t"
  103. "psubw %%mm7, %%mm0 \n\t"
  104. "psubw %%mm6, %%mm2 \n\t"
  105. "psubw %%mm4, %%mm0 \n\t"
  106. "psubw %%mm3, %%mm2 \n\t"
  107. "psubw %%mm3, %%mm0 \n\t"
  108. "paddw %%mm1, %%mm2 \n\t"
  109. "movq %%mm5, %%mm4 \n\t"
  110. "movq %%mm1, %%mm6 \n\t"
  111. "psraw $1, %%mm4 \n\t"
  112. "psraw $1, %%mm6 \n\t"
  113. "paddw %%mm5, %%mm4 \n\t"
  114. "paddw %%mm1, %%mm6 \n\t"
  115. "paddw %%mm7, %%mm4 \n\t"
  116. "paddw %%mm5, %%mm6 \n\t"
  117. "psubw %%mm1, %%mm4 \n\t"
  118. "paddw %%mm3, %%mm6 \n\t"
  119. "movq %%mm0, %%mm1 \n\t"
  120. "movq %%mm4, %%mm3 \n\t"
  121. "movq %%mm2, %%mm5 \n\t"
  122. "movq %%mm6, %%mm7 \n\t"
  123. "psraw $2, %%mm6 \n\t"
  124. "psraw $2, %%mm3 \n\t"
  125. "psraw $2, %%mm5 \n\t"
  126. "psraw $2, %%mm0 \n\t"
  127. "paddw %%mm6, %%mm1 \n\t"
  128. "paddw %%mm2, %%mm3 \n\t"
  129. "psubw %%mm4, %%mm5 \n\t"
  130. "psubw %%mm0, %%mm7 \n\t"
  131. "movq 32(%0), %%mm2 \n\t"
  132. "movq 96(%0), %%mm6 \n\t"
  133. "movq %%mm2, %%mm4 \n\t"
  134. "movq %%mm6, %%mm0 \n\t"
  135. "psraw $1, %%mm4 \n\t"
  136. "psraw $1, %%mm6 \n\t"
  137. "psubw %%mm0, %%mm4 \n\t"
  138. "paddw %%mm2, %%mm6 \n\t"
  139. "movq (%0), %%mm2 \n\t"
  140. "movq 64(%0), %%mm0 \n\t"
  141. SUMSUB_BA( %%mm0, %%mm2 )
  142. SUMSUB_BA( %%mm6, %%mm0 )
  143. SUMSUB_BA( %%mm4, %%mm2 )
  144. SUMSUB_BA( %%mm7, %%mm6 )
  145. SUMSUB_BA( %%mm5, %%mm4 )
  146. SUMSUB_BA( %%mm3, %%mm2 )
  147. SUMSUB_BA( %%mm1, %%mm0 )
  148. :: "r"(block)
  149. );
  150. }
  151. static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  152. {
  153. int i;
  154. int16_t __attribute__ ((aligned(8))) b2[64];
  155. block[0] += 32;
  156. for(i=0; i<2; i++){
  157. uint64_t tmp;
  158. h264_idct8_1d(block+4*i);
  159. asm volatile(
  160. "movq %%mm7, %0 \n\t"
  161. TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
  162. "movq %%mm0, 8(%1) \n\t"
  163. "movq %%mm6, 24(%1) \n\t"
  164. "movq %%mm7, 40(%1) \n\t"
  165. "movq %%mm4, 56(%1) \n\t"
  166. "movq %0, %%mm7 \n\t"
  167. TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
  168. "movq %%mm7, (%1) \n\t"
  169. "movq %%mm1, 16(%1) \n\t"
  170. "movq %%mm0, 32(%1) \n\t"
  171. "movq %%mm3, 48(%1) \n\t"
  172. : "=m"(tmp)
  173. : "r"(b2+32*i)
  174. : "memory"
  175. );
  176. }
  177. for(i=0; i<2; i++){
  178. h264_idct8_1d(b2+4*i);
  179. asm volatile(
  180. "psraw $6, %%mm7 \n\t"
  181. "psraw $6, %%mm6 \n\t"
  182. "psraw $6, %%mm5 \n\t"
  183. "psraw $6, %%mm4 \n\t"
  184. "psraw $6, %%mm3 \n\t"
  185. "psraw $6, %%mm2 \n\t"
  186. "psraw $6, %%mm1 \n\t"
  187. "psraw $6, %%mm0 \n\t"
  188. "movq %%mm7, (%0) \n\t"
  189. "movq %%mm5, 16(%0) \n\t"
  190. "movq %%mm3, 32(%0) \n\t"
  191. "movq %%mm1, 48(%0) \n\t"
  192. "movq %%mm0, 64(%0) \n\t"
  193. "movq %%mm2, 80(%0) \n\t"
  194. "movq %%mm4, 96(%0) \n\t"
  195. "movq %%mm6, 112(%0) \n\t"
  196. :: "r"(b2+4*i)
  197. : "memory"
  198. );
  199. }
  200. add_pixels_clamped_mmx(b2, dst, stride);
  201. }
  202. static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  203. {
  204. int dc = (block[0] + 32) >> 6;
  205. asm volatile(
  206. "movd %0, %%mm0 \n\t"
  207. "pshufw $0, %%mm0, %%mm0 \n\t"
  208. "pxor %%mm1, %%mm1 \n\t"
  209. "psubw %%mm0, %%mm1 \n\t"
  210. "packuswb %%mm0, %%mm0 \n\t"
  211. "packuswb %%mm1, %%mm1 \n\t"
  212. ::"r"(dc)
  213. );
  214. asm volatile(
  215. "movd %0, %%mm2 \n\t"
  216. "movd %1, %%mm3 \n\t"
  217. "movd %2, %%mm4 \n\t"
  218. "movd %3, %%mm5 \n\t"
  219. "paddusb %%mm0, %%mm2 \n\t"
  220. "paddusb %%mm0, %%mm3 \n\t"
  221. "paddusb %%mm0, %%mm4 \n\t"
  222. "paddusb %%mm0, %%mm5 \n\t"
  223. "psubusb %%mm1, %%mm2 \n\t"
  224. "psubusb %%mm1, %%mm3 \n\t"
  225. "psubusb %%mm1, %%mm4 \n\t"
  226. "psubusb %%mm1, %%mm5 \n\t"
  227. "movd %%mm2, %0 \n\t"
  228. "movd %%mm3, %1 \n\t"
  229. "movd %%mm4, %2 \n\t"
  230. "movd %%mm5, %3 \n\t"
  231. :"+m"(*(uint32_t*)(dst+0*stride)),
  232. "+m"(*(uint32_t*)(dst+1*stride)),
  233. "+m"(*(uint32_t*)(dst+2*stride)),
  234. "+m"(*(uint32_t*)(dst+3*stride))
  235. );
  236. }
  237. static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  238. {
  239. int dc = (block[0] + 32) >> 6;
  240. int y;
  241. asm volatile(
  242. "movd %0, %%mm0 \n\t"
  243. "pshufw $0, %%mm0, %%mm0 \n\t"
  244. "pxor %%mm1, %%mm1 \n\t"
  245. "psubw %%mm0, %%mm1 \n\t"
  246. "packuswb %%mm0, %%mm0 \n\t"
  247. "packuswb %%mm1, %%mm1 \n\t"
  248. ::"r"(dc)
  249. );
  250. for(y=2; y--; dst += 4*stride){
  251. asm volatile(
  252. "movq %0, %%mm2 \n\t"
  253. "movq %1, %%mm3 \n\t"
  254. "movq %2, %%mm4 \n\t"
  255. "movq %3, %%mm5 \n\t"
  256. "paddusb %%mm0, %%mm2 \n\t"
  257. "paddusb %%mm0, %%mm3 \n\t"
  258. "paddusb %%mm0, %%mm4 \n\t"
  259. "paddusb %%mm0, %%mm5 \n\t"
  260. "psubusb %%mm1, %%mm2 \n\t"
  261. "psubusb %%mm1, %%mm3 \n\t"
  262. "psubusb %%mm1, %%mm4 \n\t"
  263. "psubusb %%mm1, %%mm5 \n\t"
  264. "movq %%mm2, %0 \n\t"
  265. "movq %%mm3, %1 \n\t"
  266. "movq %%mm4, %2 \n\t"
  267. "movq %%mm5, %3 \n\t"
  268. :"+m"(*(uint64_t*)(dst+0*stride)),
  269. "+m"(*(uint64_t*)(dst+1*stride)),
  270. "+m"(*(uint64_t*)(dst+2*stride)),
  271. "+m"(*(uint64_t*)(dst+3*stride))
  272. );
  273. }
  274. }
  275. /***********************************/
  276. /* deblocking */
  277. // out: o = |x-y|>a
  278. // clobbers: t
  279. #define DIFF_GT_MMX(x,y,a,o,t)\
  280. "movq "#y", "#t" \n\t"\
  281. "movq "#x", "#o" \n\t"\
  282. "psubusb "#x", "#t" \n\t"\
  283. "psubusb "#y", "#o" \n\t"\
  284. "por "#t", "#o" \n\t"\
  285. "psubusb "#a", "#o" \n\t"
  286. // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
  287. // out: mm5=beta-1, mm7=mask
  288. // clobbers: mm4,mm6
  289. #define H264_DEBLOCK_MASK(alpha1, beta1) \
  290. "pshufw $0, "#alpha1", %%mm4 \n\t"\
  291. "pshufw $0, "#beta1 ", %%mm5 \n\t"\
  292. "packuswb %%mm4, %%mm4 \n\t"\
  293. "packuswb %%mm5, %%mm5 \n\t"\
  294. DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
  295. DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
  296. "por %%mm4, %%mm7 \n\t"\
  297. DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
  298. "por %%mm4, %%mm7 \n\t"\
  299. "pxor %%mm6, %%mm6 \n\t"\
  300. "pcmpeqb %%mm6, %%mm7 \n\t"
  301. // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
  302. // out: mm1=p0' mm2=q0'
  303. // clobbers: mm0,3-6
  304. #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
  305. /* a = q0^p0^((p1-q1)>>2) */\
  306. "movq %%mm0, %%mm4 \n\t"\
  307. "psubb %%mm3, %%mm4 \n\t"\
  308. "psrlw $2, %%mm4 \n\t"\
  309. "pxor %%mm1, %%mm4 \n\t"\
  310. "pxor %%mm2, %%mm4 \n\t"\
  311. /* b = p0^(q1>>2) */\
  312. "psrlw $2, %%mm3 \n\t"\
  313. "pand "#pb_3f", %%mm3 \n\t"\
  314. "movq %%mm1, %%mm5 \n\t"\
  315. "pxor %%mm3, %%mm5 \n\t"\
  316. /* c = q0^(p1>>2) */\
  317. "psrlw $2, %%mm0 \n\t"\
  318. "pand "#pb_3f", %%mm0 \n\t"\
  319. "movq %%mm2, %%mm6 \n\t"\
  320. "pxor %%mm0, %%mm6 \n\t"\
  321. /* d = (c^b) & ~(b^a) & 1 */\
  322. "pxor %%mm5, %%mm6 \n\t"\
  323. "pxor %%mm4, %%mm5 \n\t"\
  324. "pandn %%mm6, %%mm5 \n\t"\
  325. "pand "#pb_01", %%mm5 \n\t"\
  326. /* delta = (avg(q0, p1>>2) + (d&a))
  327. * - (avg(p0, q1>>2) + (d&~a)) */\
  328. "pavgb %%mm2, %%mm0 \n\t"\
  329. "pand %%mm5, %%mm4 \n\t"\
  330. "paddusb %%mm4, %%mm0 \n\t"\
  331. "pavgb %%mm1, %%mm3 \n\t"\
  332. "pxor %%mm5, %%mm4 \n\t"\
  333. "paddusb %%mm4, %%mm3 \n\t"\
  334. /* p0 += clip(delta, -tc0, tc0)
  335. * q0 -= clip(delta, -tc0, tc0) */\
  336. "movq %%mm0, %%mm4 \n\t"\
  337. "psubusb %%mm3, %%mm0 \n\t"\
  338. "psubusb %%mm4, %%mm3 \n\t"\
  339. "pminub %%mm7, %%mm0 \n\t"\
  340. "pminub %%mm7, %%mm3 \n\t"\
  341. "paddusb %%mm0, %%mm1 \n\t"\
  342. "paddusb %%mm3, %%mm2 \n\t"\
  343. "psubusb %%mm3, %%mm1 \n\t"\
  344. "psubusb %%mm0, %%mm2 \n\t"
  345. // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone
  346. // out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
  347. // clobbers: q2, tmp, tc0
  348. #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
  349. "movq %%mm1, "#tmp" \n\t"\
  350. "pavgb %%mm2, "#tmp" \n\t"\
  351. "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
  352. "pxor "q2addr", "#tmp" \n\t"\
  353. "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
  354. "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
  355. "movq "#p1", "#tmp" \n\t"\
  356. "psubusb "#tc0", "#tmp" \n\t"\
  357. "paddusb "#p1", "#tc0" \n\t"\
  358. "pmaxub "#tmp", "#q2" \n\t"\
  359. "pminub "#tc0", "#q2" \n\t"\
  360. "movq "#q2", "q1addr" \n\t"
  361. static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
  362. {
  363. uint64_t tmp0;
  364. uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101;
  365. // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask
  366. uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff };
  367. asm volatile(
  368. "movq (%1,%3), %%mm0 \n\t" //p1
  369. "movq (%1,%3,2), %%mm1 \n\t" //p0
  370. "movq (%2), %%mm2 \n\t" //q0
  371. "movq (%2,%3), %%mm3 \n\t" //q1
  372. H264_DEBLOCK_MASK(%6, %7)
  373. "pand %5, %%mm7 \n\t"
  374. "movq %%mm7, %0 \n\t"
  375. /* filter p1 */
  376. "movq (%1), %%mm3 \n\t" //p2
  377. DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
  378. "pandn %%mm7, %%mm6 \n\t"
  379. "pcmpeqb %%mm7, %%mm6 \n\t"
  380. "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
  381. "pshufw $80, %4, %%mm4 \n\t"
  382. "pand %%mm7, %%mm4 \n\t" // mask & tc0
  383. "movq %8, %%mm7 \n\t"
  384. "pand %%mm6, %%mm7 \n\t" // mask & |p2-p0|<beta & 1
  385. "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
  386. "paddb %%mm4, %%mm7 \n\t" // tc++
  387. H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
  388. /* filter q1 */
  389. "movq (%2,%3,2), %%mm4 \n\t" //q2
  390. DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
  391. "pandn %0, %%mm6 \n\t"
  392. "pcmpeqb %0, %%mm6 \n\t"
  393. "pand %0, %%mm6 \n\t"
  394. "pshufw $80, %4, %%mm5 \n\t"
  395. "pand %%mm6, %%mm5 \n\t"
  396. "pand %8, %%mm6 \n\t"
  397. "paddb %%mm6, %%mm7 \n\t"
  398. "movq (%2,%3), %%mm3 \n\t"
  399. H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
  400. /* filter p0, q0 */
  401. H264_DEBLOCK_P0_Q0(%8, %9)
  402. "movq %%mm1, (%1,%3,2) \n\t"
  403. "movq %%mm2, (%2) \n\t"
  404. : "=m"(tmp0)
  405. : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
  406. "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1),
  407. "m"(mm_bone), "m"(ff_pb_3F)
  408. );
  409. }
  410. static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  411. {
  412. if((tc0[0] & tc0[1]) >= 0)
  413. h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
  414. if((tc0[2] & tc0[3]) >= 0)
  415. h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
  416. }
  417. static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  418. {
  419. //FIXME: could cut some load/stores by merging transpose with filter
  420. // also, it only needs to transpose 6x8
  421. uint8_t trans[8*8];
  422. int i;
  423. for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
  424. if((tc0[0] & tc0[1]) < 0)
  425. continue;
  426. transpose4x4(trans, pix-4, 8, stride);
  427. transpose4x4(trans +4*8, pix, 8, stride);
  428. transpose4x4(trans+4, pix-4+4*stride, 8, stride);
  429. transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
  430. h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
  431. transpose4x4(pix-2, trans +2*8, stride, 8);
  432. transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
  433. }
  434. }
  435. static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
  436. {
  437. asm volatile(
  438. "movq (%0), %%mm0 \n\t" //p1
  439. "movq (%0,%2), %%mm1 \n\t" //p0
  440. "movq (%1), %%mm2 \n\t" //q0
  441. "movq (%1,%2), %%mm3 \n\t" //q1
  442. H264_DEBLOCK_MASK(%4, %5)
  443. "movd %3, %%mm6 \n\t"
  444. "punpcklbw %%mm6, %%mm6 \n\t"
  445. "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask
  446. H264_DEBLOCK_P0_Q0(%6, %7)
  447. "movq %%mm1, (%0,%2) \n\t"
  448. "movq %%mm2, (%1) \n\t"
  449. :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
  450. "r"(*(uint32_t*)tc0),
  451. "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F)
  452. );
  453. }
  454. static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  455. {
  456. h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
  457. }
  458. static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  459. {
  460. //FIXME: could cut some load/stores by merging transpose with filter
  461. uint8_t trans[8*4];
  462. transpose4x4(trans, pix-2, 8, stride);
  463. transpose4x4(trans+4, pix-2+4*stride, 8, stride);
  464. h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
  465. transpose4x4(pix-2, trans, stride, 8);
  466. transpose4x4(pix-2+4*stride, trans+4, stride, 8);
  467. }
  468. // p0 = (p0 + q1 + 2*p1 + 2) >> 2
  469. #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
  470. "movq "#p0", %%mm4 \n\t"\
  471. "pxor "#q1", %%mm4 \n\t"\
  472. "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\
  473. "pavgb "#q1", "#p0" \n\t"\
  474. "psubusb %%mm4, "#p0" \n\t"\
  475. "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
  476. static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
  477. {
  478. asm volatile(
  479. "movq (%0), %%mm0 \n\t"
  480. "movq (%0,%2), %%mm1 \n\t"
  481. "movq (%1), %%mm2 \n\t"
  482. "movq (%1,%2), %%mm3 \n\t"
  483. H264_DEBLOCK_MASK(%3, %4)
  484. "movq %%mm1, %%mm5 \n\t"
  485. "movq %%mm2, %%mm6 \n\t"
  486. H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
  487. H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
  488. "psubb %%mm5, %%mm1 \n\t"
  489. "psubb %%mm6, %%mm2 \n\t"
  490. "pand %%mm7, %%mm1 \n\t"
  491. "pand %%mm7, %%mm2 \n\t"
  492. "paddb %%mm5, %%mm1 \n\t"
  493. "paddb %%mm6, %%mm2 \n\t"
  494. "movq %%mm1, (%0,%2) \n\t"
  495. "movq %%mm2, (%1) \n\t"
  496. :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
  497. "m"(alpha1), "m"(beta1), "m"(mm_bone)
  498. );
  499. }
  500. static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
  501. {
  502. h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
  503. }
  504. static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
  505. {
  506. //FIXME: could cut some load/stores by merging transpose with filter
  507. uint8_t trans[8*4];
  508. transpose4x4(trans, pix-2, 8, stride);
  509. transpose4x4(trans+4, pix-2+4*stride, 8, stride);
  510. h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
  511. transpose4x4(pix-2, trans, stride, 8);
  512. transpose4x4(pix-2+4*stride, trans+4, stride, 8);
  513. }
  514. /***********************************/
  515. /* motion compensation */
  516. #define QPEL_H264V(A,B,C,D,E,F,OP)\
  517. "movd (%0), "#F" \n\t"\
  518. "movq "#C", %%mm6 \n\t"\
  519. "paddw "#D", %%mm6 \n\t"\
  520. "psllw $2, %%mm6 \n\t"\
  521. "psubw "#B", %%mm6 \n\t"\
  522. "psubw "#E", %%mm6 \n\t"\
  523. "pmullw %4, %%mm6 \n\t"\
  524. "add %2, %0 \n\t"\
  525. "punpcklbw %%mm7, "#F" \n\t"\
  526. "paddw %5, "#A" \n\t"\
  527. "paddw "#F", "#A" \n\t"\
  528. "paddw "#A", %%mm6 \n\t"\
  529. "psraw $5, %%mm6 \n\t"\
  530. "packuswb %%mm6, %%mm6 \n\t"\
  531. OP(%%mm6, (%1), A, d)\
  532. "add %3, %1 \n\t"
  533. #define QPEL_H264HV(A,B,C,D,E,F,OF)\
  534. "movd (%0), "#F" \n\t"\
  535. "movq "#C", %%mm6 \n\t"\
  536. "paddw "#D", %%mm6 \n\t"\
  537. "psllw $2, %%mm6 \n\t"\
  538. "psubw "#B", %%mm6 \n\t"\
  539. "psubw "#E", %%mm6 \n\t"\
  540. "pmullw %3, %%mm6 \n\t"\
  541. "add %2, %0 \n\t"\
  542. "punpcklbw %%mm7, "#F" \n\t"\
  543. "paddw "#F", "#A" \n\t"\
  544. "paddw "#A", %%mm6 \n\t"\
  545. "movq %%mm6, "#OF"(%1) \n\t"
  546. #define QPEL_H264(OPNAME, OP, MMX)\
  547. static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  548. int h=4;\
  549. \
  550. asm volatile(\
  551. "pxor %%mm7, %%mm7 \n\t"\
  552. "movq %5, %%mm4 \n\t"\
  553. "movq %6, %%mm5 \n\t"\
  554. "1: \n\t"\
  555. "movd -1(%0), %%mm1 \n\t"\
  556. "movd (%0), %%mm2 \n\t"\
  557. "movd 1(%0), %%mm3 \n\t"\
  558. "movd 2(%0), %%mm0 \n\t"\
  559. "punpcklbw %%mm7, %%mm1 \n\t"\
  560. "punpcklbw %%mm7, %%mm2 \n\t"\
  561. "punpcklbw %%mm7, %%mm3 \n\t"\
  562. "punpcklbw %%mm7, %%mm0 \n\t"\
  563. "paddw %%mm0, %%mm1 \n\t"\
  564. "paddw %%mm3, %%mm2 \n\t"\
  565. "movd -2(%0), %%mm0 \n\t"\
  566. "movd 3(%0), %%mm3 \n\t"\
  567. "punpcklbw %%mm7, %%mm0 \n\t"\
  568. "punpcklbw %%mm7, %%mm3 \n\t"\
  569. "paddw %%mm3, %%mm0 \n\t"\
  570. "psllw $2, %%mm2 \n\t"\
  571. "psubw %%mm1, %%mm2 \n\t"\
  572. "pmullw %%mm4, %%mm2 \n\t"\
  573. "paddw %%mm5, %%mm0 \n\t"\
  574. "paddw %%mm2, %%mm0 \n\t"\
  575. "psraw $5, %%mm0 \n\t"\
  576. "packuswb %%mm0, %%mm0 \n\t"\
  577. OP(%%mm0, (%1),%%mm6, d)\
  578. "add %3, %0 \n\t"\
  579. "add %4, %1 \n\t"\
  580. "decl %2 \n\t"\
  581. " jnz 1b \n\t"\
  582. : "+a"(src), "+c"(dst), "+m"(h)\
  583. : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  584. : "memory"\
  585. );\
  586. }\
  587. static void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  588. int h=4;\
  589. asm volatile(\
  590. "pxor %%mm7, %%mm7 \n\t"\
  591. "movq %0, %%mm4 \n\t"\
  592. "movq %1, %%mm5 \n\t"\
  593. :: "m"(ff_pw_5), "m"(ff_pw_16)\
  594. );\
  595. do{\
  596. asm volatile(\
  597. "movd -1(%0), %%mm1 \n\t"\
  598. "movd (%0), %%mm2 \n\t"\
  599. "movd 1(%0), %%mm3 \n\t"\
  600. "movd 2(%0), %%mm0 \n\t"\
  601. "punpcklbw %%mm7, %%mm1 \n\t"\
  602. "punpcklbw %%mm7, %%mm2 \n\t"\
  603. "punpcklbw %%mm7, %%mm3 \n\t"\
  604. "punpcklbw %%mm7, %%mm0 \n\t"\
  605. "paddw %%mm0, %%mm1 \n\t"\
  606. "paddw %%mm3, %%mm2 \n\t"\
  607. "movd -2(%0), %%mm0 \n\t"\
  608. "movd 3(%0), %%mm3 \n\t"\
  609. "punpcklbw %%mm7, %%mm0 \n\t"\
  610. "punpcklbw %%mm7, %%mm3 \n\t"\
  611. "paddw %%mm3, %%mm0 \n\t"\
  612. "psllw $2, %%mm2 \n\t"\
  613. "psubw %%mm1, %%mm2 \n\t"\
  614. "pmullw %%mm4, %%mm2 \n\t"\
  615. "paddw %%mm5, %%mm0 \n\t"\
  616. "paddw %%mm2, %%mm0 \n\t"\
  617. "movd (%2), %%mm3 \n\t"\
  618. "psraw $5, %%mm0 \n\t"\
  619. "packuswb %%mm0, %%mm0 \n\t"\
  620. PAVGB" %%mm3, %%mm0 \n\t"\
  621. OP(%%mm0, (%1),%%mm6, d)\
  622. "add %4, %0 \n\t"\
  623. "add %4, %1 \n\t"\
  624. "add %3, %2 \n\t"\
  625. : "+a"(src), "+c"(dst), "+d"(src2)\
  626. : "D"((long)src2Stride), "S"((long)dstStride)\
  627. : "memory"\
  628. );\
  629. }while(--h);\
  630. }\
  631. static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  632. src -= 2*srcStride;\
  633. asm volatile(\
  634. "pxor %%mm7, %%mm7 \n\t"\
  635. "movd (%0), %%mm0 \n\t"\
  636. "add %2, %0 \n\t"\
  637. "movd (%0), %%mm1 \n\t"\
  638. "add %2, %0 \n\t"\
  639. "movd (%0), %%mm2 \n\t"\
  640. "add %2, %0 \n\t"\
  641. "movd (%0), %%mm3 \n\t"\
  642. "add %2, %0 \n\t"\
  643. "movd (%0), %%mm4 \n\t"\
  644. "add %2, %0 \n\t"\
  645. "punpcklbw %%mm7, %%mm0 \n\t"\
  646. "punpcklbw %%mm7, %%mm1 \n\t"\
  647. "punpcklbw %%mm7, %%mm2 \n\t"\
  648. "punpcklbw %%mm7, %%mm3 \n\t"\
  649. "punpcklbw %%mm7, %%mm4 \n\t"\
  650. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  651. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  652. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  653. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  654. \
  655. : "+a"(src), "+c"(dst)\
  656. : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  657. : "memory"\
  658. );\
  659. }\
  660. static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  661. int h=4;\
  662. int w=3;\
  663. src -= 2*srcStride+2;\
  664. while(w--){\
  665. asm volatile(\
  666. "pxor %%mm7, %%mm7 \n\t"\
  667. "movd (%0), %%mm0 \n\t"\
  668. "add %2, %0 \n\t"\
  669. "movd (%0), %%mm1 \n\t"\
  670. "add %2, %0 \n\t"\
  671. "movd (%0), %%mm2 \n\t"\
  672. "add %2, %0 \n\t"\
  673. "movd (%0), %%mm3 \n\t"\
  674. "add %2, %0 \n\t"\
  675. "movd (%0), %%mm4 \n\t"\
  676. "add %2, %0 \n\t"\
  677. "punpcklbw %%mm7, %%mm0 \n\t"\
  678. "punpcklbw %%mm7, %%mm1 \n\t"\
  679. "punpcklbw %%mm7, %%mm2 \n\t"\
  680. "punpcklbw %%mm7, %%mm3 \n\t"\
  681. "punpcklbw %%mm7, %%mm4 \n\t"\
  682. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
  683. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
  684. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
  685. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
  686. \
  687. : "+a"(src)\
  688. : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
  689. : "memory"\
  690. );\
  691. tmp += 4;\
  692. src += 4 - 9*srcStride;\
  693. }\
  694. tmp -= 3*4;\
  695. asm volatile(\
  696. "movq %4, %%mm6 \n\t"\
  697. "1: \n\t"\
  698. "movq (%0), %%mm0 \n\t"\
  699. "paddw 10(%0), %%mm0 \n\t"\
  700. "movq 2(%0), %%mm1 \n\t"\
  701. "paddw 8(%0), %%mm1 \n\t"\
  702. "movq 4(%0), %%mm2 \n\t"\
  703. "paddw 6(%0), %%mm2 \n\t"\
  704. "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
  705. "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
  706. "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
  707. "paddsw %%mm2, %%mm0 \n\t"\
  708. "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
  709. "paddw %%mm6, %%mm2 \n\t"\
  710. "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 +32 */\
  711. "psraw $6, %%mm0 \n\t"\
  712. "packuswb %%mm0, %%mm0 \n\t"\
  713. OP(%%mm0, (%1),%%mm7, d)\
  714. "add $24, %0 \n\t"\
  715. "add %3, %1 \n\t"\
  716. "decl %2 \n\t"\
  717. " jnz 1b \n\t"\
  718. : "+a"(tmp), "+c"(dst), "+m"(h)\
  719. : "S"((long)dstStride), "m"(ff_pw_32)\
  720. : "memory"\
  721. );\
  722. }\
  723. \
  724. static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  725. int h=8;\
  726. asm volatile(\
  727. "pxor %%mm7, %%mm7 \n\t"\
  728. "movq %5, %%mm6 \n\t"\
  729. "1: \n\t"\
  730. "movq (%0), %%mm0 \n\t"\
  731. "movq 1(%0), %%mm2 \n\t"\
  732. "movq %%mm0, %%mm1 \n\t"\
  733. "movq %%mm2, %%mm3 \n\t"\
  734. "punpcklbw %%mm7, %%mm0 \n\t"\
  735. "punpckhbw %%mm7, %%mm1 \n\t"\
  736. "punpcklbw %%mm7, %%mm2 \n\t"\
  737. "punpckhbw %%mm7, %%mm3 \n\t"\
  738. "paddw %%mm2, %%mm0 \n\t"\
  739. "paddw %%mm3, %%mm1 \n\t"\
  740. "psllw $2, %%mm0 \n\t"\
  741. "psllw $2, %%mm1 \n\t"\
  742. "movq -1(%0), %%mm2 \n\t"\
  743. "movq 2(%0), %%mm4 \n\t"\
  744. "movq %%mm2, %%mm3 \n\t"\
  745. "movq %%mm4, %%mm5 \n\t"\
  746. "punpcklbw %%mm7, %%mm2 \n\t"\
  747. "punpckhbw %%mm7, %%mm3 \n\t"\
  748. "punpcklbw %%mm7, %%mm4 \n\t"\
  749. "punpckhbw %%mm7, %%mm5 \n\t"\
  750. "paddw %%mm4, %%mm2 \n\t"\
  751. "paddw %%mm3, %%mm5 \n\t"\
  752. "psubw %%mm2, %%mm0 \n\t"\
  753. "psubw %%mm5, %%mm1 \n\t"\
  754. "pmullw %%mm6, %%mm0 \n\t"\
  755. "pmullw %%mm6, %%mm1 \n\t"\
  756. "movd -2(%0), %%mm2 \n\t"\
  757. "movd 7(%0), %%mm5 \n\t"\
  758. "punpcklbw %%mm7, %%mm2 \n\t"\
  759. "punpcklbw %%mm7, %%mm5 \n\t"\
  760. "paddw %%mm3, %%mm2 \n\t"\
  761. "paddw %%mm5, %%mm4 \n\t"\
  762. "movq %6, %%mm5 \n\t"\
  763. "paddw %%mm5, %%mm2 \n\t"\
  764. "paddw %%mm5, %%mm4 \n\t"\
  765. "paddw %%mm2, %%mm0 \n\t"\
  766. "paddw %%mm4, %%mm1 \n\t"\
  767. "psraw $5, %%mm0 \n\t"\
  768. "psraw $5, %%mm1 \n\t"\
  769. "packuswb %%mm1, %%mm0 \n\t"\
  770. OP(%%mm0, (%1),%%mm5, q)\
  771. "add %3, %0 \n\t"\
  772. "add %4, %1 \n\t"\
  773. "decl %2 \n\t"\
  774. " jnz 1b \n\t"\
  775. : "+a"(src), "+c"(dst), "+m"(h)\
  776. : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  777. : "memory"\
  778. );\
  779. }\
  780. \
  781. static void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  782. int h=8;\
  783. asm volatile(\
  784. "pxor %%mm7, %%mm7 \n\t"\
  785. "movq %0, %%mm6 \n\t"\
  786. :: "m"(ff_pw_5)\
  787. );\
  788. do{\
  789. asm volatile(\
  790. "movq (%0), %%mm0 \n\t"\
  791. "movq 1(%0), %%mm2 \n\t"\
  792. "movq %%mm0, %%mm1 \n\t"\
  793. "movq %%mm2, %%mm3 \n\t"\
  794. "punpcklbw %%mm7, %%mm0 \n\t"\
  795. "punpckhbw %%mm7, %%mm1 \n\t"\
  796. "punpcklbw %%mm7, %%mm2 \n\t"\
  797. "punpckhbw %%mm7, %%mm3 \n\t"\
  798. "paddw %%mm2, %%mm0 \n\t"\
  799. "paddw %%mm3, %%mm1 \n\t"\
  800. "psllw $2, %%mm0 \n\t"\
  801. "psllw $2, %%mm1 \n\t"\
  802. "movq -1(%0), %%mm2 \n\t"\
  803. "movq 2(%0), %%mm4 \n\t"\
  804. "movq %%mm2, %%mm3 \n\t"\
  805. "movq %%mm4, %%mm5 \n\t"\
  806. "punpcklbw %%mm7, %%mm2 \n\t"\
  807. "punpckhbw %%mm7, %%mm3 \n\t"\
  808. "punpcklbw %%mm7, %%mm4 \n\t"\
  809. "punpckhbw %%mm7, %%mm5 \n\t"\
  810. "paddw %%mm4, %%mm2 \n\t"\
  811. "paddw %%mm3, %%mm5 \n\t"\
  812. "psubw %%mm2, %%mm0 \n\t"\
  813. "psubw %%mm5, %%mm1 \n\t"\
  814. "pmullw %%mm6, %%mm0 \n\t"\
  815. "pmullw %%mm6, %%mm1 \n\t"\
  816. "movd -2(%0), %%mm2 \n\t"\
  817. "movd 7(%0), %%mm5 \n\t"\
  818. "punpcklbw %%mm7, %%mm2 \n\t"\
  819. "punpcklbw %%mm7, %%mm5 \n\t"\
  820. "paddw %%mm3, %%mm2 \n\t"\
  821. "paddw %%mm5, %%mm4 \n\t"\
  822. "movq %5, %%mm5 \n\t"\
  823. "paddw %%mm5, %%mm2 \n\t"\
  824. "paddw %%mm5, %%mm4 \n\t"\
  825. "paddw %%mm2, %%mm0 \n\t"\
  826. "paddw %%mm4, %%mm1 \n\t"\
  827. "psraw $5, %%mm0 \n\t"\
  828. "psraw $5, %%mm1 \n\t"\
  829. "movq (%2), %%mm4 \n\t"\
  830. "packuswb %%mm1, %%mm0 \n\t"\
  831. PAVGB" %%mm4, %%mm0 \n\t"\
  832. OP(%%mm0, (%1),%%mm5, q)\
  833. "add %4, %0 \n\t"\
  834. "add %4, %1 \n\t"\
  835. "add %3, %2 \n\t"\
  836. : "+a"(src), "+c"(dst), "+d"(src2)\
  837. : "D"((long)src2Stride), "S"((long)dstStride),\
  838. "m"(ff_pw_16)\
  839. : "memory"\
  840. );\
  841. }while(--h);\
  842. }\
  843. \
  844. static inline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  845. int w= 2;\
  846. src -= 2*srcStride;\
  847. \
  848. while(w--){\
  849. asm volatile(\
  850. "pxor %%mm7, %%mm7 \n\t"\
  851. "movd (%0), %%mm0 \n\t"\
  852. "add %2, %0 \n\t"\
  853. "movd (%0), %%mm1 \n\t"\
  854. "add %2, %0 \n\t"\
  855. "movd (%0), %%mm2 \n\t"\
  856. "add %2, %0 \n\t"\
  857. "movd (%0), %%mm3 \n\t"\
  858. "add %2, %0 \n\t"\
  859. "movd (%0), %%mm4 \n\t"\
  860. "add %2, %0 \n\t"\
  861. "punpcklbw %%mm7, %%mm0 \n\t"\
  862. "punpcklbw %%mm7, %%mm1 \n\t"\
  863. "punpcklbw %%mm7, %%mm2 \n\t"\
  864. "punpcklbw %%mm7, %%mm3 \n\t"\
  865. "punpcklbw %%mm7, %%mm4 \n\t"\
  866. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  867. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  868. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  869. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  870. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  871. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  872. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  873. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  874. \
  875. : "+a"(src), "+c"(dst)\
  876. : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  877. : "memory"\
  878. );\
  879. if(h==16){\
  880. asm volatile(\
  881. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  882. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  883. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  884. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  885. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  886. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  887. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  888. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  889. \
  890. : "+a"(src), "+c"(dst)\
  891. : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  892. : "memory"\
  893. );\
  894. }\
  895. src += 4-(h+5)*srcStride;\
  896. dst += 4-h*dstStride;\
  897. }\
  898. }\
  899. static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  900. int h = size;\
  901. int w = (size+8)>>2;\
  902. src -= 2*srcStride+2;\
  903. while(w--){\
  904. asm volatile(\
  905. "pxor %%mm7, %%mm7 \n\t"\
  906. "movd (%0), %%mm0 \n\t"\
  907. "add %2, %0 \n\t"\
  908. "movd (%0), %%mm1 \n\t"\
  909. "add %2, %0 \n\t"\
  910. "movd (%0), %%mm2 \n\t"\
  911. "add %2, %0 \n\t"\
  912. "movd (%0), %%mm3 \n\t"\
  913. "add %2, %0 \n\t"\
  914. "movd (%0), %%mm4 \n\t"\
  915. "add %2, %0 \n\t"\
  916. "punpcklbw %%mm7, %%mm0 \n\t"\
  917. "punpcklbw %%mm7, %%mm1 \n\t"\
  918. "punpcklbw %%mm7, %%mm2 \n\t"\
  919. "punpcklbw %%mm7, %%mm3 \n\t"\
  920. "punpcklbw %%mm7, %%mm4 \n\t"\
  921. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
  922. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
  923. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
  924. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
  925. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
  926. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
  927. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
  928. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
  929. : "+a"(src)\
  930. : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
  931. : "memory"\
  932. );\
  933. if(size==16){\
  934. asm volatile(\
  935. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
  936. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
  937. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
  938. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
  939. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
  940. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
  941. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
  942. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
  943. : "+a"(src)\
  944. : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
  945. : "memory"\
  946. );\
  947. }\
  948. tmp += 4;\
  949. src += 4 - (size+5)*srcStride;\
  950. }\
  951. tmp -= size+8;\
  952. w = size>>4;\
  953. do{\
  954. h = size;\
  955. asm volatile(\
  956. "movq %4, %%mm6 \n\t"\
  957. "1: \n\t"\
  958. "movq (%0), %%mm0 \n\t"\
  959. "movq 8(%0), %%mm3 \n\t"\
  960. "movq 2(%0), %%mm1 \n\t"\
  961. "movq 10(%0), %%mm4 \n\t"\
  962. "paddw %%mm4, %%mm0 \n\t"\
  963. "paddw %%mm3, %%mm1 \n\t"\
  964. "paddw 18(%0), %%mm3 \n\t"\
  965. "paddw 16(%0), %%mm4 \n\t"\
  966. "movq 4(%0), %%mm2 \n\t"\
  967. "movq 12(%0), %%mm5 \n\t"\
  968. "paddw 6(%0), %%mm2 \n\t"\
  969. "paddw 14(%0), %%mm5 \n\t"\
  970. "psubw %%mm1, %%mm0 \n\t"\
  971. "psubw %%mm4, %%mm3 \n\t"\
  972. "psraw $2, %%mm0 \n\t"\
  973. "psraw $2, %%mm3 \n\t"\
  974. "psubw %%mm1, %%mm0 \n\t"\
  975. "psubw %%mm4, %%mm3 \n\t"\
  976. "paddsw %%mm2, %%mm0 \n\t"\
  977. "paddsw %%mm5, %%mm3 \n\t"\
  978. "psraw $2, %%mm0 \n\t"\
  979. "psraw $2, %%mm3 \n\t"\
  980. "paddw %%mm6, %%mm2 \n\t"\
  981. "paddw %%mm6, %%mm5 \n\t"\
  982. "paddw %%mm2, %%mm0 \n\t"\
  983. "paddw %%mm5, %%mm3 \n\t"\
  984. "psraw $6, %%mm0 \n\t"\
  985. "psraw $6, %%mm3 \n\t"\
  986. "packuswb %%mm3, %%mm0 \n\t"\
  987. OP(%%mm0, (%1),%%mm7, q)\
  988. "add $48, %0 \n\t"\
  989. "add %3, %1 \n\t"\
  990. "decl %2 \n\t"\
  991. " jnz 1b \n\t"\
  992. : "+a"(tmp), "+c"(dst), "+m"(h)\
  993. : "S"((long)dstStride), "m"(ff_pw_32)\
  994. : "memory"\
  995. );\
  996. tmp += 8 - size*24;\
  997. dst += 8 - size*dstStride;\
  998. }while(w--);\
  999. }\
  1000. \
  1001. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1002. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  1003. }\
  1004. static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1005. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  1006. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  1007. }\
  1008. \
  1009. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1010. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  1011. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1012. src += 8*srcStride;\
  1013. dst += 8*dstStride;\
  1014. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  1015. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1016. }\
  1017. \
  1018. static void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1019. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  1020. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1021. src += 8*dstStride;\
  1022. dst += 8*dstStride;\
  1023. src2 += 8*src2Stride;\
  1024. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  1025. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1026. }\
  1027. \
  1028. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1029. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
  1030. }\
  1031. \
  1032. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1033. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
  1034. }\
  1035. \
  1036. static void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1037. {\
  1038. asm volatile(\
  1039. "movq %5, %%mm6 \n\t"\
  1040. "movq (%1), %%mm0 \n\t"\
  1041. "movq 24(%1), %%mm1 \n\t"\
  1042. "paddw %%mm6, %%mm0 \n\t"\
  1043. "paddw %%mm6, %%mm1 \n\t"\
  1044. "psraw $5, %%mm0 \n\t"\
  1045. "psraw $5, %%mm1 \n\t"\
  1046. "packuswb %%mm0, %%mm0 \n\t"\
  1047. "packuswb %%mm1, %%mm1 \n\t"\
  1048. PAVGB" (%0), %%mm0 \n\t"\
  1049. PAVGB" (%0,%3), %%mm1 \n\t"\
  1050. OP(%%mm0, (%2), %%mm4, d)\
  1051. OP(%%mm1, (%2,%4), %%mm5, d)\
  1052. "lea (%0,%3,2), %0 \n\t"\
  1053. "lea (%2,%4,2), %2 \n\t"\
  1054. "movq 48(%1), %%mm0 \n\t"\
  1055. "movq 72(%1), %%mm1 \n\t"\
  1056. "paddw %%mm6, %%mm0 \n\t"\
  1057. "paddw %%mm6, %%mm1 \n\t"\
  1058. "psraw $5, %%mm0 \n\t"\
  1059. "psraw $5, %%mm1 \n\t"\
  1060. "packuswb %%mm0, %%mm0 \n\t"\
  1061. "packuswb %%mm1, %%mm1 \n\t"\
  1062. PAVGB" (%0), %%mm0 \n\t"\
  1063. PAVGB" (%0,%3), %%mm1 \n\t"\
  1064. OP(%%mm0, (%2), %%mm4, d)\
  1065. OP(%%mm1, (%2,%4), %%mm5, d)\
  1066. :"+a"(src8), "+c"(src16), "+d"(dst)\
  1067. :"S"((long)src8Stride), "D"((long)dstStride), "m"(ff_pw_16)\
  1068. :"memory");\
  1069. }\
  1070. static void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1071. {\
  1072. asm volatile(\
  1073. "movq %0, %%mm6 \n\t"\
  1074. ::"m"(ff_pw_16)\
  1075. );\
  1076. while(h--){\
  1077. asm volatile(\
  1078. "movq (%1), %%mm0 \n\t"\
  1079. "movq 8(%1), %%mm1 \n\t"\
  1080. "paddw %%mm6, %%mm0 \n\t"\
  1081. "paddw %%mm6, %%mm1 \n\t"\
  1082. "psraw $5, %%mm0 \n\t"\
  1083. "psraw $5, %%mm1 \n\t"\
  1084. "packuswb %%mm1, %%mm0 \n\t"\
  1085. PAVGB" (%0), %%mm0 \n\t"\
  1086. OP(%%mm0, (%2), %%mm5, q)\
  1087. ::"a"(src8), "c"(src16), "d"(dst)\
  1088. :"memory");\
  1089. src8 += src8Stride;\
  1090. src16 += 24;\
  1091. dst += dstStride;\
  1092. }\
  1093. }\
  1094. static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1095. {\
  1096. OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
  1097. OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  1098. }\
  1099. #define H264_MC(OPNAME, SIZE, MMX) \
  1100. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1101. OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
  1102. }\
  1103. \
  1104. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1105. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1106. }\
  1107. \
  1108. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1109. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1110. }\
  1111. \
  1112. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1113. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1114. }\
  1115. \
  1116. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1117. uint64_t temp[SIZE*SIZE/8];\
  1118. uint8_t * const half= (uint8_t*)temp;\
  1119. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
  1120. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
  1121. }\
  1122. \
  1123. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1124. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1125. }\
  1126. \
  1127. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1128. uint64_t temp[SIZE*SIZE/8];\
  1129. uint8_t * const half= (uint8_t*)temp;\
  1130. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
  1131. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
  1132. }\
  1133. \
  1134. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1135. uint64_t temp[SIZE*SIZE/8];\
  1136. uint8_t * const halfV= (uint8_t*)temp;\
  1137. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
  1138. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
  1139. }\
  1140. \
  1141. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1142. uint64_t temp[SIZE*SIZE/8];\
  1143. uint8_t * const halfV= (uint8_t*)temp;\
  1144. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
  1145. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
  1146. }\
  1147. \
  1148. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1149. uint64_t temp[SIZE*SIZE/8];\
  1150. uint8_t * const halfV= (uint8_t*)temp;\
  1151. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
  1152. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
  1153. }\
  1154. \
  1155. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1156. uint64_t temp[SIZE*SIZE/8];\
  1157. uint8_t * const halfV= (uint8_t*)temp;\
  1158. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
  1159. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
  1160. }\
  1161. \
  1162. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1163. uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
  1164. int16_t * const tmp= (int16_t*)temp;\
  1165. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
  1166. }\
  1167. \
  1168. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1169. uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
  1170. uint8_t * const halfHV= (uint8_t*)temp;\
  1171. int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
  1172. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
  1173. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1174. }\
  1175. \
  1176. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1177. uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
  1178. uint8_t * const halfHV= (uint8_t*)temp;\
  1179. int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
  1180. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
  1181. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1182. }\
  1183. \
  1184. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1185. uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
  1186. int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
  1187. uint8_t * const halfHV= ((uint8_t*)temp);\
  1188. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1189. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1190. }\
  1191. \
  1192. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1193. uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
  1194. int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
  1195. uint8_t * const halfHV= ((uint8_t*)temp);\
  1196. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1197. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1198. }\
  1199. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  1200. #define AVG_3DNOW_OP(a,b,temp, size) \
  1201. "mov" #size " " #b ", " #temp " \n\t"\
  1202. "pavgusb " #temp ", " #a " \n\t"\
  1203. "mov" #size " " #a ", " #b " \n\t"
  1204. #define AVG_MMX2_OP(a,b,temp, size) \
  1205. "mov" #size " " #b ", " #temp " \n\t"\
  1206. "pavgb " #temp ", " #a " \n\t"\
  1207. "mov" #size " " #a ", " #b " \n\t"
  1208. #define PAVGB "pavgusb"
  1209. QPEL_H264(put_, PUT_OP, 3dnow)
  1210. QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1211. #undef PAVGB
  1212. #define PAVGB "pavgb"
  1213. QPEL_H264(put_, PUT_OP, mmx2)
  1214. QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
  1215. #undef PAVGB
  1216. H264_MC(put_, 4, 3dnow)
  1217. H264_MC(put_, 8, 3dnow)
  1218. H264_MC(put_, 16,3dnow)
  1219. H264_MC(avg_, 4, 3dnow)
  1220. H264_MC(avg_, 8, 3dnow)
  1221. H264_MC(avg_, 16,3dnow)
  1222. H264_MC(put_, 4, mmx2)
  1223. H264_MC(put_, 8, mmx2)
  1224. H264_MC(put_, 16,mmx2)
  1225. H264_MC(avg_, 4, mmx2)
  1226. H264_MC(avg_, 8, mmx2)
  1227. H264_MC(avg_, 16,mmx2)
  1228. #define H264_CHROMA_OP(S,D)
  1229. #define H264_CHROMA_OP4(S,D,T)
  1230. #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
  1231. #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
  1232. #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
  1233. #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
  1234. #include "dsputil_h264_template_mmx.c"
  1235. #undef H264_CHROMA_OP
  1236. #undef H264_CHROMA_OP4
  1237. #undef H264_CHROMA_MC8_TMPL
  1238. #undef H264_CHROMA_MC4_TMPL
  1239. #undef H264_CHROMA_MC2_TMPL
  1240. #undef H264_CHROMA_MC8_MV0
  1241. #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
  1242. #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
  1243. "pavgb " #T ", " #D " \n\t"
  1244. #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
  1245. #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
  1246. #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
  1247. #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
  1248. #include "dsputil_h264_template_mmx.c"
  1249. #undef H264_CHROMA_OP
  1250. #undef H264_CHROMA_OP4
  1251. #undef H264_CHROMA_MC8_TMPL
  1252. #undef H264_CHROMA_MC4_TMPL
  1253. #undef H264_CHROMA_MC2_TMPL
  1254. #undef H264_CHROMA_MC8_MV0
  1255. #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
  1256. #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
  1257. "pavgusb " #T ", " #D " \n\t"
  1258. #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
  1259. #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
  1260. #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
  1261. #include "dsputil_h264_template_mmx.c"
  1262. #undef H264_CHROMA_OP
  1263. #undef H264_CHROMA_OP4
  1264. #undef H264_CHROMA_MC8_TMPL
  1265. #undef H264_CHROMA_MC4_TMPL
  1266. #undef H264_CHROMA_MC8_MV0
  1267. /***********************************/
  1268. /* weighted prediction */
  1269. static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
  1270. {
  1271. int x, y;
  1272. offset <<= log2_denom;
  1273. offset += (1 << log2_denom) >> 1;
  1274. asm volatile(
  1275. "movd %0, %%mm4 \n\t"
  1276. "movd %1, %%mm5 \n\t"
  1277. "movd %2, %%mm6 \n\t"
  1278. "pshufw $0, %%mm4, %%mm4 \n\t"
  1279. "pshufw $0, %%mm5, %%mm5 \n\t"
  1280. "pxor %%mm7, %%mm7 \n\t"
  1281. :: "g"(weight), "g"(offset), "g"(log2_denom)
  1282. );
  1283. for(y=0; y<h; y+=2){
  1284. for(x=0; x<w; x+=4){
  1285. asm volatile(
  1286. "movd %0, %%mm0 \n\t"
  1287. "movd %1, %%mm1 \n\t"
  1288. "punpcklbw %%mm7, %%mm0 \n\t"
  1289. "punpcklbw %%mm7, %%mm1 \n\t"
  1290. "pmullw %%mm4, %%mm0 \n\t"
  1291. "pmullw %%mm4, %%mm1 \n\t"
  1292. "paddsw %%mm5, %%mm0 \n\t"
  1293. "paddsw %%mm5, %%mm1 \n\t"
  1294. "psraw %%mm6, %%mm0 \n\t"
  1295. "psraw %%mm6, %%mm1 \n\t"
  1296. "packuswb %%mm7, %%mm0 \n\t"
  1297. "packuswb %%mm7, %%mm1 \n\t"
  1298. "movd %%mm0, %0 \n\t"
  1299. "movd %%mm1, %1 \n\t"
  1300. : "+m"(*(uint32_t*)(dst+x)),
  1301. "+m"(*(uint32_t*)(dst+x+stride))
  1302. );
  1303. }
  1304. dst += 2*stride;
  1305. }
  1306. }
  1307. static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
  1308. {
  1309. int x, y;
  1310. offset = ((offset + 1) | 1) << log2_denom;
  1311. asm volatile(
  1312. "movd %0, %%mm3 \n\t"
  1313. "movd %1, %%mm4 \n\t"
  1314. "movd %2, %%mm5 \n\t"
  1315. "movd %3, %%mm6 \n\t"
  1316. "pshufw $0, %%mm3, %%mm3 \n\t"
  1317. "pshufw $0, %%mm4, %%mm4 \n\t"
  1318. "pshufw $0, %%mm5, %%mm5 \n\t"
  1319. "pxor %%mm7, %%mm7 \n\t"
  1320. :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
  1321. );
  1322. for(y=0; y<h; y++){
  1323. for(x=0; x<w; x+=4){
  1324. asm volatile(
  1325. "movd %0, %%mm0 \n\t"
  1326. "movd %1, %%mm1 \n\t"
  1327. "punpcklbw %%mm7, %%mm0 \n\t"
  1328. "punpcklbw %%mm7, %%mm1 \n\t"
  1329. "pmullw %%mm3, %%mm0 \n\t"
  1330. "pmullw %%mm4, %%mm1 \n\t"
  1331. "paddsw %%mm1, %%mm0 \n\t"
  1332. "paddsw %%mm5, %%mm0 \n\t"
  1333. "psraw %%mm6, %%mm0 \n\t"
  1334. "packuswb %%mm0, %%mm0 \n\t"
  1335. "movd %%mm0, %0 \n\t"
  1336. : "+m"(*(uint32_t*)(dst+x))
  1337. : "m"(*(uint32_t*)(src+x))
  1338. );
  1339. }
  1340. src += stride;
  1341. dst += stride;
  1342. }
  1343. }
  1344. #define H264_WEIGHT(W,H) \
  1345. static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
  1346. ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
  1347. } \
  1348. static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
  1349. ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
  1350. }
  1351. H264_WEIGHT(16,16)
  1352. H264_WEIGHT(16, 8)
  1353. H264_WEIGHT( 8,16)
  1354. H264_WEIGHT( 8, 8)
  1355. H264_WEIGHT( 8, 4)
  1356. H264_WEIGHT( 4, 8)
  1357. H264_WEIGHT( 4, 4)
  1358. H264_WEIGHT( 4, 2)