You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

922 lines
41KB

  1. /*
  2. * MMX and SSE2 optimized snow DSP utils
  3. * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "../avcodec.h"
  22. #include "../snow.h"
  23. #include "x86_cpu.h"
  24. void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
  25. const int w2= (width+1)>>1;
  26. // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
  27. DWTELEM temp_buf[(width>>1) + 4];
  28. DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
  29. const int w_l= (width>>1);
  30. const int w_r= w2 - 1;
  31. int i;
  32. { // Lift 0
  33. DWTELEM * const ref = b + w2 - 1;
  34. DWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
  35. // (the first time erroneously), we allow the SSE2 code to run an extra pass.
  36. // The savings in code and time are well worth having to store this value and
  37. // calculate b[0] correctly afterwards.
  38. i = 0;
  39. asm volatile(
  40. "pcmpeqd %%xmm7, %%xmm7 \n\t"
  41. "pslld $31, %%xmm7 \n\t"
  42. "psrld $29, %%xmm7 \n\t"
  43. ::);
  44. for(; i<w_l-7; i+=8){
  45. asm volatile(
  46. "movdqu (%1), %%xmm1 \n\t"
  47. "movdqu 16(%1), %%xmm5 \n\t"
  48. "movdqu 4(%1), %%xmm2 \n\t"
  49. "movdqu 20(%1), %%xmm6 \n\t"
  50. "paddd %%xmm1, %%xmm2 \n\t"
  51. "paddd %%xmm5, %%xmm6 \n\t"
  52. "movdqa %%xmm2, %%xmm0 \n\t"
  53. "movdqa %%xmm6, %%xmm4 \n\t"
  54. "paddd %%xmm2, %%xmm2 \n\t"
  55. "paddd %%xmm6, %%xmm6 \n\t"
  56. "paddd %%xmm0, %%xmm2 \n\t"
  57. "paddd %%xmm4, %%xmm6 \n\t"
  58. "paddd %%xmm7, %%xmm2 \n\t"
  59. "paddd %%xmm7, %%xmm6 \n\t"
  60. "psrad $3, %%xmm2 \n\t"
  61. "psrad $3, %%xmm6 \n\t"
  62. "movdqa (%0), %%xmm0 \n\t"
  63. "movdqa 16(%0), %%xmm4 \n\t"
  64. "psubd %%xmm2, %%xmm0 \n\t"
  65. "psubd %%xmm6, %%xmm4 \n\t"
  66. "movdqa %%xmm0, (%0) \n\t"
  67. "movdqa %%xmm4, 16(%0) \n\t"
  68. :: "r"(&b[i]), "r"(&ref[i])
  69. : "memory"
  70. );
  71. }
  72. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  73. b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  74. }
  75. { // Lift 1
  76. DWTELEM * const dst = b+w2;
  77. i = 0;
  78. for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
  79. dst[i] = dst[i] - (b[i] + b[i + 1]);
  80. }
  81. for(; i<w_r-7; i+=8){
  82. asm volatile(
  83. "movdqu (%1), %%xmm1 \n\t"
  84. "movdqu 16(%1), %%xmm5 \n\t"
  85. "movdqu 4(%1), %%xmm2 \n\t"
  86. "movdqu 20(%1), %%xmm6 \n\t"
  87. "paddd %%xmm1, %%xmm2 \n\t"
  88. "paddd %%xmm5, %%xmm6 \n\t"
  89. "movdqa (%0), %%xmm0 \n\t"
  90. "movdqa 16(%0), %%xmm4 \n\t"
  91. "psubd %%xmm2, %%xmm0 \n\t"
  92. "psubd %%xmm6, %%xmm4 \n\t"
  93. "movdqa %%xmm0, (%0) \n\t"
  94. "movdqa %%xmm4, 16(%0) \n\t"
  95. :: "r"(&dst[i]), "r"(&b[i])
  96. : "memory"
  97. );
  98. }
  99. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  100. }
  101. { // Lift 2
  102. DWTELEM * const ref = b+w2 - 1;
  103. DWTELEM b_0 = b[0];
  104. i = 0;
  105. asm volatile(
  106. "pslld $1, %%xmm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
  107. ::);
  108. for(; i<w_l-7; i+=8){
  109. asm volatile(
  110. "movdqu (%1), %%xmm1 \n\t"
  111. "movdqu 16(%1), %%xmm5 \n\t"
  112. "movdqu 4(%1), %%xmm0 \n\t"
  113. "movdqu 20(%1), %%xmm4 \n\t"
  114. "paddd %%xmm1, %%xmm0 \n\t"
  115. "paddd %%xmm5, %%xmm4 \n\t"
  116. "movdqa %%xmm7, %%xmm1 \n\t"
  117. "movdqa %%xmm7, %%xmm5 \n\t"
  118. "psubd %%xmm0, %%xmm1 \n\t"
  119. "psubd %%xmm4, %%xmm5 \n\t"
  120. "movdqa (%0), %%xmm0 \n\t"
  121. "movdqa 16(%0), %%xmm4 \n\t"
  122. "pslld $2, %%xmm0 \n\t"
  123. "pslld $2, %%xmm4 \n\t"
  124. "psubd %%xmm0, %%xmm1 \n\t"
  125. "psubd %%xmm4, %%xmm5 \n\t"
  126. "psrad $4, %%xmm1 \n\t"
  127. "psrad $4, %%xmm5 \n\t"
  128. "movdqa (%0), %%xmm0 \n\t"
  129. "movdqa 16(%0), %%xmm4 \n\t"
  130. "psubd %%xmm1, %%xmm0 \n\t"
  131. "psubd %%xmm5, %%xmm4 \n\t"
  132. "movdqa %%xmm0, (%0) \n\t"
  133. "movdqa %%xmm4, 16(%0) \n\t"
  134. :: "r"(&b[i]), "r"(&ref[i])
  135. : "memory"
  136. );
  137. }
  138. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  139. b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
  140. }
  141. { // Lift 3
  142. DWTELEM * const src = b+w2;
  143. i = 0;
  144. for(; (((long)&temp[i]) & 0xF) && i<w_r; i++){
  145. temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
  146. }
  147. for(; i<w_r-7; i+=8){
  148. asm volatile(
  149. "movdqu 4(%1), %%xmm2 \n\t"
  150. "movdqu 20(%1), %%xmm6 \n\t"
  151. "paddd (%1), %%xmm2 \n\t"
  152. "paddd 16(%1), %%xmm6 \n\t"
  153. "movdqa %%xmm2, %%xmm0 \n\t"
  154. "movdqa %%xmm6, %%xmm4 \n\t"
  155. "pslld $2, %%xmm2 \n\t"
  156. "pslld $2, %%xmm6 \n\t"
  157. "psubd %%xmm2, %%xmm0 \n\t"
  158. "psubd %%xmm6, %%xmm4 \n\t"
  159. "psrad $1, %%xmm0 \n\t"
  160. "psrad $1, %%xmm4 \n\t"
  161. "movdqu (%0), %%xmm2 \n\t"
  162. "movdqu 16(%0), %%xmm6 \n\t"
  163. "psubd %%xmm0, %%xmm2 \n\t"
  164. "psubd %%xmm4, %%xmm6 \n\t"
  165. "movdqa %%xmm2, (%2) \n\t"
  166. "movdqa %%xmm6, 16(%2) \n\t"
  167. :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
  168. : "memory"
  169. );
  170. }
  171. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
  172. }
  173. {
  174. snow_interleave_line_header(&i, width, b, temp);
  175. for (; (i & 0x1E) != 0x1E; i-=2){
  176. b[i+1] = temp[i>>1];
  177. b[i] = b[i>>1];
  178. }
  179. for (i-=30; i>=0; i-=32){
  180. asm volatile(
  181. "movdqa (%1), %%xmm0 \n\t"
  182. "movdqa 16(%1), %%xmm2 \n\t"
  183. "movdqa 32(%1), %%xmm4 \n\t"
  184. "movdqa 48(%1), %%xmm6 \n\t"
  185. "movdqa (%1), %%xmm1 \n\t"
  186. "movdqa 16(%1), %%xmm3 \n\t"
  187. "movdqa 32(%1), %%xmm5 \n\t"
  188. "movdqa 48(%1), %%xmm7 \n\t"
  189. "punpckldq (%2), %%xmm0 \n\t"
  190. "punpckldq 16(%2), %%xmm2 \n\t"
  191. "punpckldq 32(%2), %%xmm4 \n\t"
  192. "punpckldq 48(%2), %%xmm6 \n\t"
  193. "movdqa %%xmm0, (%0) \n\t"
  194. "movdqa %%xmm2, 32(%0) \n\t"
  195. "movdqa %%xmm4, 64(%0) \n\t"
  196. "movdqa %%xmm6, 96(%0) \n\t"
  197. "punpckhdq (%2), %%xmm1 \n\t"
  198. "punpckhdq 16(%2), %%xmm3 \n\t"
  199. "punpckhdq 32(%2), %%xmm5 \n\t"
  200. "punpckhdq 48(%2), %%xmm7 \n\t"
  201. "movdqa %%xmm1, 16(%0) \n\t"
  202. "movdqa %%xmm3, 48(%0) \n\t"
  203. "movdqa %%xmm5, 80(%0) \n\t"
  204. "movdqa %%xmm7, 112(%0) \n\t"
  205. :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
  206. : "memory"
  207. );
  208. }
  209. }
  210. }
  211. void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
  212. const int w2= (width+1)>>1;
  213. DWTELEM temp[width >> 1];
  214. const int w_l= (width>>1);
  215. const int w_r= w2 - 1;
  216. int i;
  217. { // Lift 0
  218. DWTELEM * const ref = b + w2 - 1;
  219. i = 1;
  220. b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  221. asm volatile(
  222. "pcmpeqd %%mm7, %%mm7 \n\t"
  223. "pslld $31, %%mm7 \n\t"
  224. "psrld $29, %%mm7 \n\t"
  225. ::);
  226. for(; i<w_l-3; i+=4){
  227. asm volatile(
  228. "movq (%1), %%mm2 \n\t"
  229. "movq 8(%1), %%mm6 \n\t"
  230. "paddd 4(%1), %%mm2 \n\t"
  231. "paddd 12(%1), %%mm6 \n\t"
  232. "movq %%mm2, %%mm0 \n\t"
  233. "movq %%mm6, %%mm4 \n\t"
  234. "paddd %%mm2, %%mm2 \n\t"
  235. "paddd %%mm6, %%mm6 \n\t"
  236. "paddd %%mm0, %%mm2 \n\t"
  237. "paddd %%mm4, %%mm6 \n\t"
  238. "paddd %%mm7, %%mm2 \n\t"
  239. "paddd %%mm7, %%mm6 \n\t"
  240. "psrad $3, %%mm2 \n\t"
  241. "psrad $3, %%mm6 \n\t"
  242. "movq (%0), %%mm0 \n\t"
  243. "movq 8(%0), %%mm4 \n\t"
  244. "psubd %%mm2, %%mm0 \n\t"
  245. "psubd %%mm6, %%mm4 \n\t"
  246. "movq %%mm0, (%0) \n\t"
  247. "movq %%mm4, 8(%0) \n\t"
  248. :: "r"(&b[i]), "r"(&ref[i])
  249. : "memory"
  250. );
  251. }
  252. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  253. }
  254. { // Lift 1
  255. DWTELEM * const dst = b+w2;
  256. i = 0;
  257. for(; i<w_r-3; i+=4){
  258. asm volatile(
  259. "movq (%1), %%mm2 \n\t"
  260. "movq 8(%1), %%mm6 \n\t"
  261. "paddd 4(%1), %%mm2 \n\t"
  262. "paddd 12(%1), %%mm6 \n\t"
  263. "movq (%0), %%mm0 \n\t"
  264. "movq 8(%0), %%mm4 \n\t"
  265. "psubd %%mm2, %%mm0 \n\t"
  266. "psubd %%mm6, %%mm4 \n\t"
  267. "movq %%mm0, (%0) \n\t"
  268. "movq %%mm4, 8(%0) \n\t"
  269. :: "r"(&dst[i]), "r"(&b[i])
  270. : "memory"
  271. );
  272. }
  273. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  274. }
  275. { // Lift 2
  276. DWTELEM * const ref = b+w2 - 1;
  277. i = 1;
  278. b[0] = b[0] - (((-2 * ref[1] + W_BO) - 4 * b[0]) >> W_BS);
  279. asm volatile(
  280. "pslld $1, %%mm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
  281. ::);
  282. for(; i<w_l-3; i+=4){
  283. asm volatile(
  284. "movq (%1), %%mm0 \n\t"
  285. "movq 8(%1), %%mm4 \n\t"
  286. "paddd 4(%1), %%mm0 \n\t"
  287. "paddd 12(%1), %%mm4 \n\t"
  288. "movq %%mm7, %%mm1 \n\t"
  289. "movq %%mm7, %%mm5 \n\t"
  290. "psubd %%mm0, %%mm1 \n\t"
  291. "psubd %%mm4, %%mm5 \n\t"
  292. "movq (%0), %%mm0 \n\t"
  293. "movq 8(%0), %%mm4 \n\t"
  294. "pslld $2, %%mm0 \n\t"
  295. "pslld $2, %%mm4 \n\t"
  296. "psubd %%mm0, %%mm1 \n\t"
  297. "psubd %%mm4, %%mm5 \n\t"
  298. "psrad $4, %%mm1 \n\t"
  299. "psrad $4, %%mm5 \n\t"
  300. "movq (%0), %%mm0 \n\t"
  301. "movq 8(%0), %%mm4 \n\t"
  302. "psubd %%mm1, %%mm0 \n\t"
  303. "psubd %%mm5, %%mm4 \n\t"
  304. "movq %%mm0, (%0) \n\t"
  305. "movq %%mm4, 8(%0) \n\t"
  306. :: "r"(&b[i]), "r"(&ref[i])
  307. : "memory"
  308. );
  309. }
  310. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  311. }
  312. { // Lift 3
  313. DWTELEM * const src = b+w2;
  314. i = 0;
  315. for(; i<w_r-3; i+=4){
  316. asm volatile(
  317. "movq 4(%1), %%mm2 \n\t"
  318. "movq 12(%1), %%mm6 \n\t"
  319. "paddd (%1), %%mm2 \n\t"
  320. "paddd 8(%1), %%mm6 \n\t"
  321. "movq %%mm2, %%mm0 \n\t"
  322. "movq %%mm6, %%mm4 \n\t"
  323. "pslld $2, %%mm2 \n\t"
  324. "pslld $2, %%mm6 \n\t"
  325. "psubd %%mm2, %%mm0 \n\t"
  326. "psubd %%mm6, %%mm4 \n\t"
  327. "psrad $1, %%mm0 \n\t"
  328. "psrad $1, %%mm4 \n\t"
  329. "movq (%0), %%mm2 \n\t"
  330. "movq 8(%0), %%mm6 \n\t"
  331. "psubd %%mm0, %%mm2 \n\t"
  332. "psubd %%mm4, %%mm6 \n\t"
  333. "movq %%mm2, (%2) \n\t"
  334. "movq %%mm6, 8(%2) \n\t"
  335. :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
  336. : "memory"
  337. );
  338. }
  339. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
  340. }
  341. {
  342. snow_interleave_line_header(&i, width, b, temp);
  343. for (; (i & 0xE) != 0xE; i-=2){
  344. b[i+1] = temp[i>>1];
  345. b[i] = b[i>>1];
  346. }
  347. for (i-=14; i>=0; i-=16){
  348. asm volatile(
  349. "movq (%1), %%mm0 \n\t"
  350. "movq 8(%1), %%mm2 \n\t"
  351. "movq 16(%1), %%mm4 \n\t"
  352. "movq 24(%1), %%mm6 \n\t"
  353. "movq (%1), %%mm1 \n\t"
  354. "movq 8(%1), %%mm3 \n\t"
  355. "movq 16(%1), %%mm5 \n\t"
  356. "movq 24(%1), %%mm7 \n\t"
  357. "punpckldq (%2), %%mm0 \n\t"
  358. "punpckldq 8(%2), %%mm2 \n\t"
  359. "punpckldq 16(%2), %%mm4 \n\t"
  360. "punpckldq 24(%2), %%mm6 \n\t"
  361. "movq %%mm0, (%0) \n\t"
  362. "movq %%mm2, 16(%0) \n\t"
  363. "movq %%mm4, 32(%0) \n\t"
  364. "movq %%mm6, 48(%0) \n\t"
  365. "punpckhdq (%2), %%mm1 \n\t"
  366. "punpckhdq 8(%2), %%mm3 \n\t"
  367. "punpckhdq 16(%2), %%mm5 \n\t"
  368. "punpckhdq 24(%2), %%mm7 \n\t"
  369. "movq %%mm1, 8(%0) \n\t"
  370. "movq %%mm3, 24(%0) \n\t"
  371. "movq %%mm5, 40(%0) \n\t"
  372. "movq %%mm7, 56(%0) \n\t"
  373. :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
  374. : "memory"
  375. );
  376. }
  377. }
  378. }
  379. #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
  380. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  381. ""op" 16(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  382. ""op" 32(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  383. ""op" 48(%%"r",%%"REG_d",4), %%"t3" \n\t"
  384. #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
  385. snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
  386. #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
  387. snow_vertical_compose_sse2_load_add("paddd",r,t0,t1,t2,t3)
  388. #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  389. "psubd %%"s0", %%"t0" \n\t"\
  390. "psubd %%"s1", %%"t1" \n\t"\
  391. "psubd %%"s2", %%"t2" \n\t"\
  392. "psubd %%"s3", %%"t3" \n\t"
  393. #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
  394. "movdqa %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  395. "movdqa %%"s1", 16(%%"w",%%"REG_d",4) \n\t"\
  396. "movdqa %%"s2", 32(%%"w",%%"REG_d",4) \n\t"\
  397. "movdqa %%"s3", 48(%%"w",%%"REG_d",4) \n\t"
  398. #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
  399. "psrad $"n", %%"t0" \n\t"\
  400. "psrad $"n", %%"t1" \n\t"\
  401. "psrad $"n", %%"t2" \n\t"\
  402. "psrad $"n", %%"t3" \n\t"
  403. #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  404. "paddd %%"s0", %%"t0" \n\t"\
  405. "paddd %%"s1", %%"t1" \n\t"\
  406. "paddd %%"s2", %%"t2" \n\t"\
  407. "paddd %%"s3", %%"t3" \n\t"
  408. #define snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)\
  409. "pslld $"n", %%"t0" \n\t"\
  410. "pslld $"n", %%"t1" \n\t"\
  411. "pslld $"n", %%"t2" \n\t"\
  412. "pslld $"n", %%"t3" \n\t"
  413. #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  414. "movdqa %%"s0", %%"t0" \n\t"\
  415. "movdqa %%"s1", %%"t1" \n\t"\
  416. "movdqa %%"s2", %%"t2" \n\t"\
  417. "movdqa %%"s3", %%"t3" \n\t"
  418. void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  419. long i = width;
  420. while(i & 0xF)
  421. {
  422. i--;
  423. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  424. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  425. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  426. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  427. }
  428. asm volatile (
  429. "jmp 2f \n\t"
  430. "1: \n\t"
  431. "mov %6, %%"REG_a" \n\t"
  432. "mov %4, %%"REG_S" \n\t"
  433. snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
  434. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  435. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  436. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  437. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  438. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  439. "pslld $31, %%xmm1 \n\t"
  440. "psrld $29, %%xmm1 \n\t"
  441. "mov %5, %%"REG_a" \n\t"
  442. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  443. snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
  444. snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
  445. snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  446. snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
  447. "mov %3, %%"REG_c" \n\t"
  448. snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
  449. snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
  450. snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  451. snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6")
  452. "mov %2, %%"REG_a" \n\t"
  453. snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
  454. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  455. snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
  456. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  457. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  458. "pslld $31, %%xmm1 \n\t"
  459. "psrld $28, %%xmm1 \n\t"
  460. "mov %1, %%"REG_S" \n\t"
  461. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  462. snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
  463. snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
  464. snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
  465. snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6")
  466. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  467. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  468. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  469. snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
  470. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  471. snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")
  472. "2: \n\t"
  473. "sub $16, %%"REG_d" \n\t"
  474. "jge 1b \n\t"
  475. :"+d"(i)
  476. :
  477. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  478. "%"REG_a"","%"REG_S"","%"REG_c"");
  479. }
  480. #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
  481. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  482. ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  483. ""op" 16(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  484. ""op" 24(%%"r",%%"REG_d",4), %%"t3" \n\t"
  485. #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
  486. snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
  487. #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
  488. snow_vertical_compose_mmx_load_add("paddd",r,t0,t1,t2,t3)
  489. #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  490. snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)
  491. #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
  492. "movq %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  493. "movq %%"s1", 8(%%"w",%%"REG_d",4) \n\t"\
  494. "movq %%"s2", 16(%%"w",%%"REG_d",4) \n\t"\
  495. "movq %%"s3", 24(%%"w",%%"REG_d",4) \n\t"
  496. #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
  497. snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)
  498. #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  499. snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)
  500. #define snow_vertical_compose_mmx_sll(n,t0,t1,t2,t3)\
  501. snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)
  502. #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  503. "movq %%"s0", %%"t0" \n\t"\
  504. "movq %%"s1", %%"t1" \n\t"\
  505. "movq %%"s2", %%"t2" \n\t"\
  506. "movq %%"s3", %%"t3" \n\t"
  507. void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  508. long i = width;
  509. while(i & 0x7)
  510. {
  511. i--;
  512. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  513. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  514. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  515. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  516. }
  517. asm volatile(
  518. "jmp 2f \n\t"
  519. "1: \n\t"
  520. "mov %6, %%"REG_a" \n\t"
  521. "mov %4, %%"REG_S" \n\t"
  522. snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
  523. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  524. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  525. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  526. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  527. "pcmpeqd %%mm1, %%mm1 \n\t"
  528. "pslld $31, %%mm1 \n\t"
  529. "psrld $29, %%mm1 \n\t"
  530. "mov %5, %%"REG_a" \n\t"
  531. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  532. snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
  533. snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
  534. snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  535. snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
  536. "mov %3, %%"REG_c" \n\t"
  537. snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
  538. snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
  539. snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  540. snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6")
  541. "mov %2, %%"REG_a" \n\t"
  542. snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
  543. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  544. snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
  545. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  546. "pcmpeqd %%mm1, %%mm1 \n\t"
  547. "pslld $31, %%mm1 \n\t"
  548. "psrld $28, %%mm1 \n\t"
  549. "mov %1, %%"REG_S" \n\t"
  550. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  551. snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
  552. snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
  553. snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
  554. snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6")
  555. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  556. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  557. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  558. snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
  559. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  560. snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")
  561. "2: \n\t"
  562. "sub $8, %%"REG_d" \n\t"
  563. "jge 1b \n\t"
  564. :"+d"(i)
  565. :
  566. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  567. "%"REG_a"","%"REG_S"","%"REG_c"");
  568. }
  569. #define snow_inner_add_yblock_sse2_header \
  570. DWTELEM * * dst_array = sb->line + src_y;\
  571. long tmp;\
  572. asm volatile(\
  573. "mov %7, %%"REG_c" \n\t"\
  574. "mov %6, %2 \n\t"\
  575. "mov %4, %%"REG_S" \n\t"\
  576. "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
  577. "pcmpeqd %%xmm3, %%xmm3 \n\t"\
  578. "pslld $31, %%xmm3 \n\t"\
  579. "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
  580. "1: \n\t"\
  581. "mov %1, %%"REG_D" \n\t"\
  582. "mov (%%"REG_D"), %%"REG_D" \n\t"\
  583. "add %3, %%"REG_D" \n\t"
  584. #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
  585. "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
  586. "movq (%%"REG_d"), %%"out_reg1" \n\t"\
  587. "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
  588. "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
  589. "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
  590. "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
  591. "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
  592. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  593. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  594. "pmullw %%xmm0, %%"out_reg1" \n\t"\
  595. "pmullw %%xmm4, %%"out_reg2" \n\t"
  596. #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
  597. "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
  598. "movq (%%"REG_d"), %%"out_reg1" \n\t"\
  599. "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
  600. "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
  601. "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
  602. "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
  603. "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
  604. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  605. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  606. "pmullw %%xmm0, %%"out_reg1" \n\t"\
  607. "pmullw %%xmm4, %%"out_reg2" \n\t"
  608. #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
  609. snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
  610. "paddusw %%xmm2, %%xmm1 \n\t"\
  611. "paddusw %%xmm6, %%xmm5 \n\t"
  612. #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
  613. snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
  614. "paddusw %%xmm2, %%xmm1 \n\t"\
  615. "paddusw %%xmm6, %%xmm5 \n\t"
  616. #define snow_inner_add_yblock_sse2_end_common1\
  617. "add $32, %%"REG_S" \n\t"\
  618. "add %%"REG_c", %0 \n\t"\
  619. "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
  620. "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
  621. "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
  622. "add %%"REG_c", (%%"REG_a") \n\t"
  623. #define snow_inner_add_yblock_sse2_end_common2\
  624. "jnz 1b \n\t"\
  625. :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
  626. :\
  627. "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
  628. "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
  629. #define snow_inner_add_yblock_sse2_end_8\
  630. "sal $1, %%"REG_c" \n\t"\
  631. "add $"PTR_SIZE"*2, %1 \n\t"\
  632. snow_inner_add_yblock_sse2_end_common1\
  633. "sar $1, %%"REG_c" \n\t"\
  634. "sub $2, %2 \n\t"\
  635. snow_inner_add_yblock_sse2_end_common2
  636. #define snow_inner_add_yblock_sse2_end_16\
  637. "add $"PTR_SIZE"*1, %1 \n\t"\
  638. snow_inner_add_yblock_sse2_end_common1\
  639. "dec %2 \n\t"\
  640. snow_inner_add_yblock_sse2_end_common2
  641. static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  642. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  643. snow_inner_add_yblock_sse2_header
  644. snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
  645. snow_inner_add_yblock_sse2_accum_8("2", "8")
  646. snow_inner_add_yblock_sse2_accum_8("1", "128")
  647. snow_inner_add_yblock_sse2_accum_8("0", "136")
  648. "mov %0, %%"REG_d" \n\t"
  649. "movdqa (%%"REG_D"), %%xmm0 \n\t"
  650. "movdqa %%xmm1, %%xmm2 \n\t"
  651. "punpckhwd %%xmm7, %%xmm1 \n\t"
  652. "punpcklwd %%xmm7, %%xmm2 \n\t"
  653. "paddd %%xmm2, %%xmm0 \n\t"
  654. "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
  655. "paddd %%xmm1, %%xmm2 \n\t"
  656. "paddd %%xmm3, %%xmm0 \n\t"
  657. "paddd %%xmm3, %%xmm2 \n\t"
  658. "mov %1, %%"REG_D" \n\t"
  659. "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
  660. "add %3, %%"REG_D" \n\t"
  661. "movdqa (%%"REG_D"), %%xmm4 \n\t"
  662. "movdqa %%xmm5, %%xmm6 \n\t"
  663. "punpckhwd %%xmm7, %%xmm5 \n\t"
  664. "punpcklwd %%xmm7, %%xmm6 \n\t"
  665. "paddd %%xmm6, %%xmm4 \n\t"
  666. "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
  667. "paddd %%xmm5, %%xmm6 \n\t"
  668. "paddd %%xmm3, %%xmm4 \n\t"
  669. "paddd %%xmm3, %%xmm6 \n\t"
  670. "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
  671. "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
  672. "packssdw %%xmm2, %%xmm0 \n\t"
  673. "packuswb %%xmm7, %%xmm0 \n\t"
  674. "movq %%xmm0, (%%"REG_d") \n\t"
  675. "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
  676. "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
  677. "packssdw %%xmm6, %%xmm4 \n\t"
  678. "packuswb %%xmm7, %%xmm4 \n\t"
  679. "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
  680. snow_inner_add_yblock_sse2_end_8
  681. }
  682. static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  683. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  684. snow_inner_add_yblock_sse2_header
  685. snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
  686. snow_inner_add_yblock_sse2_accum_16("2", "16")
  687. snow_inner_add_yblock_sse2_accum_16("1", "512")
  688. snow_inner_add_yblock_sse2_accum_16("0", "528")
  689. "mov %0, %%"REG_d" \n\t"
  690. "movdqa %%xmm1, %%xmm0 \n\t"
  691. "movdqa %%xmm5, %%xmm4 \n\t"
  692. "punpcklwd %%xmm7, %%xmm0 \n\t"
  693. "paddd (%%"REG_D"), %%xmm0 \n\t"
  694. "punpckhwd %%xmm7, %%xmm1 \n\t"
  695. "paddd 16(%%"REG_D"), %%xmm1 \n\t"
  696. "punpcklwd %%xmm7, %%xmm4 \n\t"
  697. "paddd 32(%%"REG_D"), %%xmm4 \n\t"
  698. "punpckhwd %%xmm7, %%xmm5 \n\t"
  699. "paddd 48(%%"REG_D"), %%xmm5 \n\t"
  700. "paddd %%xmm3, %%xmm0 \n\t"
  701. "paddd %%xmm3, %%xmm1 \n\t"
  702. "paddd %%xmm3, %%xmm4 \n\t"
  703. "paddd %%xmm3, %%xmm5 \n\t"
  704. "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
  705. "psrad $8, %%xmm1 \n\t" /* FRAC_BITS. */
  706. "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
  707. "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
  708. "packssdw %%xmm1, %%xmm0 \n\t"
  709. "packssdw %%xmm5, %%xmm4 \n\t"
  710. "packuswb %%xmm4, %%xmm0 \n\t"
  711. "movdqu %%xmm0, (%%"REG_d") \n\t"
  712. snow_inner_add_yblock_sse2_end_16
  713. }
  714. #define snow_inner_add_yblock_mmx_header \
  715. DWTELEM * * dst_array = sb->line + src_y;\
  716. long tmp;\
  717. asm volatile(\
  718. "mov %7, %%"REG_c" \n\t"\
  719. "mov %6, %2 \n\t"\
  720. "mov %4, %%"REG_S" \n\t"\
  721. "pxor %%mm7, %%mm7 \n\t" /* 0 */\
  722. "pcmpeqd %%mm3, %%mm3 \n\t"\
  723. "pslld $31, %%mm3 \n\t"\
  724. "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
  725. "1: \n\t"\
  726. "mov %1, %%"REG_D" \n\t"\
  727. "mov (%%"REG_D"), %%"REG_D" \n\t"\
  728. "add %3, %%"REG_D" \n\t"
  729. #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
  730. "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
  731. "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
  732. "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
  733. "punpcklbw %%mm7, %%"out_reg1" \n\t"\
  734. "punpcklbw %%mm7, %%"out_reg2" \n\t"\
  735. "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
  736. "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
  737. "punpcklbw %%mm7, %%mm0 \n\t"\
  738. "punpcklbw %%mm7, %%mm4 \n\t"\
  739. "pmullw %%mm0, %%"out_reg1" \n\t"\
  740. "pmullw %%mm4, %%"out_reg2" \n\t"
  741. #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
  742. snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
  743. "paddusw %%mm2, %%mm1 \n\t"\
  744. "paddusw %%mm6, %%mm5 \n\t"
  745. #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
  746. "mov %0, %%"REG_d" \n\t"\
  747. "movq %%mm1, %%mm0 \n\t"\
  748. "movq %%mm5, %%mm4 \n\t"\
  749. "punpcklwd %%mm7, %%mm0 \n\t"\
  750. "paddd "read_offset"(%%"REG_D"), %%mm0 \n\t"\
  751. "punpckhwd %%mm7, %%mm1 \n\t"\
  752. "paddd "read_offset"+8(%%"REG_D"), %%mm1 \n\t"\
  753. "punpcklwd %%mm7, %%mm4 \n\t"\
  754. "paddd "read_offset"+16(%%"REG_D"), %%mm4 \n\t"\
  755. "punpckhwd %%mm7, %%mm5 \n\t"\
  756. "paddd "read_offset"+24(%%"REG_D"), %%mm5 \n\t"\
  757. "paddd %%mm3, %%mm0 \n\t"\
  758. "paddd %%mm3, %%mm1 \n\t"\
  759. "paddd %%mm3, %%mm4 \n\t"\
  760. "paddd %%mm3, %%mm5 \n\t"\
  761. "psrad $8, %%mm0 \n\t"\
  762. "psrad $8, %%mm1 \n\t"\
  763. "psrad $8, %%mm4 \n\t"\
  764. "psrad $8, %%mm5 \n\t"\
  765. \
  766. "packssdw %%mm1, %%mm0 \n\t"\
  767. "packssdw %%mm5, %%mm4 \n\t"\
  768. "packuswb %%mm4, %%mm0 \n\t"\
  769. "movq %%mm0, "write_offset"(%%"REG_d") \n\t"
  770. #define snow_inner_add_yblock_mmx_end(s_step)\
  771. "add $"s_step", %%"REG_S" \n\t"\
  772. "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
  773. "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
  774. "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
  775. "add %%"REG_c", (%%"REG_a") \n\t"\
  776. "add $"PTR_SIZE"*1, %1 \n\t"\
  777. "add %%"REG_c", %0 \n\t"\
  778. "dec %2 \n\t"\
  779. "jnz 1b \n\t"\
  780. :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
  781. :\
  782. "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
  783. "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
  784. static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  785. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  786. snow_inner_add_yblock_mmx_header
  787. snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
  788. snow_inner_add_yblock_mmx_accum("2", "8", "0")
  789. snow_inner_add_yblock_mmx_accum("1", "128", "0")
  790. snow_inner_add_yblock_mmx_accum("0", "136", "0")
  791. snow_inner_add_yblock_mmx_mix("0", "0")
  792. snow_inner_add_yblock_mmx_end("16")
  793. }
  794. static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  795. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  796. snow_inner_add_yblock_mmx_header
  797. snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
  798. snow_inner_add_yblock_mmx_accum("2", "16", "0")
  799. snow_inner_add_yblock_mmx_accum("1", "512", "0")
  800. snow_inner_add_yblock_mmx_accum("0", "528", "0")
  801. snow_inner_add_yblock_mmx_mix("0", "0")
  802. snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
  803. snow_inner_add_yblock_mmx_accum("2", "24", "8")
  804. snow_inner_add_yblock_mmx_accum("1", "520", "8")
  805. snow_inner_add_yblock_mmx_accum("0", "536", "8")
  806. snow_inner_add_yblock_mmx_mix("32", "8")
  807. snow_inner_add_yblock_mmx_end("32")
  808. }
  809. void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
  810. int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  811. if (b_w == 16)
  812. inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  813. else if (b_w == 8 && obmc_stride == 16) {
  814. if (!(b_h & 1))
  815. inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  816. else
  817. inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  818. } else
  819. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  820. }
  821. void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
  822. int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  823. if (b_w == 16)
  824. inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  825. else if (b_w == 8 && obmc_stride == 16)
  826. inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  827. else
  828. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  829. }