You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

656 lines
28KB

  1. /*
  2. * MMX and SSE2 optimized snow DSP utils
  3. * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. */
  19. #include "../avcodec.h"
  20. #include "../snow.h"
  21. #include "mmx.h"
  22. static void always_inline snow_interleave_line_header(int * i, int width, DWTELEM * low, DWTELEM * high){
  23. (*i) = (width) - 2;
  24. if (width & 1){
  25. low[(*i)+1] = low[((*i)+1)>>1];
  26. (*i)--;
  27. }
  28. }
  29. static void always_inline snow_horizontal_compose_lift_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w, int lift_high, int mul, int add, int shift){
  30. for(; i<w; i++){
  31. dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
  32. }
  33. if((width^lift_high)&1){
  34. dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
  35. }
  36. }
  37. static void always_inline snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
  38. for(; i<w; i++){
  39. dst[i] = src[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4 * src[i]) >> W_BS);
  40. }
  41. if(width&1){
  42. dst[w] = src[w] - (((-2 * ref[w] + W_BO) - 4 * src[w]) >> W_BS);
  43. }
  44. }
  45. void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
  46. const int w2= (width+1)>>1;
  47. // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
  48. DWTELEM temp_buf[(width>>1) + 4];
  49. DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
  50. const int w_l= (width>>1);
  51. const int w_r= w2 - 1;
  52. int i;
  53. { // Lift 0
  54. DWTELEM * const ref = b + w2 - 1;
  55. DWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
  56. // (the first time erroneously), we allow the SSE2 code to run an extra pass.
  57. // The savings in code and time are well worth having to store this value and
  58. // calculate b[0] correctly afterwards.
  59. i = 0;
  60. asm volatile(
  61. "pcmpeqd %%xmm7, %%xmm7 \n\t"
  62. "pslld $31, %%xmm7 \n\t"
  63. "psrld $29, %%xmm7 \n\t"
  64. ::);
  65. for(; i<w_l-7; i+=8){
  66. asm volatile(
  67. "movdqu (%1), %%xmm1 \n\t"
  68. "movdqu 16(%1), %%xmm5 \n\t"
  69. "movdqu 4(%1), %%xmm2 \n\t"
  70. "movdqu 20(%1), %%xmm6 \n\t"
  71. "paddd %%xmm1, %%xmm2 \n\t"
  72. "paddd %%xmm5, %%xmm6 \n\t"
  73. "movdqa %%xmm2, %%xmm0 \n\t"
  74. "movdqa %%xmm6, %%xmm4 \n\t"
  75. "paddd %%xmm2, %%xmm2 \n\t"
  76. "paddd %%xmm6, %%xmm6 \n\t"
  77. "paddd %%xmm0, %%xmm2 \n\t"
  78. "paddd %%xmm4, %%xmm6 \n\t"
  79. "paddd %%xmm7, %%xmm2 \n\t"
  80. "paddd %%xmm7, %%xmm6 \n\t"
  81. "psrad $3, %%xmm2 \n\t"
  82. "psrad $3, %%xmm6 \n\t"
  83. "movdqa (%0), %%xmm0 \n\t"
  84. "movdqa 16(%0), %%xmm4 \n\t"
  85. "psubd %%xmm2, %%xmm0 \n\t"
  86. "psubd %%xmm6, %%xmm4 \n\t"
  87. "movdqa %%xmm0, (%0) \n\t"
  88. "movdqa %%xmm4, 16(%0) \n\t"
  89. :: "r"(&b[i]), "r"(&ref[i])
  90. : "memory"
  91. );
  92. }
  93. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  94. b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  95. }
  96. { // Lift 1
  97. DWTELEM * const dst = b+w2;
  98. i = 0;
  99. for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
  100. dst[i] = dst[i] - (b[i] + b[i + 1]);
  101. }
  102. for(; i<w_r-7; i+=8){
  103. asm volatile(
  104. "movdqu (%1), %%xmm1 \n\t"
  105. "movdqu 16(%1), %%xmm5 \n\t"
  106. "movdqu 4(%1), %%xmm2 \n\t"
  107. "movdqu 20(%1), %%xmm6 \n\t"
  108. "paddd %%xmm1, %%xmm2 \n\t"
  109. "paddd %%xmm5, %%xmm6 \n\t"
  110. "movdqa (%0), %%xmm0 \n\t"
  111. "movdqa 16(%0), %%xmm4 \n\t"
  112. "psubd %%xmm2, %%xmm0 \n\t"
  113. "psubd %%xmm6, %%xmm4 \n\t"
  114. "movdqa %%xmm0, (%0) \n\t"
  115. "movdqa %%xmm4, 16(%0) \n\t"
  116. :: "r"(&dst[i]), "r"(&b[i])
  117. : "memory"
  118. );
  119. }
  120. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  121. }
  122. { // Lift 2
  123. DWTELEM * const ref = b+w2 - 1;
  124. DWTELEM b_0 = b[0];
  125. i = 0;
  126. asm volatile(
  127. "pslld $1, %%xmm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
  128. ::);
  129. for(; i<w_l-7; i+=8){
  130. asm volatile(
  131. "movdqu (%1), %%xmm1 \n\t"
  132. "movdqu 16(%1), %%xmm5 \n\t"
  133. "movdqu 4(%1), %%xmm0 \n\t"
  134. "movdqu 20(%1), %%xmm4 \n\t"
  135. "paddd %%xmm1, %%xmm0 \n\t"
  136. "paddd %%xmm5, %%xmm4 \n\t"
  137. "movdqa %%xmm7, %%xmm1 \n\t"
  138. "movdqa %%xmm7, %%xmm5 \n\t"
  139. "psubd %%xmm0, %%xmm1 \n\t"
  140. "psubd %%xmm4, %%xmm5 \n\t"
  141. "movdqa (%0), %%xmm0 \n\t"
  142. "movdqa 16(%0), %%xmm4 \n\t"
  143. "pslld $2, %%xmm0 \n\t"
  144. "pslld $2, %%xmm4 \n\t"
  145. "psubd %%xmm0, %%xmm1 \n\t"
  146. "psubd %%xmm4, %%xmm5 \n\t"
  147. "psrad $4, %%xmm1 \n\t"
  148. "psrad $4, %%xmm5 \n\t"
  149. "movdqa (%0), %%xmm0 \n\t"
  150. "movdqa 16(%0), %%xmm4 \n\t"
  151. "psubd %%xmm1, %%xmm0 \n\t"
  152. "psubd %%xmm5, %%xmm4 \n\t"
  153. "movdqa %%xmm0, (%0) \n\t"
  154. "movdqa %%xmm4, 16(%0) \n\t"
  155. :: "r"(&b[i]), "r"(&ref[i])
  156. : "memory"
  157. );
  158. }
  159. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  160. b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
  161. }
  162. { // Lift 3
  163. DWTELEM * const src = b+w2;
  164. i = 0;
  165. for(; (((long)&temp[i]) & 0xF) && i<w_r; i++){
  166. temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
  167. }
  168. for(; i<w_r-7; i+=8){
  169. asm volatile(
  170. "movdqu 4(%1), %%xmm2 \n\t"
  171. "movdqu 20(%1), %%xmm6 \n\t"
  172. "paddd (%1), %%xmm2 \n\t"
  173. "paddd 16(%1), %%xmm6 \n\t"
  174. "movdqa %%xmm2, %%xmm0 \n\t"
  175. "movdqa %%xmm6, %%xmm4 \n\t"
  176. "pslld $2, %%xmm2 \n\t"
  177. "pslld $2, %%xmm6 \n\t"
  178. "psubd %%xmm2, %%xmm0 \n\t"
  179. "psubd %%xmm6, %%xmm4 \n\t"
  180. "psrad $1, %%xmm0 \n\t"
  181. "psrad $1, %%xmm4 \n\t"
  182. "movdqu (%0), %%xmm2 \n\t"
  183. "movdqu 16(%0), %%xmm6 \n\t"
  184. "psubd %%xmm0, %%xmm2 \n\t"
  185. "psubd %%xmm4, %%xmm6 \n\t"
  186. "movdqa %%xmm2, (%2) \n\t"
  187. "movdqa %%xmm6, 16(%2) \n\t"
  188. :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
  189. : "memory"
  190. );
  191. }
  192. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
  193. }
  194. {
  195. snow_interleave_line_header(&i, width, b, temp);
  196. for (; (i & 0x1E) != 0x1E; i-=2){
  197. b[i+1] = temp[i>>1];
  198. b[i] = b[i>>1];
  199. }
  200. for (i-=30; i>=0; i-=32){
  201. asm volatile(
  202. "movdqa (%1), %%xmm0 \n\t"
  203. "movdqa 16(%1), %%xmm2 \n\t"
  204. "movdqa 32(%1), %%xmm4 \n\t"
  205. "movdqa 48(%1), %%xmm6 \n\t"
  206. "movdqa (%1), %%xmm1 \n\t"
  207. "movdqa 16(%1), %%xmm3 \n\t"
  208. "movdqa 32(%1), %%xmm5 \n\t"
  209. "movdqa 48(%1), %%xmm7 \n\t"
  210. "punpckldq (%2), %%xmm0 \n\t"
  211. "punpckldq 16(%2), %%xmm2 \n\t"
  212. "punpckldq 32(%2), %%xmm4 \n\t"
  213. "punpckldq 48(%2), %%xmm6 \n\t"
  214. "movdqa %%xmm0, (%0) \n\t"
  215. "movdqa %%xmm2, 32(%0) \n\t"
  216. "movdqa %%xmm4, 64(%0) \n\t"
  217. "movdqa %%xmm6, 96(%0) \n\t"
  218. "punpckhdq (%2), %%xmm1 \n\t"
  219. "punpckhdq 16(%2), %%xmm3 \n\t"
  220. "punpckhdq 32(%2), %%xmm5 \n\t"
  221. "punpckhdq 48(%2), %%xmm7 \n\t"
  222. "movdqa %%xmm1, 16(%0) \n\t"
  223. "movdqa %%xmm3, 48(%0) \n\t"
  224. "movdqa %%xmm5, 80(%0) \n\t"
  225. "movdqa %%xmm7, 112(%0) \n\t"
  226. :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
  227. : "memory"
  228. );
  229. }
  230. }
  231. }
  232. void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
  233. const int w2= (width+1)>>1;
  234. DWTELEM temp[width >> 1];
  235. const int w_l= (width>>1);
  236. const int w_r= w2 - 1;
  237. int i;
  238. { // Lift 0
  239. DWTELEM * const ref = b + w2 - 1;
  240. i = 1;
  241. b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  242. asm volatile(
  243. "pcmpeqd %%mm7, %%mm7 \n\t"
  244. "pslld $31, %%mm7 \n\t"
  245. "psrld $29, %%mm7 \n\t"
  246. ::);
  247. for(; i<w_l-3; i+=4){
  248. asm volatile(
  249. "movq (%1), %%mm2 \n\t"
  250. "movq 8(%1), %%mm6 \n\t"
  251. "paddd 4(%1), %%mm2 \n\t"
  252. "paddd 12(%1), %%mm6 \n\t"
  253. "movq %%mm2, %%mm0 \n\t"
  254. "movq %%mm6, %%mm4 \n\t"
  255. "paddd %%mm2, %%mm2 \n\t"
  256. "paddd %%mm6, %%mm6 \n\t"
  257. "paddd %%mm0, %%mm2 \n\t"
  258. "paddd %%mm4, %%mm6 \n\t"
  259. "paddd %%mm7, %%mm2 \n\t"
  260. "paddd %%mm7, %%mm6 \n\t"
  261. "psrad $3, %%mm2 \n\t"
  262. "psrad $3, %%mm6 \n\t"
  263. "movq (%0), %%mm0 \n\t"
  264. "movq 8(%0), %%mm4 \n\t"
  265. "psubd %%mm2, %%mm0 \n\t"
  266. "psubd %%mm6, %%mm4 \n\t"
  267. "movq %%mm0, (%0) \n\t"
  268. "movq %%mm4, 8(%0) \n\t"
  269. :: "r"(&b[i]), "r"(&ref[i])
  270. : "memory"
  271. );
  272. }
  273. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  274. }
  275. { // Lift 1
  276. DWTELEM * const dst = b+w2;
  277. i = 0;
  278. for(; i<w_r-3; i+=4){
  279. asm volatile(
  280. "movq (%1), %%mm2 \n\t"
  281. "movq 8(%1), %%mm6 \n\t"
  282. "paddd 4(%1), %%mm2 \n\t"
  283. "paddd 12(%1), %%mm6 \n\t"
  284. "movq (%0), %%mm0 \n\t"
  285. "movq 8(%0), %%mm4 \n\t"
  286. "psubd %%mm2, %%mm0 \n\t"
  287. "psubd %%mm6, %%mm4 \n\t"
  288. "movq %%mm0, (%0) \n\t"
  289. "movq %%mm4, 8(%0) \n\t"
  290. :: "r"(&dst[i]), "r"(&b[i])
  291. : "memory"
  292. );
  293. }
  294. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  295. }
  296. { // Lift 2
  297. DWTELEM * const ref = b+w2 - 1;
  298. i = 1;
  299. b[0] = b[0] - (((-2 * ref[1] + W_BO) - 4 * b[0]) >> W_BS);
  300. asm volatile(
  301. "pslld $1, %%mm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
  302. ::);
  303. for(; i<w_l-3; i+=4){
  304. asm volatile(
  305. "movq (%1), %%mm0 \n\t"
  306. "movq 8(%1), %%mm4 \n\t"
  307. "paddd 4(%1), %%mm0 \n\t"
  308. "paddd 12(%1), %%mm4 \n\t"
  309. "movq %%mm7, %%mm1 \n\t"
  310. "movq %%mm7, %%mm5 \n\t"
  311. "psubd %%mm0, %%mm1 \n\t"
  312. "psubd %%mm4, %%mm5 \n\t"
  313. "movq (%0), %%mm0 \n\t"
  314. "movq 8(%0), %%mm4 \n\t"
  315. "pslld $2, %%mm0 \n\t"
  316. "pslld $2, %%mm4 \n\t"
  317. "psubd %%mm0, %%mm1 \n\t"
  318. "psubd %%mm4, %%mm5 \n\t"
  319. "psrad $4, %%mm1 \n\t"
  320. "psrad $4, %%mm5 \n\t"
  321. "movq (%0), %%mm0 \n\t"
  322. "movq 8(%0), %%mm4 \n\t"
  323. "psubd %%mm1, %%mm0 \n\t"
  324. "psubd %%mm5, %%mm4 \n\t"
  325. "movq %%mm0, (%0) \n\t"
  326. "movq %%mm4, 8(%0) \n\t"
  327. :: "r"(&b[i]), "r"(&ref[i])
  328. : "memory"
  329. );
  330. }
  331. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  332. }
  333. { // Lift 3
  334. DWTELEM * const src = b+w2;
  335. i = 0;
  336. for(; i<w_r-3; i+=4){
  337. asm volatile(
  338. "movq 4(%1), %%mm2 \n\t"
  339. "movq 12(%1), %%mm6 \n\t"
  340. "paddd (%1), %%mm2 \n\t"
  341. "paddd 8(%1), %%mm6 \n\t"
  342. "movq %%mm2, %%mm0 \n\t"
  343. "movq %%mm6, %%mm4 \n\t"
  344. "pslld $2, %%mm2 \n\t"
  345. "pslld $2, %%mm6 \n\t"
  346. "psubd %%mm2, %%mm0 \n\t"
  347. "psubd %%mm6, %%mm4 \n\t"
  348. "psrad $1, %%mm0 \n\t"
  349. "psrad $1, %%mm4 \n\t"
  350. "movq (%0), %%mm2 \n\t"
  351. "movq 8(%0), %%mm6 \n\t"
  352. "psubd %%mm0, %%mm2 \n\t"
  353. "psubd %%mm4, %%mm6 \n\t"
  354. "movq %%mm2, (%2) \n\t"
  355. "movq %%mm6, 8(%2) \n\t"
  356. :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
  357. : "memory"
  358. );
  359. }
  360. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
  361. }
  362. {
  363. snow_interleave_line_header(&i, width, b, temp);
  364. for (; (i & 0xE) != 0xE; i-=2){
  365. b[i+1] = temp[i>>1];
  366. b[i] = b[i>>1];
  367. }
  368. for (i-=14; i>=0; i-=16){
  369. asm volatile(
  370. "movq (%1), %%mm0 \n\t"
  371. "movq 8(%1), %%mm2 \n\t"
  372. "movq 16(%1), %%mm4 \n\t"
  373. "movq 24(%1), %%mm6 \n\t"
  374. "movq (%1), %%mm1 \n\t"
  375. "movq 8(%1), %%mm3 \n\t"
  376. "movq 16(%1), %%mm5 \n\t"
  377. "movq 24(%1), %%mm7 \n\t"
  378. "punpckldq (%2), %%mm0 \n\t"
  379. "punpckldq 8(%2), %%mm2 \n\t"
  380. "punpckldq 16(%2), %%mm4 \n\t"
  381. "punpckldq 24(%2), %%mm6 \n\t"
  382. "movq %%mm0, (%0) \n\t"
  383. "movq %%mm2, 16(%0) \n\t"
  384. "movq %%mm4, 32(%0) \n\t"
  385. "movq %%mm6, 48(%0) \n\t"
  386. "punpckhdq (%2), %%mm1 \n\t"
  387. "punpckhdq 8(%2), %%mm3 \n\t"
  388. "punpckhdq 16(%2), %%mm5 \n\t"
  389. "punpckhdq 24(%2), %%mm7 \n\t"
  390. "movq %%mm1, 8(%0) \n\t"
  391. "movq %%mm3, 24(%0) \n\t"
  392. "movq %%mm5, 40(%0) \n\t"
  393. "movq %%mm7, 56(%0) \n\t"
  394. :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
  395. : "memory"
  396. );
  397. }
  398. }
  399. }
  400. #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
  401. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  402. ""op" 16(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  403. ""op" 32(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  404. ""op" 48(%%"r",%%"REG_d",4), %%"t3" \n\t"
  405. #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
  406. snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
  407. #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
  408. snow_vertical_compose_sse2_load_add("paddd",r,t0,t1,t2,t3)
  409. #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  410. "psubd %%"s0", %%"t0" \n\t"\
  411. "psubd %%"s1", %%"t1" \n\t"\
  412. "psubd %%"s2", %%"t2" \n\t"\
  413. "psubd %%"s3", %%"t3" \n\t"
  414. #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
  415. "movdqa %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  416. "movdqa %%"s1", 16(%%"w",%%"REG_d",4) \n\t"\
  417. "movdqa %%"s2", 32(%%"w",%%"REG_d",4) \n\t"\
  418. "movdqa %%"s3", 48(%%"w",%%"REG_d",4) \n\t"
  419. #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
  420. "psrad $"n", %%"t0" \n\t"\
  421. "psrad $"n", %%"t1" \n\t"\
  422. "psrad $"n", %%"t2" \n\t"\
  423. "psrad $"n", %%"t3" \n\t"
  424. #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  425. "paddd %%"s0", %%"t0" \n\t"\
  426. "paddd %%"s1", %%"t1" \n\t"\
  427. "paddd %%"s2", %%"t2" \n\t"\
  428. "paddd %%"s3", %%"t3" \n\t"
  429. #define snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)\
  430. "pslld $"n", %%"t0" \n\t"\
  431. "pslld $"n", %%"t1" \n\t"\
  432. "pslld $"n", %%"t2" \n\t"\
  433. "pslld $"n", %%"t3" \n\t"
  434. #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  435. "movdqa %%"s0", %%"t0" \n\t"\
  436. "movdqa %%"s1", %%"t1" \n\t"\
  437. "movdqa %%"s2", %%"t2" \n\t"\
  438. "movdqa %%"s3", %%"t3" \n\t"
  439. void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  440. long i = width;
  441. while(i & 0xF)
  442. {
  443. i--;
  444. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  445. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  446. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  447. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  448. }
  449. asm volatile (
  450. "jmp 2f \n\t"
  451. "1: \n\t"
  452. "mov %6, %%"REG_a" \n\t"
  453. "mov %4, %%"REG_b" \n\t"
  454. snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
  455. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  456. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  457. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  458. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  459. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  460. "pslld $31, %%xmm1 \n\t"
  461. "psrld $29, %%xmm1 \n\t"
  462. "mov %5, %%"REG_a" \n\t"
  463. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  464. snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
  465. snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
  466. snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  467. snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
  468. "mov %3, %%"REG_c" \n\t"
  469. snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
  470. snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
  471. snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  472. snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6")
  473. "mov %2, %%"REG_a" \n\t"
  474. snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
  475. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  476. snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
  477. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  478. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  479. "pslld $31, %%xmm1 \n\t"
  480. "psrld $28, %%xmm1 \n\t"
  481. "mov %1, %%"REG_b" \n\t"
  482. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  483. snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
  484. snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
  485. snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
  486. snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6")
  487. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  488. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  489. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  490. snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
  491. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  492. snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")
  493. "2: \n\t"
  494. "sub $16, %%"REG_d" \n\t"
  495. "jge 1b \n\t"
  496. :"+d"(i)
  497. :
  498. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  499. "%"REG_a"","%"REG_b"","%"REG_c"");
  500. }
  501. #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
  502. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  503. ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  504. ""op" 16(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  505. ""op" 24(%%"r",%%"REG_d",4), %%"t3" \n\t"
  506. #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
  507. snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
  508. #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
  509. snow_vertical_compose_mmx_load_add("paddd",r,t0,t1,t2,t3)
  510. #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  511. snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)
  512. #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
  513. "movq %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  514. "movq %%"s1", 8(%%"w",%%"REG_d",4) \n\t"\
  515. "movq %%"s2", 16(%%"w",%%"REG_d",4) \n\t"\
  516. "movq %%"s3", 24(%%"w",%%"REG_d",4) \n\t"
  517. #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
  518. snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)
  519. #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  520. snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)
  521. #define snow_vertical_compose_mmx_sll(n,t0,t1,t2,t3)\
  522. snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)
  523. #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  524. "movq %%"s0", %%"t0" \n\t"\
  525. "movq %%"s1", %%"t1" \n\t"\
  526. "movq %%"s2", %%"t2" \n\t"\
  527. "movq %%"s3", %%"t3" \n\t"
  528. void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  529. long i = width;
  530. while(i & 0x7)
  531. {
  532. i--;
  533. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  534. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  535. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  536. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  537. }
  538. asm volatile(
  539. "jmp 2f \n\t"
  540. "1: \n\t"
  541. "mov %6, %%"REG_a" \n\t"
  542. "mov %4, %%"REG_b" \n\t"
  543. snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
  544. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  545. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  546. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  547. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  548. "pcmpeqd %%mm1, %%mm1 \n\t"
  549. "pslld $31, %%mm1 \n\t"
  550. "psrld $29, %%mm1 \n\t"
  551. "mov %5, %%"REG_a" \n\t"
  552. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  553. snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
  554. snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
  555. snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  556. snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
  557. "mov %3, %%"REG_c" \n\t"
  558. snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
  559. snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
  560. snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  561. snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6")
  562. "mov %2, %%"REG_a" \n\t"
  563. snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
  564. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  565. snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
  566. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  567. "pcmpeqd %%mm1, %%mm1 \n\t"
  568. "pslld $31, %%mm1 \n\t"
  569. "psrld $28, %%mm1 \n\t"
  570. "mov %1, %%"REG_b" \n\t"
  571. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  572. snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
  573. snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
  574. snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
  575. snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6")
  576. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  577. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  578. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  579. snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
  580. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  581. snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")
  582. "2: \n\t"
  583. "sub $8, %%"REG_d" \n\t"
  584. "jge 1b \n\t"
  585. :"+d"(i)
  586. :
  587. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  588. "%"REG_a"","%"REG_b"","%"REG_c"");
  589. }