You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

724 lines
23KB

  1. /*
  2. * Blackfin Pixel Operations
  3. * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config_bfin.h"
  22. DEFUN(put_pixels_clamped,mL1,
  23. (DCTELEM *block, uint8_t *dest, int line_size)):
  24. [--SP] = (R7:4);
  25. R4 = 0;
  26. R5.l = 0x00ff;
  27. R5.h = 0x00ff;
  28. I0 = R0; // block
  29. I1 = R1; // dest
  30. R2 += -4; // line_size
  31. M1 = R2;
  32. P0 = 8;
  33. R0 = [I0++];
  34. R1 = [I0++];
  35. R2 = MAX(R0, R4) (V);
  36. LSETUP (ppc$0,ppc$1) LC0=P0;
  37. ppc$0: R2 = MIN(R2, R5) (V);
  38. R3 = MAX(R1, R4) (V);
  39. R3 = MIN(R3, R5) (V) || R0 = [I0++];
  40. R6 = BYTEPACK (R2,R3) || R1 = [I0++];
  41. R2 = MAX(R0, R4) (V) || [I1++] = R6;
  42. R2 = MIN(R2, R5) (V);
  43. R3 = MAX(R1, R4) (V);
  44. R3 = MIN(R3, R5) (V) || R0 = [I0++];
  45. R6 = BYTEPACK (R2,R3) || R1 = [I0++];
  46. ppc$1: R2 = Max(R0, R4) (V) || [I1++M1] = R6;
  47. (R7:4) = [SP++];
  48. RTS;
  49. DEFUN(add_pixels_clamped,mL1,
  50. (DCTELEM *block, uint8_t *dest, int line_size)):
  51. [-- SP] = (R7:4);
  52. R4 = 0;
  53. I0 = 0;
  54. R2 += -4; // line_size
  55. M0 = R2;
  56. I1 = R1; // dest
  57. I3 = R0; // block
  58. I2 = R1; // dest
  59. P0 = 8;
  60. M3 = 2;
  61. R0 = [I3++] || R2 = [I1];
  62. R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++];
  63. R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
  64. R6 = BYTEOP3P(R1:0, R3:2) (LO) || R1.H = W[I3++] || R2 = [I1];
  65. LSETUP(apc$2,apc$3) LC1 = P0;
  66. apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R3 = [I1++M0];
  67. R2 = R2 << 8 || R0.H = W[I3--];
  68. R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
  69. R6 = R6 + R7 (S) || R1.H = W[I3];
  70. R6 = BYTEOP3P(R1:0, R3:2) (LO) || I3+=M3 || [I2++]=R6;
  71. R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R2 = [I1];
  72. R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++];
  73. R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
  74. R6 = R6 + R7 (S) || R1.H = W[I3++];
  75. apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO) || [I2++M0] = R6 || R2 = [I1];
  76. (R7:4) = [SP++];
  77. RTS;
  78. /*
  79. motion compensation
  80. primitives
  81. * Halfpel motion compensation with rounding (a+b+1)>>1.
  82. * This is an array[4][4] of motion compensation funcions for 4
  83. * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
  84. * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
  85. * @param block destination where the result is stored
  86. * @param pixels source
  87. * @param line_size number of bytes in a horizontal line of block
  88. * @param h height
  89. */
  90. DEFUN(put_pixels8uc,mL1,
  91. (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
  92. int dest_size, int line_size, int h)):
  93. i3=r0; // dest
  94. i0=r1; // src0
  95. i1=r2; // src1
  96. r0=[sp+12]; // dest_size
  97. r2=[sp+16]; // line_size
  98. p0=[sp+20]; // h
  99. [--sp] = (r7:6);
  100. r0+=-4;
  101. m3=r0;
  102. r2+=-8;
  103. m0=r2;
  104. LSETUP(pp8$0,pp8$1) LC0=P0;
  105. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  106. pp8$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  107. R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0]|| R2 =[I1++M0];
  108. R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R6 ;
  109. pp8$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
  110. (r7:6) = [sp++];
  111. RTS;
  112. DEFUN(put_pixels16uc,mL1,
  113. (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
  114. int dest_size, int line_size, int h)):
  115. link 0;
  116. [--sp] = (r7:6);
  117. i3=r0; // dest
  118. i0=r1; // src0
  119. i1=r2; // src1
  120. r0=[fp+20]; // dest_size
  121. r2=[fp+24]; // line_size
  122. p0=[fp+28]; // h
  123. r0+=-12;
  124. m3=r0; // line_size
  125. r2+=-16;
  126. m0=r2;
  127. LSETUP(pp16$0,pp16$1) LC0=P0;
  128. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  129. pp16$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  130. R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2 =[I1++];
  131. R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++] || R3 =[I1++];
  132. [I3++] = R6;
  133. R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0] || R2 =[I1++M0];
  134. R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R7 ;
  135. [I3++] = R6;
  136. pp16$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
  137. (r7:6) = [sp++];
  138. unlink;
  139. RTS;
  140. DEFUN(put_pixels8uc_nornd,mL1,
  141. (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
  142. int line_size, int h)):
  143. i3=r0; // dest
  144. i0=r1; // src0
  145. i1=r2; // src1
  146. r2=[sp+12]; // line_size
  147. p0=[sp+16]; // h
  148. [--sp] = (r7:6);
  149. r2+=-4;
  150. m3=r2;
  151. r2+=-4;
  152. m0=r2;
  153. LSETUP(pp8$2,pp8$3) LC0=P0;
  154. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  155. pp8$2: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  156. R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0]|| R2 =[I1++M0];
  157. R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R6 ;
  158. pp8$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
  159. (r7:6) = [sp++];
  160. RTS;
  161. DEFUN(put_pixels16uc_nornd,mL1,
  162. (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
  163. int line_size, int h)):
  164. i3=r0; // dest
  165. i0=r1; // src0
  166. i1=r2; // src1
  167. r2=[sp+12]; // line_size
  168. p0=[sp+16]; // h
  169. [--sp] = (r7:6);
  170. r2+=-12;
  171. m3=r2; // line_size
  172. r2+=-4;
  173. m0=r2;
  174. LSETUP(pp16$2,pp16$3) LC0=P0;
  175. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  176. pp16$2:
  177. DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  178. R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++] || R2 =[I1++];
  179. R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++] || R3 =[I1++];
  180. [I3++] = R6;
  181. R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0] || R2 =[I1++M0];
  182. R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R7 ;
  183. [I3++] = R6;
  184. pp16$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
  185. (r7:6) = [sp++];
  186. RTS;
  187. DEFUN(z_put_pixels16_xy2,mL1,
  188. (uint8_t *block, const uint8_t *s0,
  189. int dest_size, int line_size, int h)):
  190. link 0;
  191. [--sp] = (r7:4);
  192. i3=r0; // dest
  193. i0=r1; // src0--> pixels
  194. i1=r1; // src1--> pixels + line_size
  195. r2+=-12;
  196. m2=r2; // m2=dest_width-4
  197. r2=[fp+20];
  198. m3=r2; // line_size
  199. p0=[fp+24]; // h
  200. r2+=-16;
  201. i1+=m3; /* src1 + line_size */
  202. m0=r2; /* line-size - 20 */
  203. B0 = I0;
  204. B1 = I1;
  205. B3 = I3;
  206. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  207. LSETUP(LS$16E,LE$16E) LC0=P0;
  208. LS$16E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  209. R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++] || R2 =[I1++];
  210. R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
  211. DISALGNEXCPT || R3 = [I1++] || [I3++] = R5;
  212. R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0]|| R2 = [I1++M0];
  213. R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
  214. LE$16E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
  215. M1 = 1;
  216. I3 = B3;
  217. I1 = B1;
  218. I0 = B0;
  219. I0 += M1;
  220. I1 += M1;
  221. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  222. LSETUP(LS$16O,LE$16O) LC0=P0;
  223. LS$16O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  224. R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++] || R2 =[I1++];
  225. R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6 =[I3++];
  226. R4 = R4 +|+ R6 || R7 = [I3--];
  227. R5 = R5 +|+ R7 || [I3++] = R4;
  228. DISALGNEXCPT || R3 =[I1++] || [I3++] = R5;
  229. R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0]|| R2 = [I1++M0];
  230. R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
  231. R4 = R4 +|+ R6 || R7 = [I3--];
  232. R5 = R5 +|+ R7 || [I3++] = R4;
  233. LE$16O: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
  234. (r7:4) = [sp++];
  235. unlink;
  236. rts;
  237. DEFUN(put_pixels16_xy2_nornd,mL1,
  238. (uint8_t *block, const uint8_t *s0,
  239. int line_size, int h)):
  240. link 0;
  241. [--sp] = (r7:4);
  242. i3=r0; // dest
  243. i0=r1; // src0--> pixels
  244. i1=r1; // src1--> pixels + line_size
  245. m3=r2;
  246. r2+=-12;
  247. m2=r2;
  248. r2+=-4;
  249. i1+=m3; /* src1 + line_size */
  250. m0=r2; /* line-size - 20 */
  251. p0=[fp+20]; // h
  252. B0=I0;
  253. B1=I1;
  254. B3=I3;
  255. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  256. LSETUP(LS$16ET,LE$16ET) LC0=P0;
  257. LS$16ET:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  258. R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++] || R2 =[I1++];
  259. R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R1 = [I0++] || [I3++] = R4 ;
  260. DISALGNEXCPT || R3 = [I1++] || [I3++] = R5;
  261. R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0]|| R2 = [I1++M0];
  262. R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ;
  263. LE$16ET:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
  264. M1 = 1;
  265. I3=B3;
  266. I1=B1;
  267. I0=B0;
  268. I0 += M1;
  269. I1 += M1;
  270. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  271. LSETUP(LS$16OT,LE$16OT) LC0=P0;
  272. LS$16OT:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  273. R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++] || R2 =[I1++];
  274. R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R1 = [I0++] || R6 =[I3++];
  275. R4 = R4 +|+ R6 || R7 = [I3--];
  276. R5 = R5 +|+ R7 || [I3++] = R4;
  277. DISALGNEXCPT || R3 =[I1++] || [I3++] = R5;
  278. R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0]|| R2 = [I1++M0];
  279. R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++];
  280. R4 = R4 +|+ R6 || R7 = [I3--];
  281. R5 = R5 +|+ R7 || [I3++] = R4;
  282. LE$16OT:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
  283. (r7:4) = [sp++];
  284. unlink;
  285. rts;
  286. DEFUN(z_put_pixels8_xy2,mL1,
  287. (uint8_t *block, const uint8_t *s0,
  288. int dest_size, int line_size, int h)):
  289. link 0;
  290. [--sp] = (r7:4);
  291. i3=r0; // dest
  292. i0=r1; // src0--> pixels
  293. i1=r1; // src1--> pixels + line_size
  294. r2+=-4;
  295. m2=r2; // m2=dest_width-4
  296. r2=[fp+20];
  297. m3=r2; // line_size
  298. p0=[fp+24]; // h
  299. r2+=-8;
  300. i1+=m3; /* src1 + line_size */
  301. m0=r2; /* line-size - 20 */
  302. b0 = I0;
  303. b1 = I1;
  304. b3 = I3;
  305. LSETUP(LS$8E,LE$8E) LC0=P0;
  306. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  307. LS$8E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  308. R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0] || R2 =[I1++M0];
  309. R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
  310. LE$8E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
  311. M1 = 1;
  312. I3 = b3;
  313. I1 = b1;
  314. I0 = b0;
  315. I0 += M1;
  316. I1 += M1;
  317. LSETUP(LS$8O,LE$8O) LC0=P0;
  318. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  319. LS$8O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
  320. R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0] || R2 =[I1++M0];
  321. R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 =[I3++];
  322. R4 = R4 +|+ R6 || R7 = [I3--];
  323. R5 = R5 +|+ R7 || [I3++] = R4;
  324. LE$8O: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5;
  325. (r7:4) = [sp++];
  326. unlink;
  327. rts;
  328. DEFUN(put_pixels8_xy2_nornd,mL1,
  329. (uint8_t *block, const uint8_t *s0, int line_size, int h)):
  330. link 0;
  331. [--sp] = (r7:4);
  332. i3=r0; // dest
  333. i0=r1; // src0--> pixels
  334. i1=r1; // src1--> pixels + line_size
  335. m3=r2;
  336. r2+=-4;
  337. m2=r2;
  338. r2+=-4;
  339. i1+=m3; /* src1 + line_size */
  340. m0=r2; /* line-size - 20 */
  341. p0=[fp+20]; // h
  342. b0 = I0;
  343. b1 = I1;
  344. b3 = I3;
  345. LSETUP(LS$8ET,LE$8ET) LC0=P0;
  346. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  347. LS$8ET: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  348. R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0] || R2 = [I1++M0];
  349. R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ;
  350. LE$8ET: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
  351. M1 = 1;
  352. I3 = b3;
  353. I1 = b1;
  354. I0 = b0;
  355. I0 += M1;
  356. I1 += M1;
  357. LSETUP(LS$8OT,LE$8OT) LC0=P0;
  358. DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
  359. LS$8OT: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  360. R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0] || R2 = [I1++M0];
  361. R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++];
  362. R4 = R4 +|+ R6 || R7 = [I3--];
  363. R5 = R5 +|+ R7 || [I3++] = R4;
  364. LE$8OT: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5;
  365. (r7:4) = [sp++];
  366. unlink;
  367. rts;
  368. DEFUN(diff_pixels,mL1,
  369. (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)):
  370. link 0;
  371. [--sp] = (r7:4);
  372. p0=8;
  373. i3=r0; // block
  374. i0=r1; // s1
  375. i1=r2; // s2
  376. r2=[fp+20]; // stride
  377. r2+=-8;
  378. m0=r2;
  379. LSETUP(.LS0,.LE0) LC0=P0;
  380. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  381. .LS0: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  382. (R5,R4) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
  383. (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || [I3++] = R4;
  384. DISALGNEXCPT || R2 = [I1++] || [I3++] = R5;
  385. [i3++]=r6;
  386. .LE0: [i3++]=r7;
  387. (r7:4) = [sp++];
  388. unlink;
  389. rts;
  390. /*
  391. for (i = 0; i < 16; i++) {
  392. for (j = 0; j < 16; j++) {
  393. sum += pix[j];
  394. }
  395. pix += line_size;
  396. }
  397. */
  398. DEFUN(pix_sum,mL1,
  399. (uint8_t *p, int stride)):
  400. link 0;
  401. [--sp] = (r7:4);
  402. p0=8;
  403. i0=r0; // s1
  404. i1=r0;
  405. m1=r1;
  406. r1=r1+r1;
  407. r1+=-16; // stride
  408. m0=r1;
  409. i1+=m1;
  410. r6=0;
  411. LSETUP(LS$PS,LE$PS) LC0=P0;
  412. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  413. LS$PS: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  414. (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++] || R2 = [I1++];
  415. r6=r6+|+r5;
  416. r6=r6+|+r4;
  417. (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++] || R3 = [I1++];
  418. r6=r6+|+r5;
  419. r6=r6+|+r4;
  420. (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++m0] || R2 = [I1++m0];
  421. r6=r6+|+r5;
  422. r6=r6+|+r4;
  423. (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++] || R2 = [I1++];
  424. r6=r6+|+r5;
  425. LE$PS: r6=r6+|+r4;
  426. r0.l=r6.l+r6.h;
  427. r0.h=0;
  428. (r7:4) = [sp++];
  429. unlink;
  430. rts;
  431. DEFUN(get_pixels,mL1,
  432. (DCTELEM *restrict block, const uint8_t *pixels, int line_size)):
  433. [--sp] = (r7:4);
  434. i3=r0; // dest
  435. i0=r1; // src0
  436. p0=8;
  437. r2+=-8;
  438. m0=r2;
  439. LSETUP(gp8$0,gp8$1) LC0=P0;
  440. DISALGNEXCPT || R0 = [I0++];
  441. DISALGNEXCPT || R1 = [I0++];
  442. gp8$0: (R7,R6) = byteunpack R1:0 || R0 = [I0++M0];
  443. (R5,R4) = byteunpack R1:0 (R) || R0 = [I0++] || [I3++]=R6;
  444. DISALGNEXCPT || R1 = [I0++] || [I3++]=R7;
  445. [I3++]=R4;
  446. gp8$1: [I3++]=R5
  447. (r7:4) = [sp++];
  448. RTS;
  449. /* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
  450. /* 91 cycles */
  451. DEFUN(z_sad16x16,mL1,
  452. (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
  453. link 0;
  454. I0 = R0;
  455. I1 = R1;
  456. A1 = A0 = 0;
  457. R0 = [sp+20]; // rwidth
  458. P2 = [sp+24]; // height
  459. R3 = 16;
  460. R0 = R0 - R3;
  461. R3 = R2 - R3;
  462. M1 = R0;
  463. M0 = R3;
  464. DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
  465. LSETUP (s$16, e$16) LC0=P2;
  466. s$16: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  467. SAA (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++];
  468. SAA (R1:0,R3:2) (R) || R1 = [I0++] || R3 = [I1++];
  469. SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1];
  470. e$16: SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
  471. R3=A1.L+A1.H, R2=A0.L+A0.H ;
  472. R0 = R2 + R3 ;
  473. unlink;
  474. RTS;
  475. /* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
  476. /* 36 cycles */
  477. DEFUN(z_sad8x8,mL1,
  478. (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
  479. I0 = R0;
  480. I1 = R1;
  481. A1 = A0 = 0;
  482. r0 = [sp+12]; // rwidth
  483. P2 = [sp+16]; //height
  484. R3 = 8;
  485. R0 = R0 - R3;
  486. R3 = R2 - R3;
  487. M0 = R3;
  488. M1 = R0;
  489. LSETUP (s$8, e$8) LC0=P2;
  490. DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
  491. DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  492. s$8: SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1];
  493. SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
  494. e$8: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  495. R3=A1.L+A1.H, R2=A0.L+A0.H ;
  496. R0 = R2 + R3 ;
  497. RTS;
  498. DEFUN(pix_norm1,mL1,
  499. (uint8_t * pix, int line_size)):
  500. [--SP]=(R7:4,P5:3);
  501. // Fetch the input arguments.
  502. P1 = R0; // pix
  503. P0 = R1; // line_size
  504. P5 = 16; // loop ctr.
  505. P0 -= P5;
  506. M0 = P0; // M0 = line_size-16;
  507. // Now for the real work.
  508. A1 = A0 = 0;
  509. lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
  510. I0 = P1;
  511. DISALGNEXCPT || r0 = [i0++];
  512. _pix_norm1_blkfn_loopStart:
  513. // following unpacks pix1[0..15] pix1+line_size[0..15]
  514. DISALGNEXCPT || r1 = [i0++];
  515. (r5, r4) = byteunpack r1:0 || r0 = [i0++];
  516. a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
  517. a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
  518. (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
  519. a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
  520. a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
  521. (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
  522. a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
  523. a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
  524. (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
  525. a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
  526. _pix_norm1_blkfn_loopEnd:
  527. a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
  528. // Clean up at the end:
  529. R2 = A0, R3 = A1;
  530. R0 = R2 + R3 (S);
  531. (R7:4,P5:3)=[SP++];
  532. RTS;
  533. DEFUN(sse4,mL1,
  534. (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
  535. link 0;
  536. [--sp] = (r7:6);
  537. p0=[fp+24]; // h
  538. i0=r1; // pix1
  539. i1=r2; // pix2
  540. r2=[fp+20]; // line_size
  541. r2+=-4;
  542. m0=r2;
  543. a0=a1=0;
  544. LSETUP(.S40,.E40) LC0=P0;
  545. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  546. .S40: DISALGNEXCPT || R1 = [I0++M0] || R3 = [I1++M0];
  547. (R7,R6) = BYTEOP16M (R1:0,R3:2);
  548. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  549. .E40: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  550. a0 += a1;
  551. r0 = a0;
  552. (r7:6) = [sp++];
  553. unlink;
  554. rts;
  555. DEFUN(sse8,mL1,
  556. (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
  557. link 0;
  558. [--sp] = (r7:6);
  559. p0=[fp+24]; // h
  560. i0=r1; // pix1
  561. i1=r2; // pix2
  562. r2=[fp+20]; // line_size
  563. r2+=-8;
  564. m0=r2;
  565. a0=a1=0;
  566. LSETUP(.S80,.E80) LC0=P0;
  567. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  568. .S80: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  569. (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
  570. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  571. a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  572. (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++];
  573. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  574. .E80: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  575. a0 += a1;
  576. r0 = a0;
  577. (r7:6) = [sp++];
  578. unlink;
  579. rts;
  580. DEFUN(sse16,mL1,
  581. (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
  582. link 0;
  583. [--sp] = (r7:6);
  584. p0=[fp+24]; // h
  585. i0=r1; // pix1
  586. i1=r2; // pix2
  587. r2=[fp+20]; // line_size
  588. r2+=-16;
  589. m0=r2;
  590. a0=a1=0;
  591. DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
  592. LSETUP(.S160,.E160) LC0=P0;
  593. .S160: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
  594. (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++];
  595. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  596. a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  597. (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++] || R3 = [I1++];
  598. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  599. a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  600. (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
  601. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  602. a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  603. (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++];
  604. a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
  605. .E160: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
  606. a0 += a1;
  607. r0 = a0;
  608. (r7:6) = [sp++];
  609. unlink;
  610. rts;