You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

441 lines
15KB

  1. /*
  2. * aligned/packed access motion
  3. *
  4. * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavcodec/avcodec.h"
  23. #include "libavcodec/dsputil.h"
  24. #include "dsputil_sh4.h"
  25. #define LP(p) *(uint32_t*)(p)
  26. #define LPC(p) *(const uint32_t*)(p)
  27. #define UNPACK(ph,pl,tt0,tt1) do { \
  28. uint32_t t0,t1; t0=tt0;t1=tt1; \
  29. ph = ( (t0 & ~BYTE_VEC32(0x03))>>2) + ( (t1 & ~BYTE_VEC32(0x03))>>2); \
  30. pl = (t0 & BYTE_VEC32(0x03)) + (t1 & BYTE_VEC32(0x03)); } while(0)
  31. #define rnd_PACK(ph,pl,nph,npl) ph + nph + (((pl + npl + BYTE_VEC32(0x02))>>2) & BYTE_VEC32(0x03))
  32. #define no_rnd_PACK(ph,pl,nph,npl) ph + nph + (((pl + npl + BYTE_VEC32(0x01))>>2) & BYTE_VEC32(0x03))
  33. /* little endian */
  34. #define MERGE1(a,b,ofs) (ofs==0)?a:( ((a)>>(8*ofs))|((b)<<(32-8*ofs)) )
  35. #define MERGE2(a,b,ofs) (ofs==3)?b:( ((a)>>(8*(ofs+1)))|((b)<<(32-8*(ofs+1))) )
  36. /* big
  37. #define MERGE1(a,b,ofs) (ofs==0)?a:( ((a)<<(8*ofs))|((b)>>(32-8*ofs)) )
  38. #define MERGE2(a,b,ofs) (ofs==3)?b:( ((a)<<(8+8*ofs))|((b)>>(32-8-8*ofs)) )
  39. */
  40. #define put(d,s) d = s
  41. #define avg(d,s) d = rnd_avg32(s,d)
  42. #define OP_C4(ofs) \
  43. ref-=ofs; \
  44. do { \
  45. OP(LP(dest),MERGE1(LPC(ref),LPC(ref+4),ofs)); \
  46. ref+=stride; \
  47. dest+=stride; \
  48. } while(--height)
  49. #define OP_C40() \
  50. do { \
  51. OP(LP(dest),LPC(ref)); \
  52. ref+=stride; \
  53. dest+=stride; \
  54. } while(--height)
  55. #define OP put
  56. static void put_pixels4_c(uint8_t *dest,const uint8_t *ref, const int stride,int height)
  57. {
  58. switch((int)ref&3){
  59. case 0: OP_C40(); return;
  60. case 1: OP_C4(1); return;
  61. case 2: OP_C4(2); return;
  62. case 3: OP_C4(3); return;
  63. }
  64. }
  65. #undef OP
  66. #define OP avg
  67. static void avg_pixels4_c(uint8_t *dest,const uint8_t *ref, const int stride,int height)
  68. {
  69. switch((int)ref&3){
  70. case 0: OP_C40(); return;
  71. case 1: OP_C4(1); return;
  72. case 2: OP_C4(2); return;
  73. case 3: OP_C4(3); return;
  74. }
  75. }
  76. #undef OP
  77. #define OP_C(ofs,sz,avg2) \
  78. { \
  79. ref-=ofs; \
  80. do { \
  81. uint32_t t0,t1; \
  82. t0 = LPC(ref+0); \
  83. t1 = LPC(ref+4); \
  84. OP(LP(dest+0), MERGE1(t0,t1,ofs)); \
  85. t0 = LPC(ref+8); \
  86. OP(LP(dest+4), MERGE1(t1,t0,ofs)); \
  87. if (sz==16) { \
  88. t1 = LPC(ref+12); \
  89. OP(LP(dest+8), MERGE1(t0,t1,ofs)); \
  90. t0 = LPC(ref+16); \
  91. OP(LP(dest+12), MERGE1(t1,t0,ofs)); \
  92. } \
  93. ref+=stride; \
  94. dest+= stride; \
  95. } while(--height); \
  96. }
  97. /* aligned */
  98. #define OP_C0(sz,avg2) \
  99. { \
  100. do { \
  101. OP(LP(dest+0), LPC(ref+0)); \
  102. OP(LP(dest+4), LPC(ref+4)); \
  103. if (sz==16) { \
  104. OP(LP(dest+8), LPC(ref+8)); \
  105. OP(LP(dest+12), LPC(ref+12)); \
  106. } \
  107. ref+=stride; \
  108. dest+= stride; \
  109. } while(--height); \
  110. }
  111. #define OP_X(ofs,sz,avg2) \
  112. { \
  113. ref-=ofs; \
  114. do { \
  115. uint32_t t0,t1; \
  116. t0 = LPC(ref+0); \
  117. t1 = LPC(ref+4); \
  118. OP(LP(dest+0), avg2(MERGE1(t0,t1,ofs),MERGE2(t0,t1,ofs))); \
  119. t0 = LPC(ref+8); \
  120. OP(LP(dest+4), avg2(MERGE1(t1,t0,ofs),MERGE2(t1,t0,ofs))); \
  121. if (sz==16) { \
  122. t1 = LPC(ref+12); \
  123. OP(LP(dest+8), avg2(MERGE1(t0,t1,ofs),MERGE2(t0,t1,ofs))); \
  124. t0 = LPC(ref+16); \
  125. OP(LP(dest+12), avg2(MERGE1(t1,t0,ofs),MERGE2(t1,t0,ofs))); \
  126. } \
  127. ref+=stride; \
  128. dest+= stride; \
  129. } while(--height); \
  130. }
  131. /* aligned */
  132. #define OP_Y0(sz,avg2) \
  133. { \
  134. uint32_t t0,t1,t2,t3,t; \
  135. \
  136. t0 = LPC(ref+0); \
  137. t1 = LPC(ref+4); \
  138. if (sz==16) { \
  139. t2 = LPC(ref+8); \
  140. t3 = LPC(ref+12); \
  141. } \
  142. do { \
  143. ref += stride; \
  144. \
  145. t = LPC(ref+0); \
  146. OP(LP(dest+0), avg2(t0,t)); t0 = t; \
  147. t = LPC(ref+4); \
  148. OP(LP(dest+4), avg2(t1,t)); t1 = t; \
  149. if (sz==16) { \
  150. t = LPC(ref+8); \
  151. OP(LP(dest+8), avg2(t2,t)); t2 = t; \
  152. t = LPC(ref+12); \
  153. OP(LP(dest+12), avg2(t3,t)); t3 = t; \
  154. } \
  155. dest+= stride; \
  156. } while(--height); \
  157. }
  158. #define OP_Y(ofs,sz,avg2) \
  159. { \
  160. uint32_t t0,t1,t2,t3,t,w0,w1; \
  161. \
  162. ref-=ofs; \
  163. w0 = LPC(ref+0); \
  164. w1 = LPC(ref+4); \
  165. t0 = MERGE1(w0,w1,ofs); \
  166. w0 = LPC(ref+8); \
  167. t1 = MERGE1(w1,w0,ofs); \
  168. if (sz==16) { \
  169. w1 = LPC(ref+12); \
  170. t2 = MERGE1(w0,w1,ofs); \
  171. w0 = LPC(ref+16); \
  172. t3 = MERGE1(w1,w0,ofs); \
  173. } \
  174. do { \
  175. ref += stride; \
  176. \
  177. w0 = LPC(ref+0); \
  178. w1 = LPC(ref+4); \
  179. t = MERGE1(w0,w1,ofs); \
  180. OP(LP(dest+0), avg2(t0,t)); t0 = t; \
  181. w0 = LPC(ref+8); \
  182. t = MERGE1(w1,w0,ofs); \
  183. OP(LP(dest+4), avg2(t1,t)); t1 = t; \
  184. if (sz==16) { \
  185. w1 = LPC(ref+12); \
  186. t = MERGE1(w0,w1,ofs); \
  187. OP(LP(dest+8), avg2(t2,t)); t2 = t; \
  188. w0 = LPC(ref+16); \
  189. t = MERGE1(w1,w0,ofs); \
  190. OP(LP(dest+12), avg2(t3,t)); t3 = t; \
  191. } \
  192. dest+=stride; \
  193. } while(--height); \
  194. }
  195. #define OP_X0(sz,avg2) OP_X(0,sz,avg2)
  196. #define OP_XY0(sz,PACK) OP_XY(0,sz,PACK)
  197. #define OP_XY(ofs,sz,PACK) \
  198. { \
  199. uint32_t t2,t3,w0,w1; \
  200. uint32_t a0,a1,a2,a3,a4,a5,a6,a7; \
  201. \
  202. ref -= ofs; \
  203. w0 = LPC(ref+0); \
  204. w1 = LPC(ref+4); \
  205. UNPACK(a0,a1,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
  206. w0 = LPC(ref+8); \
  207. UNPACK(a2,a3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
  208. if (sz==16) { \
  209. w1 = LPC(ref+12); \
  210. UNPACK(a4,a5,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
  211. w0 = LPC(ref+16); \
  212. UNPACK(a6,a7,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
  213. } \
  214. do { \
  215. ref+=stride; \
  216. w0 = LPC(ref+0); \
  217. w1 = LPC(ref+4); \
  218. UNPACK(t2,t3,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
  219. OP(LP(dest+0),PACK(a0,a1,t2,t3)); \
  220. a0 = t2; a1 = t3; \
  221. w0 = LPC(ref+8); \
  222. UNPACK(t2,t3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
  223. OP(LP(dest+4),PACK(a2,a3,t2,t3)); \
  224. a2 = t2; a3 = t3; \
  225. if (sz==16) { \
  226. w1 = LPC(ref+12); \
  227. UNPACK(t2,t3,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
  228. OP(LP(dest+8),PACK(a4,a5,t2,t3)); \
  229. a4 = t2; a5 = t3; \
  230. w0 = LPC(ref+16); \
  231. UNPACK(t2,t3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
  232. OP(LP(dest+12),PACK(a6,a7,t2,t3)); \
  233. a6 = t2; a7 = t3; \
  234. } \
  235. dest+=stride; \
  236. } while(--height); \
  237. }
  238. #define DEFFUNC(op,rnd,xy,sz,OP_N,avgfunc) \
  239. static void op##_##rnd##_pixels##sz##_##xy (uint8_t * dest, const uint8_t * ref, \
  240. const int stride, int height) \
  241. { \
  242. switch((int)ref&3) { \
  243. case 0:OP_N##0(sz,rnd##_##avgfunc); return; \
  244. case 1:OP_N(1,sz,rnd##_##avgfunc); return; \
  245. case 2:OP_N(2,sz,rnd##_##avgfunc); return; \
  246. case 3:OP_N(3,sz,rnd##_##avgfunc); return; \
  247. } \
  248. }
  249. #define OP put
  250. DEFFUNC(put, rnd,o,8,OP_C,avg32)
  251. DEFFUNC(put, rnd,x,8,OP_X,avg32)
  252. DEFFUNC(put,no_rnd,x,8,OP_X,avg32)
  253. DEFFUNC(put, rnd,y,8,OP_Y,avg32)
  254. DEFFUNC(put,no_rnd,y,8,OP_Y,avg32)
  255. DEFFUNC(put, rnd,xy,8,OP_XY,PACK)
  256. DEFFUNC(put,no_rnd,xy,8,OP_XY,PACK)
  257. DEFFUNC(put, rnd,o,16,OP_C,avg32)
  258. DEFFUNC(put, rnd,x,16,OP_X,avg32)
  259. DEFFUNC(put,no_rnd,x,16,OP_X,avg32)
  260. DEFFUNC(put, rnd,y,16,OP_Y,avg32)
  261. DEFFUNC(put,no_rnd,y,16,OP_Y,avg32)
  262. DEFFUNC(put, rnd,xy,16,OP_XY,PACK)
  263. DEFFUNC(put,no_rnd,xy,16,OP_XY,PACK)
  264. #undef OP
  265. #define OP avg
  266. DEFFUNC(avg, rnd,o,8,OP_C,avg32)
  267. DEFFUNC(avg, rnd,x,8,OP_X,avg32)
  268. DEFFUNC(avg,no_rnd,x,8,OP_X,avg32)
  269. DEFFUNC(avg, rnd,y,8,OP_Y,avg32)
  270. DEFFUNC(avg,no_rnd,y,8,OP_Y,avg32)
  271. DEFFUNC(avg, rnd,xy,8,OP_XY,PACK)
  272. DEFFUNC(avg,no_rnd,xy,8,OP_XY,PACK)
  273. DEFFUNC(avg, rnd,o,16,OP_C,avg32)
  274. DEFFUNC(avg, rnd,x,16,OP_X,avg32)
  275. DEFFUNC(avg,no_rnd,x,16,OP_X,avg32)
  276. DEFFUNC(avg, rnd,y,16,OP_Y,avg32)
  277. DEFFUNC(avg,no_rnd,y,16,OP_Y,avg32)
  278. DEFFUNC(avg, rnd,xy,16,OP_XY,PACK)
  279. DEFFUNC(avg,no_rnd,xy,16,OP_XY,PACK)
  280. #undef OP
  281. #define put_no_rnd_pixels8_o put_rnd_pixels8_o
  282. #define put_no_rnd_pixels16_o put_rnd_pixels16_o
  283. #define avg_no_rnd_pixels8_o avg_rnd_pixels8_o
  284. #define avg_no_rnd_pixels16_o avg_rnd_pixels16_o
  285. #define put_pixels8_c put_rnd_pixels8_o
  286. #define put_pixels16_c put_rnd_pixels16_o
  287. #define avg_pixels8_c avg_rnd_pixels8_o
  288. #define avg_pixels16_c avg_rnd_pixels16_o
  289. #define put_no_rnd_pixels8_c put_rnd_pixels8_o
  290. #define put_no_rnd_pixels16_c put_rnd_pixels16_o
  291. #define avg_no_rnd_pixels8_c avg_rnd_pixels8_o
  292. #define avg_no_rnd_pixels16_c avg_rnd_pixels16_o
  293. #define QPEL
  294. #ifdef QPEL
  295. #include "qpel.c"
  296. #endif
  297. void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
  298. {
  299. const int high_bit_depth = avctx->bits_per_raw_sample > 8;
  300. if (!high_bit_depth) {
  301. c->put_pixels_tab[0][0] = put_rnd_pixels16_o;
  302. c->put_pixels_tab[0][1] = put_rnd_pixels16_x;
  303. c->put_pixels_tab[0][2] = put_rnd_pixels16_y;
  304. c->put_pixels_tab[0][3] = put_rnd_pixels16_xy;
  305. c->put_pixels_tab[1][0] = put_rnd_pixels8_o;
  306. c->put_pixels_tab[1][1] = put_rnd_pixels8_x;
  307. c->put_pixels_tab[1][2] = put_rnd_pixels8_y;
  308. c->put_pixels_tab[1][3] = put_rnd_pixels8_xy;
  309. c->put_no_rnd_pixels_tab[0][0] = put_no_rnd_pixels16_o;
  310. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x;
  311. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y;
  312. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy;
  313. c->put_no_rnd_pixels_tab[1][0] = put_no_rnd_pixels8_o;
  314. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x;
  315. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y;
  316. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy;
  317. c->avg_pixels_tab[0][0] = avg_rnd_pixels16_o;
  318. c->avg_pixels_tab[0][1] = avg_rnd_pixels16_x;
  319. c->avg_pixels_tab[0][2] = avg_rnd_pixels16_y;
  320. c->avg_pixels_tab[0][3] = avg_rnd_pixels16_xy;
  321. c->avg_pixels_tab[1][0] = avg_rnd_pixels8_o;
  322. c->avg_pixels_tab[1][1] = avg_rnd_pixels8_x;
  323. c->avg_pixels_tab[1][2] = avg_rnd_pixels8_y;
  324. c->avg_pixels_tab[1][3] = avg_rnd_pixels8_xy;
  325. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_o;
  326. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x;
  327. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y;
  328. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy;
  329. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_o;
  330. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x;
  331. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y;
  332. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy;
  333. }
  334. #ifdef QPEL
  335. #define dspfunc(PFX, IDX, NUM) \
  336. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_sh4; \
  337. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_sh4; \
  338. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_sh4; \
  339. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_sh4; \
  340. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_sh4; \
  341. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_sh4; \
  342. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_sh4; \
  343. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_sh4; \
  344. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_sh4; \
  345. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_sh4; \
  346. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_sh4; \
  347. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_sh4; \
  348. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_sh4; \
  349. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_sh4; \
  350. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_sh4; \
  351. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_sh4
  352. dspfunc(put_qpel, 0, 16);
  353. dspfunc(put_no_rnd_qpel, 0, 16);
  354. dspfunc(avg_qpel, 0, 16);
  355. /* dspfunc(avg_no_rnd_qpel, 0, 16); */
  356. dspfunc(put_qpel, 1, 8);
  357. dspfunc(put_no_rnd_qpel, 1, 8);
  358. dspfunc(avg_qpel, 1, 8);
  359. /* dspfunc(avg_no_rnd_qpel, 1, 8); */
  360. if (!high_bit_depth) {
  361. dspfunc(put_h264_qpel, 0, 16);
  362. dspfunc(put_h264_qpel, 1, 8);
  363. dspfunc(put_h264_qpel, 2, 4);
  364. dspfunc(avg_h264_qpel, 0, 16);
  365. dspfunc(avg_h264_qpel, 1, 8);
  366. dspfunc(avg_h264_qpel, 2, 4);
  367. }
  368. #undef dspfunc
  369. if (!high_bit_depth) {
  370. c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_sh4;
  371. c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_sh4;
  372. c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_sh4;
  373. c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_sh4;
  374. c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_sh4;
  375. c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_sh4;
  376. }
  377. c->put_mspel_pixels_tab[0]= put_mspel8_mc00_sh4;
  378. c->put_mspel_pixels_tab[1]= put_mspel8_mc10_sh4;
  379. c->put_mspel_pixels_tab[2]= put_mspel8_mc20_sh4;
  380. c->put_mspel_pixels_tab[3]= put_mspel8_mc30_sh4;
  381. c->put_mspel_pixels_tab[4]= put_mspel8_mc02_sh4;
  382. c->put_mspel_pixels_tab[5]= put_mspel8_mc12_sh4;
  383. c->put_mspel_pixels_tab[6]= put_mspel8_mc22_sh4;
  384. c->put_mspel_pixels_tab[7]= put_mspel8_mc32_sh4;
  385. c->gmc1 = gmc1_c;
  386. c->gmc = gmc_c;
  387. #endif
  388. }