You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

285 lines
8.8KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
  3. // current version mostly by Michael Niedermayer (michaelni@gmx.at)
  4. // the parts written by michael are under GNU GPL
  5. #include <inttypes.h>
  6. #include <string.h>
  7. //#include <stdio.h> //FOR DEBUG ONLY
  8. #include "../config.h"
  9. #include "swscale.h"
  10. #include "../cpudetect.h"
  11. #undef MOVNTQ
  12. #undef PAVGB
  13. //#undef HAVE_MMX2
  14. //#undef HAVE_MMX
  15. //#undef ARCH_X86
  16. #define DITHER1XBPP
  17. int fullUVIpol=0;
  18. //disables the unscaled height version
  19. int allwaysIpol=0;
  20. #define RET 0xC3 //near return opcode
  21. /*
  22. NOTES
  23. known BUGS with known cause (no bugreports please!, but patches are welcome :) )
  24. horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
  25. Supported output formats BGR15 BGR16 BGR24 BGR32
  26. BGR15 & BGR16 MMX verions support dithering
  27. Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  28. TODO
  29. more intelligent missalignment avoidance for the horizontal scaler
  30. bicubic scaler
  31. dither in C
  32. change the distance of the u & v buffer
  33. how to differenciate between x86 an C at runtime ?! (using C for now)
  34. */
  35. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  36. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  37. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  38. #ifdef ARCH_X86
  39. #define CAN_COMPILE_X86_ASM
  40. #endif
  41. #ifdef CAN_COMPILE_X86_ASM
  42. static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
  43. static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
  44. static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
  45. static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
  46. static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
  47. static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
  48. static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
  49. static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
  50. static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
  51. static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
  52. static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
  53. static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
  54. static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
  55. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  56. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  57. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  58. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  59. static uint64_t __attribute__((aligned(8))) dither4[2]={
  60. 0x0103010301030103LL,
  61. 0x0200020002000200LL,};
  62. static uint64_t __attribute__((aligned(8))) dither8[2]={
  63. 0x0602060206020602LL,
  64. 0x0004000400040004LL,};
  65. static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
  66. static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
  67. static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
  68. static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
  69. static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
  70. static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
  71. static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
  72. static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
  73. static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
  74. static uint64_t __attribute__((aligned(8))) temp0;
  75. static uint64_t __attribute__((aligned(8))) asm_yalpha1;
  76. static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
  77. // temporary storage for 4 yuv lines:
  78. // 16bit for now (mmx likes it more compact)
  79. static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
  80. static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
  81. #else
  82. static uint16_t pix_buf_y[4][2048];
  83. static uint16_t pix_buf_uv[2][2048*2];
  84. #endif
  85. // clipping helper table for C implementations:
  86. static unsigned char clip_table[768];
  87. static unsigned short clip_table16b[768];
  88. static unsigned short clip_table16g[768];
  89. static unsigned short clip_table16r[768];
  90. static unsigned short clip_table15b[768];
  91. static unsigned short clip_table15g[768];
  92. static unsigned short clip_table15r[768];
  93. // yuv->rgb conversion tables:
  94. static int yuvtab_2568[256];
  95. static int yuvtab_3343[256];
  96. static int yuvtab_0c92[256];
  97. static int yuvtab_1a1e[256];
  98. static int yuvtab_40cf[256];
  99. #ifdef CAN_COMPILE_X86_ASM
  100. static uint8_t funnyYCode[10000];
  101. static uint8_t funnyUVCode[10000];
  102. #endif
  103. static int canMMX2BeUsed=0;
  104. #ifdef CAN_COMPILE_X86_ASM
  105. void in_asm_used_var_warning_killer()
  106. {
  107. int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
  108. bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
  109. M24A+M24B+M24C;
  110. if(i) i=0;
  111. }
  112. #endif
  113. //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  114. //Plain C versions
  115. #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
  116. #define COMPILE_C
  117. #endif
  118. #ifdef CAN_COMPILE_X86_ASM
  119. #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  120. #define COMPILE_MMX
  121. #endif
  122. #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
  123. #define COMPILE_MMX2
  124. #endif
  125. #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  126. #define COMPILE_3DNOW
  127. #endif
  128. #endif //CAN_COMPILE_X86_ASM
  129. #undef HAVE_MMX
  130. #undef HAVE_MMX2
  131. #undef HAVE_3DNOW
  132. #undef ARCH_X86
  133. #ifdef COMPILE_C
  134. #undef HAVE_MMX
  135. #undef HAVE_MMX2
  136. #undef HAVE_3DNOW
  137. #undef ARCH_X86
  138. #define RENAME(a) a ## _C
  139. #include "swscale_template.c"
  140. #endif
  141. #ifdef CAN_COMPILE_X86_ASM
  142. //X86 versions
  143. /*
  144. #undef RENAME
  145. #undef HAVE_MMX
  146. #undef HAVE_MMX2
  147. #undef HAVE_3DNOW
  148. #define ARCH_X86
  149. #define RENAME(a) a ## _X86
  150. #include "swscale_template.c"
  151. */
  152. //MMX versions
  153. #ifdef COMPILE_MMX
  154. #undef RENAME
  155. #define HAVE_MMX
  156. #undef HAVE_MMX2
  157. #undef HAVE_3DNOW
  158. #define ARCH_X86
  159. #define RENAME(a) a ## _MMX
  160. #include "swscale_template.c"
  161. #endif
  162. //MMX2 versions
  163. #ifdef COMPILE_MMX2
  164. #undef RENAME
  165. #define HAVE_MMX
  166. #define HAVE_MMX2
  167. #undef HAVE_3DNOW
  168. #define ARCH_X86
  169. #define RENAME(a) a ## _MMX2
  170. #include "swscale_template.c"
  171. #endif
  172. //3DNOW versions
  173. #ifdef COMPILE_3DNOW
  174. #undef RENAME
  175. #define HAVE_MMX
  176. #undef HAVE_MMX2
  177. #define HAVE_3DNOW
  178. #define ARCH_X86
  179. #define RENAME(a) a ## _3DNow
  180. #include "swscale_template.c"
  181. #endif
  182. #endif //CAN_COMPILE_X86_ASM
  183. // minor note: the HAVE_xyz is messed up after that line so dont use it
  184. // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
  185. // *** Note: it's called multiple times while decoding a frame, first time y==0
  186. // *** Designed to upscale, but may work for downscale too.
  187. // switching the cpu type during a sliced drawing can have bad effects, like sig11
  188. void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY ,
  189. int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
  190. int srcW, int srcH, int dstW, int dstH){
  191. #ifdef RUNTIME_CPUDETECT
  192. #ifdef CAN_COMPILE_X86_ASM
  193. // ordered per speed fasterst first
  194. if(gCpuCaps.hasMMX2)
  195. SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  196. else if(gCpuCaps.has3DNow)
  197. SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  198. else if(gCpuCaps.hasMMX)
  199. SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  200. else
  201. SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  202. #else
  203. SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  204. #endif
  205. #else //RUNTIME_CPUDETECT
  206. #ifdef HAVE_MMX2
  207. SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  208. #elif defined (HAVE_3DNOW)
  209. SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  210. #elif defined (HAVE_MMX)
  211. SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  212. #else
  213. SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  214. #endif
  215. #endif //!RUNTIME_CPUDETECT
  216. }
  217. void SwScale_Init(){
  218. // generating tables:
  219. int i;
  220. for(i=0;i<256;i++){
  221. clip_table[i]=0;
  222. clip_table[i+256]=i;
  223. clip_table[i+512]=255;
  224. yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
  225. yuvtab_3343[i]=0x3343*(i-128);
  226. yuvtab_0c92[i]=-0x0c92*(i-128);
  227. yuvtab_1a1e[i]=-0x1a1e*(i-128);
  228. yuvtab_40cf[i]=0x40cf*(i-128);
  229. }
  230. for(i=0; i<768; i++)
  231. {
  232. int v= clip_table[i];
  233. clip_table16b[i]= v>>3;
  234. clip_table16g[i]= (v<<3)&0x07E0;
  235. clip_table16r[i]= (v<<8)&0xF800;
  236. clip_table15b[i]= v>>3;
  237. clip_table15g[i]= (v<<2)&0x03E0;
  238. clip_table15r[i]= (v<<7)&0x7C00;
  239. }
  240. }