| 
																	
																	
																		
																	
																	
																 | 
																@@ -22,6 +22,30 @@ | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																#include "cuda/vector_helpers.cuh" | 
																 | 
																 | 
																#include "cuda/vector_helpers.cuh" | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																typedef float4 (*coeffs_function_t)(float); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																__device__ inline float4 lanczos_coeffs(float x) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																{ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    const float pi = 3.141592654f; | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    float4 res = make_float4( | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        pi * (x + 1), | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        pi * x, | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        pi * (x - 1), | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        pi * (x - 2)); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    res.x = res.x == 0.0f ? 1.0f : | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        __sinf(res.x) * __sinf(res.x / 2.0f) / (res.x * res.x / 2.0f); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    res.y = res.y == 0.0f ? 1.0f : | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        __sinf(res.y) * __sinf(res.y / 2.0f) / (res.y * res.y / 2.0f); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    res.z = res.z == 0.0f ? 1.0f : | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        __sinf(res.z) * __sinf(res.z / 2.0f) / (res.z * res.z / 2.0f); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    res.w = res.w == 0.0f ? 1.0f : | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        __sinf(res.w) * __sinf(res.w / 2.0f) / (res.w * res.w / 2.0f); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    return res / (res.x + res.y + res.z + res.w); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																} | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																__device__ inline float4 bicubic_coeffs(float x) | 
																 | 
																 | 
																__device__ inline float4 bicubic_coeffs(float x) | 
															
														
														
													
														
															
																 | 
																 | 
																{ | 
																 | 
																 | 
																{ | 
															
														
														
													
														
															
																 | 
																 | 
																    const float A = -0.75f; | 
																 | 
																 | 
																    const float A = -0.75f; | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -35,10 +59,8 @@ __device__ inline float4 bicubic_coeffs(float x) | 
															
														
														
													
														
															
																 | 
																 | 
																    return res; | 
																 | 
																 | 
																    return res; | 
															
														
														
													
														
															
																 | 
																 | 
																} | 
																 | 
																 | 
																} | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																__device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float *s) | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																__device__ inline void derived_fast_coeffs(float4 coeffs, float x, float *h0, float *h1, float *s) | 
															
														
														
													
														
															
																 | 
																 | 
																{ | 
																 | 
																 | 
																{ | 
															
														
														
													
														
															
																 | 
																 | 
																    float4 coeffs = bicubic_coeffs(x); | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																    float g0 = coeffs.x + coeffs.y; | 
																 | 
																 | 
																    float g0 = coeffs.x + coeffs.y; | 
															
														
														
													
														
															
																 | 
																 | 
																    float g1 = coeffs.z + coeffs.w; | 
																 | 
																 | 
																    float g1 = coeffs.z + coeffs.w; | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -48,7 +70,7 @@ __device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float | 
															
														
														
													
														
															
																 | 
																 | 
																} | 
																 | 
																 | 
																} | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																template<typename V> | 
																 | 
																 | 
																template<typename V> | 
															
														
														
													
														
															
																 | 
																 | 
																__device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3) | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																__device__ inline V apply_coeffs(float4 coeffs, V c0, V c1, V c2, V c3) | 
															
														
														
													
														
															
																 | 
																 | 
																{ | 
																 | 
																 | 
																{ | 
															
														
														
													
														
															
																 | 
																 | 
																    V res = c0 * coeffs.x; | 
																 | 
																 | 
																    V res = c0 * coeffs.x; | 
															
														
														
													
														
															
																 | 
																 | 
																    res  += c1 * coeffs.y; | 
																 | 
																 | 
																    res  += c1 * coeffs.y; | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -59,7 +81,8 @@ __device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3) | 
															
														
														
													
														
															
																 | 
																 | 
																} | 
																 | 
																 | 
																} | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																template<typename T> | 
																 | 
																 | 
																template<typename T> | 
															
														
														
													
														
															
																 | 
																 | 
																__device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex, | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																__device__ inline void Subsample_Bicubic(coeffs_function_t coeffs_function, | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                                         cudaTextureObject_t src_tex, | 
															
														
														
													
														
															
																 | 
																 | 
																                                         T *dst, | 
																 | 
																 | 
																                                         T *dst, | 
															
														
														
													
														
															
																 | 
																 | 
																                                         int dst_width, int dst_height, int dst_pitch, | 
																 | 
																 | 
																                                         int dst_width, int dst_height, int dst_pitch, | 
															
														
														
													
														
															
																 | 
																 | 
																                                         int src_width, int src_height, | 
																 | 
																 | 
																                                         int src_width, int src_height, | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -81,17 +104,17 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex, | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																        float factor = bit_depth > 8 ? 0xFFFF : 0xFF; | 
																 | 
																 | 
																        float factor = bit_depth > 8 ? 0xFFFF : 0xFF; | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																        float4 coeffsX = bicubic_coeffs(fx); | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																        float4 coeffsY = bicubic_coeffs(fy); | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        float4 coeffsX = coeffs_function(fx); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        float4 coeffsY = coeffs_function(fy); | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y)) | 
																 | 
																 | 
																#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y)) | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																        dst[yo * dst_pitch + xo] = from_floatN<T, floatT>( | 
																 | 
																 | 
																        dst[yo * dst_pitch + xo] = from_floatN<T, floatT>( | 
															
														
														
													
														
															
																 | 
																 | 
																            bicubic_filter<floatT>(coeffsY, | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																                bicubic_filter<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																                bicubic_filter<floatT>(coeffsX, PIX(px - 1, py    ), PIX(px, py    ), PIX(px + 1, py    ), PIX(px + 2, py    )), | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																                bicubic_filter<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																                bicubic_filter<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																            apply_coeffs<floatT>(coeffsY, | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py    ), PIX(px, py    ), PIX(px + 1, py    ), PIX(px + 2, py    )), | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) | 
															
														
														
													
														
															
																 | 
																 | 
																            ) * factor | 
																 | 
																 | 
																            ) * factor | 
															
														
														
													
														
															
																 | 
																 | 
																        ); | 
																 | 
																 | 
																        ); | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -101,7 +124,8 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex, | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																/* This does not yield correct results. Most likely because of low internal precision in tex2D linear interpolation */ | 
																 | 
																 | 
																/* This does not yield correct results. Most likely because of low internal precision in tex2D linear interpolation */ | 
															
														
														
													
														
															
																 | 
																 | 
																template<typename T> | 
																 | 
																 | 
																template<typename T> | 
															
														
														
													
														
															
																 | 
																 | 
																__device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex, | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																__device__ inline void Subsample_FastBicubic(coeffs_function_t coeffs_function, | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                                             cudaTextureObject_t src_tex, | 
															
														
														
													
														
															
																 | 
																 | 
																                                             T *dst, | 
																 | 
																 | 
																                                             T *dst, | 
															
														
														
													
														
															
																 | 
																 | 
																                                             int dst_width, int dst_height, int dst_pitch, | 
																 | 
																 | 
																                                             int dst_width, int dst_height, int dst_pitch, | 
															
														
														
													
														
															
																 | 
																 | 
																                                             int src_width, int src_height, | 
																 | 
																 | 
																                                             int src_width, int src_height, | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -123,10 +147,13 @@ __device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex, | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																        float factor = bit_depth > 8 ? 0xFFFF : 0xFF; | 
																 | 
																 | 
																        float factor = bit_depth > 8 ? 0xFFFF : 0xFF; | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        float4 coeffsX = coeffs_function(fx); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        float4 coeffsY = coeffs_function(fy); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																        float h0x, h1x, sx; | 
																 | 
																 | 
																        float h0x, h1x, sx; | 
															
														
														
													
														
															
																 | 
																 | 
																        float h0y, h1y, sy; | 
																 | 
																 | 
																        float h0y, h1y, sy; | 
															
														
														
													
														
															
																 | 
																 | 
																        bicubic_fast_coeffs(fx, &h0x, &h1x, &sx); | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																        bicubic_fast_coeffs(fy, &h0y, &h1y, &sy); | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        derived_fast_coeffs(coeffsX, fx, &h0x, &h1x, &sx); | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        derived_fast_coeffs(coeffsY, fy, &h0y, &h1y, &sy); | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y)) | 
																 | 
																 | 
																#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y)) | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
												
													
														
															
																| 
																	
																		
																	
																	
																		
																	
																	
																 | 
																@@ -157,7 +184,7 @@ extern "C" { | 
															
														
														
													
														
															
																 | 
																 | 
																                                            int src_width, int src_height,                \ | 
																 | 
																 | 
																                                            int src_width, int src_height,                \ | 
															
														
														
													
														
															
																 | 
																 | 
																                                            int bit_depth)                                \ | 
																 | 
																 | 
																                                            int bit_depth)                                \ | 
															
														
														
													
														
															
																 | 
																 | 
																    {                                                                                     \ | 
																 | 
																 | 
																    {                                                                                     \ | 
															
														
														
													
														
															
																 | 
																 | 
																        Subsample_Bicubic<T>(src_tex, dst,                                                \ | 
																 | 
																 | 
																 | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        Subsample_Bicubic<T>(&bicubic_coeffs, src_tex, dst,                               \ | 
															
														
														
													
														
															
																 | 
																 | 
																                             dst_width, dst_height, dst_pitch,                            \ | 
																 | 
																 | 
																                             dst_width, dst_height, dst_pitch,                            \ | 
															
														
														
													
														
															
																 | 
																 | 
																                             src_width, src_height,                                       \ | 
																 | 
																 | 
																                             src_width, src_height,                                       \ | 
															
														
														
													
														
															
																 | 
																 | 
																                             bit_depth);                                                  \ | 
																 | 
																 | 
																                             bit_depth);                                                  \ | 
															
														
														
													
												
													
														
															
																| 
																	
																	
																	
																		
																	
																 | 
																@@ -171,4 +198,26 @@ BICUBIC_KERNEL(ushort) | 
															
														
														
													
														
															
																 | 
																 | 
																BICUBIC_KERNEL(ushort2) | 
																 | 
																 | 
																BICUBIC_KERNEL(ushort2) | 
															
														
														
													
														
															
																 | 
																 | 
																BICUBIC_KERNEL(ushort4) | 
																 | 
																 | 
																BICUBIC_KERNEL(ushort4) | 
															
														
														
													
														
															
																 | 
																 | 
																
  | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																#define LANCZOS_KERNEL(T) \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    __global__ void Subsample_Lanczos_ ## T(cudaTextureObject_t src_tex,                  \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                                            T *dst,                                       \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                                            int dst_width, int dst_height, int dst_pitch, \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                                            int src_width, int src_height,                \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                                            int bit_depth)                                \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    {                                                                                     \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																        Subsample_Bicubic<T>(&lanczos_coeffs, src_tex, dst,                               \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                             dst_width, dst_height, dst_pitch,                            \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                             src_width, src_height,                                       \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																                             bit_depth);                                                  \ | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																    } | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																LANCZOS_KERNEL(uchar) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																LANCZOS_KERNEL(uchar2) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																LANCZOS_KERNEL(uchar4) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																LANCZOS_KERNEL(ushort) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																LANCZOS_KERNEL(ushort2) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																LANCZOS_KERNEL(ushort4) | 
															
														
														
													
														
															
																 | 
																 | 
																 | 
																 | 
																 | 
																
  | 
															
														
														
													
														
															
																 | 
																 | 
																} | 
																 | 
																 | 
																} |