You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
3.9KB

  1. /*
  2. * Optimization of some functions from mpegvideo.c for armv5te
  3. * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. /*
  24. * Special optimized version of dct_unquantize_h263_helper_c, it
  25. * requires the block to be at least 8 bytes aligned, and may process
  26. * more elements than requested. But it is guaranteed to never
  27. * process more than 64 elements provided that count argument is <= 64,
  28. * so it is safe. This function is optimized for a common distribution
  29. * of values for nCoeffs (they are mostly multiple of 8 plus one or
  30. * two extra elements). So this function processes data as 8 elements
  31. * per loop iteration and contains optional 2 elements processing in
  32. * the end.
  33. *
  34. * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
  35. */
  36. .macro dequant_t dst, src, mul, add, tmp
  37. rsbs \tmp, ip, \src, asr #16
  38. it gt
  39. addgt \tmp, \add, #0
  40. it lt
  41. rsblt \tmp, \add, #0
  42. it ne
  43. smlatbne \dst, \src, \mul, \tmp
  44. .endm
  45. .macro dequant_b dst, src, mul, add, tmp
  46. rsbs \tmp, ip, \src, lsl #16
  47. it gt
  48. addgt \tmp, \add, #0
  49. it lt
  50. rsblt \tmp, \add, #0
  51. it ne
  52. smlabbne \dst, \src, \mul, \tmp
  53. .endm
  54. function ff_dct_unquantize_h263_armv5te, export=1
  55. push {r4-r9,lr}
  56. mov ip, #0
  57. subs r3, r3, #2
  58. ble 2f
  59. ldrd r4, [r0, #0]
  60. 1:
  61. ldrd r6, [r0, #8]
  62. dequant_t r9, r4, r1, r2, r9
  63. dequant_t lr, r5, r1, r2, lr
  64. dequant_b r4, r4, r1, r2, r8
  65. dequant_b r5, r5, r1, r2, r8
  66. strh r4, [r0], #2
  67. strh r9, [r0], #2
  68. strh r5, [r0], #2
  69. strh lr, [r0], #2
  70. dequant_t r9, r6, r1, r2, r9
  71. dequant_t lr, r7, r1, r2, lr
  72. dequant_b r6, r6, r1, r2, r8
  73. dequant_b r7, r7, r1, r2, r8
  74. strh r6, [r0], #2
  75. strh r9, [r0], #2
  76. strh r7, [r0], #2
  77. strh lr, [r0], #2
  78. subs r3, r3, #8
  79. it gt
  80. ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
  81. bgt 1b
  82. adds r3, r3, #2
  83. it le
  84. pople {r4-r9,pc}
  85. 2:
  86. ldrsh r9, [r0, #0]
  87. ldrsh lr, [r0, #2]
  88. mov r8, r2
  89. cmp r9, #0
  90. it lt
  91. rsblt r8, r2, #0
  92. it ne
  93. smlabbne r9, r9, r1, r8
  94. mov r8, r2
  95. cmp lr, #0
  96. it lt
  97. rsblt r8, r2, #0
  98. it ne
  99. smlabbne lr, lr, r1, r8
  100. strh r9, [r0], #2
  101. strh lr, [r0], #2
  102. pop {r4-r9,pc}
  103. endfunc