You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
4.0KB

  1. /*
  2. * Optimization of some functions from mpegvideo.c for armv5te
  3. * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. /*
  24. * Special optimized version of dct_unquantize_h263_helper_c, it
  25. * requires the block to be at least 8 bytes aligned, and may process
  26. * more elements than requested. But it is guaranteed to never
  27. * process more than 64 elements provided that count argument is <= 64,
  28. * so it is safe. This function is optimized for a common distribution
  29. * of values for nCoeffs (they are mostly multiple of 8 plus one or
  30. * two extra elements). So this function processes data as 8 elements
  31. * per loop iteration and contains optional 2 elements processing in
  32. * the end.
  33. *
  34. * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
  35. */
  36. function ff_dct_unquantize_h263_armv5te, export=1
  37. push {r4-r9,lr}
  38. mov ip, #0
  39. subs r3, r3, #2
  40. ble 2f
  41. ldrd r4, [r0, #0]
  42. 1:
  43. ldrd r6, [r0, #8]
  44. rsbs r9, ip, r4, asr #16
  45. addgt r9, r2, #0
  46. rsblt r9, r2, #0
  47. smlatbne r9, r4, r1, r9
  48. rsbs lr, ip, r5, asr #16
  49. addgt lr, r2, #0
  50. rsblt lr, r2, #0
  51. smlatbne lr, r5, r1, lr
  52. rsbs r8, ip, r4, asl #16
  53. addgt r8, r2, #0
  54. rsblt r8, r2, #0
  55. smlabbne r4, r4, r1, r8
  56. rsbs r8, ip, r5, asl #16
  57. addgt r8, r2, #0
  58. rsblt r8, r2, #0
  59. smlabbne r5, r5, r1, r8
  60. strh r4, [r0], #2
  61. strh r9, [r0], #2
  62. strh r5, [r0], #2
  63. strh lr, [r0], #2
  64. rsbs r9, ip, r6, asr #16
  65. addgt r9, r2, #0
  66. rsblt r9, r2, #0
  67. smlatbne r9, r6, r1, r9
  68. rsbs lr, ip, r7, asr #16
  69. addgt lr, r2, #0
  70. rsblt lr, r2, #0
  71. smlatbne lr, r7, r1, lr
  72. rsbs r8, ip, r6, asl #16
  73. addgt r8, r2, #0
  74. rsblt r8, r2, #0
  75. smlabbne r6, r6, r1, r8
  76. rsbs r8, ip, r7, asl #16
  77. addgt r8, r2, #0
  78. rsblt r8, r2, #0
  79. smlabbne r7, r7, r1, r8
  80. strh r6, [r0], #2
  81. strh r9, [r0], #2
  82. strh r7, [r0], #2
  83. strh lr, [r0], #2
  84. subs r3, r3, #8
  85. ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
  86. bgt 1b
  87. adds r3, r3, #2
  88. pople {r4-r9,pc}
  89. 2:
  90. ldrsh r9, [r0, #0]
  91. ldrsh lr, [r0, #2]
  92. mov r8, r2
  93. cmp r9, #0
  94. rsblt r8, r2, #0
  95. smlabbne r9, r9, r1, r8
  96. mov r8, r2
  97. cmp lr, #0
  98. rsblt r8, r2, #0
  99. smlabbne lr, lr, r1, r8
  100. strh r9, [r0], #2
  101. strh lr, [r0], #2
  102. pop {r4-r9,pc}
  103. .endfunc