You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

275 lines
8.5KB

  1. /*
  2. * Copyright (c) 2012 Stefano Sabatini
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * flite voice synth source
  23. */
  24. #include <flite/flite.h>
  25. #include "libavutil/audioconvert.h"
  26. #include "libavutil/file.h"
  27. #include "libavutil/opt.h"
  28. #include "avfilter.h"
  29. #include "audio.h"
  30. #include "formats.h"
  31. #include "internal.h"
  32. typedef struct {
  33. const AVClass *class;
  34. char *voice_str;
  35. char *textfile;
  36. char *text;
  37. cst_wave *wave;
  38. int16_t *wave_samples;
  39. int wave_nb_samples;
  40. int list_voices;
  41. cst_voice *voice;
  42. int64_t pts;
  43. int frame_nb_samples; ///< number of samples per frame
  44. } FliteContext;
  45. #define OFFSET(x) offsetof(FliteContext, x)
  46. static const AVOption flite_options[] = {
  47. { "list_voices", "list voices and exit", OFFSET(list_voices), AV_OPT_TYPE_INT, {.dbl=0}, 0, 1 },
  48. { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
  49. { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
  50. { "text", "set text to speak", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
  51. { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
  52. { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
  53. { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
  54. { NULL }
  55. };
  56. AVFILTER_DEFINE_CLASS(flite);
  57. static volatile int flite_inited = 0;
  58. /* declare functions for all the supported voices */
  59. #define DECLARE_REGISTER_VOICE_FN(name) cst_voice *register_cmu_us_## name(const char *)
  60. DECLARE_REGISTER_VOICE_FN(awb);
  61. DECLARE_REGISTER_VOICE_FN(kal);
  62. DECLARE_REGISTER_VOICE_FN(kal16);
  63. DECLARE_REGISTER_VOICE_FN(rms);
  64. DECLARE_REGISTER_VOICE_FN(slt);
  65. struct voice_entry {
  66. const char *name;
  67. cst_voice * (*register_fn)(const char *);
  68. } voice_entry;
  69. static struct voice_entry voice_entries[] = {
  70. { "awb", register_cmu_us_awb },
  71. { "kal", register_cmu_us_kal },
  72. { "kal16", register_cmu_us_kal16 },
  73. { "rms", register_cmu_us_rms },
  74. { "slt", register_cmu_us_slt },
  75. };
  76. static void list_voices(void *log_ctx, const char *sep)
  77. {
  78. int i, n = FF_ARRAY_ELEMS(voice_entries);
  79. for (i = 0; i < n; i++)
  80. av_log(log_ctx, AV_LOG_INFO, "%s%s",
  81. voice_entries[i].name, i < (n-1) ? sep : "\n");
  82. }
  83. static int select_voice(cst_voice **voice, const char *voice_name, void *log_ctx)
  84. {
  85. int i;
  86. for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
  87. struct voice_entry *entry = &voice_entries[i];
  88. if (!strcmp(entry->name, voice_name)) {
  89. *voice = entry->register_fn(NULL);
  90. if (!*voice) {
  91. av_log(log_ctx, AV_LOG_ERROR,
  92. "Could not register voice '%s'\n", voice_name);
  93. return AVERROR_UNKNOWN;
  94. }
  95. return 0;
  96. }
  97. }
  98. av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
  99. av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
  100. list_voices(log_ctx, ", ");
  101. return AVERROR(EINVAL);
  102. }
  103. static av_cold int init(AVFilterContext *ctx, const char *args)
  104. {
  105. FliteContext *flite = ctx->priv;
  106. int ret = 0;
  107. flite->class = &flite_class;
  108. av_opt_set_defaults(flite);
  109. if ((ret = av_set_options_string(flite, args, "=", ":")) < 0) {
  110. av_log(ctx, AV_LOG_ERROR, "Error parsing options string: '%s'\n", args);
  111. return ret;
  112. }
  113. if (flite->list_voices) {
  114. list_voices(ctx, "\n");
  115. return AVERROR_EXIT;
  116. }
  117. if (!flite_inited) {
  118. if (flite_init() < 0) {
  119. av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
  120. return AVERROR_UNKNOWN;
  121. }
  122. flite_inited++;
  123. }
  124. if ((ret = select_voice(&flite->voice, flite->voice_str, ctx)) < 0)
  125. return ret;
  126. if (flite->textfile && flite->text) {
  127. av_log(ctx, AV_LOG_ERROR,
  128. "Both text and textfile options set: only one must be specified\n");
  129. return AVERROR(EINVAL);
  130. }
  131. if (flite->textfile) {
  132. uint8_t *textbuf;
  133. size_t textbuf_size;
  134. if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
  135. av_log(ctx, AV_LOG_ERROR,
  136. "The text file '%s' could not be read: %s\n",
  137. flite->textfile, av_err2str(ret));
  138. return ret;
  139. }
  140. if (!(flite->text = av_malloc(textbuf_size+1)))
  141. return AVERROR(ENOMEM);
  142. memcpy(flite->text, textbuf, textbuf_size);
  143. flite->text[textbuf_size] = 0;
  144. av_file_unmap(textbuf, textbuf_size);
  145. }
  146. if (!flite->text) {
  147. av_log(ctx, AV_LOG_ERROR,
  148. "No speech text specified, specify the 'text' or 'textfile' option\n");
  149. return AVERROR(EINVAL);
  150. }
  151. /* synth all the file data in block */
  152. flite->wave = flite_text_to_wave(flite->text, flite->voice);
  153. flite->wave_samples = flite->wave->samples;
  154. flite->wave_nb_samples = flite->wave->num_samples;
  155. return 0;
  156. }
  157. static av_cold void uninit(AVFilterContext *ctx)
  158. {
  159. FliteContext *flite = ctx->priv;
  160. av_opt_free(flite);
  161. delete_voice(flite->voice);
  162. flite->voice = NULL;
  163. delete_wave(flite->wave);
  164. flite->wave = NULL;
  165. }
  166. static int query_formats(AVFilterContext *ctx)
  167. {
  168. FliteContext *flite = ctx->priv;
  169. AVFilterChannelLayouts *chlayouts = NULL;
  170. int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
  171. AVFilterFormats *sample_formats = NULL;
  172. AVFilterFormats *sample_rates = NULL;
  173. ff_add_channel_layout(&chlayouts, chlayout);
  174. ff_set_common_channel_layouts(ctx, chlayouts);
  175. ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
  176. ff_set_common_formats(ctx, sample_formats);
  177. ff_add_format(&sample_rates, flite->wave->sample_rate);
  178. ff_set_common_samplerates (ctx, sample_rates);
  179. return 0;
  180. }
  181. static int config_props(AVFilterLink *outlink)
  182. {
  183. AVFilterContext *ctx = outlink->src;
  184. FliteContext *flite = ctx->priv;
  185. outlink->sample_rate = flite->wave->sample_rate;
  186. outlink->time_base = (AVRational){1, flite->wave->sample_rate};
  187. av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
  188. flite->voice_str,
  189. av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
  190. return 0;
  191. }
  192. static int request_frame(AVFilterLink *outlink)
  193. {
  194. AVFilterBufferRef *samplesref;
  195. FliteContext *flite = outlink->src->priv;
  196. int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
  197. if (!nb_samples)
  198. return AVERROR_EOF;
  199. samplesref = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
  200. if (!samplesref)
  201. return AVERROR(ENOMEM);
  202. memcpy(samplesref->data[0], flite->wave_samples,
  203. nb_samples * flite->wave->num_channels * 2);
  204. samplesref->pts = flite->pts;
  205. samplesref->pos = -1;
  206. samplesref->audio->sample_rate = flite->wave->sample_rate;
  207. flite->pts += nb_samples;
  208. flite->wave_samples += nb_samples * flite->wave->num_channels;
  209. flite->wave_nb_samples -= nb_samples;
  210. return ff_filter_samples(outlink, samplesref);
  211. }
  212. AVFilter avfilter_asrc_flite = {
  213. .name = "flite",
  214. .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
  215. .query_formats = query_formats,
  216. .init = init,
  217. .uninit = uninit,
  218. .priv_size = sizeof(FliteContext),
  219. .inputs = (const AVFilterPad[]) {{ .name = NULL}},
  220. .outputs = (const AVFilterPad[]) {
  221. {
  222. .name = "default",
  223. .type = AVMEDIA_TYPE_AUDIO,
  224. .config_props = config_props,
  225. .request_frame = request_frame,
  226. },
  227. { .name = NULL }
  228. },
  229. };