You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

290 lines
9.1KB

  1. /*
  2. * Copyright (c) 2012 Stefano Sabatini
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * flite voice synth source
  23. */
  24. #include <flite/flite.h>
  25. #include "libavutil/audioconvert.h"
  26. #include "libavutil/file.h"
  27. #include "libavutil/opt.h"
  28. #include "avfilter.h"
  29. #include "audio.h"
  30. #include "formats.h"
  31. #include "internal.h"
  32. typedef struct {
  33. const AVClass *class;
  34. char *voice_str;
  35. char *textfile;
  36. char *text;
  37. cst_wave *wave;
  38. int16_t *wave_samples;
  39. int wave_nb_samples;
  40. int list_voices;
  41. cst_voice *voice;
  42. struct voice_entry *voice_entry;
  43. int64_t pts;
  44. int frame_nb_samples; ///< number of samples per frame
  45. } FliteContext;
  46. #define OFFSET(x) offsetof(FliteContext, x)
  47. static const AVOption flite_options[] = {
  48. { "list_voices", "list voices and exit", OFFSET(list_voices), AV_OPT_TYPE_INT, {.dbl=0}, 0, 1 },
  49. { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
  50. { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
  51. { "text", "set text to speak", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
  52. { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
  53. { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
  54. { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
  55. { NULL }
  56. };
  57. AVFILTER_DEFINE_CLASS(flite);
  58. static volatile int flite_inited = 0;
  59. /* declare functions for all the supported voices */
  60. #define DECLARE_REGISTER_VOICE_FN(name) \
  61. cst_voice *register_cmu_us_## name(const char *); \
  62. void unregister_cmu_us_## name(cst_voice *);
  63. DECLARE_REGISTER_VOICE_FN(awb);
  64. DECLARE_REGISTER_VOICE_FN(kal);
  65. DECLARE_REGISTER_VOICE_FN(kal16);
  66. DECLARE_REGISTER_VOICE_FN(rms);
  67. DECLARE_REGISTER_VOICE_FN(slt);
  68. struct voice_entry {
  69. const char *name;
  70. cst_voice * (*register_fn)(const char *);
  71. void (*unregister_fn)(cst_voice *);
  72. cst_voice *voice;
  73. unsigned usage_count;
  74. } voice_entry;
  75. #define MAKE_VOICE_STRUCTURE(voice_name) { \
  76. .name = #voice_name, \
  77. .register_fn = register_cmu_us_ ## voice_name, \
  78. .unregister_fn = unregister_cmu_us_ ## voice_name, \
  79. }
  80. static struct voice_entry voice_entries[] = {
  81. MAKE_VOICE_STRUCTURE(awb),
  82. MAKE_VOICE_STRUCTURE(kal),
  83. MAKE_VOICE_STRUCTURE(kal16),
  84. MAKE_VOICE_STRUCTURE(rms),
  85. MAKE_VOICE_STRUCTURE(slt),
  86. };
  87. static void list_voices(void *log_ctx, const char *sep)
  88. {
  89. int i, n = FF_ARRAY_ELEMS(voice_entries);
  90. for (i = 0; i < n; i++)
  91. av_log(log_ctx, AV_LOG_INFO, "%s%s",
  92. voice_entries[i].name, i < (n-1) ? sep : "\n");
  93. }
  94. static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
  95. {
  96. int i;
  97. for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
  98. struct voice_entry *entry = &voice_entries[i];
  99. if (!strcmp(entry->name, voice_name)) {
  100. if (!entry->voice)
  101. entry->voice = entry->register_fn(NULL);
  102. if (!entry->voice) {
  103. av_log(log_ctx, AV_LOG_ERROR,
  104. "Could not register voice '%s'\n", voice_name);
  105. return AVERROR_UNKNOWN;
  106. }
  107. entry->usage_count++;
  108. *entry_ret = entry;
  109. return 0;
  110. }
  111. }
  112. av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
  113. av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
  114. list_voices(log_ctx, ", ");
  115. return AVERROR(EINVAL);
  116. }
  117. static av_cold int init(AVFilterContext *ctx, const char *args)
  118. {
  119. FliteContext *flite = ctx->priv;
  120. int ret = 0;
  121. flite->class = &flite_class;
  122. av_opt_set_defaults(flite);
  123. if ((ret = av_set_options_string(flite, args, "=", ":")) < 0)
  124. return ret;
  125. if (flite->list_voices) {
  126. list_voices(ctx, "\n");
  127. return AVERROR_EXIT;
  128. }
  129. if (!flite_inited) {
  130. if (flite_init() < 0) {
  131. av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
  132. return AVERROR_UNKNOWN;
  133. }
  134. flite_inited++;
  135. }
  136. if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
  137. return ret;
  138. flite->voice = flite->voice_entry->voice;
  139. if (flite->textfile && flite->text) {
  140. av_log(ctx, AV_LOG_ERROR,
  141. "Both text and textfile options set: only one must be specified\n");
  142. return AVERROR(EINVAL);
  143. }
  144. if (flite->textfile) {
  145. uint8_t *textbuf;
  146. size_t textbuf_size;
  147. if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
  148. av_log(ctx, AV_LOG_ERROR,
  149. "The text file '%s' could not be read: %s\n",
  150. flite->textfile, av_err2str(ret));
  151. return ret;
  152. }
  153. if (!(flite->text = av_malloc(textbuf_size+1)))
  154. return AVERROR(ENOMEM);
  155. memcpy(flite->text, textbuf, textbuf_size);
  156. flite->text[textbuf_size] = 0;
  157. av_file_unmap(textbuf, textbuf_size);
  158. }
  159. if (!flite->text) {
  160. av_log(ctx, AV_LOG_ERROR,
  161. "No speech text specified, specify the 'text' or 'textfile' option\n");
  162. return AVERROR(EINVAL);
  163. }
  164. /* synth all the file data in block */
  165. flite->wave = flite_text_to_wave(flite->text, flite->voice);
  166. flite->wave_samples = flite->wave->samples;
  167. flite->wave_nb_samples = flite->wave->num_samples;
  168. return 0;
  169. }
  170. static av_cold void uninit(AVFilterContext *ctx)
  171. {
  172. FliteContext *flite = ctx->priv;
  173. av_opt_free(flite);
  174. if (!--flite->voice_entry->usage_count)
  175. flite->voice_entry->unregister_fn(flite->voice);
  176. flite->voice = NULL;
  177. flite->voice_entry = NULL;
  178. delete_wave(flite->wave);
  179. flite->wave = NULL;
  180. }
  181. static int query_formats(AVFilterContext *ctx)
  182. {
  183. FliteContext *flite = ctx->priv;
  184. AVFilterChannelLayouts *chlayouts = NULL;
  185. int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
  186. AVFilterFormats *sample_formats = NULL;
  187. AVFilterFormats *sample_rates = NULL;
  188. ff_add_channel_layout(&chlayouts, chlayout);
  189. ff_set_common_channel_layouts(ctx, chlayouts);
  190. ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
  191. ff_set_common_formats(ctx, sample_formats);
  192. ff_add_format(&sample_rates, flite->wave->sample_rate);
  193. ff_set_common_samplerates (ctx, sample_rates);
  194. return 0;
  195. }
  196. static int config_props(AVFilterLink *outlink)
  197. {
  198. AVFilterContext *ctx = outlink->src;
  199. FliteContext *flite = ctx->priv;
  200. outlink->sample_rate = flite->wave->sample_rate;
  201. outlink->time_base = (AVRational){1, flite->wave->sample_rate};
  202. av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
  203. flite->voice_str,
  204. av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
  205. return 0;
  206. }
  207. static int request_frame(AVFilterLink *outlink)
  208. {
  209. AVFilterBufferRef *samplesref;
  210. FliteContext *flite = outlink->src->priv;
  211. int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
  212. if (!nb_samples)
  213. return AVERROR_EOF;
  214. samplesref = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
  215. if (!samplesref)
  216. return AVERROR(ENOMEM);
  217. memcpy(samplesref->data[0], flite->wave_samples,
  218. nb_samples * flite->wave->num_channels * 2);
  219. samplesref->pts = flite->pts;
  220. samplesref->pos = -1;
  221. samplesref->audio->sample_rate = flite->wave->sample_rate;
  222. flite->pts += nb_samples;
  223. flite->wave_samples += nb_samples * flite->wave->num_channels;
  224. flite->wave_nb_samples -= nb_samples;
  225. return ff_filter_samples(outlink, samplesref);
  226. }
  227. AVFilter avfilter_asrc_flite = {
  228. .name = "flite",
  229. .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
  230. .query_formats = query_formats,
  231. .init = init,
  232. .uninit = uninit,
  233. .priv_size = sizeof(FliteContext),
  234. .inputs = (const AVFilterPad[]) {{ .name = NULL}},
  235. .outputs = (const AVFilterPad[]) {
  236. {
  237. .name = "default",
  238. .type = AVMEDIA_TYPE_AUDIO,
  239. .config_props = config_props,
  240. .request_frame = request_frame,
  241. },
  242. { .name = NULL }
  243. },
  244. };