From 7b96f573063eb156a5baec747776c39c391273f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Dzi=C4=99giel?= Date: Mon, 28 Jun 2021 18:21:09 +0200 Subject: [PATCH] subparse: Autodetect subtitle text encoding Use "uchardet" to guess the subtitle text encoding if it is not in UTF-8 or manually specified instead of blindly guessing its "ISO-8859-15". The "uchardet" dependency is optional and when is not available at compile time, then old behaviour will be used. --- .../gst/subparse/gstsubparse.c | 60 +++++++++++++++++-- .../gst-plugins-base/gst/subparse/meson.build | 12 +++- .../gst-plugins-base/meson_options.txt | 3 + 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/subprojects/gst-plugins-base/gst/subparse/gstsubparse.c b/subprojects/gst-plugins-base/gst/subparse/gstsubparse.c index eaae3cf856..520de88003 100644 --- a/subprojects/gst-plugins-base/gst/subparse/gstsubparse.c +++ b/subprojects/gst-plugins-base/gst/subparse/gstsubparse.c @@ -31,6 +31,10 @@ #include #include +#if defined(HAVE_UCHARDET) +#include +#endif + #include "gstsubparse.h" #include "gstssaparse.h" @@ -151,8 +155,9 @@ gst_sub_parse_class_init (GstSubParseClass * klass) "Encoding to assume if input subtitles are not in UTF-8 or any other " "Unicode encoding. If not set, the GST_SUBTITLE_ENCODING environment " "variable will be checked for an encoding to use. If that is not set " - "either, ISO-8859-15 will be assumed.", DEFAULT_ENCODING, - G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)); + "either, then if plugin was build with uchardet support it will be " + "used to guess the encoding, otherwise ISO-8859-15 will be assumed.", + DEFAULT_ENCODING, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)); g_object_class_install_property (object_class, PROP_VIDEOFPS, gst_param_spec_fraction ("video-fps", "Video framerate", @@ -395,9 +400,34 @@ gst_sub_parse_get_format_description (GstSubParseFormat format) return NULL; } +#if defined(HAVE_UCHARDET) +static const gchar * +gst_sub_parse_uchardet_detect_encoding (GstSubParse * self, uchardet_t handle, + const gchar * str, gsize len) +{ + const gchar *charset = NULL; + gint retval; + + GST_DEBUG_OBJECT (self, + "detecting encoding with uchardet using %li characters", len); + retval = uchardet_handle_data (handle, str, len); + if (retval != 0) { + GST_WARNING_OBJECT (self, "could not handle data with uchardet, code: %i", + retval); + } else { + uchardet_data_end (handle); + charset = uchardet_get_charset (handle); + if (charset == NULL || *charset == '\0') + GST_WARNING_OBJECT (self, "uchardet could not detect encoding"); + else + GST_DEBUG_OBJECT (self, "uchardet detected encoding: %s", charset); + } + return charset; +} +#endif static gchar * convert_encoding (GstSubParse * self, const gchar * str, gsize len, @@ -407,6 +437,10 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, GError *err = NULL; gchar *ret = NULL; +#if defined(HAVE_UCHARDET) + uchardet_t handle = NULL; +#endif + *consumed = 0; /* First try any detected encoding */ @@ -441,12 +475,18 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, if (encoding == NULL || *encoding == '\0') { encoding = g_getenv ("GST_SUBTITLE_ENCODING"); } +#if defined(HAVE_UCHARDET) + if (encoding == NULL || *encoding == '\0') { + /* no encoding specified via the environment variable either, + * so try to autodetect with uchardet */ + handle = uchardet_new (); + encoding = gst_sub_parse_uchardet_detect_encoding (self, handle, str, len); + } +#endif + /* if uchardet failed and local encoding is UTF-8, assume ISO-8859-15 */ if (encoding == NULL || *encoding == '\0') { - /* if local encoding is UTF-8 and no encoding specified - * via the environment variable, assume ISO-8859-15 */ - if (g_get_charset (&encoding)) { + if (g_get_charset (&encoding)) encoding = "ISO-8859-15"; - } } ret = gst_sub_parse_gst_convert_to_utf8 (str, len, encoding, consumed, &err); @@ -460,12 +500,20 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, ret = gst_sub_parse_gst_convert_to_utf8 (str, len, "ISO-8859-15", consumed, NULL); + } else { + /* reuse the detected encoding from now on */ + self->detected_encoding = g_strdup (encoding); } GST_LOG_OBJECT (self, "successfully converted %" G_GSIZE_FORMAT " characters from %s to UTF-8" "%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : ""); +#if defined(HAVE_UCHARDET) + if (handle) + uchardet_delete (handle); +#endif + return ret; } diff --git a/subprojects/gst-plugins-base/gst/subparse/meson.build b/subprojects/gst-plugins-base/gst/subparse/meson.build index 7be6c2becc..68d02fcbb1 100644 --- a/subprojects/gst-plugins-base/gst/subparse/meson.build +++ b/subprojects/gst-plugins-base/gst/subparse/meson.build @@ -8,12 +8,20 @@ subparse_sources = [ 'mpl2parse.c', 'qttextparse.c', ] +subparse_defines = [] +subparse_optional_deps = [] + +subparse_uchardet_dep = dependency('uchardet', required : get_option('subparse-uchardet')) +if subparse_uchardet_dep.found() + subparse_defines += '-DHAVE_UCHARDET' + subparse_optional_deps += subparse_uchardet_dep +endif gstsubparse = library('gstsubparse', subparse_sources, - c_args : gst_plugins_base_args, + c_args : gst_plugins_base_args + subparse_defines, include_directories: [configinc, libsinc], - dependencies : [gst_base_dep], + dependencies : [gst_base_dep] + subparse_optional_deps, install : true, install_dir : plugins_install_dir, ) diff --git a/subprojects/gst-plugins-base/meson_options.txt b/subprojects/gst-plugins-base/meson_options.txt index 7010b91626..1b2771425b 100644 --- a/subprojects/gst-plugins-base/meson_options.txt +++ b/subprojects/gst-plugins-base/meson_options.txt @@ -66,6 +66,9 @@ option('x11', type : 'feature', value : 'auto', description : 'X11 ximagesink pl option('xshm', type : 'feature', value : 'auto', description : 'X11 shared memory support for X11 plugins') option('xvideo', type : 'feature', value : 'auto', description : 'X11 XVideo xvimagesink plugin') +# Subparse plugin options +option('subparse-uchardet', type : 'feature', value : 'auto', description: 'Autodetect subtitles text encoding using uchardet') + # Common feature options option('examples', type : 'feature', value : 'auto', yield : true) option('tests', type : 'feature', value : 'auto', yield : true) -- GitLab