diff options
author | Tim-Philipp Müller <tim@centricular.net> | 2006-06-22 16:27:03 +0000 |
---|---|---|
committer | Tim-Philipp Müller <tim@centricular.net> | 2006-06-22 16:27:03 +0000 |
commit | 45c10ca9de093be4a2a7023b3ce121515a300ce3 (patch) | |
tree | 321633fe89255d5e3ca2e65545367f0ba384a9fe /gst/matroska | |
parent | a6af52cc25a1f1407f30f04e3119ff3ef15dde3f (diff) |
gst/matroska/: Try to fix up broken matroska files containing subtitle streams with non-UTF8 character encodings (cou...
Original commit message from CVS:
* gst/matroska/matroska-demux.c:
(gst_matroska_demux_check_subtitle_buffer),
(gst_matroska_demux_parse_blockgroup_or_simpleblock),
(gst_matroska_demux_subtitle_caps):
* gst/matroska/matroska-ids.c:
(gst_matroska_track_init_subtitle_context):
* gst/matroska/matroska-ids.h:
Try to fix up broken matroska files containing subtitle
streams with non-UTF8 character encodings (courtesy of
mkvmerge) using either the encoding specified in the
GST_SUBTITLE_ENCODING environment variable or the
current locale's character set if it is non-UTF8.
Fixes #337076.
Diffstat (limited to 'gst/matroska')
-rw-r--r-- | gst/matroska/matroska-demux.c | 82 | ||||
-rw-r--r-- | gst/matroska/matroska-ids.c | 1 | ||||
-rw-r--r-- | gst/matroska/matroska-ids.h | 3 |
3 files changed, 84 insertions, 2 deletions
diff --git a/gst/matroska/matroska-demux.c b/gst/matroska/matroska-demux.c index 00eae247..15df1997 100644 --- a/gst/matroska/matroska-demux.c +++ b/gst/matroska/matroska-demux.c @@ -36,7 +36,7 @@ #include "matroska-demux.h" #include "matroska-ids.h" -GST_DEBUG_CATEGORY (matroskademux_debug); +GST_DEBUG_CATEGORY_STATIC (matroskademux_debug); #define GST_CAT_DEFAULT matroskademux_debug enum @@ -2135,6 +2135,75 @@ gst_matroska_demux_add_wvpk_header (GstMatroskaTrackContext * stream, return TRUE; } +static GstBuffer * +gst_matroska_demux_check_subtitle_buffer (GstMatroskaDemux * demux, + GstMatroskaTrackContext * stream, GstBuffer * buf) +{ + GstMatroskaTrackSubtitleContext *sub_stream; + const gchar *encoding, *data; + GError *err = NULL; + GstBuffer *newbuf; + gchar *utf8; + guint size; + + sub_stream = (GstMatroskaTrackSubtitleContext *) stream; + + if (!sub_stream->check_utf8) + return buf; + + data = (const gchar *) GST_BUFFER_DATA (buf); + size = GST_BUFFER_SIZE (buf); + + if (!sub_stream->invalid_utf8) { + if (g_utf8_validate (data, size, NULL)) { + return buf; + } + GST_WARNING_OBJECT (demux, "subtitle stream %d is not valid UTF-8, this " + "is broken according to the matroska specification", stream->num); + sub_stream->invalid_utf8 = TRUE; + } + + /* file with broken non-UTF8 subtitle, do the best we can do to fix it */ + encoding = g_getenv ("GST_SUBTITLE_ENCODING"); + if (encoding == NULL || *encoding == '\0') { + /* if local encoding is UTF-8 and no encoding specified + * via the environment variable, assume ISO-8859-15 */ + if (g_get_charset (&encoding)) { + encoding = "ISO-8859-15"; + } + } + + utf8 = g_convert_with_fallback (data, size, "UTF-8", encoding, "*", + NULL, NULL, &err); + + if (err) { + GST_LOG_OBJECT (demux, "could not convert string from '%s' to UTF-8: %s", + encoding, err->message); + g_error_free (err); + g_free (utf8); + + /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */ + encoding = "ISO-8859-15"; + utf8 = g_convert_with_fallback (data, size, "UTF-8", encoding, "*", + NULL, NULL, NULL); + } + + GST_LOG_OBJECT (demux, "converted subtitle text from %s to UTF-8 %s", + encoding, (err) ? "(using ISO-8859-15 as fallback)" : ""); + + if (utf8 == NULL) + utf8 = g_strdup ("invalid subtitle"); + + newbuf = gst_buffer_new (); + GST_BUFFER_MALLOCDATA (newbuf) = (guint8 *) utf8; + GST_BUFFER_DATA (newbuf) = (guint8 *) utf8; + GST_BUFFER_SIZE (newbuf) = strlen (utf8); + gst_buffer_stamp (newbuf, buf); + + gst_buffer_unref (buf); + return newbuf; +} + static gboolean gst_matroska_demux_parse_blockgroup_or_simpleblock (GstMatroskaDemux * demux, guint64 cluster_time, gboolean is_simpleblock) @@ -2415,6 +2484,12 @@ gst_matroska_demux_parse_blockgroup_or_simpleblock (GstMatroskaDemux * demux, GST_TIME_ARGS (GST_BUFFER_DURATION (sub))); gst_buffer_set_caps (sub, GST_PAD_CAPS (stream->pad)); + + /* Fix up broken files with subtitles that are not UTF8 */ + if (stream->type == GST_MATROSKA_TRACK_TYPE_SUBTITLE) { + sub = gst_matroska_demux_check_subtitle_buffer (demux, stream, sub); + } + ret = gst_pad_push (stream->pad, sub); if (ret != GST_FLOW_OK && ret != GST_FLOW_NOT_LINKED) got_error = TRUE; @@ -3448,15 +3523,20 @@ gst_matroska_demux_subtitle_caps (GstMatroskaTrackSubtitleContext * if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_UTF8)) { caps = gst_caps_new_simple ("text/plain", NULL); + subtitlecontext->check_utf8 = TRUE; } else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_SSA)) { caps = gst_caps_new_simple ("application/x-ssa", NULL); + subtitlecontext->check_utf8 = TRUE; } else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_ASS)) { caps = gst_caps_new_simple ("application/x-ass", NULL); + subtitlecontext->check_utf8 = TRUE; } else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_USF)) { caps = gst_caps_new_simple ("application/x-usf", NULL); + subtitlecontext->check_utf8 = TRUE; } else { GST_DEBUG ("Unknown subtitle stream: codec_id='%s'", codec_id); caps = gst_caps_new_simple ("application/x-subtitle-unknown", NULL); + subtitlecontext->check_utf8 = FALSE; } if (data != NULL && size > 0) { diff --git a/gst/matroska/matroska-ids.c b/gst/matroska/matroska-ids.c index dde46769..db8261ab 100644 --- a/gst/matroska/matroska-ids.c +++ b/gst/matroska/matroska-ids.c @@ -105,6 +105,7 @@ gst_matroska_track_init_subtitle_context (GstMatroskaTrackContext ** p_context) *p_context = (GstMatroskaTrackContext *) subtitle_context; (*p_context)->type = GST_MATROSKA_TRACK_TYPE_SUBTITLE; + subtitle_context->invalid_utf8 = FALSE; return TRUE; } diff --git a/gst/matroska/matroska-ids.h b/gst/matroska/matroska-ids.h index 0016b654..b9ba8ccd 100644 --- a/gst/matroska/matroska-ids.h +++ b/gst/matroska/matroska-ids.h @@ -284,7 +284,8 @@ typedef struct _GstMatroskaTrackComplexContext { typedef struct _GstMatroskaTrackSubtitleContext { GstMatroskaTrackContext parent; - /* or here... */ + gboolean check_utf8; /* buffers should be valid UTF-8 */ + gboolean invalid_utf8; /* work around broken files */ } GstMatroskaTrackSubtitleContext; typedef struct _GstMatroskaIndex { |