From a7727bf8becfe796813fb2a99e3344c3b2daafb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim Rühsen?= Date: Mon, 6 Oct 2014 16:32:37 +0200 Subject: [PATCH] fixed IRI misbehaviour(s) --- src/ChangeLog | 8 ++++ src/html-url.c | 5 +- src/iri.c | 102 ++++++++++++++++++++-------------------- src/url.c | 19 ++++---- tests/ChangeLog | 7 +++ tests/Test-idn-meta.px | 3 +- tests/Test-iri-forced-remote.px | 31 ++---------- tests/Test-iri-percent.px | 6 ++- tests/Test-iri.px | 28 +++-------- 9 files changed, 96 insertions(+), 113 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 1c4e2d5..2c20679 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2014-10-06 Tim Ruehsen + + * url.c (url_parse): little code cleanup + * html-url.c (get_urls_html): HTTP header Content-Type charset preceeds http-equiv + * iri.c (do_conversion): moved iconv code completely into the function + * iri.c (do_conversion): call url_unescape to fix charset conversion + * iri.c (remote_to_utf8): use strcasecmp to compare encoding + 2014-05-03 Tim Ruehsen * retr.c (retrieve_url): fixed memory leak diff --git a/src/html-url.c b/src/html-url.c index 3c6c9b9..903864e 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -748,8 +748,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); - /* If meta charset isn't null, override content encoding */ - if (iri && meta_charset) + /* Meta charset is only valid if there was no HTTP header Content-Type charset. */ + /* This is true for HTTP 1.0 and 1.1. */ + if (iri && !iri->content_encoding && meta_charset) set_content_encoding (iri, meta_charset); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); diff --git a/src/iri.c b/src/iri.c index 4db426e..e38edc9 100644 --- a/src/iri.c +++ b/src/iri.c @@ -38,15 +38,13 @@ as that of the covered work. */ #include #include "utils.h" +#include "url.h" /* RFC3987 section 3.1 mandates STD3 ASCII RULES */ #define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES /* Note: locale encoding is kept in options struct (opt.locale) */ -static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); - - /* Given a string containing "charset=XXX", return the encoding if found, or NULL otherwise */ char * @@ -110,52 +108,34 @@ check_encoding_name (char *encoding) return true; } -/* Try converting string str from locale to UTF-8. Return a new string - on success, or str on error or if conversion isn't needed. */ -const char * -locale_to_utf8 (const char *str) -{ - iconv_t l2u; - char *new; - - /* That shouldn't happen, just in case */ - if (!opt.locale) - { - logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n")); - opt.locale = find_locale (); - } - - if (!opt.locale || !strcasecmp (opt.locale, "utf-8")) - return str; - - l2u = iconv_open ("UTF-8", opt.locale); - if (l2u == (iconv_t)(-1)) - { - logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"), - quote (opt.locale), quote ("UTF-8")); - return str; - } - - if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new)) - return (const char *) new; - - return str; -} - /* Do the conversion according to the passed conversion descriptor cd. *out will contain the transcoded string on success. *out content is unspecified otherwise. */ static bool -do_conversion (iconv_t cd, char *in, size_t inlen, char **out) +do_conversion (const char *tocode, const char *fromcode, char *in, size_t inlen, char **out) { + iconv_t cd; /* sXXXav : hummm hard to guess... */ - size_t len, done, outlen = inlen * 2; + size_t len, done, outlen; int invalid = 0, tooshort = 0; - char *s; + char *s, *in_org, *in_save; - s = xmalloc (outlen + 1); - *out = s; - len = outlen; + cd = iconv_open (tocode, fromcode); + if (cd == (iconv_t)(-1)) + { + logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"), + quote (opt.locale), quote ("UTF-8")); + return false; + } + + /* iconv() has to work on an unescaped string */ + in_org = in; + in_save = in = strndup(in, inlen); + url_unescape(in); + inlen = strlen(in); + + len = outlen = inlen * 2; + *out = s = xmalloc (outlen + 1); done = 0; for (;;) @@ -164,6 +144,9 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) { *out = s; *(s + len - outlen - done) = '\0'; + xfree(in_save); + iconv_close(cd); + logprintf (LOG_VERBOSE, _("converted '%s' (%s) -> '%s' (%s)\n"), in_org, fromcode, *out, tocode); return true; } @@ -202,9 +185,35 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) } } + xfree(in_save); + iconv_close(cd); + logprintf (LOG_VERBOSE, _("converted '%s' (%s) -> '%s' (%s)\n"), in_org, fromcode, *out, tocode); return false; } +/* Try converting string str from locale to UTF-8. Return a new string + on success, or str on error or if conversion isn't needed. */ +const char * +locale_to_utf8 (const char *str) +{ + char *new; + + /* That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n")); + opt.locale = find_locale (); + } + + if (!opt.locale || !strcasecmp (opt.locale, "utf-8")) + return str; + + if (do_conversion ("UTF-8", opt.locale, (char *) str, strlen ((char *) str), &new)) + return (const char *) new; + + return str; +} + /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL on error. */ char * @@ -258,7 +267,6 @@ idn_decode (char *host) bool remote_to_utf8 (struct iri *iri, const char *str, const char **new) { - iconv_t cd; bool ret = false; if (!iri->uri_encoding) @@ -267,7 +275,7 @@ remote_to_utf8 (struct iri *iri, const char *str, const char **new) /* When `i->uri_encoding' == "UTF-8" there is nothing to convert. But we must test for non-ASCII symbols for correct hostname processing in `idn_encode' function. */ - if (!strcmp (iri->uri_encoding, "UTF-8")) + if (!strcasecmp (iri->uri_encoding, "UTF-8")) { const char *p = str; for (p = str; *p; p++) @@ -279,15 +287,9 @@ remote_to_utf8 (struct iri *iri, const char *str, const char **new) return false; } - cd = iconv_open ("UTF-8", iri->uri_encoding); - if (cd == (iconv_t)(-1)) - return false; - - if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + if (do_conversion ("UTF-8", iri->uri_encoding, (char *) str, strlen (str), (char **) new)) ret = true; - iconv_close (cd); - /* Test if something was converted */ if (!strcmp (str, *new)) { diff --git a/src/url.c b/src/url.c index ab76f3b..0551c58 100644 --- a/src/url.c +++ b/src/url.c @@ -681,7 +681,6 @@ url_parse (const char *url, int *error, struct iri *iri, bool percent_encode) char *user = NULL, *passwd = NULL; const char *url_encoded = NULL; - char *new_url = NULL; int error_code; @@ -695,29 +694,29 @@ url_parse (const char *url, int *error, struct iri *iri, bool percent_encode) goto error; } + url_encoded = url; + if (iri && iri->utf8_encode) { + char *new_url = NULL; + iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url); if (!iri->utf8_encode) new_url = NULL; else { iri->orig_url = xstrdup (url); - percent_encode = true; + url_encoded = reencode_escapes (new_url); + if (url_encoded != new_url) + xfree (new_url); + percent_encode = false; } } - /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/ if (percent_encode) - url_encoded = reencode_escapes (new_url ? new_url : url); - else - url_encoded = new_url ? new_url : url; + url_encoded = reencode_escapes (url); p = url_encoded; - - if (new_url && url_encoded != new_url) - xfree (new_url); - p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); diff --git a/tests/ChangeLog b/tests/ChangeLog index b4ea8cb..c1f417d 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,10 @@ +2014-10-06 Tim Ruehsen + + * Test-iri.px: fixed encodings + * Test-iri-forced-remote.px: fixed encodings + * Test-iri-percent.px: fixed encodings + * Test-idn-meta.px: fixed encodings + 2014-10-02 Tim Ruehsen * tests/WgetTests.pm: use filename as default test name diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px index 1370a4a..fd7dcfa 100755 --- a/tests/Test-idn-meta.px +++ b/tests/Test-idn-meta.px @@ -27,7 +27,8 @@ my %urls = ( code => "200", msg => "You want fries with that?", headers => { - 'Content-Type' => 'text/html; charset=UTF-8', + # HTTP header preceeds http-equiv, simply just omit it here + #'Content-Type' => 'text/html; charset=UTF-8', }, content => $starter_file, }, diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px index 192833d..5dee3bb 100755 --- a/tests/Test-iri-forced-remote.px +++ b/tests/Test-iri-forced-remote.px @@ -48,7 +48,7 @@ my $pagefrancais = < La seule page en français - +

@@ -117,17 +117,10 @@ my %urls = ( content => "", }, '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded - code => "404", - msg => "File not found", - headers => { - "Content-type" => "text/html; charset=UTF-8", - }, - content => $page404, - }, - '/p1_fran%E7ais.html' => { code => "200", msg => "Ok", headers => { + # wrong charset here, overridden by --remote-encoding=iso-8859-1 "Content-type" => "text/html; charset=UTF-8", }, content => $pagefrancais, @@ -140,14 +133,6 @@ my %urls = ( }, content => $pageeen, }, - '/p2_%E9%E9n.html' => { - code => "200", - msg => "Ok", - headers => { - "Content-type" => "text/html; charset=ISO-8859-1", - }, - content => $pageeen, - }, '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded code => "200", msg => "Ok", @@ -156,14 +141,6 @@ my %urls = ( }, content => $pageeuro, }, - '/p3_%A4%A4%A4.html' => { - code => "200", - msg => "Ok", - headers => { - "Content-type" => "text/plain", - }, - content => $pageeuro, - }, '/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded code => "200", msg => "Ok", @@ -174,7 +151,7 @@ my %urls = ( }, ); -my $cmdline = $WgetTest::WGETPATH . " --iri --trust-server-names --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/"; +my $cmdline = $WgetTest::WGETPATH . " --iri -e robots=on --trust-server-names --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/"; my $expected_error_code = 0; @@ -185,7 +162,7 @@ my %expected_downloaded_files = ( 'robots.txt' => { content => "", }, - "p1_fran${ccedilla_l15}ais.html" => { + "p1_fran${ccedilla_u8}ais.html" => { content => $pagefrancais, }, "p2_${eacute_u8}${eacute_u8}n.html" => { diff --git a/tests/Test-iri-percent.px b/tests/Test-iri-percent.px index ae86b18..7c4f4c8 100755 --- a/tests/Test-iri-percent.px +++ b/tests/Test-iri-percent.px @@ -11,6 +11,8 @@ use HTTPTest; my $ccedilla_l15 = "\xE7"; my $ccedilla_l15_pct = "%E7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $ccedilla_u8_pct = "%C3%A7"; my $eacute_l1 = "\xE9"; my $eacute_u8 = "\xC3\xA9"; my $eacute_u8_pct = "%C3%A9"; @@ -52,7 +54,7 @@ my %urls = ( }, content => $pageindex, }, - "/hello_${ccedilla_l15_pct}${eacute_u8_pct}.html" => { + "/hello_${ccedilla_u8_pct}${eacute_u8_pct}.html" => { code => "200", msg => "Ok", headers => { @@ -70,7 +72,7 @@ my %expected_downloaded_files = ( 'index.html' => { content => $pageindex, }, - "hello_${ccedilla_l15}${eacute_u8}.html" => { + "hello_${ccedilla_u8}${eacute_u8}.html" => { content => $pagefrancais, }, ); diff --git a/tests/Test-iri.px b/tests/Test-iri.px index 4d6b39a..eb23b63 100755 --- a/tests/Test-iri.px +++ b/tests/Test-iri.px @@ -42,11 +42,12 @@ my $pageindex = < EOF +# specifying a wrong charset in http-equiv - it will be overridden by Content-Type HTTP header my $pagefrancais = < La seule page en français - +

@@ -131,18 +132,11 @@ my %urls = ( content => "", }, '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded - code => "404", - msg => "File not found", - headers => { - "Content-type" => "text/html; charset=UTF-8", - }, - content => $page404, - }, - '/p1_fran%E7ais.html' => { code => "200", msg => "Ok", headers => { - "Content-type" => "text/html; charset=UTF-8", + # Content-Type header overrides http-equiv Content-Type + "Content-type" => "text/html; charset=ISO-8859-15", }, content => $pagefrancais, }, @@ -150,10 +144,10 @@ my %urls = ( code => "200", msg => "Ok", request_headers => { - "Referer" => qr|http://localhost:[0-9]+/p1_fran%E7ais.html|, + "Referer" => qr|http://localhost:[0-9]+/p1_fran%C3%A7ais.html|, }, headers => { - "Content-type" => "text/html; charset=ISO-8859-1", + "Content-type" => "text/html; charset=UTF-8", }, content => $pageeen, }, @@ -165,14 +159,6 @@ my %urls = ( }, content => $pageeuro, }, - '/p3_%A4%A4%A4.html' => { - code => "200", - msg => "Ok", - headers => { - "Content-type" => "text/plain; charset=ISO-8859-1", - }, - content => $pageeuro, - }, '/p4_m%C3%A9%C3%A9r.html' => { code => "200", msg => "Ok", @@ -197,7 +183,7 @@ my %expected_downloaded_files = ( 'robots.txt' => { content => "", }, - "p1_fran${ccedilla_l15}ais.html" => { + "p1_fran${ccedilla_u8}ais.html" => { content => $pagefrancais, }, "p2_${eacute_u8}${eacute_u8}n.html" => { -- 2.1.1