From a7727bf8becfe796813fb2a99e3344c3b2daafb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <address@hidden>
Date: Mon, 6 Oct 2014 16:32:37 +0200
Subject: [PATCH] fixed IRI misbehaviour(s)

---
 src/ChangeLog                   |   8 ++++
 src/html-url.c                  |   5 +-
 src/iri.c                       | 102 ++++++++++++++++++++--------------------
 src/url.c                       |  19 ++++----
 tests/ChangeLog                 |   7 +++
 tests/Test-idn-meta.px          |   3 +-
 tests/Test-iri-forced-remote.px |  31 ++----------
 tests/Test-iri-percent.px       |   6 ++-
 tests/Test-iri.px               |  28 +++--------
 9 files changed, 96 insertions(+), 113 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 1c4e2d5..2c20679 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,11 @@
+2014-10-06  Tim Ruehsen  <address@hidden>
+
+	* url.c (url_parse): little code cleanup
+	* html-url.c (get_urls_html): HTTP header Content-Type charset preceeds http-equiv
+	* iri.c (do_conversion): moved iconv code completely into the function
+	* iri.c (do_conversion): call url_unescape to fix charset conversion
+	* iri.c (remote_to_utf8): use strcasecmp to compare encoding
+
 2014-05-03  Tim Ruehsen  <address@hidden>

 	* retr.c (retrieve_url): fixed memory leak
diff --git a/src/html-url.c b/src/html-url.c
index 3c6c9b9..903864e 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -748,8 +748,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                  NULL, interesting_attributes);

-  /* If meta charset isn't null, override content encoding */
-  if (iri && meta_charset)
+  /* Meta charset is only valid if there was no HTTP header Content-Type charset. */
+  /* This is true for HTTP 1.0 and 1.1. */
+  if (iri && !iri->content_encoding && meta_charset)
     set_content_encoding (iri, meta_charset);

   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
diff --git a/src/iri.c b/src/iri.c
index 4db426e..e38edc9 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -38,15 +38,13 @@ as that of the covered work.  */
 #include <errno.h>

 #include "utils.h"
+#include "url.h"

 /* RFC3987 section 3.1 mandates STD3 ASCII RULES */
 #define IDNA_FLAGS  IDNA_USE_STD3_ASCII_RULES

 /* Note: locale encoding is kept in options struct (opt.locale) */

-static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
-
-
 /* Given a string containing "charset=XXX", return the encoding if found,
    or NULL otherwise */
 char *
@@ -110,52 +108,34 @@ check_encoding_name (char *encoding)
   return true;
 }

-/* Try converting string str from locale to UTF-8. Return a new string
-   on success, or str on error or if conversion isn't needed. */
-const char *
-locale_to_utf8 (const char *str)
-{
-  iconv_t l2u;
-  char *new;
-
-  /* That shouldn't happen, just in case */
-  if (!opt.locale)
-    {
-      logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n"));
-      opt.locale = find_locale ();
-    }
-
-  if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
-    return str;
-
-  l2u = iconv_open ("UTF-8", opt.locale);
-  if (l2u == (iconv_t)(-1))
-    {
-      logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
-                 quote (opt.locale), quote ("UTF-8"));
-      return str;
-    }
-
-  if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
-    return (const char *) new;
-
-  return str;
-}
-
 /* Do the conversion according to the passed conversion descriptor cd. *out
    will contain the transcoded string on success. *out content is
    unspecified otherwise. */
 static bool
-do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+do_conversion (const char *tocode, const char *fromcode, char *in, size_t inlen, char **out)
 {
+  iconv_t cd;
   /* sXXXav : hummm hard to guess... */
-  size_t len, done, outlen = inlen * 2;
+  size_t len, done, outlen;
   int invalid = 0, tooshort = 0;
-  char *s;
+  char *s, *in_org, *in_save;

-  s = xmalloc (outlen + 1);
-  *out = s;
-  len = outlen;
+  cd = iconv_open (tocode, fromcode);
+  if (cd == (iconv_t)(-1))
+    {
+      logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
+                 quote (opt.locale), quote ("UTF-8"));
+      return false;
+    }
+
+  /* iconv() has to work on an unescaped string */
+  in_org = in;
+  in_save = in = strndup(in, inlen);
+  url_unescape(in);
+  inlen = strlen(in);
+
+  len = outlen = inlen * 2;
+  *out = s = xmalloc (outlen + 1);
   done = 0;

   for (;;)
@@ -164,6 +144,9 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
         {
           *out = s;
           *(s + len - outlen - done) = '\0';
+          xfree(in_save);
+          iconv_close(cd);
+          logprintf (LOG_VERBOSE, _("converted '%s' (%s) -> '%s' (%s)\n"), in_org, fromcode, *out, tocode);
           return true;
         }

@@ -202,9 +185,35 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
         }
     }

+    xfree(in_save);
+    iconv_close(cd);
+    logprintf (LOG_VERBOSE, _("converted '%s' (%s) -> '%s' (%s)\n"), in_org, fromcode, *out, tocode);
     return false;
 }

+/* Try converting string str from locale to UTF-8. Return a new string
+   on success, or str on error or if conversion isn't needed. */
+const char *
+locale_to_utf8 (const char *str)
+{
+  char *new;
+
+  /* That shouldn't happen, just in case */
+  if (!opt.locale)
+    {
+      logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n"));
+      opt.locale = find_locale ();
+    }
+
+  if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
+    return str;
+
+  if (do_conversion ("UTF-8", opt.locale, (char *) str, strlen ((char *) str), &new))
+    return (const char *) new;
+
+  return str;
+}
+
 /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
    on error. */
 char *
@@ -258,7 +267,6 @@ idn_decode (char *host)
 bool
 remote_to_utf8 (struct iri *iri, const char *str, const char **new)
 {
-  iconv_t cd;
   bool ret = false;

   if (!iri->uri_encoding)
@@ -267,7 +275,7 @@ remote_to_utf8 (struct iri *iri, const char *str, const char **new)
   /* When `i->uri_encoding' == "UTF-8" there is nothing to convert.  But we must
      test for non-ASCII symbols for correct hostname processing in `idn_encode'
      function. */
-  if (!strcmp (iri->uri_encoding, "UTF-8"))
+  if (!strcasecmp (iri->uri_encoding, "UTF-8"))
     {
       const char *p = str;
       for (p = str; *p; p++)
@@ -279,15 +287,9 @@ remote_to_utf8 (struct iri *iri, const char *str, const char **new)
       return false;
     }

-  cd = iconv_open ("UTF-8", iri->uri_encoding);
-  if (cd == (iconv_t)(-1))
-    return false;
-
-  if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+  if (do_conversion ("UTF-8", iri->uri_encoding, (char *) str, strlen (str), (char **) new))
     ret = true;

-  iconv_close (cd);
-
   /* Test if something was converted */
   if (!strcmp (str, *new))
     {
diff --git a/src/url.c b/src/url.c
index ab76f3b..0551c58 100644
--- a/src/url.c
+++ b/src/url.c
@@ -681,7 +681,6 @@ url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
   char *user = NULL, *passwd = NULL;

   const char *url_encoded = NULL;
-  char *new_url = NULL;

   int error_code;

@@ -695,29 +694,29 @@ url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
       goto error;
     }

+  url_encoded = url;
+
   if (iri && iri->utf8_encode)
     {
+      char *new_url = NULL;
+
       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
       if (!iri->utf8_encode)
         new_url = NULL;
       else
         {
           iri->orig_url = xstrdup (url);
-          percent_encode = true;
+          url_encoded = reencode_escapes (new_url);
+          if (url_encoded != new_url)
+            xfree (new_url);
+          percent_encode = false;
         }
     }

-  /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
   if (percent_encode)
-    url_encoded = reencode_escapes (new_url ? new_url : url);
-  else
-    url_encoded = new_url ? new_url : url;
+    url_encoded = reencode_escapes (url);

   p = url_encoded;
-
-  if (new_url && url_encoded != new_url)
-    xfree (new_url);
-
   p += strlen (supported_schemes[scheme].leading_string);
   uname_b = p;
   p = url_skip_credentials (p);
diff --git a/tests/ChangeLog b/tests/ChangeLog
index b4ea8cb..c1f417d 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,10 @@
+2014-10-06  Tim Ruehsen <address@hidden>
+
+	* Test-iri.px: fixed encodings
+	* Test-iri-forced-remote.px: fixed encodings
+	* Test-iri-percent.px: fixed encodings
+	* Test-idn-meta.px: fixed encodings
+
 2014-10-02  Tim Ruehsen <address@hidden>

 	* tests/WgetTests.pm: use filename as default test name
diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px
index 1370a4a..fd7dcfa 100755
--- a/tests/Test-idn-meta.px
+++ b/tests/Test-idn-meta.px
@@ -27,7 +27,8 @@ my %urls = (
         code => "200",
         msg => "You want fries with that?",
         headers => {
-            'Content-Type' => 'text/html; charset=UTF-8',
+            # HTTP header preceeds http-equiv, simply just omit it here
+            #'Content-Type' => 'text/html; charset=UTF-8',
         },
         content => $starter_file,
     },
diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px
index 192833d..5dee3bb 100755
--- a/tests/Test-iri-forced-remote.px
+++ b/tests/Test-iri-forced-remote.px
@@ -48,7 +48,7 @@ my $pagefrancais = <<EOF;
 <html>
 <head>
   <title>La seule page en français</title>
-  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
 </head>
 <body>
   <p>
@@ -117,17 +117,10 @@ my %urls = (
         content => "",
     },
     '/p1_fran%C3%A7ais.html' => {	# UTF-8 encoded
-        code => "404",
-        msg => "File not found",
-        headers => {
-            "Content-type" => "text/html; charset=UTF-8",
-        },
-        content => $page404,
-    },
-    '/p1_fran%E7ais.html' => {
         code => "200",
         msg => "Ok",
         headers => {
+            # wrong charset here, overridden by --remote-encoding=iso-8859-1
             "Content-type" => "text/html; charset=UTF-8",
         },
         content => $pagefrancais,
@@ -140,14 +133,6 @@ my %urls = (
         },
         content => $pageeen,
     },
-    '/p2_%E9%E9n.html' => {
-        code => "200",
-        msg => "Ok",
-        headers => {
-            "Content-type" => "text/html; charset=ISO-8859-1",
-        },
-        content => $pageeen,
-    },
     '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {	# UTF-8 encoded
         code => "200",
         msg => "Ok",
@@ -156,14 +141,6 @@ my %urls = (
         },
         content => $pageeuro,
     },
-    '/p3_%A4%A4%A4.html' => {
-        code => "200",
-        msg => "Ok",
-        headers => {
-            "Content-type" => "text/plain",
-        },
-        content => $pageeuro,
-    },
     '/p3_%C2%A4%C2%A4%C2%A4.html' => {	# UTF-8 encoded
         code => "200",
         msg => "Ok",
@@ -174,7 +151,7 @@ my %urls = (
     },
 );

-my $cmdline = $WgetTest::WGETPATH . " --iri --trust-server-names --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
+my $cmdline = $WgetTest::WGETPATH . " --iri -e robots=on --trust-server-names --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";

 my $expected_error_code = 0;

@@ -185,7 +162,7 @@ my %expected_downloaded_files = (
     'robots.txt' => {
         content => "",
     },
-    "p1_fran${ccedilla_l15}ais.html" => {
+    "p1_fran${ccedilla_u8}ais.html" => {
         content => $pagefrancais,
     },
     "p2_${eacute_u8}${eacute_u8}n.html" => {
diff --git a/tests/Test-iri-percent.px b/tests/Test-iri-percent.px
index ae86b18..7c4f4c8 100755
--- a/tests/Test-iri-percent.px
+++ b/tests/Test-iri-percent.px
@@ -11,6 +11,8 @@ use HTTPTest;

 my $ccedilla_l15 = "\xE7";
 my $ccedilla_l15_pct = "%E7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $ccedilla_u8_pct = "%C3%A7";
 my $eacute_l1 = "\xE9";
 my $eacute_u8 = "\xC3\xA9";
 my $eacute_u8_pct = "%C3%A9";
@@ -52,7 +54,7 @@ my %urls = (
         },
         content => $pageindex,
     },
-    "/hello_${ccedilla_l15_pct}${eacute_u8_pct}.html" => {
+    "/hello_${ccedilla_u8_pct}${eacute_u8_pct}.html" => {
         code => "200",
         msg => "Ok",
         headers => {
@@ -70,7 +72,7 @@ my %expected_downloaded_files = (
     'index.html' => {
         content => $pageindex,
     },
-    "hello_${ccedilla_l15}${eacute_u8}.html" => {
+    "hello_${ccedilla_u8}${eacute_u8}.html" => {
         content => $pagefrancais,
     },
 );
diff --git a/tests/Test-iri.px b/tests/Test-iri.px
index 4d6b39a..eb23b63 100755
--- a/tests/Test-iri.px
+++ b/tests/Test-iri.px
@@ -42,11 +42,12 @@ my $pageindex = <<EOF;
 </html>
 EOF

+# specifying a wrong charset in http-equiv - it will be overridden by Content-Type HTTP header
 my $pagefrancais = <<EOF;
 <html>
 <head>
   <title>La seule page en français</title>
-  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
 </head>
 <body>
   <p>
@@ -131,18 +132,11 @@ my %urls = (
         content => "",
     },
     '/p1_fran%C3%A7ais.html' => {	# UTF-8 encoded
-        code => "404",
-        msg => "File not found",
-        headers => {
-            "Content-type" => "text/html; charset=UTF-8",
-        },
-        content => $page404,
-    },
-    '/p1_fran%E7ais.html' => {
         code => "200",
         msg => "Ok",
         headers => {
-            "Content-type" => "text/html; charset=UTF-8",
+            # Content-Type header overrides http-equiv Content-Type
+            "Content-type" => "text/html; charset=ISO-8859-15",
         },
         content => $pagefrancais,
     },
@@ -150,10 +144,10 @@ my %urls = (
         code => "200",
         msg => "Ok",
         request_headers => {
-            "Referer" => qr|http://localhost:[0-9]+/p1_fran%E7ais.html|,
+            "Referer" => qr|http://localhost:[0-9]+/p1_fran%C3%A7ais.html|,
         },
         headers => {
-            "Content-type" => "text/html; charset=ISO-8859-1",
+            "Content-type" => "text/html; charset=UTF-8",
         },
         content => $pageeen,
     },
@@ -165,14 +159,6 @@ my %urls = (
         },
         content => $pageeuro,
     },
-    '/p3_%A4%A4%A4.html' => {
-        code => "200",
-        msg => "Ok",
-        headers => {
-            "Content-type" => "text/plain; charset=ISO-8859-1",
-        },
-        content => $pageeuro,
-    },
     '/p4_m%C3%A9%C3%A9r.html' => {
         code => "200",
         msg => "Ok",
@@ -197,7 +183,7 @@ my %expected_downloaded_files = (
     'robots.txt' => {
         content => "",
     },
-    "p1_fran${ccedilla_l15}ais.html" => {
+    "p1_fran${ccedilla_u8}ais.html" => {
         content => $pagefrancais,
     },
     "p2_${eacute_u8}${eacute_u8}n.html" => {
--
2.1.1