From a7727bf8becfe796813fb2a99e3344c3b2daafb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim Rühsen?=
Date: Mon, 6 Oct 2014 16:32:37 +0200 Subject: [PATCH] fixed IRI misbehaviour(s) --- src/ChangeLog | 8 ++++ src/html-url.c | 5 +- src/iri.c | 102 ++++++++++++++++++++-------------------- src/url.c | 19 ++++---- tests/ChangeLog | 7 +++ tests/Test-idn-meta.px | 3 +- tests/Test-iri-forced-remote.px | 31 ++---------- tests/Test-iri-percent.px | 6 ++- tests/Test-iri.px | 28 +++-------- 9 files changed, 96 insertions(+), 113 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 1c4e2d5..2c20679 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2014-10-06 Tim Ruehsen + + * url.c (url_parse): little code cleanup + * html-url.c (get_urls_html): HTTP header Content-Type charset preceeds http-equiv + * iri.c (do_conversion): moved iconv code completely into the function + * iri.c (do_conversion): call url_unescape to fix charset conversion + * iri.c (remote_to_utf8): use strcasecmp to compare encoding + 2014-05-03 Tim Ruehsen * retr.c (retrieve_url): fixed memory leak diff --git a/src/html-url.c b/src/html-url.c index 3c6c9b9..903864e 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -748,8 +748,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); - /* If meta charset isn't null, override content encoding */ - if (iri && meta_charset) + /* Meta charset is only valid if there was no HTTP header Content-Type charset. */ + /* This is true for HTTP 1.0 and 1.1. */ + if (iri && !iri->content_encoding && meta_charset) set_content_encoding (iri, meta_charset); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); diff --git a/src/iri.c b/src/iri.c index 4db426e..e38edc9 100644 --- a/src/iri.c +++ b/src/iri.c @@ -38,15 +38,13 @@ as that of the covered work. */ #include
@@ -117,17 +117,10 @@ my %urls = (
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
- code => "404",
- msg => "File not found",
- headers => {
- "Content-type" => "text/html; charset=UTF-8",
- },
- content => $page404,
- },
- '/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
+ # wrong charset here, overridden by --remote-encoding=iso-8859-1
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
@@ -140,14 +133,6 @@ my %urls = (
},
content => $pageeen,
},
- '/p2_%E9%E9n.html' => {
- code => "200",
- msg => "Ok",
- headers => {
- "Content-type" => "text/html; charset=ISO-8859-1",
- },
- content => $pageeen,
- },
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
@@ -156,14 +141,6 @@ my %urls = (
},
content => $pageeuro,
},
- '/p3_%A4%A4%A4.html' => {
- code => "200",
- msg => "Ok",
- headers => {
- "Content-type" => "text/plain",
- },
- content => $pageeuro,
- },
'/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
@@ -174,7 +151,7 @@ my %urls = (
},
);
-my $cmdline = $WgetTest::WGETPATH . " --iri --trust-server-names --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
+my $cmdline = $WgetTest::WGETPATH . " --iri -e robots=on --trust-server-names --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
@@ -185,7 +162,7 @@ my %expected_downloaded_files = (
'robots.txt' => {
content => "",
},
- "p1_fran${ccedilla_l15}ais.html" => {
+ "p1_fran${ccedilla_u8}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
diff --git a/tests/Test-iri-percent.px b/tests/Test-iri-percent.px
index ae86b18..7c4f4c8 100755
--- a/tests/Test-iri-percent.px
+++ b/tests/Test-iri-percent.px
@@ -11,6 +11,8 @@ use HTTPTest;
my $ccedilla_l15 = "\xE7";
my $ccedilla_l15_pct = "%E7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $ccedilla_u8_pct = "%C3%A7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eacute_u8_pct = "%C3%A9";
@@ -52,7 +54,7 @@ my %urls = (
},
content => $pageindex,
},
- "/hello_${ccedilla_l15_pct}${eacute_u8_pct}.html" => {
+ "/hello_${ccedilla_u8_pct}${eacute_u8_pct}.html" => {
code => "200",
msg => "Ok",
headers => {
@@ -70,7 +72,7 @@ my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
- "hello_${ccedilla_l15}${eacute_u8}.html" => {
+ "hello_${ccedilla_u8}${eacute_u8}.html" => {
content => $pagefrancais,
},
);
diff --git a/tests/Test-iri.px b/tests/Test-iri.px
index 4d6b39a..eb23b63 100755
--- a/tests/Test-iri.px
+++ b/tests/Test-iri.px
@@ -42,11 +42,12 @@ my $pageindex = <
@@ -131,18 +132,11 @@ my %urls = (
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
- code => "404",
- msg => "File not found",
- headers => {
- "Content-type" => "text/html; charset=UTF-8",
- },
- content => $page404,
- },
- '/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
- "Content-type" => "text/html; charset=UTF-8",
+ # Content-Type header overrides http-equiv Content-Type
+ "Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pagefrancais,
},
@@ -150,10 +144,10 @@ my %urls = (
code => "200",
msg => "Ok",
request_headers => {
- "Referer" => qr|http://localhost:[0-9]+/p1_fran%E7ais.html|,
+ "Referer" => qr|http://localhost:[0-9]+/p1_fran%C3%A7ais.html|,
},
headers => {
- "Content-type" => "text/html; charset=ISO-8859-1",
+ "Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
@@ -165,14 +159,6 @@ my %urls = (
},
content => $pageeuro,
},
- '/p3_%A4%A4%A4.html' => {
- code => "200",
- msg => "Ok",
- headers => {
- "Content-type" => "text/plain; charset=ISO-8859-1",
- },
- content => $pageeuro,
- },
'/p4_m%C3%A9%C3%A9r.html' => {
code => "200",
msg => "Ok",
@@ -197,7 +183,7 @@ my %expected_downloaded_files = (
'robots.txt' => {
content => "",
},
- "p1_fran${ccedilla_l15}ais.html" => {
+ "p1_fran${ccedilla_u8}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
--
2.1.1