[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Bug-wget] Website with broken img tags; browsers can handle it but
From: |
Giuseppe Scrivano |
Subject: |
Re: [Bug-wget] Website with broken img tags; browsers can handle it but wget can not. |
Date: |
Sun, 30 May 2010 14:03:23 +0200 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/24.0.50 (gnu/linux) |
Alexander Lane <address@hidden> writes:
> I've encountered a website that does not put the ">" at the end of
> some of its img tags. Wget skips downloading those images as a result,
> but I checked several web browsers & they were all able to cope with
> it.
>
> I don't know whether this was done in an attempt to break automated
> downloading or if it's just bad HTML.
>
> Here's what they look like:
>
> <p><img src="something/something1.jpg" border="1" width="1060"
> height="1592"</p>
>
> Is there any way I can make wget recognize & follow these malformed img tags?
Thanks for your report.
This patch that I am going to commit should make wget more robust to
unclosed html tags.
Would you like to help me testing it?
Cheers,
Giuseppe
=== modified file 'src/html-parse.c'
--- src/html-parse.c 2010-05-08 19:56:15 +0000
+++ src/html-parse.c 2010-05-30 11:38:35 +0000
@@ -528,13 +528,14 @@
* whitespace
* 8-bit and control chars
* characters that clearly cannot be part of name:
- '=', '>', '/'.
+ '=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
- && (x) != '=' && (x) != '>' && (x) != '/')
+ && (x) != '=' && (x) != '<' && (x) != '>' \
+ && (x) != '/')
#ifdef STANDALONE
static int comment_backout_count;
@@ -619,6 +620,7 @@
case '\n':
ch = *p++;
break;
+ case '<':
case '>':
state = AC_S_DONE;
break;
@@ -926,7 +928,7 @@
}
}
- if (end_tag && *p != '>')
+ if (end_tag && *p != '>' && *p != '<')
goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
@@ -958,12 +960,12 @@
/* ^ */
ADVANCE (p);
SKIP_WS (p);
- if (*p != '>')
+ if (*p != '<' || *p != '>')
goto backout_tag;
}
/* Check for end of tag definition. */
- if (*p == '>')
+ if (*p == '<' || *p == '>')
break;
/* Establish bounds of attribute name. */
@@ -978,7 +980,8 @@
/* Establish bounds of attribute value. */
SKIP_WS (p);
- if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
+
+ if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
@@ -1015,7 +1018,7 @@
newline_seen = true;
continue;
}
- else if (newline_seen && *p == '>')
+ else if (newline_seen && (*p == '<' || *p == '>'))
break;
ADVANCE (p);
}
@@ -1040,7 +1043,7 @@
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
- while (!c_isspace (*p) && *p != '>')
+ while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */
@@ -1138,7 +1141,8 @@
}
mapfun (&taginfo, maparg);
- ADVANCE (p);
+ if (*p != '<')
+ ADVANCE (p);
}
goto look_for_tag;