>From 14fe0982e02ee4c10b241f9e7a29fb3e5164c6d5 Mon Sep 17 00:00:00 2001 From: "Dale R. Worley" Date: Sun, 16 Oct 2016 14:44:15 -0400 Subject: [PATCH] Amend redirection behavior * doc/wget.text: Update documentation. Fix errors and omissions. * src/convert.h (struct urlpos): Add link_redirect_p flag to struct urlpos to indicate the URL resulted from a redirection. * src/recur.c (download_child): Suppress --no-parent check for redirection URLs. * src/recur.c (download_child): Suppress directory checks for redirection URLs and page requisites (if -p). * src/recur.c (descend_redirect): Set link_redirect_p flag on struct urlpos for redirection URLs. Remove old test for suppressing directory checks for redirection URLs. --- doc/wget.texi | 41 +++++++++++++++++++++++++++++++++++++---- src/convert.h | 1 + src/recur.c | 53 ++++++++++++++++++++++++++++------------------------- 3 files changed, 66 insertions(+), 29 deletions(-) diff --git a/doc/wget.texi b/doc/wget.texi index f42773e..91219e5 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -2357,6 +2357,11 @@ your shell from expanding it, like in @samp{-A "*.mp3"} or @samp{-A '*.mp3'}. @itemx --reject-regex @var{urlregex} Specify a regular expression to accept or reject the complete URL. address@hidden that the effect of @samp{--accept-regex} and address@hidden is suppressed for +fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. + @item --regex-type @var{regextype} Specify the regular expression type. Possible types are @samp{posix} or @samp{pcre}. Note that to be able to use @samp{pcre} type, wget has to be @@ -2431,18 +2436,32 @@ Specify a comma-separated list of directories you wish to follow when downloading (@pxref{Directory-Based Limits}). Elements of @var{list} may contain wildcards. address@hidden that the effect of @samp{--include-directories} and address@hidden is suppressed for +fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. + @item -X @var{list} @itemx address@hidden Specify a comma-separated list of directories you wish to exclude from download (@pxref{Directory-Based Limits}). Elements of @var{list} may contain wildcards. address@hidden that the effect of @samp{--include-directories} and address@hidden is suppressed for +fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. + @item -np @item --no-parent -Do not ever ascend to the parent directory when retrieving recursively. +Do not ascend to the parent directory when retrieving recursively. This is a useful option, since it guarantees that only the files @emph{below} a certain hierarchy will be downloaded. @xref{Directory-Based Limits}, for more details. + address@hidden that the effect of @samp{--no-parent} is suppressed for +fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. @end table @c man end @@ -2689,6 +2708,11 @@ comma-separated list, and given as an argument to @samp{-A}. The argument to @samp{--accept-regex} option is a regular expression which is matched against the complete URL. address@hidden that the effect of @samp{--accept-regex} and address@hidden is suppressed for +fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. + @cindex reject wildcards @cindex reject suffixes @cindex wildcards, reject @@ -2709,9 +2733,14 @@ Analogously, to download all files except the ones beginning with expansion by the shell. @end table -The argument to @samp{--accept-regex} option is a regular expression which +The argument to @samp{--reject-regex} option is a regular expression which is matched against the complete URL. address@hidden that the effect of @samp{--accept-regex} and address@hidden is suppressed for +fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. + @noindent The @samp{-A} and @samp{-R} options may be combined to achieve even better fine-tuning of which files to retrieve. E.g. @samp{wget -A @@ -2778,12 +2807,16 @@ Wget offers three different options to deal with this requirement. Each option description lists a short name, a long name, and the equivalent command in @file{.wgetrc}. address@hidden that the effect of all of these options is suppressed +for fetching redirection URLs and for fetching page requisite URLs if address@hidden is specified. + @cindex directories, include @cindex include directories @cindex accept directories @table @samp @item -I @var{list} address@hidden --include @var{list} address@hidden --include-directories @var{list} @itemx include_directories = @var{list} @samp{-I} option accepts a comma-separated list of directories included in the retrieval. Any other directories will simply be ignored. The @@ -2801,7 +2834,7 @@ wget -I /people,/cgi-bin http://host/people/bozo/ @cindex exclude directories @cindex reject directories @item -X @var{list} address@hidden --exclude @var{list} address@hidden --exclude-directories @var{list} @itemx exclude_directories = @var{list} @samp{-X} option is exactly the reverse of @samp{-I}---this is a list of directories @emph{excluded} from the download. E.g. if you do not want diff --git a/src/convert.h b/src/convert.h index e3ff6f0..af0ab79 100644 --- a/src/convert.h +++ b/src/convert.h @@ -72,6 +72,7 @@ struct urlpos { unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */ unsigned int link_expect_html :1; /* expected to contain HTML */ unsigned int link_expect_css :1; /* expected to contain CSS */ + unsigned int link_redirect_p :1; /* the url comes from a redirection */ unsigned int link_refresh_p :1; /* link was received from */ diff --git a/src/recur.c b/src/recur.c index 1469e31..36aee22 100644 --- a/src/recur.c +++ b/src/recur.c @@ -650,14 +650,15 @@ download_child (const struct urlpos *upos, struct url *parent, int depth, /* 4. Check for parent directory. If we descended to a different host or changed the scheme, ignore - opt.no_parent. Also ignore it for documents needed to display - the parent page when in -p mode. */ + opt.no_parent. Also ignore it for redirections and documents + needed to display the parent page when in -p mode. */ if (opt.no_parent && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) && 0 == strcasecmp (u->host, start_url_parsed->host) && (u->scheme != start_url_parsed->scheme || u->port == start_url_parsed->port) - && !(opt.page_requisites && upos->link_inline_p)) + && !(opt.page_requisites && upos->link_inline_p) + && !upos->link_redirect_p) { if (!subdir_p (start_url_parsed->dir, u->dir)) { @@ -668,23 +669,30 @@ download_child (const struct urlpos *upos, struct url *parent, int depth, } } - /* 5. If the file does not match the acceptance list, or is on the - rejection list, chuck it out. The same goes for the directory - exclusion and inclusion lists. */ - if (opt.includes || opt.excludes) - { - if (!accdir (u->dir)) - { - DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); - reason = WG_RR_LIST; - goto out; - } - } - if (!accept_url (url)) + /* 5. If the file does not match the acceptance regexp list, or is on the + rejection regexp list, chuck it out. The same goes for the directory + exclusion and inclusion lists. + + Ignore this test for redirections and documents needed to display + the parent page when in -p mode. */ + if (!(opt.page_requisites && upos->link_inline_p) + && !upos->link_redirect_p) { - DEBUGP (("%s is excluded/not-included through regex.\n", url)); - reason = WG_RR_REGEX; - goto out; + if (opt.includes || opt.excludes) + { + if (!accdir (u->dir)) + { + DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); + reason = WG_RR_LIST; + goto out; + } + } + if (!accept_url (url)) + { + DEBUGP (("%s is excluded/not-included through regex.\n", url)); + reason = WG_RR_REGEX; + goto out; + } } /* 6. Check for acceptance/rejection rules. We ignore these rules @@ -800,18 +808,13 @@ descend_redirect (const char *redirected, struct url *orig_parsed, int depth, upos = xnew0 (struct urlpos); upos->url = new_parsed; + upos->link_redirect_p = 1; reason = download_child (upos, orig_parsed, depth, start_url_parsed, blacklist, iri); if (reason == WG_RR_SUCCESS) blacklist_add (blacklist, upos->url->url); - else if (reason == WG_RR_LIST || reason == WG_RR_REGEX) - { - DEBUGP (("Ignoring decision for redirects, decided to load it.\n")); - blacklist_add (blacklist, upos->url->url); - reason = WG_RR_SUCCESS; - } else DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); -- 1.8.3.1