=== modified file 'bootstrap.conf' --- bootstrap.conf 2012-03-20 19:41:14 +0000 +++ bootstrap.conf 2012-04-04 15:09:08 +0000 @@ -58,6 +58,7 @@ quote quotearg recv +regex select send setsockopt === modified file 'src/init.c' --- src/init.c 2012-03-08 09:00:51 +0000 +++ src/init.c 2012-04-04 17:46:59 +0000 @@ -80,6 +80,7 @@ CMD_DECLARE (cmd_directory_vector); CMD_DECLARE (cmd_number); CMD_DECLARE (cmd_number_inf); +CMD_DECLARE (cmd_regex); CMD_DECLARE (cmd_string); CMD_DECLARE (cmd_file); CMD_DECLARE (cmd_directory); @@ -116,6 +117,7 @@ } commands[] = { /* KEEP THIS LIST ALPHABETICALLY SORTED */ { "accept", &opt.accepts, cmd_vector }, + { "acceptregex", &opt.acceptregex, cmd_regex }, { "addhostdir", &opt.add_hostdir, cmd_boolean }, { "adjustextension", &opt.adjust_extension, cmd_boolean }, { "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */ @@ -237,6 +239,7 @@ { "recursive", NULL, cmd_spec_recursive }, { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, + { "rejectregex", &opt.rejectregex, cmd_regex }, { "relativeonly", &opt.relative_only, cmd_boolean }, { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, @@ -943,6 +946,30 @@ return true; } +/* Compile the regular expression and place a + pointer to *PLACE. */ +static bool +cmd_regex (const char *com, const char *val, void *place) +{ + regex_t **regex = (regex_t **)place; + *regex = malloc (sizeof (regex_t)); + + int errcode = regcomp (*regex, val, REG_EXTENDED | REG_NOSUB); + + if (errcode != 0) + { + int errbuf_size = regerror (errcode, *regex, NULL, 0); + char *errbuf = malloc (errbuf_size); + errbuf_size = regerror (errcode, *regex, errbuf, errbuf_size); + fprintf (stderr, _("%s: %s: Invalid regular expression %s, %s\n"), + exec_name, com, quote (val), errbuf); + xfree (errbuf); + return false; + } + + return true; +} + /* Like the above, but handles tilde-expansion when reading a user's `.wgetrc'. In that case, and if VAL begins with `~', the tilde === modified file 'src/main.c' --- src/main.c 2012-03-05 21:23:06 +0000 +++ src/main.c 2012-04-04 15:15:50 +0000 @@ -158,6 +158,7 @@ static struct cmdline_option option_data[] = { { "accept", 'A', OPT_VALUE, "accept", -1 }, + { "acceptregex", 0, OPT_VALUE, "acceptregex", -1 }, { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 }, { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument }, { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 }, @@ -263,6 +264,7 @@ { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, + { "rejectregex", 0, OPT_VALUE, "rejectregex", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, @@ -723,6 +725,10 @@ N_("\ -R, --reject=LIST comma-separated list of rejected extensions.\n"), N_("\ + --acceptregex=REGEX extended regex matching accepted URLs.\n"), + N_("\ + --rejectregex=REGEX extended regex matching rejected URLs.\n"), + N_("\ -D, --domains=LIST comma-separated list of accepted domains.\n"), N_("\ --exclude-domains=LIST comma-separated list of rejected domains.\n"), === modified file 'src/options.h' --- src/options.h 2012-03-05 21:23:06 +0000 +++ src/options.h 2012-04-04 17:43:42 +0000 @@ -29,6 +29,8 @@ shall include the source code for the parts of OpenSSL used as well as that of the covered work. */ +#include + struct options { int verbose; /* Are we verbose? (First set to -1, @@ -74,6 +76,9 @@ bool ignore_case; /* Whether to ignore case when matching dirs and files */ + regex_t *acceptregex; /* Patterns to accept. */ + regex_t *rejectregex; /* Patterns to reject. */ + char **domains; /* See host.c */ char **exclude_domains; bool dns_cache; /* whether we cache DNS lookups. */ === modified file 'src/recur.c' --- src/recur.c 2011-03-30 23:37:12 +0000 +++ src/recur.c 2012-04-04 17:48:34 +0000 @@ -586,6 +586,11 @@ goto out; } } + if (!accept_url (url)) + { + DEBUGP (("%s is excluded/not-included through regex.\n", url)); + goto out; + } /* 6. Check for acceptance/rejection rules. We ignore these rules for directories (no file name to match) and for non-leaf HTMLs, === modified file 'src/utils.c' --- src/utils.c 2012-03-29 18:13:27 +0000 +++ src/utils.c 2012-04-04 17:47:46 +0000 @@ -917,6 +917,48 @@ return true; } +/* Determine whether an URL is acceptable to be followed, according to + regex patterns to accept/reject. */ +bool +accept_url (const char *s) +{ + int rc; + bool accept = true; + + if (opt.acceptregex) + { + rc = regexec (opt.acceptregex, s, 0, NULL, 0); + if (rc == REG_NOMATCH) + accept = false; + else if (rc != 0) + { + int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0); + char *errbuf = malloc (errbuf_size); + errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size); + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (s), rc); + xfree (errbuf); + accept = false; + } + } + if (accept && opt.rejectregex) + { + rc = regexec (opt.rejectregex, s, 0, NULL, 0); + if (rc == 0) + accept = false; + else if (rc != REG_NOMATCH) + { + int errbuf_size = regerror (rc, opt.rejectregex, NULL, 0); + char *errbuf = malloc (errbuf_size); + errbuf_size = regerror (rc, opt.rejectregex, errbuf, errbuf_size); + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (s), rc); + xfree (errbuf); + } + } + return accept; +} + /* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p() will return true if and only if D2 begins with `/something/' or is exactly '/something'. */ === modified file 'src/utils.h' --- src/utils.h 2011-01-01 12:19:37 +0000 +++ src/utils.h 2012-04-04 15:13:48 +0000 @@ -90,6 +90,7 @@ int fnmatch_nocase (const char *, const char *, int); bool acceptable (const char *); +bool accept_url (const char *); bool accdir (const char *s); char *suffix (const char *s); bool match_tail (const char *, const char *, bool);