=== modified file 'ChangeLog'
--- ChangeLog 2012-03-25 11:47:53 +0000
+++ ChangeLog 2012-04-10 22:28:11 +0000
@@ -1,3 +1,8 @@
+2012-04-11 Gijs van Tulder
+
+ * bootstrap.conf (gnulib_modules): Include module `regex'.
+ * configure.ac: Check for PCRE library.
+
2012-03-25 Ray Satiro
* configure.ac: Fix build under mingw when OpenSSL is used.
=== modified file 'bootstrap.conf'
--- bootstrap.conf 2012-03-20 19:41:14 +0000
+++ bootstrap.conf 2012-04-04 15:09:08 +0000
@@ -58,6 +58,7 @@
quote
quotearg
recv
+regex
select
send
setsockopt
=== modified file 'configure.ac'
--- configure.ac 2012-03-25 11:47:53 +0000
+++ configure.ac 2012-04-10 21:59:48 +0000
@@ -532,6 +532,18 @@
])
)
+dnl
+dnl Check for PCRE
+dnl
+
+AC_CHECK_HEADER(pcre.h,
+ AC_CHECK_LIB(pcre, pcre_compile,
+ [LIBS="${LIBS} -lpcre"
+ AC_DEFINE([HAVE_LIBPCRE], 1,
+ [Define if libpcre is available.])
+ ])
+)
+
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
=== modified file 'src/ChangeLog'
--- src/ChangeLog 2012-04-01 14:30:59 +0000
+++ src/ChangeLog 2012-04-10 22:30:28 +0000
@@ -1,3 +1,12 @@
+2012-04-11 Gijs van Tulder
+
+ * init.c: Add --accept-regex, --reject-regex and --regex-type.
+ * main.c: Likewise.
+ * options.c: Likewise.
+ * recur.c: Likewise.
+ * utils.c: Add regex-related functions.
+ * utils.h: Add regex-related functions.
+
2012-04-01 Giuseppe Scrivano
* gnutls.c (wgnutls_read_timeout): Ensure timer is freed.
=== modified file 'src/init.c'
--- src/init.c 2012-03-08 09:00:51 +0000
+++ src/init.c 2012-04-10 22:10:10 +0000
@@ -46,6 +46,10 @@
# endif
#endif
+#include
+#ifdef HAVE_LIBPCRE
+# include
+#endif
#ifdef HAVE_PWD_H
# include
@@ -94,6 +98,7 @@
CMD_DECLARE (cmd_spec_prefer_family);
CMD_DECLARE (cmd_spec_progress);
CMD_DECLARE (cmd_spec_recursive);
+CMD_DECLARE (cmd_spec_regex_type);
CMD_DECLARE (cmd_spec_restrict_file_names);
#ifdef HAVE_SSL
CMD_DECLARE (cmd_spec_secure_protocol);
@@ -116,6 +121,7 @@
} commands[] = {
/* KEEP THIS LIST ALPHABETICALLY SORTED */
{ "accept", &opt.accepts, cmd_vector },
+ { "acceptregex", &opt.acceptregex_s, cmd_string },
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
@@ -236,7 +242,9 @@
{ "reclevel", &opt.reclevel, cmd_number_inf },
{ "recursive", NULL, cmd_spec_recursive },
{ "referer", &opt.referer, cmd_string },
+ { "regextype", &opt.regex_type, cmd_spec_regex_type },
{ "reject", &opt.rejects, cmd_vector },
+ { "rejectregex", &opt.rejectregex_s, cmd_string },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
@@ -361,6 +369,8 @@
opt.restrict_files_nonascii = false;
opt.restrict_files_case = restrict_no_case_restriction;
+ opt.regex_type = regex_type_posix;
+
opt.max_redirect = 20;
opt.waitretry = 10;
@@ -1368,6 +1378,25 @@
return true;
}
+/* Validate --regex-type and set the choice. */
+
+static bool
+cmd_spec_regex_type (const char *com, const char *val, void *place_ignored)
+{
+ static const struct decode_item choices[] = {
+ { "posix", regex_type_posix },
+#ifdef HAVE_LIBPCRE
+ { "pcre", regex_type_pcre },
+#endif
+ };
+ int regex_type = regex_type_posix;
+ int ok = decode_string (val, choices, countof (choices), ®ex_type);
+ if (!ok)
+ fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
+ opt.regex_type = regex_type;
+ return ok;
+}
+
static bool
cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored)
{
=== modified file 'src/main.c'
--- src/main.c 2012-03-05 21:23:06 +0000
+++ src/main.c 2012-04-10 22:25:56 +0000
@@ -158,6 +158,7 @@
static struct cmdline_option option_data[] =
{
{ "accept", 'A', OPT_VALUE, "accept", -1 },
+ { "accept-regex", 0, OPT_VALUE, "acceptregex", -1 },
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@@ -262,7 +263,9 @@
{ "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
{ "referer", 0, OPT_VALUE, "referer", -1 },
+ { "regex-type", 0, OPT_VALUE, "regextype", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
+ { "reject-regex", 0, OPT_VALUE, "rejectregex", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -723,6 +726,17 @@
N_("\
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
N_("\
+ --accept-regex=REGEX regex matching accepted URLs.\n"),
+ N_("\
+ --reject-regex=REGEX regex matching rejected URLs.\n"),
+#ifdef HAVE_LIBPCRE
+ N_("\
+ --regex-type=TYPE regex type (posix|pcre).\n"),
+#else
+ N_("\
+ --regex-type=TYPE regex type (posix).\n"),
+#endif
+ N_("\
-D, --domains=LIST comma-separated list of accepted domains.\n"),
N_("\
--exclude-domains=LIST comma-separated list of rejected domains.\n"),
@@ -1323,6 +1337,35 @@
exit (1);
}
+ /* Compile the regular expressions. */
+ switch (opt.regex_type)
+ {
+#ifdef HAVE_LIBPCRE
+ case regex_type_pcre:
+ opt.regex_compile_fun = compile_pcre_regex;
+ opt.regex_match_fun = match_pcre_regex;
+ break;
+#endif
+
+ case regex_type_posix:
+ default:
+ opt.regex_compile_fun = compile_posix_regex;
+ opt.regex_match_fun = match_posix_regex;
+ break;
+ }
+ if (opt.acceptregex_s)
+ {
+ opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
+ if (!opt.acceptregex)
+ exit (1);
+ }
+ if (opt.rejectregex_s)
+ {
+ opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
+ if (!opt.rejectregex)
+ exit (1);
+ }
+
#ifdef ENABLE_IRI
if (opt.enable_iri)
{
=== modified file 'src/options.h'
--- src/options.h 2012-03-05 21:23:06 +0000
+++ src/options.h 2012-04-10 22:20:26 +0000
@@ -74,6 +74,19 @@
bool ignore_case; /* Whether to ignore case when
matching dirs and files */
+ char *acceptregex_s; /* Patterns to accept (a regex string). */
+ char *rejectregex_s; /* Patterns to reject (a regex string). */
+ void *acceptregex; /* Patterns to accept (a regex struct). */
+ void *rejectregex; /* Patterns to reject (a regex struct). */
+ enum {
+#ifdef HAVE_LIBPCRE
+ regex_type_pcre,
+#endif
+ regex_type_posix
+ } regex_type; /* The regex library. */
+ void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */
+ bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */
+
char **domains; /* See host.c */
char **exclude_domains;
bool dns_cache; /* whether we cache DNS lookups. */
=== modified file 'src/recur.c'
--- src/recur.c 2011-03-30 23:37:12 +0000
+++ src/recur.c 2012-04-04 17:48:34 +0000
@@ -586,6 +586,11 @@
goto out;
}
}
+ if (!accept_url (url))
+ {
+ DEBUGP (("%s is excluded/not-included through regex.\n", url));
+ goto out;
+ }
/* 6. Check for acceptance/rejection rules. We ignore these rules
for directories (no file name to match) and for non-leaf HTMLs,
=== modified file 'src/utils.c'
--- src/utils.c 2012-03-29 18:13:27 +0000
+++ src/utils.c 2012-04-10 22:22:10 +0000
@@ -73,6 +73,11 @@
#include
#include
+#include
+#ifdef HAVE_LIBPCRE
+# include
+#endif
+
#ifndef HAVE_SIGSETJMP
/* If sigsetjmp is a macro, configure won't pick it up. */
# ifdef sigsetjmp
@@ -917,6 +922,19 @@
return true;
}
+/* Determine whether an URL is acceptable to be followed, according to
+ regex patterns to accept/reject. */
+bool
+accept_url (const char *s)
+{
+ if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
+ return false;
+ if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
+ return false;
+
+ return true;
+}
+
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
will return true if and only if D2 begins with `/something/' or is exactly
'/something'. */
@@ -2309,6 +2327,92 @@
return q - (char *) dest;
}
+#ifdef HAVE_LIBPCRE
+/* Compiles the PCRE regex. */
+void *
+compile_pcre_regex (const char *str)
+{
+ const char *errbuf;
+ int erroffset;
+ pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0);
+
+ if (!regex)
+ {
+ fprintf (stderr, _("Invalid regular expression %s, %s\n"),
+ quote (str), errbuf);
+ return false;
+ }
+ return regex;
+}
+#endif
+
+/* Compiles the POSIX regex. */
+void *
+compile_posix_regex (const char *str)
+{
+ regex_t *regex = malloc (sizeof (regex_t));
+
+ int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB);
+
+ if (errcode != 0)
+ {
+ int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0);
+ char *errbuf = malloc (errbuf_size);
+ errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size);
+ fprintf (stderr, _("Invalid regular expression %s, %s\n"),
+ quote (str), errbuf);
+ xfree (errbuf);
+ return NULL;
+ }
+
+ return regex;
+}
+
+#ifdef HAVE_LIBPCRE
+#define OVECCOUNT 30
+/* Matches a PCRE regex. */
+bool
+match_pcre_regex (const void *regex, const char *str)
+{
+ int l = strlen (str);
+ int ovector[OVECCOUNT];
+
+ int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT);
+ if (rc == PCRE_ERROR_NOMATCH)
+ return false;
+ else if (rc < 0)
+ {
+ logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+ quote (str), rc);
+ return false;
+ }
+ else
+ return true;
+}
+#undef OVECCOUNT
+#endif
+
+/* Matches a POSIX regex. */
+bool
+match_posix_regex (const void *regex, const char *str)
+{
+ int rc = regexec ((regex_t *) regex, str, 0, NULL, 0);
+ if (rc == REG_NOMATCH)
+ return false;
+ else if (rc == 0)
+ return true;
+ else
+ {
+ int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
+ char *errbuf = malloc (errbuf_size);
+ errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
+ logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+ quote (str), rc);
+ xfree (errbuf);
+ return false;
+ }
+}
+
#undef IS_ASCII
#undef NEXT_CHAR
=== modified file 'src/utils.h'
--- src/utils.h 2011-01-01 12:19:37 +0000
+++ src/utils.h 2012-04-10 22:10:39 +0000
@@ -90,6 +90,7 @@
int fnmatch_nocase (const char *, const char *, int);
bool acceptable (const char *);
+bool accept_url (const char *);
bool accdir (const char *s);
char *suffix (const char *s);
bool match_tail (const char *, const char *, bool);
@@ -141,6 +142,14 @@
int base64_encode (const void *, int, char *);
int base64_decode (const char *, void *);
+#ifdef HAVE_LIBPCRE
+void *compile_pcre_regex (const char *);
+bool match_pcre_regex (const void *, const char *);
+#endif
+
+void *compile_posix_regex (const char *);
+bool match_posix_regex (const void *, const char *);
+
void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *));
const char *print_decimal (double);