bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Bug-wget] [PATCH] New option: --rename-output: modify output filename w


From: Andrew Cady
Subject: [Bug-wget] [PATCH] New option: --rename-output: modify output filename with perl
Date: Thu, 25 Jul 2013 10:31:11 -0400

This patch adds an option that allows the user to modify wget's output
filenames using a perl expression.  It works similarly to perl's
"rename" script, in terms of how perl is used to modify the filename
string.  That is, the original filename is stored in the perl variable
$_, which the user-supplied code can modify; the value left in $_ is
used instead of the original.

Perl treats $_ as the default variable for regular expressions (among
other operations), so the user can specify a regular expression without
having to know any perl (other than perl-compatible regexes), and that
will work fine.  But arbitrary perl code can be used, e.g., to perform
character escapes.

Example usage:

    $ wget -x --rename 's?/?%2f?g' 
http://www.gnu.org/software/wget/manual/html_node/index.html

    --2013-07-25 06:36:20--  
http://www.gnu.org/software/wget/manual/html_node/index.html
    Resolving www.gnu.org (www.gnu.org)... 208.118.235.148, 2001:4830:134:3::a
    Connecting to www.gnu.org (www.gnu.org)|208.118.235.148|:80... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 8579 (8.4K) [text/html]
    Saving to: ‘www.gnu.org%2fsoftware%2fwget%2fmanual%2fhtml_node%2findex.html’

    
100%[====================================================================================================>]
 8,579       --.-K/s   in 0.05s

    2013-07-25 06:36:20 (167 KB/s) - 
‘www.gnu.org%2fsoftware%2fwget%2fmanual%2fhtml_node%2findex.html’ saved 
[8579/8579]

This also works exactly how one would want it to work:

    $ wget -q --rename 's?/?%2f?g' -r --no-parent -k 
http://www.gnu.org/software/wget/manual/html_node/index.html

I.e., you get the site saved without any of the directory structure, and
all the internal links still work.

This patch is not 100% "done":

  * It includes "/usr/include/unistd.h" instead of <unistd.h>.
  Otherwise I get compiler warnings about implicitly defined functions.
  I guess the -I flags need to be changed when make calls gcc
  perlfilter.c?  Please advise.

  * It should be conditionally disabled with autoconf in some
  environments.

  * It should either work as usual, or else produce a usage error when
  both --rename and -O are specified (instead, -O silently disables
  --rename).

  * It should be possible to override the program used to filter the
  names (arguably, the default should be sed rather than perl, although
  I don't think so).

  * The info documentation should be updated.

I will fix those issues if the patch will enter wget.
---
 src/Makefile.am  |    4 +--
 src/http.c       |   12 +++++++
 src/init.c       |    1 +
 src/main.c       |    3 ++
 src/options.h    |    2 ++
 src/perlfilter.c |  100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/perlfilter.h |   10 ++++++
 7 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 src/perlfilter.c
 create mode 100644 src/perlfilter.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 8ef931a..16474ed 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -47,13 +47,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c   
                  \
               ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
               http.c init.c log.c main.c netrc.c progress.c ptimer.c     \
               recur.c res.c retr.c spider.c url.c warc.c                       
  \
-              utils.c exits.c build_info.c $(IRI_OBJ)                    \
+              utils.c exits.c build_info.c perlfilter.c $(IRI_OBJ)       \
               css-url.h css-tokens.h connect.h convert.h cookies.h       \
               ftp.h hash.h host.h html-parse.h html-url.h      \
               http.h http-ntlm.h init.h log.h mswindows.h netrc.h        \
               options.h progress.h ptimer.h recur.h res.h retr.h         \
               spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h        
  \
-              exits.h gettext.h
+              exits.h gettext.h perlfilter.h
 nodist_wget_SOURCES = version.c
 EXTRA_wget_SOURCES = iri.c
 LDADD = $(LIBOBJS) ../lib/libgnu.a
diff --git a/src/http.c b/src/http.c
index 6b042a7..971374e 100644
--- a/src/http.c
+++ b/src/http.c
@@ -67,6 +67,7 @@ as that of the covered work.  */
 #ifdef __VMS
 # include "vms.h"
 #endif /* def __VMS */
+#include "perlfilter.h"
 
 extern char *version_string;
 
@@ -3019,6 +3020,17 @@ http_loop (struct url *u, struct url *original_url, char 
**newloc,
       got_name = true;
     }
 
+  if (got_name && opt.rename_output && !opt.output_document)
+    {
+      static pipe2_t *filter = 0;
+      if (!filter)
+        {
+          filter = malloc(sizeof(*filter));
+          *filter = init_perl_filter(opt.rename_output);
+        }
+      hstat.local_file = apply_perl_filter(*filter, hstat.local_file);
+    }
+
   if (got_name && file_exists_p (hstat.local_file) && opt.noclobber && 
!opt.output_document)
     {
       /* If opt.noclobber is turned on and file already exists, do not
diff --git a/src/init.c b/src/init.c
index 1c4432b..fb3be9b 100644
--- a/src/init.c
+++ b/src/init.c
@@ -254,6 +254,7 @@ static const struct {
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
+  { "renameoutput",     &opt.rename_output,     cmd_string },
   { "reportspeed",             &opt.report_bps, cmd_spec_report_speed},
   { "restrictfilenames", NULL,                  cmd_spec_restrict_file_names },
   { "retrsymlinks",     &opt.retr_symlinks,     cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 8ce0eb3..3f8eed2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -270,6 +270,7 @@ static struct cmdline_option option_data[] =
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
+    { "rename-output", 0, OPT_VALUE, "renameoutput", -1 },
     { "report-speed", 0, OPT_BOOLEAN, "reportspeed", -1 },
     { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
     { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@@ -627,6 +628,8 @@ HTTP options:\n"),
        --auth-no-challenge     send Basic HTTP authentication information\n\
                                without first waiting for the server's\n\
                                challenge.\n"),
+    N_("\
+       --rename-output=CODE    rename output file(s) with perl.\n"),
     "\n",
 
 #ifdef HAVE_SSL
diff --git a/src/options.h b/src/options.h
index 0a10c9b..f810263 100644
--- a/src/options.h
+++ b/src/options.h
@@ -277,6 +277,8 @@ struct options
   int ftp_stmlf;                /* Force Stream_LF format for binary FTP. */
 #endif /* def __VMS */
 
+  char *rename_output;          /* Rename output file(s) using this perl code. 
*/
+
   bool useservertimestamps;    /* Update downloaded files' timestamps to
                                   match those on server? */
 
diff --git a/src/perlfilter.c b/src/perlfilter.c
new file mode 100644
index 0000000..805bc2e
--- /dev/null
+++ b/src/perlfilter.c
@@ -0,0 +1,100 @@
+#include <errno.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#include "/usr/include/unistd.h"
+#include "perlfilter.h"
+
+static int
+open2(char *argv[], int *result_in, int *result_out, pid_t *result_child)
+{
+  int pipe_out[2]; /* out of parent */
+  int pipe_in[2];  /* into parent */
+  pid_t cpid;
+
+  if (pipe(pipe_out) < 0)
+    return -1;
+  if (pipe(pipe_in) < 0)
+    return -1;
+
+  cpid = fork();
+  if (cpid < 0)
+    return -1;
+
+  if (cpid == 0) { /* child */
+    int *input = pipe_out; /* for sanity */
+    int *output = pipe_in;
+    close(output[0]);
+    close(input[1]);
+
+    /* see http://unixwiz.net/techtips/remap-pipe-fds.c.txt */
+    if (output[1] == 0)
+      if ((output[1] = dup(output[1]) < 0))
+       error(1, errno, "in child process: dup");
+    if (output[0] == 1)
+      if ((output[0] = dup(output[0]) < 0))
+       error(1, errno, "in child process: dup");
+
+    if (dup2(output[1], 1) < 0)
+      error(1, errno, "in child process: dup2");
+    if (dup2(input[0], 0) < 0)
+      error(1, errno, "in child process: dup2");
+
+    if (output[1] != 1)
+      close(output[1]);
+    if (input[0] != 0)
+      close(input[0]);
+
+    execv(argv[0], &argv[1]);
+    error(1, errno, "in child process: exec");
+    return -1; /* fucking warning */
+
+  } else { /* parent */
+    close(pipe_out[0]);
+    close(pipe_in[1]);
+
+    *result_in = pipe_in[0];
+    *result_out = pipe_out[1];
+    if (*result_child)
+      *result_child = cpid;
+    return 0;
+  }
+}
+
+pipe2_t init_perl_filter(char *src)
+{
+  pipe2_t res;
+  const char prefix[] = "BEGIN{$|++}";
+  char *perlcode = malloc(sizeof(prefix)+strlen(src));
+  strcpy(perlcode, prefix);
+  strcat(perlcode, src);
+  char *cmd[] = { "/usr/bin/perl", "perl", "-0lpe", perlcode, 0 };
+
+  int in = -1, out = -1; /* initialize to silence warning */
+  if (open2(cmd, &in, &out, &res.pid) < 0)
+    error(1, errno, "init_perl_filter: open2");
+  if ((res.out = fdopen(out, "w")) < 0)
+    error(1, errno, "init_perl_filter: fdopen");
+  if ((res.in  = fdopen(in, "r")) < 0)
+    error(1, errno, "init_perl_filter: fdopen");
+
+  return res;
+}
+
+char *apply_perl_filter(pipe2_t filter, char *s)
+{
+  ssize_t len = strlen(s)+1;
+
+  if (fwrite(s, 1, len, filter.out) < 0)
+    error(1, errno, "apply_perl_filter: fwrite");
+  fflush(filter.out);
+  fsync(fileno(filter.out));
+
+  ssize_t got = getdelim(&s, &len, '\0', filter.in);
+  if (got < 0)
+    error(1, errno, "apply_perl_filter: getdelim");
+
+  return s;
+}
diff --git a/src/perlfilter.h b/src/perlfilter.h
new file mode 100644
index 0000000..68b5b69
--- /dev/null
+++ b/src/perlfilter.h
@@ -0,0 +1,10 @@
+#ifndef _PERLFILTER_H
+#define _PERLFILTER_H
+#include <stdio.h>
+typedef struct {
+  FILE *in, *out;
+  pid_t pid;
+} pipe2_t;
+pipe2_t init_perl_filter(char *src);
+char *apply_perl_filter(pipe2_t filter, char *s);
+#endif
-- 
1.7.10.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]