From b75be369910b96e4c3b37a8d8c9d1542480eedde Mon Sep 17 00:00:00 2001 From: Tim Schlueter Date: Mon, 24 Jul 2017 23:24:05 -0700 Subject: [PATCH 3/3] Add gzip Content-Encoding decompression * src/http.c (struct http_stat): Add remote_encoding field. (read_response_body): Enable gzip decompression. (initialize_request): Send gzip Accept-Encoding header. (gethttp): Decompress files with gzip Content-Encoding. * src/retr.c: include zlib.h. (zalloc): New function. (zfree): New function. (fd_read_body): Decompress gzip data. * src/retr.h (fd_read_body enum): Add rb_compressed_gzip flag. --- src/http.c | 39 ++++++++++++++++- src/retr.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- src/retr.h | 4 +- 3 files changed, 180 insertions(+), 6 deletions(-) diff --git a/src/http.c b/src/http.c index a8c6e18..08b2ed6 100644 --- a/src/http.c +++ b/src/http.c @@ -1581,6 +1581,7 @@ struct http_stat #endif encoding_t local_encoding; /* the encoding of the local file */ + encoding_t remote_encoding; /* the encoding of the remote file */ bool temporary; /* downloading a temporary file */ }; @@ -1693,6 +1694,9 @@ read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, if (chunked_transfer_encoding) flags |= rb_chunked_transfer_encoding; + if (hs->remote_encoding == ENC_GZIP) + flags |= rb_compressed_gzip; + hs->len = hs->restval; hs->rd_size = 0; /* Download the response body and write it to fp. @@ -1886,7 +1890,12 @@ initialize_request (const struct url *u, struct http_stat *hs, int *dt, struct u rel_value); SET_USER_AGENT (req); request_set_header (req, "Accept", "*/*", rel_none); - request_set_header (req, "Accept-Encoding", "identity", rel_none); +#ifdef HAVE_LIBZ + if (opt.compression != compression_none) + request_set_header (req, "Accept-Encoding", "gzip", rel_none); + else +#endif + request_set_header (req, "Accept-Encoding", "identity", rel_none); /* Find the username with priority */ if (u->user) @@ -3203,6 +3212,7 @@ gethttp (const struct url *u, struct url *original_url, struct http_stat *hs, hs->error = NULL; hs->message = NULL; hs->local_encoding = ENC_NONE; + hs->remote_encoding = ENC_NONE; conn = u; @@ -3694,6 +3704,30 @@ gethttp (const struct url *u, struct url *original_url, struct http_stat *hs, DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval)); hs->local_encoding = ENC_NONE; } +#ifdef HAVE_LIBZ + else if (hs->local_encoding == ENC_GZIP + && opt.compression != compression_none) + { + /* Make sure the Content-Type is not gzip before decompressing */ + const char * p = strchr (type, '/'); + if (p == NULL) + { + hs->remote_encoding = ENC_GZIP; + hs->local_encoding = ENC_NONE; + } + else + { + p++; + if (c_tolower(p[0]) == 'x' && p[1] == '-') + p += 2; + if (0 != c_strcasecmp (p, "gzip")) + { + hs->remote_encoding = ENC_GZIP; + hs->local_encoding = ENC_NONE; + } + } + } +#endif } /* 20x responses are counted among successful by default. */ @@ -3930,6 +3964,9 @@ gethttp (const struct url *u, struct url *original_url, struct http_stat *hs, } if (contlen == -1) hs->contlen = -1; + /* If the response is gzipped, the uncompressed size is unknown. */ + else if (hs->remote_encoding == ENC_GZIP) + hs->contlen = -1; else hs->contlen = contlen + contrange; diff --git a/src/retr.c b/src/retr.c index 0cf438e..a27d58a 100644 --- a/src/retr.c +++ b/src/retr.c @@ -41,6 +41,10 @@ as that of the covered work. */ # include /* For delete(). */ #endif +#ifdef HAVE_LIBZ +# include +#endif + #include "exits.h" #include "utils.h" #include "retr.h" @@ -84,6 +88,22 @@ limit_bandwidth_reset (void) xzero (limit_data); } +#ifdef HAVE_LIBZ +static voidpf +zalloc (voidpf opaque, unsigned int items, unsigned int size) +{ + (void) opaque; + return (voidpf) xcalloc (items, size); +} + +static void +zfree (voidpf opaque, voidpf address) +{ + (void) opaque; + xfree (address); +} +#endif + /* Limit the bandwidth by pausing the download for an amount of time. BYTES is the number of bytes received from the network, and TIMER is the timer that started at the beginning of download. */ @@ -257,6 +277,44 @@ fd_read_body (const char *downloaded_filename, int fd, FILE *out, wgint toread, wgint sum_written = 0; wgint remaining_chunk_size = 0; +#ifdef HAVE_LIBZ + /* try to minimize the number of calls to inflate() and write_data() per + call to fd_read() */ + unsigned int gzbufsize = dlbufsize * 4; + char *gzbuf = NULL; + z_stream gzstream; + + if (flags & rb_compressed_gzip) + { + gzbuf = xmalloc (gzbufsize); + if (gzbuf != NULL) + { + gzstream.zalloc = zalloc; + gzstream.zfree = zfree; + gzstream.opaque = Z_NULL; + gzstream.next_in = Z_NULL; + gzstream.avail_in = 0; + + #define GZIP_DETECT 32 /* gzip format detection */ + #define GZIP_WINDOW 15 /* logarithmic window size (default: 15) */ + ret = inflateInit2 (&gzstream, GZIP_DETECT | GZIP_WINDOW); + if (ret != Z_OK) + { + xfree (gzbuf); + errno = (ret == Z_MEM_ERROR) ? ENOMEM : EINVAL; + ret = -1; + goto out; + } + } + else + { + errno = ENOMEM; + ret = -1; + goto out; + } + } +#endif + if (flags & rb_skip_startpos) skip = startpos; @@ -383,12 +441,64 @@ fd_read_body (const char *downloaded_filename, int fd, FILE *out, wgint toread, int write_res; sum_read += ret; - write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written); - if (write_res < 0) + +#ifdef HAVE_LIBZ + if (gzbuf != NULL) { - ret = (write_res == -3) ? -3 : -2; - goto out; + int err; + int towrite; + gzstream.avail_in = ret; + gzstream.next_in = (unsigned char *) dlbuf; + + do + { + gzstream.avail_out = gzbufsize; + gzstream.next_out = (unsigned char *) gzbuf; + + err = inflate (&gzstream, Z_NO_FLUSH); + + switch (err) + { + case Z_MEM_ERROR: + errno = ENOMEM; + ret = -1; + goto out; + case Z_NEED_DICT: + case Z_DATA_ERROR: + errno = EINVAL; + ret = -1; + goto out; + case Z_STREAM_END: + if (exact && sum_read != toread) + { + DEBUGP(("zlib stream ended unexpectedly after " + "%ld/%ld bytes\n", sum_read, toread)); + } + } + + towrite = gzbufsize - gzstream.avail_out; + write_res = write_data (out, out2, gzbuf, towrite, &skip, + &sum_written); + if (write_res < 0) + { + ret = (write_res == -3) ? -3 : -2; + goto out; + } + } + while (gzstream.avail_out == 0); + } + else +#endif + { + write_res = write_data (out, out2, dlbuf, ret, &skip, + &sum_written); + if (write_res < 0) + { + ret = (write_res == -3) ? -3 : -2; + goto out; + } } + if (chunked) { remaining_chunk_size -= ret; @@ -433,6 +543,31 @@ fd_read_body (const char *downloaded_filename, int fd, FILE *out, wgint toread, if (timer) ptimer_destroy (timer); +#ifdef HAVE_LIBZ + if (gzbuf != NULL) + { + int err = inflateEnd (&gzstream); + if (ret >= 0) + { + /* with compression enabled, ret must be 0 if successful */ + if (err == Z_OK) + ret = 0; + else + { + errno = EINVAL; + ret = -1; + } + } + xfree (gzbuf); + + if (gzstream.total_in != sum_read) + { + DEBUGP(("zlib read size differs from raw read size (%lu/%lu)\n", + gzstream.total_in, sum_read)); + } + } +#endif + if (qtyread) *qtyread += sum_read; if (qtywritten) diff --git a/src/retr.h b/src/retr.h index 5fbbacb..f133c83 100644 --- a/src/retr.h +++ b/src/retr.h @@ -49,7 +49,9 @@ enum { rb_skip_startpos = 2, /* Used by HTTP/HTTPS*/ - rb_chunked_transfer_encoding = 4 + rb_chunked_transfer_encoding = 4, + + rb_compressed_gzip = 8 }; int fd_read_body (const char *, int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *); -- 2.7.4