[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Bug-wget] bad filenames (again)
From: |
Andries E. Brouwer |
Subject: |
Re: [Bug-wget] bad filenames (again) |
Date: |
Sun, 9 Aug 2015 22:08:34 +0200 |
User-agent: |
Mutt/1.5.21 (2010-09-15) |
On Fri, Aug 07, 2015 at 05:13:19PM +0200, Tim Ruehsen wrote:
> The solution would something like
>
> if locale is UTF-8
> do not escape valid UTF-8 sequences
> else
> keep wget's current behavior
> If you provide patch for this we will appreciate that.
OK - a first version of such a patch.
This splits the restrict_control into two halves.
The low control is as before.
The high control is permitted by default on a Unix system
with something that looks like an UTF-8 locale.
For Windows the behavior is unchanged.
Andries
Test: fetch http://he.wikipedia.org/wiki/הרפש_.ש
diff -ru wget-1.16.3/src/init.c wget-1.16.3a/src/init.c
--- wget-1.16.3/src/init.c 2015-01-31 00:25:57.000000000 +0100
+++ wget-1.16.3a/src/init.c 2015-08-09 21:44:54.260215105 +0200
@@ -333,6 +333,27 @@
return -1;
}
+
+/* Used to determine whether bytes 128-159 are OK in a filename */
+static int
+have_utf8_locale() {
+#if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__)
+ /* insert some test for Windows */
+#else
+ char *p;
+
+ p = getenv("LC_ALL");
+ if (p == NULL)
+ p = getenv("LC_CTYPE");
+ if (p == NULL)
+ p = getenv("LANG");
+ if (strstr(p, "UTF-8") != NULL || strstr(p, "UTF8") != NULL ||
+ strstr(p, "utf-8") != NULL || strstr(p, "utf8") != NULL)
+ return true;
+#endif
+ return false;
+}
+
/* Reset the variables to default values. */
void
defaults (void)
@@ -401,6 +422,7 @@
opt.restrict_files_os = restrict_unix;
#endif
opt.restrict_files_ctrl = true;
+ opt.restrict_files_highctrl = (have_utf8_locale() ? false : true);
opt.restrict_files_nonascii = false;
opt.restrict_files_case = restrict_no_case_restriction;
@@ -1466,6 +1488,7 @@
{
int restrict_os = opt.restrict_files_os;
int restrict_ctrl = opt.restrict_files_ctrl;
+ int restrict_highctrl = opt.restrict_files_highctrl;
int restrict_case = opt.restrict_files_case;
int restrict_nonascii = opt.restrict_files_nonascii;
@@ -1488,7 +1511,7 @@
else if (VAL_IS ("uppercase"))
restrict_case = restrict_uppercase;
else if (VAL_IS ("nocontrol"))
- restrict_ctrl = false;
+ restrict_ctrl = restrict_highctrl = false;
else if (VAL_IS ("ascii"))
restrict_nonascii = true;
else
@@ -1509,6 +1532,7 @@
opt.restrict_files_os = restrict_os;
opt.restrict_files_ctrl = restrict_ctrl;
+ opt.restrict_files_highctrl = restrict_highctrl;
opt.restrict_files_case = restrict_case;
opt.restrict_files_nonascii = restrict_nonascii;
diff -ru wget-1.16.3/src/options.h wget-1.16.3a/src/options.h
--- wget-1.16.3/src/options.h 2015-01-31 00:25:57.000000000 +0100
+++ wget-1.16.3a/src/options.h 2015-08-09 21:22:35.984186065 +0200
@@ -244,6 +244,7 @@
bool restrict_files_ctrl; /* non-zero if control chars in URLs
are restricted from appearing in
generated file names. */
+ bool restrict_files_highctrl; /* idem for bytes 128-159 */
bool restrict_files_nonascii; /* non-zero if bytes with values greater
than 127 are restricted. */
enum {
diff -ru wget-1.16.3/src/url.c wget-1.16.3a/src/url.c
--- wget-1.16.3/src/url.c 2015-02-23 16:10:22.000000000 +0100
+++ wget-1.16.3a/src/url.c 2015-08-09 21:14:34.876175626 +0200
@@ -1329,7 +1329,8 @@
enum {
filechr_not_unix = 1, /* unusable on Unix, / and \0 */
filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
- filechr_control = 4 /* a control character, e.g. 0-31 */
+ filechr_control = 4, /* a control character, e.g. 0-31 */
+ filechr_highcontrol = 8 /* a high control character, in 128-159 */
};
#define FILE_CHAR_TEST(c, mask) \
@@ -1340,6 +1341,7 @@
#define U filechr_not_unix
#define W filechr_not_windows
#define C filechr_control
+#define Z filechr_highcontrol
#define UW U|W
#define UWC U|W|C
@@ -1370,8 +1372,8 @@
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
- C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
- C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, /* 128-143 */
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -1383,6 +1385,7 @@
#undef U
#undef W
#undef C
+#undef Z
#undef UW
#undef UWC
@@ -1417,8 +1420,11 @@
mask = filechr_not_unix;
else
mask = filechr_not_windows;
+
if (opt.restrict_files_ctrl)
mask |= filechr_control;
+ if (opt.restrict_files_highctrl)
+ mask |= filechr_highcontrol;
/* Copy [b, e) to PATHEL and URL-unescape it. */
if (escaped)
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/06
- Re: [Bug-wget] bad filenames (again), Tim Ruehsen, 2015/08/07
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/07
- Re: [Bug-wget] bad filenames (again), Tim Ruehsen, 2015/08/07
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/07
- Re: [Bug-wget] bad filenames (again),
Andries E. Brouwer <=
- Re: [Bug-wget] bad filenames (again), Tim Ruehsen, 2015/08/12
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/12
- Re: [Bug-wget] bad filenames (again), Tim Ruehsen, 2015/08/12
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/12
- Re: [Bug-wget] bad filenames (again), Tim Ruehsen, 2015/08/13
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/13
- Re: [Bug-wget] bad filenames (again), Eli Zaretskii, 2015/08/16
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/16
- Re: [Bug-wget] bad filenames (again), Eli Zaretskii, 2015/08/16
- Re: [Bug-wget] bad filenames (again), Andries E. Brouwer, 2015/08/17