>From 0123e79976aa2ec35f9610fbf3094457b689d091 Mon Sep 17 00:00:00 2001 From: Ander Juaristi Date: Mon, 13 Apr 2015 16:28:36 +0200 Subject: [PATCH 1/2] Fixed incorrect handling of reserved chars. * src/url.c (url_unescape_1): New static function. (url_unescape): Calls url_unescape_1 with mask zero. Preserves same behavior as before. Only code changes. (url_unescape_except_reserved): New function. * src/url.h: Added prototype for url_unescape_except_reserved(). When the locale is US-ASCII, URIs that contain special characters in them are converted to IRIs according to RFC 3987, section 3.2 "Converting URIs to IRIs". --- src/iri.c | 2 +- src/url.c | 40 +++++++++++++++++++++++++++++----------- src/url.h | 1 + 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/iri.c b/src/iri.c index 5278dee..10ae994 100644 --- a/src/iri.c +++ b/src/iri.c @@ -136,7 +136,7 @@ do_conversion (const char *tocode, const char *fromcode, char const *in_org, siz /* iconv() has to work on an unescaped string */ in_save = in = xstrndup (in_org, inlen); - url_unescape(in); + url_unescape_except_reserved (in); inlen = strlen(in); len = outlen = inlen * 2; diff --git a/src/url.c b/src/url.c index e78dbc6..73c8dd0 100644 --- a/src/url.c +++ b/src/url.c @@ -161,17 +161,8 @@ static const unsigned char urlchr_table[256] = #undef U #undef RU -/* URL-unescape the string S. - - This is done by transforming the sequences "%HH" to the character - represented by the hexadecimal digits HH. If % is not followed by - two hexadecimal digits, it is inserted literally. - - The transformation is done in place. If you need the original - string intact, make a copy before calling this function. */ - -void -url_unescape (char *s) +static void +url_unescape_1 (char *s, unsigned char mask) { char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ @@ -190,6 +181,8 @@ url_unescape (char *s) if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2]))) goto copychar; c = X2DIGITS_TO_NUM (h[1], h[2]); + if (urlchr_test(c, mask)) + goto copychar; /* Don't unescape %00 because there is no way to insert it into a C string without effectively truncating it. */ if (c == '\0') @@ -201,6 +194,31 @@ url_unescape (char *s) *t = '\0'; } +/* URL-unescape the string S. + + This is done by transforming the sequences "%HH" to the character + represented by the hexadecimal digits HH. If % is not followed by + two hexadecimal digits, it is inserted literally. + + The transformation is done in place. If you need the original + string intact, make a copy before calling this function. */ +void +url_unescape (char *s) +{ + url_unescape_1 (s, 0); +} + +/* URL-unescape the string S. + + This functions behaves identically as url_unescape(), but does not + convert characters from "reserved". In other words, it only converts + "unsafe" characters. */ +void +url_unescape_except_reserved (char *s) +{ + url_unescape_1 (s, urlchr_reserved); +} + /* The core of url_escape_* functions. Escapes the characters that match the provided mask in urlchr_table. diff --git a/src/url.h b/src/url.h index b1c46c1..a543f3d 100644 --- a/src/url.h +++ b/src/url.h @@ -106,6 +106,7 @@ struct url char *url_escape (const char *); char *url_escape_unsafe_and_reserved (const char *); void url_unescape (char *); +void url_unescape_except_reserved (char *); struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode); char *url_error (const char *, int); -- 1.9.1