diff --git a/tccpp.c b/tccpp.c index 4d5169e..2fed4d0 100644 --- a/tccpp.c +++ b/tccpp.c @@ -2105,13 +2105,66 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long tcc_warning("unknown escape sequence: \'\\x%x\'", c); break; } + } else if (is_long && c >= 0x80) { + /* assume we are processing UTF-8 sequence */ + + int cont; /* count of continuation bytes */ + int i; + + /* decode leading byte */ + if ((c >> 5) == 0x6) { + cont = 1; n = c & 0x1f; + } else if ((c >> 4) == 0xe) { + cont = 2; n = c & 0xf; + } else if ((c >> 3) == 0x1e) { + cont = 3; n = c & 0x7; + } else { + goto invalid_utf8_sequence; + } + + /* decode continuation bytes */ + for (i = 1; i <= cont; i++) { + if ((p[i] >> 6) != 2) + goto invalid_utf8_sequence; + n = (n << 6) | (p[i] & 0x3f); + } + + /* check for overlong encoding */ + if ((cont == 1 && n < 0x80) || + (cont == 2 && n < 0x800) || + (cont == 3 && n < 0x10000)) + goto invalid_utf8_sequence; + + /* check for invalid code point */ + if (n > 0x10FFFF) + goto invalid_utf8_sequence; + + /* advance pointer */ + p += 1 + cont; + c = n; + goto add_char_nonext; + + invalid_utf8_sequence: + tcc_warning("invalid UTF-8 sequence"); } p++; add_char_nonext: if (!is_long) cstr_ccat(outstr, c); - else + else { +#ifdef TCC_TARGET_PE + /* store as UTF-16 */ + if (c < 0x10000) { + cstr_wccat(outstr, c); + } else { + c -= 0x10000; + cstr_wccat(outstr, (c >> 10) + 0xD800); + cstr_wccat(outstr, (c & 0x3FF) + 0xDC00); + } +#else cstr_wccat(outstr, c); +#endif + } } /* add a trailing '\0' */ if (!is_long)