[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#24603: [RFC 11/18] Implement casing rules for Lithuanian
From: |
Michal Nazarewicz |
Subject: |
bug#24603: [RFC 11/18] Implement casing rules for Lithuanian |
Date: |
Tue, 4 Oct 2016 03:10:34 +0200 |
In Lithuanian, tittle above lower case i and j are retained even if
there are other diacritics above present. For that to work, an explicit
combining dot above must be added after i and j or otherwise the
rendering engine will remove the tittle.
* src/casefiddle.c (struct casing_context, prepare_casing_context): Add
lithuanian_tittle member to hold state of Lithuanian rules handling.
(case_lithuanian): New function which implements Lithuanian rules.
(case_characters): Make use of case_lithuanian.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test cases
for Lithuanian rules.
---
src/casefiddle.c | 149 +++++++++++++++++++++++++++++++++++++++++--
test/src/casefiddle-tests.el | 27 +++++++-
2 files changed, 170 insertions(+), 6 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 2a7aa64..0377fe6 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -56,6 +56,16 @@ struct casing_context {
bool inword;
/* Whether to apply Azeri/Turkish rules for dotted and dotless i. */
bool treat_turkic_i;
+
+ /* Whether to use Lithuanian rules for i’s and j’s tittle. */
+ unsigned char lithuanian_tittle;
+#define LT_OFF 0 /* No */
+#define LT_ON 1 /* Yes */
+#define LT_DEL_DOT_ABOVE 2 /* Yes and look out for combining dot above to
+ delete. */
+#define LT_INS_DOT_ABOVE 3 /* Yes and look out for diacritics combining above
+ because we may need to inject dot above before
+ them. */
};
/* Initialise CTX structure and prepares related global data for casing
@@ -64,7 +74,7 @@ static void
prepare_casing_context (struct casing_context *ctx,
enum case_action flag, bool inbuffer)
{
- Lisp_Object lang, l, tr, az;
+ Lisp_Object lang, l, tr, az, lt;
ctx->flag = flag;
ctx->inbuffer = inbuffer;
@@ -74,6 +84,7 @@ prepare_casing_context (struct casing_context *ctx,
: Qnil;
ctx->treat_turkic_i = false;
+ ctx->lithuanian_tittle = LT_OFF;
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -86,6 +97,7 @@ prepare_casing_context (struct casing_context *ctx,
lang = Vcurrent_iso639_language;
tr = intern_c_string ("tr");
az = intern_c_string ("az");
+ lt = intern_c_string ("lt");
if (SYMBOLP (lang))
{
l = lang;
@@ -97,10 +109,9 @@ prepare_casing_context (struct casing_context *ctx,
lang = XCDR (lang);
check_language:
if (EQ (l, tr) || EQ (l, az))
- {
- ctx->treat_turkic_i = true;
- break;
- }
+ ctx->treat_turkic_i = true;
+ else if (EQ (l, lt))
+ ctx->lithuanian_tittle = LT_ON;
}
}
@@ -199,6 +210,131 @@ case_character_impl (struct casing_str_buf *buf,
#define CAPITAL_DOTTED_I 0x130
#define SMALL_DOTLESS_I 0x131
#define COMBINING_DOT_ABOVE 0x307
+
+/* Lithuanian retains tittle in lower case i and j when there are more
+ accents above those letters. */
+
+#define CAPITAL_I_WITH_GRAVE 0x0CC
+#define CAPITAL_I_WITH_ACUTE 0x0CD
+#define CAPITAL_I_WITH_TILDE 0x128
+#define CAPITAL_I_WITH_OGONEK 0x12E
+#define SMALL_I_WITH_OGONEK 0x12F
+#define COMBINING_GRAVE_ABOVE 0x300
+#define COMBINING_ACUTE_ABOVE 0x301
+#define COMBINING_TILDE_ABOVE 0x303
+#define COMBINING_OGONEK 0x328
+
+/* Attempt to case CH using rules for Lithuanian i and j. Return true if
+ character has been cased (in which case it’s saved in BUF), false otherwise.
+ If CTX->lithuanian_tittle is LT_OFF, return false. */
+static bool
+case_lithuanian (struct casing_str_buf *buf, struct casing_context *ctx,
+ enum case_action flag, int ch)
+{
+ switch (__builtin_expect(ctx->lithuanian_tittle, LT_OFF)) {
+ case LT_OFF:
+ return false;
+
+ case LT_DEL_DOT_ABOVE:
+ /* When upper-casing i or j, a combining dot above that follows it must be
+ removed. This is true even if there’s a combining ogonek in between.
+ But, if there’s another character combining above in between, combining
+ dot needs to stay (since the dot will be rendered above the other
+ diacritic). */
+ switch (ch) {
+ case COMBINING_DOT_ABOVE:
+ buf->len_chars = buf->len_bytes = 0;
+ ctx->lithuanian_tittle = LT_ON;
+ return true;
+ case COMBINING_GRAVE_ABOVE:
+ case COMBINING_ACUTE_ABOVE:
+ case COMBINING_TILDE_ABOVE:
+ ctx->lithuanian_tittle = LT_ON;
+ return false;
+ case COMBINING_OGONEK:
+ return false;
+ default:
+ ctx->lithuanian_tittle = LT_ON;
+ }
+ break;
+
+ case LT_INS_DOT_ABOVE:
+ /* When lower-casing I or J, if the letter has any accents above,
+ a combining dot above must be added before them. If we are here, it
+ means that we have lower cased I or J and we’re now on the lookout for
+ accents combining above. */
+ switch (ch) {
+ case COMBINING_GRAVE_ABOVE:
+ case COMBINING_ACUTE_ABOVE:
+ case COMBINING_TILDE_ABOVE:
+ buf->len_chars = 2;
+ buf->len_bytes = CHAR_STRING (COMBINING_DOT_ABOVE, buf->data);
+ buf->len_bytes += CHAR_STRING (ch, buf->data + buf->len_bytes);
+ ctx->lithuanian_tittle = LT_ON;
+ return true;
+ case COMBINING_OGONEK:
+ return false;
+ default:
+ ctx->lithuanian_tittle = LT_ON;
+ }
+ break;
+ }
+
+ switch (flag) {
+ case CASE_UP:
+ case CASE_CAPITALIZE:
+ if (ch == 'i' || ch == 'j')
+ {
+ buf->data[0] = ch ^ ('i' ^ 'I');
+ buf->len_bytes = 1;
+ }
+ else if (ch == SMALL_I_WITH_OGONEK)
+ buf->len_bytes = CHAR_STRING (CAPITAL_I_WITH_OGONEK, buf->data);
+ else
+ break;
+ buf->len_chars = 1;
+ /* Change the state so we’re on the lookout for combining dot above. */
+ ctx->lithuanian_tittle = LT_DEL_DOT_ABOVE;
+ return true;
+
+ case CASE_DOWN:
+ /* Turning I or J to lower case requires combining dot above to be included
+ IF there are any other characters combining above present. This is so
+ that the tittle is preserved. */
+ switch (ch) {
+ case CAPITAL_I_WITH_GRAVE:
+ ch = 0x80; /* U+300, "\xCC\x80", combining grave accent */
+ goto has_accent;
+ case CAPITAL_I_WITH_ACUTE:
+ ch = 0x81; /* U+301, "\xCC \x81", combining acute accent */
+ goto has_accent;
+ case CAPITAL_I_WITH_TILDE:
+ ch = 0x83; /* U+303, "\xCC\x83", combining tilde */
+ has_accent:
+ memcpy (buf->data, "i\xCC\x87\xCC", 4);
+ buf->data[4] = ch;
+ buf->len_chars = 3;
+ buf->len_bytes = 5;
+ return true;
+
+ case 'I':
+ case 'J':
+ buf->data[0] = ch ^ ('i' ^ 'I');
+ buf->len_bytes = 1;
+ if (false)
+ case CAPITAL_I_WITH_OGONEK:
+ buf->len_bytes = CHAR_STRING (SMALL_I_WITH_OGONEK, buf->data);
+ buf->len_chars = 1;
+ /* Change the state so we’re on the lookout for diacritics combining
+ above. If one is found, we need to add combining dot above. */
+ ctx->lithuanian_tittle = LT_INS_DOT_ABOVE;
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
/* Based on CTX, case character CH accordingly. Update CTX as necessary.
Return cased character.
@@ -234,6 +370,9 @@ case_characters (struct casing_str_buf *buf, struct
casing_context *ctx,
{
enum case_action flag = normalise_flag (ctx);
+ if (case_lithuanian (buf, ctx, flag, ch))
+ return 0;
+
if (flag != CASE_NO_ACTION && __builtin_expect(ctx->treat_turkic_i, false))
{
bool dot_above = false;
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 9f5e43f..bae4242 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -185,7 +185,32 @@ casefiddle-tests--characters
("I\u0307si\u0307s" "I\u0307Sİ\u0307S" "isi\u0307s"
"I\u0307si\u0307s" "I\u0307si\u0307s" 'tr)
("I\u0307sI\u0307s" "I\u0307SI\u0307S" "isis"
- "I\u0307sis" "I\u0307sI\u0307s" 'tr))
+ "I\u0307sis" "I\u0307sI\u0307s" 'tr)
+
+ ;; Test combining dot above in inserted when needed when lower
+ ;; casing I or J.
+ ("I\u0328\u0300" ; I + ogonek + grave
+ "I\u0328\u0300" "i\u0328\u0307\u0300"
+ "I\u0328\u0300" "I\u0328\u0300" 'lt)
+
+ ("J\u0328\u0300" ; J + ogonek + grave
+ "J\u0328\u0300" "j\u0328\u0307\u0300"
+ "J\u0328\u0300" "J\u0328\u0300" 'lt)
+
+ ("Į\u0300" ; I-ogonek + grave
+ "Į\u0300" "į\u0307\u0300" "Į\u0300" "Į\u0300" 'lt)
+
+ ("Ì Í Ĩ"
+ "Ì Í Ĩ" "i\u0307\u0300 i\u0307\u0301 i\u0307\u0303"
+ "Ì Í Ĩ" "Ì Í Ĩ" 'lt)
+
+ ;; Test combining dot above in removed when upper casing i or j.
+ ("i\u0328\u0307" ; i + ogonek + dot above
+ "I\u0328" "i\u0328\u0307" "I\u0328" "I\u0328" 'lt)
+ ("j\u0328\u0307" ; j + ogonek + dot above
+ "J\u0328" "j\u0328\u0307" "J\u0328" "J\u0328" 'lt)
+ ("į\u0307" ; i-ogonek + dot above
+ "Į" "į\u0307" "Į" "Į" 'lt))
(nreverse errors))
(let* ((input (string-to-multibyte (car test)))
(expected (cdr test))
--
2.8.0.rc3.226.g39d4020
- bug#24603: [RFC 00/18] Improvement to casing, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 01/18] Add tests for casefiddle.c, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 05/18] Introduce case_character function, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 06/18] Add support for title-casing letters, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 13/18] Add some tricky Unicode characters to regex test, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 04/18] Split casify_object into multiple functions, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 03/18] Don’t assume character can be either upper- or lower-case when casing, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 12/18] Implement rules for title-casing Dutch ij ‘letter’, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 11/18] Implement casing rules for Lithuanian,
Michal Nazarewicz <=
- bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 09/18] Implement special sigma casing rule, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 14/18] Factor out character category lookup to separate function, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 07/18] Split up casify_region function., Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/03