[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#24603: [PATCHv5 10/11] Implement casing rules for Lithuanian (bug#24
From: |
Michal Nazarewicz |
Subject: |
bug#24603: [PATCHv5 10/11] Implement casing rules for Lithuanian (bug#24603) |
Date: |
Thu, 9 Mar 2017 22:51:49 +0100 |
In Lithuanian, tittle above lower case i and j are retained even if
there are other diacritics above present. For that to work, an explicit
combining dot above must be added after i and j or otherwise the
rendering engine will remove the tittle.
* src/casefiddle.c (struct casing_context, prepare_casing_context): Add
SPECIAL_LT, SPECIAL_LT_DEL_DOT_ABOVE and SPECIAL_LT_INS_DOT_ABOVE
special flag valus for handling Lithuanian. Set the flag to SPECIAL_LT
if buffer is in Lithuanian.
(maybe_case_lithuanian): New function which implements Lithuanian rules.
(case_characters): Make use of maybe_case_lithuanian.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test cases
for Lithuanian rules.
---
src/casefiddle.c | 178 ++++++++++++++++++++++++++++++++++++++++---
test/src/casefiddle-tests.el | 27 ++++++-
2 files changed, 195 insertions(+), 10 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 4785ebaddc4..a33bac7d21e 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -77,7 +77,15 @@ struct casing_context {
SPECIAL_NL_UPCASE_J,
/* Handle Azerbaijani and Turkish dotted and dotless i. */
- SPECIAL_TR
+ SPECIAL_TR,
+
+ /* Apply Lithuanian rules for i’s and j’s tittle. */
+ SPECIAL_LT,
+ /* As above plus look out for combining dot above to delete. */
+ SPECIAL_LT_DEL_DOT_ABOVE,
+ /* As above plus look out for diacritics combining above because
+ we may need to inject dot above before them. */
+ SPECIAL_LT_INS_DOT_ABOVE,
} special;
};
@@ -116,6 +124,9 @@ prepare_casing_context (struct casing_context *ctx,
case ('t' << 8) | 'r': /* Turkish */
case ('a' << 8) | 'z': /* Azerbaijani */
ctx->special = SPECIAL_TR;
+ break;
+ case ('l' << 8) | 't': /* Lithuania */
+ ctx->special = SPECIAL_LT;
}
}
@@ -362,6 +373,142 @@ maybe_case_turkic (struct casing_str_buf *buf, struct
casing_context *ctx,
return ch == cased ? RES_NO_CHANGE : RES_CHANGED;
}
+/* Lithuanian retains tittle in lower case i and j when there are more
+ accents above those letters. */
+
+#define CAPITAL_I_WITH_GRAVE 0x0CC
+#define CAPITAL_I_WITH_ACUTE 0x0CD
+#define CAPITAL_I_WITH_TILDE 0x128
+#define CAPITAL_I_WITH_OGONEK 0x12E
+#define SMALL_I_WITH_OGONEK 0x12F
+#define COMBINING_GRAVE_ABOVE 0x300
+#define COMBINING_ACUTE_ABOVE 0x301
+#define COMBINING_TILDE_ABOVE 0x303
+#define COMBINING_OGONEK 0x328
+
+/* Save in BUF result of casing character CH if Lithuanian casing rules apply.
+
+ If not-NULL, NEXT points to the next character in the cased string. If
NULL,
+ it is assumed current character is the last one being cased. This is used
to
+ apply some rules which depend on proceeding state.
+
+ FLAG is a normalised flag (as returned by normalise_flag function).
+
+ Return -2 (RES_NOT_TOUCHED) if Lithuanian rules did not apply, no changes
+ were made and other casing rules should be tried. Otherwise, meaning of
+ return values is the same as in case_characters function. */
+static int
+maybe_case_lithuanian (struct casing_str_buf *buf, struct casing_context *ctx,
+ enum case_action flag, int ch)
+{
+ switch (ctx->special) {
+ case SPECIAL_LT:
+ break;
+
+ case SPECIAL_LT_DEL_DOT_ABOVE:
+ /* When upper-casing i or j, a combining dot above that follows it must be
+ removed. This is true even if there’s a combining ogonek in between.
+ But, if there’s another character combining above in between, combining
+ dot needs to stay (since the dot will be rendered above the other
+ diacritic). */
+ switch (ch) {
+ case COMBINING_DOT_ABOVE:
+ buf->len_chars = buf->len_bytes = 0;
+ ctx->special = SPECIAL_LT;
+ return RES_CHANGED;
+ case COMBINING_GRAVE_ABOVE:
+ case COMBINING_ACUTE_ABOVE:
+ case COMBINING_TILDE_ABOVE:
+ ctx->special = SPECIAL_LT;
+ return RES_NOT_TOUCHED;
+ case COMBINING_OGONEK:
+ return RES_NOT_TOUCHED;
+ default:
+ ctx->special = SPECIAL_LT;
+ }
+ break;
+
+ case SPECIAL_LT_INS_DOT_ABOVE:
+ /* When lower-casing I or J, if the letter has any accents above,
+ a combining dot above must be added before them. If we are here, it
+ means that we have lower cased I or J and we’re now on the lookout for
+ accents combining above. */
+ switch (ch) {
+ case COMBINING_GRAVE_ABOVE:
+ case COMBINING_ACUTE_ABOVE:
+ case COMBINING_TILDE_ABOVE:
+ buf->len_chars = 2;
+ buf->len_bytes = CHAR_STRING (COMBINING_DOT_ABOVE, buf->data);
+ buf->len_bytes += CHAR_STRING (ch, buf->data + buf->len_bytes);
+ ctx->special = SPECIAL_LT;
+ return RES_CHANGED;
+ case COMBINING_OGONEK:
+ return RES_NOT_TOUCHED;
+ default:
+ ctx->special = SPECIAL_LT;
+ }
+ break;
+
+ default:
+ return RES_NOT_TOUCHED;
+ }
+
+ switch (flag) {
+ case CASE_UP:
+ case CASE_CAPITALIZE:
+ if (ch == 'i' || ch == 'j')
+ {
+ buf->data[0] = ch ^ ('i' ^ 'I');
+ buf->len_bytes = 1;
+ }
+ else if (ch == SMALL_I_WITH_OGONEK)
+ buf->len_bytes = CHAR_STRING (CAPITAL_I_WITH_OGONEK, buf->data);
+ else
+ break;
+ buf->len_chars = 1;
+ /* Change the state so we’re on the lookout for combining dot above. */
+ ctx->special = SPECIAL_LT_DEL_DOT_ABOVE;
+ return RES_CHANGED;
+
+ case CASE_DOWN:
+ /* Turning I or J to lower case requires combining dot above to be included
+ IF there are any other characters combining above present. This is so
+ that the tittle is preserved. */
+ switch (ch) {
+ case CAPITAL_I_WITH_GRAVE:
+ ch = 0x80; /* U+300, "\xCC\x80", combining grave accent */
+ goto has_accent;
+ case CAPITAL_I_WITH_ACUTE:
+ ch = 0x81; /* U+301, "\xCC \x81", combining acute accent */
+ goto has_accent;
+ case CAPITAL_I_WITH_TILDE:
+ ch = 0x83; /* U+303, "\xCC\x83", combining tilde */
+ has_accent:
+ memcpy (buf->data, "i\xCC\x87\xCC", 4);
+ buf->data[4] = ch;
+ buf->len_chars = 3;
+ buf->len_bytes = 5;
+ return RES_CHANGED;
+
+ case 'I':
+ case 'J':
+ buf->data[0] = ch ^ ('i' ^ 'I');
+ buf->len_bytes = 1;
+ if (false)
+ case CAPITAL_I_WITH_OGONEK:
+ buf->len_bytes = CHAR_STRING (SMALL_I_WITH_OGONEK, buf->data);
+ buf->len_chars = 1;
+ /* Change the state so we’re on the lookout for diacritics combining
+ above. If one is found, we need to add combining dot above. */
+ ctx->special = SPECIAL_LT_INS_DOT_ABOVE;
+ return RES_CHANGED;
+ }
+ break;
+ }
+
+ return RES_NOT_TOUCHED;
+}
+
/* Save in BUF result of casing character CH.
If not-NULL, NEXT points to the next character in the cased string. If
NULL,
@@ -381,17 +528,30 @@ case_characters (struct casing_str_buf *buf, struct
casing_context *ctx,
int ch, const unsigned char *next)
{
enum case_action flag = normalise_flag (ctx);
- int ret;
+ int ret = RES_NOT_TOUCHED;
+
+ switch (ctx->special) {
+ case SPECIAL_NONE:
+ break;
+
+ case SPECIAL_TR:
+ ret = maybe_case_turkic (buf, ctx, flag, ch, next);
+ break;
+
+ default:
+ /* case SPECIAL_LT: */
+ /* case SPECIAL_LT_DEL_DOT_ABOVE: */
+ /* case SPECIAL_LT_INS_DOT_ABOVE: */
+ ret = maybe_case_lithuanian (buf, ctx, flag, ch);
+ }
- ret = maybe_case_turkic (buf, ctx, flag, ch, next);
- if (ret != RES_NOT_TOUCHED)
- return ret;
+ if (ret == RES_NOT_TOUCHED)
+ ret = maybe_case_greek (buf, ctx, flag, ch, next);
- ret = maybe_case_greek (buf, ctx, flag, ch, next);
- if (ret != RES_NOT_TOUCHED)
- return ret;
+ if (ret == RES_NOT_TOUCHED)
+ ret = case_character_impl (buf, ctx, flag, ch);
- return case_character_impl (buf, ctx, flag, ch);
+ return ret;
}
static Lisp_Object
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index ce1bb18dd40..f7b0da41029 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -241,7 +241,32 @@ casefiddle-tests--test-casing
("I\u0307si\u0307s" "I\u0307Sİ\u0307S" "isi\u0307s"
"I\u0307si\u0307s" "I\u0307si\u0307s" "tr")
("I\u0307sI\u0307s" "I\u0307SI\u0307S" "isis"
- "I\u0307sis" "I\u0307sI\u0307s" "tr"))))))
+ "I\u0307sis" "I\u0307sI\u0307s" "tr")
+
+ ;; Test combining dot above in inserted when needed when lower
+ ;; casing I or J.
+ ("I\u0328\u0300" ; I + ogonek + grave
+ "I\u0328\u0300" "i\u0328\u0307\u0300"
+ "I\u0328\u0300" "I\u0328\u0300" "lt")
+
+ ("J\u0328\u0300" ; J + ogonek + grave
+ "J\u0328\u0300" "j\u0328\u0307\u0300"
+ "J\u0328\u0300" "J\u0328\u0300" "lt")
+
+ ("Į\u0300" ; I-ogonek + grave
+ "Į\u0300" "į\u0307\u0300" "Į\u0300" "Į\u0300" "lt")
+
+ ("Ì Í Ĩ"
+ "Ì Í Ĩ" "i\u0307\u0300 i\u0307\u0301 i\u0307\u0303"
+ "Ì Í Ĩ" "Ì Í Ĩ" "lt")
+
+ ;; Test combining dot above in removed when upper casing i or j.
+ ("i\u0328\u0307" ; i + ogonek + dot above
+ "I\u0328" "i\u0328\u0307" "I\u0328" "I\u0328" "lt")
+ ("j\u0328\u0307" ; j + ogonek + dot above
+ "J\u0328" "j\u0328\u0307" "J\u0328" "J\u0328" "lt")
+ ("į\u0307" ; i-ogonek + dot above
+ "Į" "į\u0307" "Į" "Į" "lt"))))))
(ert-deftest casefiddle-tests-casing-byte8 ()
(should-not
--
2.12.0.246.ga2ecc84866-goog
- bug#24603: [PATCHv5 00/11] Casing improvements, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 03/11] Add support for title-casing letters (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 06/11] Implement special sigma casing rule (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 04/11] Split up casify_region function (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 07/11] Introduce ‘buffer-language’ buffer-locar variable, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 02/11] Introduce case_character function, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 01/11] Split casify_object into multiple functions, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 10/11] Implement casing rules for Lithuanian (bug#24603),
Michal Nazarewicz <=
- bug#24603: [PATCHv5 08/11] Implement rules for title-casing Dutch ij ‘letter’ (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 09/11] Implement Turkic dotless and dotted i casing rules (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603), Michal Nazarewicz, 2017/03/09
bug#24603: [PATCHv5 05/11] Support casing characters which map into multiple code points (bug#24603), Michal Nazarewicz, 2017/03/09