bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603)


From: Michal Nazarewicz
Subject: bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603)
Date: Thu, 9 Mar 2017 22:51:50 +0100

Add rules for casing Irish words whose rules are quite complicated
and require some letters to remain lower case when upper casing,
involve insertion of dashes and various other transformations.

* src/casefiddle.c (struct casing_context): Add flags for denoting
Irish casing rules are in effect.
(prepare_casing_context): Detect Irish language and set ctx->special
accordingly.
(irish_upcase, irish_downcase): New functions for upcasing and
downcasing Irish letters.
(is_irish_vowel, is_irish_uc_vowel, is_irish_lc_vowel): New functions
detecting whether a letter is an Irish vowel.
(maybe_case_irish): New function implementing Irish casing rules.
(case_characters): Make use of maybe_case_irish.

* test/src/casefiddle-resources/irish-lowercase-1.txt,
test/src/casefiddle-resources/irish-lowercase-1-ref.txt,
test/src/casefiddle-resources/irish-uppercase-1.txt,
test/src/casefiddle-resources/irish-uppercase-1-ref.txt: New files
with test cases for Irish capitalisation.  The files are copied from
Mozilla’s test suite.

* test/src/casefiddle-tests.el (casefiddle-tests--resources-dir): New
variable to point to aforementioned test case files.
(casefiddle-tests--test-casing): Support missing expected strings.
(casefiddle--read-lines): New helper functions for reading lines from
a file.
(casefiddle-test-irish): Apply test cases reada from the Irish test
case files.
---
 src/casefiddle.c                                   | 313 +++++++++++++++++++++
 .../casefiddle-resources/irish-lowercase-1-ref.txt | 211 ++++++++++++++
 .../src/casefiddle-resources/irish-lowercase-1.txt | 211 ++++++++++++++
 .../casefiddle-resources/irish-uppercase-1-ref.txt | 105 +++++++
 .../src/casefiddle-resources/irish-uppercase-1.txt | 105 +++++++
 test/src/casefiddle-tests.el                       |  58 +++-
 6 files changed, 992 insertions(+), 11 deletions(-)
 create mode 100644 test/src/casefiddle-resources/irish-lowercase-1-ref.txt
 create mode 100644 test/src/casefiddle-resources/irish-lowercase-1.txt
 create mode 100644 test/src/casefiddle-resources/irish-uppercase-1-ref.txt
 create mode 100644 test/src/casefiddle-resources/irish-uppercase-1.txt

diff --git a/src/casefiddle.c b/src/casefiddle.c
index a33bac7d21e..3352fb6795a 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -86,6 +86,24 @@ struct casing_context {
     /* As above plus look out for diacritics combining above because
        we may need to inject dot above before them. */
     SPECIAL_LT_INS_DOT_ABOVE,
+
+    /* Flags for Irish word capitalising rules.  Those are insane, see
+       https://bugzilla.mozilla.org/show_bug.cgi?id=1018805 and
+       https://bugzilla.mozilla.org/show_bug.cgi?id=1014639 for reference. */
+    /* Irish handling enabled; we are lower-casing words. */
+    SPECIAL_GA_LC,
+    /* Irish handling enabled; we are upper-casing words or capitalising. */
+    SPECIAL_GA_UC,
+    /* Upper-case next character. */
+    SPECIAL_GA_UC_NEXT,
+    /* We’re in the middle of a potential ‘bhf’ or ‘bhF’ triplet. */
+    SPECIAL_GA_TRIPLET_B = 'b',
+    /* We’re in the middle of a potential ‘n-{v}’ triplet. */
+    SPECIAL_GA_TRIPLET_N = 'n',
+    /* We’re in the middle of a potential triplet starting with ‘ts’ or ‘tS’. 
*/
+    SPECIAL_GA_TRIPLET_T = 't'
+    /* Yes, it matters that the last three flags equal the characters for b,
+       n or t. */
   } special;
 };
 
@@ -127,6 +145,9 @@ prepare_casing_context (struct casing_context *ctx,
       break;
     case ('l' << 8) | 't':  /* Lithuania */
       ctx->special = SPECIAL_LT;
+      break;
+    case ('g' << 8) | 'a':  /* Irish */
+      ctx->special = flag == CASE_DOWN ? SPECIAL_GA_LC : SPECIAL_GA_UC;
     }
 }
 
@@ -509,6 +530,289 @@ maybe_case_lithuanian (struct casing_str_buf *buf, struct 
casing_context *ctx,
   return RES_NOT_TOUCHED;
 }
 
+/* The naïve method works for ASCII letters but also non-ASCII Irish
+   vowels and second byte in UTF-8 representation of those vowels. */
+static int irish_upcase(int ch) { return ch & ~('a' ^ 'A'); }
+static int irish_downcase(int ch) { return ch | ('a' ^ 'A'); }
+
+/* A byte starting two-byte sequence of non-ASCII Irish vowels. */
+#define GA_UTF8_FIRST_BYTE 0xC3
+
+/* Classifies UTF-8 encoded character at *CH as Irish vowel or not. */
+static enum {
+  GA_NOT_VOWEL = 0,
+  GA_UC_VOWEL = 1,
+  GA_LC_VOWEL = 2
+} is_irish_vowel(const unsigned char *ch) {
+  switch (*ch) {
+  case 'A': case 'E': case 'I': case 'O': case 'U':
+    return GA_UC_VOWEL;
+  case 'a': case 'e': case 'i': case 'o': case 'u':
+    return GA_LC_VOWEL;
+  case GA_UTF8_FIRST_BYTE:
+    switch (ch[1]) {
+    case 0x81:  /* C3 81 → U+00C1 → Á */
+    case 0x89:  /* C3 89 → U+00C9 → É */
+    case 0x8D:  /* C3 8D → U+00CD → Í */
+    case 0x93:  /* C3 93 → U+00D3 → Ó */
+    case 0x9A:  /* C3 9A → U+00DA → Ú */
+      return GA_UC_VOWEL;
+    case 0xA1:  /* C3 A1 → U+00E1 → á */
+    case 0xA9:  /* C3 A9 → U+00E9 → é */
+    case 0xAD:  /* C3 AD → U+00ED → í */
+    case 0xB3:  /* C3 B3 → U+00F3 → ó */
+    case 0xBA:  /* C3 BA → U+00FA → ú */
+      return GA_LC_VOWEL;
+    }
+  }
+  return GA_NOT_VOWEL;
+}
+
+/* Return whether *CH is an upper-case Irish vowel encoded in UTF-8. */
+static bool
+is_irish_uc_vowel(const char *ch) {
+  return is_irish_vowel(ch) == GA_UC_VOWEL;
+}
+
+/* Return whether *CH is an lower-case Irish vowel encoded in UTF-8. */
+static bool
+is_irish_lc_vowel(const char *ch) {
+  return is_irish_vowel(ch) == GA_LC_VOWEL;
+}
+
+/* Save in BUF result of casing character CH if Irish casing rules apply.
+
+   If not-NULL, NEXT points to the next character in the cased string.  If 
NULL,
+   it is assumed current character is the last one being cased.  This is used 
to
+   apply some rules which depend on proceeding state.
+
+   FLAG is a normalised flag (as returned by normalise_flag function).
+
+   Return -2 (RES_NOT_TOUCHED) if Irish rules did not apply, no changes
+   were made and other casing rules should be tried.  Otherwise, meaning of
+   return values is the same as in case_characters function. */
+static int
+maybe_case_irish (struct casing_str_buf *buf, struct casing_context *ctx,
+                 enum case_action flag, int ch, const unsigned char *next) {
+  unsigned n;
+
+  switch (ctx->special) {
+  case SPECIAL_GA_LC:
+    /* We are lower-casing words and Irish rules are enabled.  See
+       https://bugzilla.mozilla.org/show_bug.cgi?id=1018805 for reference.  The
+       rules are:
+
+         t{V} → t-{v}
+         n{V} → n-{v}
+
+       {V} denotes upper-case Irish vowel and {v} denotes a lower-case one. */
+
+    if (!next ||
+       ctx->inword ||
+       (ch != 't' && ch != 'n') ||
+       !is_irish_uc_vowel(next))
+      return RES_NOT_TOUCHED;
+
+    ctx->inword = true;
+    buf->len_chars = 3;
+    buf->data[0] = ch;
+    buf->data[1] = '-';
+    if (*next < 0x80) {
+      buf->data[2] = irish_downcase(next[0]);
+      buf->len_bytes = 3;
+      return 1;
+    } else {
+      buf->data[2] = next[0];
+      buf->data[3] = irish_downcase(next[1]);
+      buf->len_bytes = 4;
+      return 2;
+    }
+
+  case SPECIAL_GA_UC:
+    /* We are upper-casing or capitalising words and Irish rules are enabled.
+       See https://bugzilla.mozilla.org/show_bug.cgi?id=1014639 for
+       reference. The rules are:
+
+        h{V}  → h{V}
+        n{V}  → n{V}
+        t{V}  → t{V}
+
+        bp    → bP
+        bP    → bP
+        dt    → dT
+        dT    → dT
+        gc    → gC
+        gC    → gC
+        mb    → mB
+        mB    → mB
+        nd    → nD
+        nD    → nD
+        ng    → nG
+        nG    → nG
+
+        bhf   → bhF
+        bhF   → bhF
+
+        n-{v} → n{V}
+        t-{v} → t{V}
+
+        tSL   → tSL
+        tSl   → tSL
+        tsl   → tSL
+        tSN   → tSN
+        tsn   → tSN
+        tSn   → tSN
+        tSR   → tSR
+        tSr   → tSR
+        tsr   → tSR
+        tS{V} → tS{V}
+        tS{v} → tS{V}
+        ts{v} → tS{V}
+
+       {V} denotes upper-case Irish vowel and {v} denotes a lower-case one. */
+
+    if (!next || ctx->inword || ch < 'a' || ch > 'z')
+      return RES_NOT_TOUCHED;
+
+    n = irish_upcase(*next);
+    if (((ch == 'h' || ch == 'n' || ch == 't') && is_irish_uc_vowel(next)) ||
+       (ch == 'b' && n == 'P') ||
+       (ch == 'd' && n == 'T') ||
+       (ch == 'g' && n == 'C') ||
+       (ch == 'm' && n == 'B') ||
+       (ch == 'n' && n == 'D') ||
+       (ch == 'n' && n == 'G'))
+      {
+       ctx->inword = true;
+       ctx->special = SPECIAL_GA_UC_NEXT;
+       buf->data[0] = ch;
+       buf->len_chars = 1;
+       buf->len_bytes = 1;
+       return RES_NO_CHANGE;
+      }
+
+    if ((ch == 'b' && *next == 'h') || (ch == 't' && n == 'S') ||
+       (ch == 't' && *next == '-') || (ch == 'n' && *next == '-'))
+      {
+       /* We can only look at two characters at a time but here we need to make
+          a decision based on a 3-character sequence.
+
+          Let’s return empty string for now, remember the current character and
+          when we’ll be dealing with the next character we’ll be able to see
+          three characters.
+
+          Downside of this approach is that we cannot always correctly mark
+          buffer as changed.  Namely, ‘bhF’ triplet does not need to be changed
+          but sadly we'll mark this as modified. */
+       ctx->inword = true;
+       ctx->special = ch;
+       buf->len_chars = 0;
+       buf->len_bytes = 0;
+       return RES_CHANGED;
+      }
+
+    return RES_NOT_TOUCHED;
+
+  case SPECIAL_GA_UC_NEXT:
+    ctx->special = SPECIAL_GA_UC;
+    n = irish_upcase(ch);
+    buf->len_bytes = CHAR_STRING (n, buf->data);
+    buf->len_chars = 1;
+    return n == ch ? RES_NO_CHANGE : RES_CHANGED;
+
+  case SPECIAL_GA_TRIPLET_B:
+  case SPECIAL_GA_TRIPLET_N:
+  case SPECIAL_GA_TRIPLET_T:
+    /* We’re here after encountering a possible beginning of a three-character
+       sequence that needs to be handled.  Those are:
+
+        bhf   → bhF
+        bhF   → bhF
+
+        n-{v} → n{V}
+        t-{v} → t{V}
+
+        tSL   → tSL
+        tSl   → tSL
+        tsl   → tSL
+        tSN   → tSN
+        tsn   → tSN
+        tSn   → tSN
+        tSR   → tSR
+        tSr   → tSR
+        tsr   → tSR
+        tS{V} → tS{V}
+        tS{v} → tS{V}
+        ts{v} → tS{V} */
+
+    if (*next)
+      switch (ch) {
+      case '-':  /* ‘n-’ or ‘t-’ prefix. */
+       if (is_irish_lc_vowel(next))
+         {
+           buf->data[0] = ctx->special;
+           buf->len_chars = 1;
+           buf->len_bytes = 1;
+           ctx->special = SPECIAL_GA_UC_NEXT;
+           return RES_CHANGED;
+         }
+       break;
+
+      case 'h':  /* ‘bh’ prefix */
+       if (irish_upcase(*next) == 'F') {
+         ctx->special = SPECIAL_GA_UC;
+         buf->data[0] = 'b';
+         buf->data[1] = 'h';
+         buf->data[2] = 'F';
+         buf->len_chars = 3;
+         buf->len_bytes = 3;
+         return 1;
+       }
+       break;
+
+      case 's':  /* ‘ts’ prefix. */
+       if (*next == 'l' || *next == 'n' || *next == 'r' ||
+           is_irish_lc_vowel(next))
+         goto tSU;
+       break;
+
+      case 'S':  /* ‘tS’ prefix. */
+       if (*next == 'l' || *next == 'n' || *next == 'r' ||
+           *next == 'L' || *next == 'N' || *next == 'R' ||
+           is_irish_vowel(next))
+         {
+         tSU:
+           /* t{s}{x} → tS{X} */
+           ctx->special = SPECIAL_GA_UC_NEXT;
+           buf->data[0] = 't';
+           buf->data[1] = 'S';
+           buf->len_chars = 2;
+           buf->len_bytes = 2;
+           return RES_CHANGED;
+         }
+       break;
+      }
+
+    /* Recover from a incorrect guess that it was a triplet. */
+    if (ch == '-') {
+      ctx->inword = false;
+    } else if (ctx->flag == CASE_UP) {
+      ch = irish_upcase(ch);
+    } else if (ctx->flag == CASE_CAPITALIZE) {
+      ch = irish_downcase(ch);
+    }
+
+    buf->data[0] = irish_upcase(ctx->special);
+    buf->data[1] = ch;
+    buf->len_chars = 2;
+    buf->len_bytes = 2;
+    ctx->special = SPECIAL_GA_UC;
+    return RES_CHANGED;
+  }
+
+  return RES_NOT_TOUCHED;
+}
+
 /* Save in BUF result of casing character CH.
 
    If not-NULL, NEXT points to the next character in the cased string.  If 
NULL,
@@ -543,6 +847,15 @@ case_characters (struct casing_str_buf *buf, struct 
casing_context *ctx,
   /* case SPECIAL_LT_DEL_DOT_ABOVE: */
   /* case SPECIAL_LT_INS_DOT_ABOVE: */
     ret = maybe_case_lithuanian (buf, ctx, flag, ch);
+    break;
+
+  case SPECIAL_GA_LC:
+  case SPECIAL_GA_UC:
+  case SPECIAL_GA_UC_NEXT:
+  case SPECIAL_GA_TRIPLET_B:
+  case SPECIAL_GA_TRIPLET_N:
+  case SPECIAL_GA_TRIPLET_T:
+    ret = maybe_case_irish (buf, ctx, flag, ch, next);
   }
 
   if (ret == RES_NOT_TOUCHED)
diff --git a/test/src/casefiddle-resources/irish-lowercase-1-ref.txt 
b/test/src/casefiddle-resources/irish-lowercase-1-ref.txt
new file mode 100644
index 00000000000..cbe9f601e9c
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-lowercase-1-ref.txt
@@ -0,0 +1,211 @@
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár nathair
+ár nathair
+ár nathair
+n-a shaighdiúir
+gan dul as aca ach le n-a chabhair
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+amhrán náisiúnta
+amhrán náisiúnta
+amhrán náisiúnta
+lucht na n-ealaíon
+lucht na n-ealaíon
+lucht na n-ealaíon
+lucht na n-ealaíon
+lucht na n-ealaíon
+neart daoine
+neart daoine
+neart daoine
+ceol na n-éan
+ceol na n-éan
+ceol na n-éan
+ceol na n-éan
+ceol na n-éan
+sa néal
+sa néal
+sa néal
+ord na n-imeachtaí
+ord na n-imeachtaí
+ord na n-imeachtaí
+ord na n-imeachtaí
+ord na n-imeachtaí
+nathair nimhe
+nathair nimhe
+nathair nimhe
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+níos measa
+níos measa
+níos measa
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+duine nochta
+duine nochta
+duine nochta
+cultúr na n-óg
+cultúr na n-óg
+cultúr na n-óg
+cultúr na n-óg
+cultúr na n-óg
+dhá nóiméad
+dhá nóiméad
+dhá nóiméad
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+gúna nua
+gúna nua
+gúna nua
+formhór na n-údarás
+formhór na n-údarás
+formhór na n-údarás
+formhór na n-údarás
+formhór na n-údarás
+imoibreoir núicléach
+imoibreoir núicléach
+imoibreoir núicléach
+sean-airteagal
+seanairteagal
+bunioncaim
+bun-ioncaim
+buanorduithe
+buan-orduithe
+ár n-athair
+ár n-athair
+clár na n-ábhar
+clár na n-ábhar
+ceol na ndaoine
+ceol na ndaoine
+táim i ngrá leat
+táim i ngrá leat
+cén t-am é?
+cén t-am é?
+cén t-am é?
+cén t-am é?
+cén t-am é?
+tar ar ais!
+tar ar ais!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+rud tábhachtach
+rud tábhachtach
+rud tábhachtach
+den obair an t-eolas
+den obair an t-eolas
+den obair an t-eolas
+den obair an t-eolas
+den obair an t-eolas
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+dréacht-acht
+dréachtphlean
+dréacht-phlean
+dréacht-íocaíocht
+áitainmneacha
+áit-ainmneacha
+státurraithe
+stát-urraithe
+ar aon tslí
+ar aon tslí
+amach ón tsnáthaid
+amach ón tsnáthaid
+ar an tsráid
+ar an tsráid
+caint an tsráidbhaile
+caint an tsráidbhaile
+cora crua an tsaoil
+cora crua an tsaoil
+bholadh an tsáile
+bholadh an tsáile
+uair sa tseachtain
+uair sa tseachtain
+deireadh an tséasúir
+deireadh an tséasúir
+fear an tsiopa
+fear an tsiopa
+an tsíocháin a choimeád
+an tsíocháin a choimeád
+an tsochaí faisnéise
+an tsochaí faisnéise
+gaoth an tsóláis
+gaoth an tsóláis
+is beag an tsuim iad
+is beag an tsuim iad
+infheicthe ag an tsúil
+infheicthe ag an tsúil
+scríobhfaidh
+scríobhfaidh
+preabphas
+preabphas
+úsáidtear
+úsáidtear
+snagcheol
+snagcheol
+in-athnuaite agatsa
+in-athnuaite agatsa
+teanga dhomhanda
+teanga dhomhanda
+réaltsruth
+réaltsruth
+na hataí
+na hataí
+t-léine
+t-léine
+t-léine
+t-léine
+torc allta
+torc allta
+tsk tsk tsk a chara
+tsk tsk tsk a chara
diff --git a/test/src/casefiddle-resources/irish-lowercase-1.txt 
b/test/src/casefiddle-resources/irish-lowercase-1.txt
new file mode 100644
index 00000000000..dcb3454b96d
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-lowercase-1.txt
@@ -0,0 +1,211 @@
+ÁR nACMHAINNÍ UISCE
+ÁR N-ACMHAINNÍ UISCE
+Ár nAcmhainní Uisce
+Ár n-Acmhainní Uisce
+ár n-acmhainní uisce
+Ár nathair
+ÁR NATHAIR
+Ár Nathair
+N-a shaighdiúir
+gan dul as aca ach le nA chabhair
+EOLAÍOCHT NA nÁBHAR
+EOLAÍOCHT NA n-ÁBHAR
+Eolaíocht na nÁbhar
+Eolaíocht na n-Ábhar
+eolaíocht na n-ábhar
+Amhrán náisiúnta
+Amhrán Náisiúnta
+AMHRÁN NÁISIÚNTA
+LUCHT NA nEALAÍON
+LUCHT NA n-EALAÍON
+Lucht na nEalaíon
+Lucht na n-Ealaíon
+lucht na n-ealaíon
+Neart Daoine
+neart daoine
+NEART DAOINE
+CEOL NA nÉAN
+CEOL NA n-ÉAN
+Ceol na nÉan
+Ceol na n-Éan
+ceol na n-éan
+Sa Néal
+Sa néal
+SA NÉAL
+ORD NA nIMEACHTAÍ
+ORD NA n-IMEACHTAÍ
+Ord na nImeachtaí
+Ord na n-Imeachtaí
+ord na n-imeachtaí
+Nathair Nimhe
+Nathair nimhe
+NATHAIR NIMHE
+LUCHT ADHARTHA NA nÍOMHÁNNA
+LUCHT ADHARTHA NA n-ÍOMHÁNNA
+Lucht Adhartha na nÍomhánna
+Lucht Adhartha na n-Íomhánna
+lucht adhartha na n-íomhánna
+Níos Measa
+níos measa
+NÍOS MEASA
+GNÉITHE DÁR nOIDHREACHT
+GNÉITHE DÁR n-OIDHREACHT
+Gnéithe Dár nOidhreacht
+Gnéithe Dár n-Oidhreacht
+gnéithe dár n-oidhreacht
+Duine Nochta
+Duine nochta
+DUINE NOCHTA
+CULTÚR NA nÓG
+CULTÚR NA n-ÓG
+Cultúr na nÓg
+Cultúr na n-Óg
+cultúr na n-óg
+Dhá Nóiméad
+Dhá nóiméad
+DHÁ NÓIMÉAD
+OCHT nUAIRE SA LÁ
+OCHT n-UAIRE SA LÁ
+Ocht nUaire Sa Lá
+Ocht n-Uaire Sa Lá
+ocht n-uaire sa lá
+Gúna Nua
+gúna nua
+GÚNA NUA
+FORMHÓR NA nÚDARÁS
+FORMHÓR NA n-ÚDARÁS
+Formhór na nÚdarás
+Formhór na n-Údarás
+formhór na n-údarás
+Imoibreoir Núicléach
+Imoibreoir núicléach
+IMOIBREOIR NÚICLÉACH
+sean-Airteagal
+SeanAirteagal
+BunIoncaim
+Bun-Ioncaim
+BuanOrduithe
+Buan-Orduithe
+ÁR nATHAIR
+Ár nAthair
+CLÁR NA nÁBHAR
+Clár na nÁbhar
+CEOL NA nDAOINE
+Ceol na nDaoine
+TÁIM I nGRÁ LEAT
+Táim i nGrá Leat
+CÉN tAM É?
+CÉN t-AM É?
+Cén tAm É?
+Cén t-Am É?
+cén t-am é?
+Tar Ar Ais!
+tar ar ais!
+TÁ AN tÁDH ORM INNIU!
+TÁ AN t-ÁDH ORM INNIU!
+Tá An tÁdh Orm Inniu!
+Tá An t-Ádh Orm Inniu!
+tá an t-ádh orm inniu!
+Rud Tábhachtach
+Rud tábhachtach
+rud tábhachtach
+DEN OBAIR AN tEOLAS
+DEN OBAIR AN t-EOLAS
+Den Obair an tEolas
+Den Obair an t-Eolas
+den obair an t-eolas
+AN tÉILEAMH A ÍOC
+AN t-ÉILEAMH A ÍOC
+An tÉileamh a Íoc
+An t-Éileamh a Íoc
+an t-éileamh a íoc
+AN tINNEALL CUARDAIGH IS FEARR
+AN t-INNEALL CUARDAIGH IS FEARR
+An tInneall Cuardaigh Is Fearr
+An t-Inneall Cuardaigh Is Fearr
+an t-inneall cuardaigh is fearr
+AN tÍOCHTAR A CHUR IN UACHTAR
+AN t-ÍOCHTAR A CHUR IN UACHTAR
+An tÍochtar a Chur In Uachtar
+An t-Íochtar a Chur In Uachtar
+an t-íochtar a chur in uachtar
+TABHAIR AN tORDÚ SEO DÓ!
+TABHAIR AN t-ORDÚ SEO DÓ!
+Tabhair An tOrdú Seo Dó!
+Tabhair An t-Ordú Seo Dó!
+tabhair an t-ordú seo dó!
+TÁ AN tÓR BUÍ AIGE.
+TÁ AN t-ÓR BUÍ AIGE.
+Tá An tÓr Buí Aige.
+Tá An t-Ór Buí Aige.
+tá an t-ór buí aige.
+AN tUISCE BEATHA AR AN TÁBLA.
+AN t-UISCE BEATHA AR AN TÁBLA.
+An tUisce Beatha Ar An Tábla.
+An t-Uisce Beatha Ar An Tábla.
+an t-uisce beatha ar an tábla.
+AN tÚRSCÉAL IS DEIREANAÍ
+AN t-ÚRSCÉAL IS DEIREANAÍ
+An tÚrscéal Is Deireanaí
+An t-Úrscéal Is Deireanaí
+an t-úrscéal is deireanaí
+Dréacht-Acht
+DréachtPhlean
+Dréacht-Phlean
+Dréacht-Íocaíocht
+ÁitAinmneacha
+Áit-Ainmneacha
+StátUrraithe
+Stát-Urraithe
+AR AON tSLÍ
+Ar Aon tSlí
+AMACH ÓN tSNÁTHAID
+Amach Ón tSnáthaid
+AR AN tSRÁID
+Ar An tSráid
+CAINT AN tSRÁIDBHAILE
+Caint An tSráidbhaile
+CORA CRUA AN tSAOIL
+Cora Crua An tSaoil
+BHOLADH AN tSÁILE
+Bholadh An tSáile
+UAIR SA tSEACHTAIN
+Uair Sa tSeachtain
+DEIREADH AN tSÉASÚIR
+Deireadh An tSéasúir
+FEAR AN tSIOPA
+Fear an tSiopa
+AN tSÍOCHÁIN A CHOIMEÁD
+An tSíocháin a Choimeád
+AN tSOCHAÍ FAISNÉISE
+An tSochaí Faisnéise
+GAOTH AN tSÓLÁIS
+Gaoth aN tSóláis
+IS BEAG AN tSUIM IAD
+Is Beag An tSuim Iad
+INFHEICTHE AG AN tSÚIL
+Infheicthe Ag An tSúil
+SCRÍOBHFAIDH
+Scríobhfaidh
+PREABPHAS
+Preabphas
+ÚSÁIDTEAR
+Úsáidtear
+SNAGCHEOL
+Snagcheol
+IN-ATHNUAITE AGATSA
+In-Athnuaite AGATSA
+TEANGA DHOMHANDA
+Teanga Dhomhanda
+RÉALTSRUTH
+Réaltsruth
+NA HATAÍ
+Na Hataí
+T-LÉINE
+T-Léine
+t-Léine
+t-léine
+TORC ALLTA
+Torc Allta
+TSK TSK TSK A CHARA
+Tsk Tsk Tsk a Chara
diff --git a/test/src/casefiddle-resources/irish-uppercase-1-ref.txt 
b/test/src/casefiddle-resources/irish-uppercase-1-ref.txt
new file mode 100644
index 00000000000..21d3e4a6126
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-uppercase-1-ref.txt
@@ -0,0 +1,105 @@
+ORD NA bhFOCAL
+COSÁN NA bhFILÍ
+ÁR bPOBAL
+NÓRA NA bPORTACH
+I dTOSACH BÁIRE
+AN GHAEILGE I dTUAISCEART NA hÉIREANN
+AS AN gCEANTAR SIN
+I gCONTAE NA MÍ AGUS I gCONAMARA
+DÉ hAOINE
+OIRTHEAR NA hÁISE
+PARLAIMINT NA hEORPA
+POBLACHT NA hÉIREANN
+EALAÍN NA hIODÁILE
+NA hÍOSÁNAIGH
+ACADAMH NA hOLLSCOLAÍOCHTA
+TÍR NA hÓIGE
+TOGHCHÁN NA hUACHTARÁNACHTA
+NA hÚDARÁIS CHÁNACH
+I mBUN MO MHACHNAMH
+I mBÉAL FEIRSTE AGUS I mBAILE ÁTHA CLIATH
+ÁR nACMHAINNÍ UISCE
+EOLAÍOCHT NA nÁBHAR
+LUCHT NA nEALAÍON
+CEOL NA nÉAN
+ORD NA nIMEACHTAÍ
+LUCHT ADHARTHA NA nÍOMHÁNNA
+GNÉITHE DÁR nOIDHREACHT
+CULTÚR NA nÓG
+OCHT nUAIRE SA LÁ
+FORMHÓR NA nÚDARÁS
+ÁR nATHAIR
+CLÁR NA nÁBHAR
+LOCH nEATHACH
+CUMANN NA nÉIREANNACH AONTAITHE
+GRÉASÁN NA nIONTAS
+NÓIBHÍSEACHT NA nÍOSÁNACH
+I gCEANTAR NA nOILEÁN
+TÍR NA nÓG
+BAILE NA nULTACH
+GORT NA nÚLL
+CEOL NA nDAOINE
+I nDÚN NA nGALL
+TÁIM I nGRÁ LEAT
+LABHAIR SÉ I nGAEILGE!
+CÉN tAM É?
+TÁ AN tÁDH ORM INNIU!
+DEN OBAIR AN tEOLAS
+AN tÉILEAMH A ÍOC
+AN tINNEALL CUARDAIGH IS FEARR
+AN tÍOCHTAR A CHUR IN UACHTAR
+TABHAIR AN tORDÚ SEO DÓ!
+TÁ AN tÓR BUÍ AIGE.
+AN tUISCE BEATHA AR AN TÁBLA.
+AN tÚRSCÉAL IS DEIREANAÍ
+AN tACHT OIDEACHAIS
+AN tÁIVÉ MÁIRIA
+AN tEARRACH ARABACH
+AN tÉIRÍ AMACH
+AN tIMEALL
+AN tÍOSÁNACH PEADAR CANISIUS
+AN tOILEÁNACH
+AN tÓR MUIRE
+AN tUASAL ÉAMON Ó CUÍV
+AN tÚDARÁS UM BÓITHRE NÁISIÚNTA
+AR AON tSLÍ
+BÉAL ÁTHA AN tSLÉIBHE
+AMACH ÓN tSNÁTHAID
+BANRÍON AN tSNEACHTA
+AR AN tSRÁID
+CAINT AN tSRÁIDBHAILE
+CORA CRUA AN tSAOIL
+BHOLADH AN tSÁILE
+UAIR SA tSEACHTAIN
+DEIREADH AN tSÉASÚIR
+FEAR AN tSIOPA
+AN tSÍOCHÁIN A CHOIMEÁD
+AN tSOCHAÍ FAISNÉISE
+GAOTH AN tSÓLÁIS
+IS BEAG AN tSUIM IAD
+INFHEICTHE AG AN tSÚIL
+CNOC AN tSAMHRAIDH
+CIONN tSÁILE
+AN tSEIRBHÍS PHOIBLÍ
+BAILE AN tSÉIPÉIL
+AN tSIRIA
+AN tSÍN
+OIFIG AN tSOLÁTHAIR
+POLL AN tSÓMAIS
+EOLAIRE AN tSUÍMH
+CASADH AN tSÚGÁIN
+SCRÍOBHFAIDH
+PREABPHAS
+ÚSÁIDTEAR
+SNAGCHEOL
+STÁITSE IMBOLC
+IN-ATHNUAITE AGATSA
+TEANGA DHOMHANDA
+RÉALTSRUTH
+NA HATAÍ
+NA HATAÍ
+ÁR NATHAIR
+ÁR NATHAIR
+T-LÉINE
+TORC ALLTA
+TSK TSK TSK A CHARA
diff --git a/test/src/casefiddle-resources/irish-uppercase-1.txt 
b/test/src/casefiddle-resources/irish-uppercase-1.txt
new file mode 100644
index 00000000000..b95e0aa04df
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-uppercase-1.txt
@@ -0,0 +1,105 @@
+ord na bhfocal
+Cosán na bhFilí
+ár bpobal
+Nóra na bPortach
+i dtosach báire
+An Ghaeilge i dTuaisceart na hÉireann
+as an gceantar sin
+I gContae na Mí agus i gConamara
+Dé hAoine
+Oirthear na hÁise
+Parlaimint na hEorpa
+Poblacht na hÉireann
+Ealaín na hIodáile
+na hÍosánaigh
+Acadamh na hOllscolaíochta
+Tír na hÓige
+toghchán na hUachtaránachta
+na hÚdaráis Chánach
+I mbun mo mhachnamh
+I mBéal Feirste agus i mBaile Átha Cliath
+ár n-acmhainní uisce
+eolaíocht na n-ábhar
+lucht na n-ealaíon
+ceol na n-éan
+ord na n-imeachtaí
+lucht adhartha na n-íomhánna
+gnéithe dár n-oidhreacht
+cultúr na n-óg
+ocht n-uaire sa lá
+formhór na n-údarás
+Ár nAthair
+Clár na nÁbhar
+Loch nEathach
+Cumann na nÉireannach Aontaithe
+Gréasán na nIontas
+nóibhíseacht na nÍosánach
+i gCeantar na nOileán
+Tír na nÓg
+Baile na nUltach
+Gort na nÚll
+ceol na ndaoine
+i nDún na nGall
+táim i ngrá leat
+labhair sé i nGaeilge!
+cén t-am é?
+tá an t-ádh orm inniu!
+Den obair an t-eolas
+An t-éileamh a íoc
+an t-inneall cuardaigh is fearr
+an t-íochtar a chur in uachtar
+Tabhair an t-ordú seo dó!
+Tá an t-ór buí aige.
+an t-uisce beatha ar an tábla.
+an t-úrscéal is deireanaí
+An tAcht Oideachais
+an tÁivé Máiria
+An tEarrach Arabach
+An tÉirí Amach
+An tImeall
+An tÍosánach Peadar Canisius
+An tOileánach
+An tÓr Muire
+an tUasal Éamon Ó Cuív
+An tÚdarás um Bóithre Náisiúnta
+ar aon tslí
+Béal Átha an tSléibhe
+Amach ón tsnáthaid
+Banríon an tSneachta
+ar an tsráid
+Caint an tSráidbhaile
+cora crua an tsaoil
+bholadh an tsáile
+uair sa tseachtain
+deireadh an tséasúir
+fear an tsiopa
+an tsíocháin a choimeád
+an tsochaí faisnéise
+gaoth an tsóláis
+Is beag an tsuim iad
+infheicthe ag an tsúil
+Cnoc an tSamhraidh
+Cionn tSáile
+an tSeirbhís Phoiblí
+Baile an tSéipéil
+An tSiria
+An tSín
+Oifig an tSoláthair
+Poll an tSómais
+Eolaire an tSuímh
+Casadh an tSúgáin
+scríobhfaidh
+preabphas
+úsáidtear
+snagcheol
+Stáitse Imbolc
+in-athnuaite agatsa
+Teanga Dhomhanda
+Réaltsruth
+na hataí
+Na Hataí
+ár nathair
+Ár Nathair
+t-léine
+torc allta
+tsk tsk tsk a chara
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index f7b0da41029..e5309066c9c 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -22,6 +22,11 @@
 (require 'case-table)
 (require 'ert)
 
+(defvar casefiddle-tests--resources-dir
+  (concat (concat (file-name-directory (or load-file-name buffer-file-name))
+                  "/casefiddle-resources/"))
+  "Path to casefiddle-resources directory next to the \"casefiddle-tests.el\" 
file.")
+
 (ert-deftest casefiddle-tests-char-properties ()
   "Sanity check of character Unicode properties."
   (should-not
@@ -161,17 +166,18 @@ casefiddle-tests--test-casing
         (while (and func-pairs expected)
           (setq funcs (car func-pairs)
                 getters (list get-string get-region))
-          (while (and funcs getters)
-            (let ((got (funcall (car getters) (car funcs))))
-              (unless (string-equal got (car expected))
-                (let ((fmt (length (symbol-name (car funcs)))))
-                  (setq fmt (format "\n%%%ds: %%s" (max fmt 8)))
-                  (push (format (concat fmt fmt fmt)
-                                (car funcs) (funcall fmt-str input)
-                                "expected" (funcall fmt-str (car expected))
-                                "but got" (funcall fmt-str got))
-                        errors))))
-            (setq funcs (cdr funcs) getters (cdr getters)))
+          (when (car expected)
+            (while (and funcs getters)
+              (let ((got (funcall (car getters) (car funcs))))
+                (unless (string-equal got (car expected))
+                  (let ((fmt (length (symbol-name (car funcs)))))
+                    (setq fmt (format "\n%%%ds: %%s" (max fmt 8)))
+                    (push (format (concat fmt fmt fmt)
+                                  (car funcs) (funcall fmt-str input)
+                                  "expected" (funcall fmt-str (car expected))
+                                  "but got" (funcall fmt-str got))
+                          errors))))
+              (setq funcs (cdr funcs) getters (cdr getters))))
           (setq func-pairs (cdr func-pairs) expected (cdr expected))))
       errors)
     (cons () tests))))
@@ -268,6 +274,36 @@ casefiddle-tests--test-casing
         ("į\u0307"                ; i-ogonek + dot above
          "Į" "į\u0307" "Į" "Į" "lt"))))))
 
+
+(defun casefiddle--read-lines (test-file)
+  (with-temp-buffer
+    (insert-file-contents (concat casefiddle-tests--resources-dir test-file))
+    (split-string (buffer-string) "\n" nil " +")))
+
+(ert-deftest casefiddle-test-irish ()
+  (let (tests)
+    ;; Read upcase test cases
+    (let ((input    (casefiddle--read-lines "irish-uppercase-1.txt"))
+          (expected (casefiddle--read-lines "irish-uppercase-1-ref.txt")))
+      (while (and input expected)
+        (push (list (car input) (car expected) nil nil nil "ga") tests)
+        (setq input (cdr input) expected (cdr expected)))
+      (should-not (or input expected)))
+
+    ;; Read downcase test cases
+    (let ((input    (casefiddle--read-lines "irish-lowercase-1.txt"))
+          (expected (casefiddle--read-lines "irish-lowercase-1-ref.txt")))
+      (while (and input expected)
+        (let ((test (assoc (car input) tests)))
+          (if test
+              (setcar (cddr test) (car expected))
+            (push (list (car input) nil (car expected) nil nil "ga") tests)))
+        (setq input (cdr input) expected (cdr expected)))
+      (should-not (or input expected)))
+
+    (should-not (with-temp-buffer (casefiddle-tests--test-casing tests)))))
+
+
 (ert-deftest casefiddle-tests-casing-byte8 ()
   (should-not
    (with-temp-buffer
-- 
2.12.0.246.ga2ecc84866-goog






reply via email to

[Prev in Thread] Current Thread [Next in Thread]