emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/pyim 777f4c6b65 1/2: Improve pyim-cstring--adjust-duoyi


From: ELPA Syncer
Subject: [elpa] externals/pyim 777f4c6b65 1/2: Improve pyim-cstring--adjust-duoyinzi
Date: Sat, 14 Jan 2023 20:58:03 -0500 (EST)

branch: externals/pyim
commit 777f4c6b652063cac877013a0fe81b231b95e8fe
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>

    Improve pyim-cstring--adjust-duoyinzi
---
 pyim-cstring.el     | 91 ++++++++++++++++++++++++++++++++++++-----------------
 pyim-pymap.el       | 11 ++++---
 tests/pyim-tests.el | 40 +++++++++++++++++++++--
 3 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/pyim-cstring.el b/pyim-cstring.el
index 7ddd406e98..3eb36fd954 100644
--- a/pyim-cstring.el
+++ b/pyim-cstring.el
@@ -112,18 +112,19 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结
       (if return-list
           (list string)
         string)
-    (let* ((pinyins-list
+    (let* ((string-parts (pyim-cstring--partition string t))
+           (pinyins-list
             ;; ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing"))
             (mapcar (lambda (str)
                       (if (pyim-string-match-p "\\cc" str)
                           (pyim-pymap-cchar2py-get str)
                         (list str)))
-                    (pyim-cstring--partition string t)))
+                    string-parts))
            ;; 通过排列组合的方式, 重排 pinyins-list。
            ;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang"))
            (pinyins-list (pyim-permutate-list
                           (pyim-cstring--adjust-duoyinzi
-                           string pinyins-list)))
+                           string-parts pinyins-list)))
            (list (mapcar (lambda (x)
                            (mapconcat (lambda (str)
                                         (if shou-zi-mu
@@ -138,39 +139,71 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结
           list
         (string-join list " ")))))
 
-(defun pyim-cstring--adjust-duoyinzi (word pinyins-list)
-  "根据 WORD 对 PINYINS-LIST 进行校正。
+(defun pyim-cstring--adjust-duoyinzi (string-parts pinyins-list)
+  "根据 STRING-PARTS 对 PINYINS-LIST 进行校正。
 
 比如:
 
-1. WORD:         人民银行
+1. STRING-PARTS: (\"人\" \"民\" \"银\" \"行\")
 2. PINYINS-LIST: ((\"ren\") (\"min\") (\"yin\") (\"hang\" \"xing\"))
 3. 输出结果为:  ((\"ren\") (\"min\") (\"yin\") (\"hang\"))
 
 这个函数依赖 `pyim-pymap-duoyinzi' 提供的多音字数据。"
-  (mapcar (lambda (pinyins)
-            (if (= (length pinyins) 1)
-                pinyins
-              (let ((py-adjusted
-                     ;; NOTE: 多音字校正规则:
-                     ;; 1. 首先通过在 WORD 中搜索多音字组成的词条来校正。
-                     ;; 2. 如果多音字组成的词条无法搜索到,就使用这个多音字最常用的读音,
-                     ;;    这样处理有可能校正错误,但大多数情况还是适用的。
-                     (or (cl-find-if
-                          (lambda (pinyin)
-                            (when-let* ((x (pyim-pymap-py2duoyinzi-get 
pinyin)))
-                              (string-match-p (string-join x "\\|") word)))
-                          pinyins)
-                         (cl-find-if
-                          (lambda (pinyin)
-                            (when-let* ((x (pyim-pymap-py2duoyinzi-get pinyin 
t)))
-                              (string-match-p (string-join x "\\|") word)))
-                          pinyins))))
-                ;; 如果多音字校正没有任何结果,就用校正前的信息。
-                (if py-adjusted
-                    (list py-adjusted)
-                  pinyins))))
-          pinyins-list))
+  (let ((n (length pinyins-list))
+        output)
+    (dotimes (i n)
+      (let ((pinyins (nth i pinyins-list))
+            ;; 当前位置对应的汉字和位置前后汉字组成的两字词语。
+            (words (list (ignore-errors
+                           (concat (nth (- i 2) string-parts)
+                                   (nth (- i 1) string-parts)
+                                   (nth i string-parts)))
+                         (ignore-errors
+                           (concat (nth (- i 1) string-parts)
+                                   (nth i string-parts)))
+                         (ignore-errors
+                           (concat (nth i string-parts)
+                                   (nth (+ i 1) string-parts)))))
+            ;; 当前位置汉字
+            (char (list (nth i string-parts))))
+        (if (= (length pinyins) 1)
+            (push pinyins output)
+          (let ((py-adjusted
+                 (or
+                  ;; NOTE: 多音字校正规则:
+                  ;; 1. 首先通过 pyim 自带的多音字词语来校正,具体见:
+                  ;; `pyim-pymap-duoyinzi-words'
+                  (pyim-cstring--find-duoyinzi-pinyin pinyins words)
+                  ;; 2. 然后通过 pyim 自带的多音字常用读音进行校正, 具体见:
+                  ;; `pyim-pymap-duoyinzi-chars',
+                  ;;
+                  ;; NOTE: 如果用户想要使用某个汉字的偏僻读音,这样处理是有问题
+                  ;; 的,但大多数情况我们还是使用汉字的常用读音,让偏僻的读音进
+                  ;; 入用户个人词库似乎也没有什么好处。
+                  (pyim-cstring--find-duoyinzi-pinyin pinyins char t))))
+            ;; 3. 如果多音字校正没有结果,就使用未校正的信息。
+            (push (if py-adjusted
+                      (list py-adjusted)
+                    pinyins)
+                  output)))))
+    (reverse output)))
+
+(defun pyim-cstring--find-duoyinzi-pinyin (pinyins words &optional search-char)
+  "寻找一个汉字当前最可能的读音。
+
+以 (行) 作为例子:
+1. PINYINS:     此汉字所有的读音组成的列表,比如: (xing hang)
+2. WORDS:       此汉字本身或者和前后汉字组成的词语,比如: (银行 行业)
+3. SEARCH-CHAR: 如果仅仅搜索汉字本身,就设置为 t, 此处设置为 nil.
+4. 返回结果:    hang"
+  (cl-find-if
+   (lambda (pinyin)
+     (when-let* ((x (string-join (pyim-pymap-py2duoyinzi-get pinyin 
search-char) "--")))
+       (cl-some
+        (lambda (reg)
+          (and reg (string-match-p reg x)))
+        words)))
+   pinyins))
 
 ;;;###autoload
 (defun pyim-cstring-to-pinyin-simple (string &optional shou-zi-mu separator 
return-list)
diff --git a/pyim-pymap.el b/pyim-pymap.el
index c3a3f2367d..aaf7af3e2d 100644
--- a/pyim-pymap.el
+++ b/pyim-pymap.el
@@ -715,7 +715,7 @@
     ("ha" "蛤蟆" "癞蛤" "虾蟆")
     ("hai" "还是" "还有")
     ("hao" "貉子" "貉绒")
-    ("hang" "总行" "分行" "支行" "行业" "排行" "行情" "央行" "商行" "外行" "银行" "商行" "酒行" "麻行" 
"琴行" "巷道")
+    ("hang" "总行" "分行" "支行" "行业" "排行" "行情" "央行" "农行" "建行" "商行" "外行" "银行" "商行" 
"酒行" "麻行" "琴行" "巷道")
     ("he" "嘉和" "和睦" "亲和" "龙和" "之貉" "威吓" "恫吓" "恐吓" "鼎和" "锦和" "麒和苑" "合资" "鸿合")
     ("heng" "道行")
     ("hu" "鹄望" "鸿鹄" "鹄立")
@@ -837,7 +837,7 @@
     ("xie" "解数" "出血" "采血" "换血" "血糊" "尿血" "淤血" "放血" "血晕" "血淋" "便血" "吐血" "咯血" 
"叶韵" "蝎子")
     ("xiu" "铜臭" "乳臭" "成宿" "星宿")
     ("xin" "鸿信")
-    ("xing" "深省" "省视" "内省" "不省人事" "省悟" "省察" "旅行" "例行" "行程" "行乐" "龙行" "人行" "流行" 
"先行" "行星" "品行" "发行" "行政" "风行" "龙行" "龍行" "麟行")
+    ("xing" "深省" "省视" "内省" "不省人事" "省悟" "省察" "旅行" "例行" "行程" "行乐" "龙行" "不行" "也行" 
"很行" "人行" "流行" "先行" "行星" "品行" "发行" "行政" "风行" "龙行" "龍行" "麟行")
     ("xu" "牧畜" "畜产" "畜牧" "畜养" "气吁" "喘吁" "吁吁" "麦埂圩")
     ("xue" "削减" "削弱" "削瘦" "削球" "削平" "削价" "瘦削" "剥削" "削职" "删削" "削肩" "吸血")
     ("xun" "荨麻" "荨麻疹")
@@ -870,8 +870,11 @@
     ("zi" "吱声" "来兹" "今兹" "仔细" "仔猪")
     ("zu" "沐足" "足道")
     ("zuo" "撮毛" "小撮" "柞绸" "柞蚕" "柞树" "柞木")
-    ("zui" "咀唇" "尖沙咀" "黄达咀" "黄土咀" "鹰咀"))
-  "多音字对应的词组。")
+    ("zui" "咀唇" "沙咀" "达咀" "土咀" "鹰咀"))
+  "多音字对应的词组。
+
+注意:由于目前 pyim 实现的限制,词语中的多音字需要在词语的第一二
+三位,否则没有效果。")
 
 (defvar pyim-pymap--py2cchar-cache1 nil
   "拼音查汉字功能需要的变量.
diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el
index 5718bb80b9..a0560c552b 100644
--- a/tests/pyim-tests.el
+++ b/tests/pyim-tests.el
@@ -795,11 +795,47 @@
                    "我爱-北京-天安-门"))))
 
 (ert-deftest pyim-tests-pyim-cstring-to-pinyin ()
+
+  (should (equal (pyim-cstring--find-duoyinzi-pinyin
+                  '("xing" "hang") '("银行"))
+                 "hang"))
+  (should (equal (pyim-cstring--find-duoyinzi-pinyin
+                  '("xing" "hang") '("不行" "行为"))
+                 "xing"))
+
+  (should (equal (pyim-cstring--find-duoyinzi-pinyin
+                  '("bu" "pi") '("不") t)
+                 "bu"))
+
   (should (equal (pyim-cstring--adjust-duoyinzi
-                  "银行传说" '(("yin") ("xing" "heng" "hang")
-                               ("zhuan" "chuan") ("yue" "shuo" "shui")))
+                  '("银" "行" "传" "说")
+                  '(("yin") ("xing" "heng" "hang")
+                    ("zhuan" "chuan") ("yue" "shuo" "shui")))
                  '(("yin") ("hang") ("chuan") ("shuo"))))
 
+  (should (equal (pyim-cstring--adjust-duoyinzi
+                  '("银" "行" "很" "行")
+                  '(("yin") ("xing" "heng" "hang")
+                    ("hen") ("xing" "heng" "hang")))
+                 '(("yin") ("hang") ("hen") ("xing"))))
+
+  (should (equal (pyim-cstring--adjust-duoyinzi
+                  '("银" "行" "行" "业" "很" "行"
+                    "不" "行" "也" "行"
+                    "行" "也" "行")
+                  '(("yin") ("xing" "heng" "hang")
+                    ("xing" "heng" "hang") ("ye")
+                    ("hen") ("xing" "heng" "hang")
+                    ("dun" "bu") ("xing" "heng" "hang")
+                    ("ye") ("xing" "heng" "hang")
+                    ("xing" "heng" "hang") ("ye")
+                    ("xing" "heng" "hang")))
+                 '(("yin") ("hang") ("hang")
+                   ("ye") ("hen") ("xing")
+                   ("bu") ("xing") ("ye")
+                   ("xing") ("xing" "heng" "hang")
+                   ("ye") ("xing"))))
+
   ;; pyim-cstring-split-to-list
   (should (equal (pyim-cstring-to-pinyin "银行传说") "yinhangchuanshuo"))
   (should (equal (pyim-cstring-to-pinyin "银行传说" t) "yhcs"))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]