[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/pyim 777f4c6b65 1/2: Improve pyim-cstring--adjust-duoyi
From: |
ELPA Syncer |
Subject: |
[elpa] externals/pyim 777f4c6b65 1/2: Improve pyim-cstring--adjust-duoyinzi |
Date: |
Sat, 14 Jan 2023 20:58:03 -0500 (EST) |
branch: externals/pyim
commit 777f4c6b652063cac877013a0fe81b231b95e8fe
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>
Improve pyim-cstring--adjust-duoyinzi
---
pyim-cstring.el | 91 ++++++++++++++++++++++++++++++++++++-----------------
pyim-pymap.el | 11 ++++---
tests/pyim-tests.el | 40 +++++++++++++++++++++--
3 files changed, 107 insertions(+), 35 deletions(-)
diff --git a/pyim-cstring.el b/pyim-cstring.el
index 7ddd406e98..3eb36fd954 100644
--- a/pyim-cstring.el
+++ b/pyim-cstring.el
@@ -112,18 +112,19 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结
(if return-list
(list string)
string)
- (let* ((pinyins-list
+ (let* ((string-parts (pyim-cstring--partition string t))
+ (pinyins-list
;; ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing"))
(mapcar (lambda (str)
(if (pyim-string-match-p "\\cc" str)
(pyim-pymap-cchar2py-get str)
(list str)))
- (pyim-cstring--partition string t)))
+ string-parts))
;; 通过排列组合的方式, 重排 pinyins-list。
;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang"))
(pinyins-list (pyim-permutate-list
(pyim-cstring--adjust-duoyinzi
- string pinyins-list)))
+ string-parts pinyins-list)))
(list (mapcar (lambda (x)
(mapconcat (lambda (str)
(if shou-zi-mu
@@ -138,39 +139,71 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结
list
(string-join list " ")))))
-(defun pyim-cstring--adjust-duoyinzi (word pinyins-list)
- "根据 WORD 对 PINYINS-LIST 进行校正。
+(defun pyim-cstring--adjust-duoyinzi (string-parts pinyins-list)
+ "根据 STRING-PARTS 对 PINYINS-LIST 进行校正。
比如:
-1. WORD: 人民银行
+1. STRING-PARTS: (\"人\" \"民\" \"银\" \"行\")
2. PINYINS-LIST: ((\"ren\") (\"min\") (\"yin\") (\"hang\" \"xing\"))
3. 输出结果为: ((\"ren\") (\"min\") (\"yin\") (\"hang\"))
这个函数依赖 `pyim-pymap-duoyinzi' 提供的多音字数据。"
- (mapcar (lambda (pinyins)
- (if (= (length pinyins) 1)
- pinyins
- (let ((py-adjusted
- ;; NOTE: 多音字校正规则:
- ;; 1. 首先通过在 WORD 中搜索多音字组成的词条来校正。
- ;; 2. 如果多音字组成的词条无法搜索到,就使用这个多音字最常用的读音,
- ;; 这样处理有可能校正错误,但大多数情况还是适用的。
- (or (cl-find-if
- (lambda (pinyin)
- (when-let* ((x (pyim-pymap-py2duoyinzi-get
pinyin)))
- (string-match-p (string-join x "\\|") word)))
- pinyins)
- (cl-find-if
- (lambda (pinyin)
- (when-let* ((x (pyim-pymap-py2duoyinzi-get pinyin
t)))
- (string-match-p (string-join x "\\|") word)))
- pinyins))))
- ;; 如果多音字校正没有任何结果,就用校正前的信息。
- (if py-adjusted
- (list py-adjusted)
- pinyins))))
- pinyins-list))
+ (let ((n (length pinyins-list))
+ output)
+ (dotimes (i n)
+ (let ((pinyins (nth i pinyins-list))
+ ;; 当前位置对应的汉字和位置前后汉字组成的两字词语。
+ (words (list (ignore-errors
+ (concat (nth (- i 2) string-parts)
+ (nth (- i 1) string-parts)
+ (nth i string-parts)))
+ (ignore-errors
+ (concat (nth (- i 1) string-parts)
+ (nth i string-parts)))
+ (ignore-errors
+ (concat (nth i string-parts)
+ (nth (+ i 1) string-parts)))))
+ ;; 当前位置汉字
+ (char (list (nth i string-parts))))
+ (if (= (length pinyins) 1)
+ (push pinyins output)
+ (let ((py-adjusted
+ (or
+ ;; NOTE: 多音字校正规则:
+ ;; 1. 首先通过 pyim 自带的多音字词语来校正,具体见:
+ ;; `pyim-pymap-duoyinzi-words'
+ (pyim-cstring--find-duoyinzi-pinyin pinyins words)
+ ;; 2. 然后通过 pyim 自带的多音字常用读音进行校正, 具体见:
+ ;; `pyim-pymap-duoyinzi-chars',
+ ;;
+ ;; NOTE: 如果用户想要使用某个汉字的偏僻读音,这样处理是有问题
+ ;; 的,但大多数情况我们还是使用汉字的常用读音,让偏僻的读音进
+ ;; 入用户个人词库似乎也没有什么好处。
+ (pyim-cstring--find-duoyinzi-pinyin pinyins char t))))
+ ;; 3. 如果多音字校正没有结果,就使用未校正的信息。
+ (push (if py-adjusted
+ (list py-adjusted)
+ pinyins)
+ output)))))
+ (reverse output)))
+
+(defun pyim-cstring--find-duoyinzi-pinyin (pinyins words &optional search-char)
+ "寻找一个汉字当前最可能的读音。
+
+以 (行) 作为例子:
+1. PINYINS: 此汉字所有的读音组成的列表,比如: (xing hang)
+2. WORDS: 此汉字本身或者和前后汉字组成的词语,比如: (银行 行业)
+3. SEARCH-CHAR: 如果仅仅搜索汉字本身,就设置为 t, 此处设置为 nil.
+4. 返回结果: hang"
+ (cl-find-if
+ (lambda (pinyin)
+ (when-let* ((x (string-join (pyim-pymap-py2duoyinzi-get pinyin
search-char) "--")))
+ (cl-some
+ (lambda (reg)
+ (and reg (string-match-p reg x)))
+ words)))
+ pinyins))
;;;###autoload
(defun pyim-cstring-to-pinyin-simple (string &optional shou-zi-mu separator
return-list)
diff --git a/pyim-pymap.el b/pyim-pymap.el
index c3a3f2367d..aaf7af3e2d 100644
--- a/pyim-pymap.el
+++ b/pyim-pymap.el
@@ -715,7 +715,7 @@
("ha" "蛤蟆" "癞蛤" "虾蟆")
("hai" "还是" "还有")
("hao" "貉子" "貉绒")
- ("hang" "总行" "分行" "支行" "行业" "排行" "行情" "央行" "商行" "外行" "银行" "商行" "酒行" "麻行"
"琴行" "巷道")
+ ("hang" "总行" "分行" "支行" "行业" "排行" "行情" "央行" "农行" "建行" "商行" "外行" "银行" "商行"
"酒行" "麻行" "琴行" "巷道")
("he" "嘉和" "和睦" "亲和" "龙和" "之貉" "威吓" "恫吓" "恐吓" "鼎和" "锦和" "麒和苑" "合资" "鸿合")
("heng" "道行")
("hu" "鹄望" "鸿鹄" "鹄立")
@@ -837,7 +837,7 @@
("xie" "解数" "出血" "采血" "换血" "血糊" "尿血" "淤血" "放血" "血晕" "血淋" "便血" "吐血" "咯血"
"叶韵" "蝎子")
("xiu" "铜臭" "乳臭" "成宿" "星宿")
("xin" "鸿信")
- ("xing" "深省" "省视" "内省" "不省人事" "省悟" "省察" "旅行" "例行" "行程" "行乐" "龙行" "人行" "流行"
"先行" "行星" "品行" "发行" "行政" "风行" "龙行" "龍行" "麟行")
+ ("xing" "深省" "省视" "内省" "不省人事" "省悟" "省察" "旅行" "例行" "行程" "行乐" "龙行" "不行" "也行"
"很行" "人行" "流行" "先行" "行星" "品行" "发行" "行政" "风行" "龙行" "龍行" "麟行")
("xu" "牧畜" "畜产" "畜牧" "畜养" "气吁" "喘吁" "吁吁" "麦埂圩")
("xue" "削减" "削弱" "削瘦" "削球" "削平" "削价" "瘦削" "剥削" "削职" "删削" "削肩" "吸血")
("xun" "荨麻" "荨麻疹")
@@ -870,8 +870,11 @@
("zi" "吱声" "来兹" "今兹" "仔细" "仔猪")
("zu" "沐足" "足道")
("zuo" "撮毛" "小撮" "柞绸" "柞蚕" "柞树" "柞木")
- ("zui" "咀唇" "尖沙咀" "黄达咀" "黄土咀" "鹰咀"))
- "多音字对应的词组。")
+ ("zui" "咀唇" "沙咀" "达咀" "土咀" "鹰咀"))
+ "多音字对应的词组。
+
+注意:由于目前 pyim 实现的限制,词语中的多音字需要在词语的第一二
+三位,否则没有效果。")
(defvar pyim-pymap--py2cchar-cache1 nil
"拼音查汉字功能需要的变量.
diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el
index 5718bb80b9..a0560c552b 100644
--- a/tests/pyim-tests.el
+++ b/tests/pyim-tests.el
@@ -795,11 +795,47 @@
"我爱-北京-天安-门"))))
(ert-deftest pyim-tests-pyim-cstring-to-pinyin ()
+
+ (should (equal (pyim-cstring--find-duoyinzi-pinyin
+ '("xing" "hang") '("银行"))
+ "hang"))
+ (should (equal (pyim-cstring--find-duoyinzi-pinyin
+ '("xing" "hang") '("不行" "行为"))
+ "xing"))
+
+ (should (equal (pyim-cstring--find-duoyinzi-pinyin
+ '("bu" "pi") '("不") t)
+ "bu"))
+
(should (equal (pyim-cstring--adjust-duoyinzi
- "银行传说" '(("yin") ("xing" "heng" "hang")
- ("zhuan" "chuan") ("yue" "shuo" "shui")))
+ '("银" "行" "传" "说")
+ '(("yin") ("xing" "heng" "hang")
+ ("zhuan" "chuan") ("yue" "shuo" "shui")))
'(("yin") ("hang") ("chuan") ("shuo"))))
+ (should (equal (pyim-cstring--adjust-duoyinzi
+ '("银" "行" "很" "行")
+ '(("yin") ("xing" "heng" "hang")
+ ("hen") ("xing" "heng" "hang")))
+ '(("yin") ("hang") ("hen") ("xing"))))
+
+ (should (equal (pyim-cstring--adjust-duoyinzi
+ '("银" "行" "行" "业" "很" "行"
+ "不" "行" "也" "行"
+ "行" "也" "行")
+ '(("yin") ("xing" "heng" "hang")
+ ("xing" "heng" "hang") ("ye")
+ ("hen") ("xing" "heng" "hang")
+ ("dun" "bu") ("xing" "heng" "hang")
+ ("ye") ("xing" "heng" "hang")
+ ("xing" "heng" "hang") ("ye")
+ ("xing" "heng" "hang")))
+ '(("yin") ("hang") ("hang")
+ ("ye") ("hen") ("xing")
+ ("bu") ("xing") ("ye")
+ ("xing") ("xing" "heng" "hang")
+ ("ye") ("xing"))))
+
;; pyim-cstring-split-to-list
(should (equal (pyim-cstring-to-pinyin "银行传说") "yinhangchuanshuo"))
(should (equal (pyim-cstring-to-pinyin "银行传说" t) "yhcs"))