emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/pyim af1bec4 14/36: update pyim-cstring.el


From: ELPA Syncer
Subject: [elpa] externals/pyim af1bec4 14/36: update pyim-cstring.el
Date: Thu, 22 Apr 2021 22:57:17 -0400 (EDT)

branch: externals/pyim
commit af1bec4ac2d2d43f627bab2abc9b97c079b4f110
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>

    update pyim-cstring.el
---
 pyim-cstring.el | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 pyim.el         | 218 +++-----------------------------------------------------
 2 files changed, 215 insertions(+), 210 deletions(-)

diff --git a/pyim-cstring.el b/pyim-cstring.el
index db8c421..da89b06 100644
--- a/pyim-cstring.el
+++ b/pyim-cstring.el
@@ -199,6 +199,213 @@
   (goto-char (point-min))
   (message "分词完成!"))
 
+;; ** 中文字符串到拼音的转换工具
+;;;###autoload
+(defalias 'pyim-hanzi2pinyin 'pyim-cstring-to-pinyin)
+(defun pyim-cstring-to-pinyin (string &optional shou-zi-mu separator
+                                      return-list ignore-duo-yin-zi 
adjust-duo-yin-zi)
+  "将汉字字符串转换为对应的拼音字符串的工具.
+
+如果 SHOU-ZI-MU 设置为 t, 转换仅得到拼音首字母字符串。当
+RETURN-LIST 设置为 t 时,返回一个拼音列表,这个列表包含词条的一个
+或者多个拼音(词条包含多音字时);如果 IGNORE-DUO-YIN-ZI 设置为
+t, 遇到多音字时,只使用第一个拼音,其它拼音忽略;当
+ADJUST-DUO-YIN-Zi 设置为 t 时, `pyim-cstring-to-pinyin' 会使用 pyim 已
+安装的词库来校正多音字,但这个功能有一定的限制:
+
+1. pyim 普通词库中不存在的词条不能较正
+2. 多音字校正速度比较慢,实时转换会产生卡顿。
+
+BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结果会
+包含多余的连接符:比如: '你=好' --> 'ni-=-hao'"
+  (if (not (pyim-string-match-p "\\cc" string))
+      (if return-list
+          (list string)
+        string)
+    (let (string-list pinyins-list pinyins-list-permutated 
pinyins-list-adjusted)
+
+      ;; 将汉字字符串转换为字符list,英文原样输出。
+      ;; 比如: “Hello银行” -> ("Hello" "银" "行")
+      (setq string-list
+            (if (pyim-string-match-p "\\CC" string)
+                ;; 处理中英文混合的情况
+                (split-string
+                 (replace-regexp-in-string
+                  "\\(\\cc\\)" "@@@@\\1@@@@" string)
+                 "@@@@")
+              ;; 如果词条只包含中文,使用`string-to-vector'
+              ;; 这样处理速度比较快。
+              (string-to-vector string)))
+
+      ;; 将上述汉字字符串里面的所有汉字转换为与之对应的拼音list。
+      ;; 比如: ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing"))
+      (mapc
+       #'(lambda (str)
+           ;; `string-to-vector' 得到的是 char vector, 需要将其转换为 string。
+           (when (numberp str)
+             (setq str (char-to-string str)))
+           (cond
+            ((> (length str) 1)
+             (push (list str) pinyins-list))
+            ((and (> (length str) 0)
+                  (pyim-string-match-p "\\cc" str))
+             (push (pyim-cchar2pinyin-get (string-to-char str))
+                   pinyins-list))
+            ((> (length str) 0)
+             (push (list str) pinyins-list))))
+       string-list)
+      (setq pinyins-list (nreverse pinyins-list))
+
+      ;; 通过排列组合的方式, 重排 pinyins-list。
+      ;; 比如:(("Hello") ("yin") ("hang" "xing")) -> (("Hello" "yin" "hang") 
("Hello" "yin" "xing"))
+      (setq pinyins-list-permutated (pyim-permutate-list2 pinyins-list))
+
+      ;; 使用 pyim 的安装的词库来校正多音字。
+      (when adjust-duo-yin-zi
+        ;; 确保 pyim 词库加载
+        (pyim-dcache-init-variables)
+        (dolist (pinyin-list pinyins-list-permutated)
+          (let* ((py-str (mapconcat #'identity pinyin-list "-"))
+                 (words-from-dicts
+                  ;; pyim-buffer-list 中第一个 buffer 对应的是个人词库文件
+                  ;; 个人词库文件中的词条,极有可能存在 *多音字污染*。
+                  ;; 这是由 pyim 保存词条的机制决定的。
+                  (pyim-dcache-get py-str '(code2word))))
+            (when (member string words-from-dicts)
+              (push pinyin-list pinyins-list-adjusted))))
+        (setq pinyins-list-adjusted
+              (nreverse pinyins-list-adjusted)))
+
+      ;; 返回拼音字符串或者拼音列表
+      (let* ((pinyins-list
+              (or pinyins-list-adjusted
+                  pinyins-list-permutated))
+             (list (mapcar
+                    #'(lambda (x)
+                        (mapconcat
+                         #'(lambda (str)
+                             (if shou-zi-mu
+                                 (substring str 0 1)
+                               str))
+                         x separator))
+                    (if ignore-duo-yin-zi
+                        (list (car pinyins-list))
+                      pinyins-list))))
+        (if return-list
+            list
+          (mapconcat #'identity list " "))))))
+
+;;;###autoload
+(defalias 'pyim-hanzi2pinyin-simple 'pyim-cstring-to-pinyin-simple)
+(defun pyim-cstring-to-pinyin-simple (string &optional shou-zi-mu separator 
return-list)
+  "简化版的 `pyim-cstring-to-pinyin', 不处理多音字。"
+  (pyim-cstring-to-pinyin string shou-zi-mu separator return-list t))
+
+(defalias 'pyim-cwords-at-point 'pyim-cstring-words-at-point)
+(defun pyim-cstring-words-at-point (&optional end-of-point)
+  "获取光标当前的词条列表,当 END-OF-POINT 设置为 t 时,获取光标后的词条列表。
+词条列表的每一个元素都是列表,这些列表的第一个元素为词条,第二个元素为光标处到词条
+头部的距离,第三个元素为光标处到词条尾部的距离。
+
+其工作原理是:
+
+1. 使用 `thing-at-point' 获取当前光标处的一个字符串,一般而言:英文会得到
+   一个单词,中文会得到一个句子。
+2. 英文单词直接返回这个单词的列表。
+3. 中文句子首先用 `pyim-cstring-split-to-list' 分词,然后根据光标在中文句子
+   中的位置,筛选出符合要求的中文词条。得到并返回 *一个* 或者 *多个* 词条
+   的列表。"
+  ;;
+  ;;                                光标到词 光标到词
+  ;;                                首的距离 尾的距离
+  ;;                                       | |
+  ;; 获取光标当前的词<I>条列表 -> (("的词" 2 0) ("词条" 1 1))
+  ;;
+  (let* ((case-fold-search t)
+         (current-pos (point))
+         (current-char
+          (if end-of-point
+              (string (following-char))
+            (string (preceding-char))))
+         (str (thing-at-point 'word t))
+         (str-length (length str))
+         (str-boundary (bounds-of-thing-at-point 'word))
+         (str-beginning-pos (when str-boundary
+                              (car str-boundary)))
+         (str-end-pos (when str-boundary
+                        (cdr str-boundary)))
+         (str-offset
+          (when (and str-beginning-pos str-end-pos)
+            (if (= current-pos str-end-pos)
+                (1+ (- str-end-pos str-beginning-pos))
+              (1+ (- current-pos str-beginning-pos)))))
+         str-offset-adjusted words-alist results)
+
+    ;; 当字符串长度太长时, `pyim-cstring-split-to-list'
+    ;; 的速度比较慢,这里确保待分词的字符串长度不超过10.
+    (when (and str (not (pyim-string-match-p "\\CC" str)))
+      (if (> str-offset 5)
+          (progn (setq str-offset-adjusted 5)
+                 (setq str (substring str
+                                      (- str-offset 5)
+                                      (min (+ str-offset 5) str-length))))
+        (setq str-offset-adjusted str-offset)
+        (setq str (substring str 0 (min 9 str-length)))))
+
+    (cond
+     ((and str (not (pyim-string-match-p "\\CC" str)))
+      (setq words-alist
+            (pyim-cstring-split-to-list str))
+      (dolist (word-list words-alist)
+        (let ((word-begin (nth 1 word-list))
+              (word-end (nth 2 word-list)))
+          (if (if end-of-point
+                  (and (< str-offset-adjusted word-end)
+                       (>= str-offset-adjusted word-begin))
+                (and (<= str-offset-adjusted word-end)
+                     (> str-offset-adjusted word-begin)))
+              (push (list (car word-list)
+                          (- str-offset-adjusted word-begin) ;; 例如: ("你好" 1 1)
+                          (- word-end str-offset-adjusted))
+                    results))))
+      (or results
+          (list (if end-of-point
+                    (list current-char 0 1)
+                  (list current-char 1 0)))))
+     (str (list (list str
+                      (- current-pos str-beginning-pos)
+                      (- str-end-pos current-pos)))))))
+
+;; ** 让 forward/backward 支持中文
+(defalias 'pyim-forward-word 'pyim-cstring-forward-word)
+(defun pyim-cstring-forward-word (&optional arg)
+  "向前移动 ARG 英文或者中文词,向前移动时基于 *最长* 的词移动。"
+  (interactive "P")
+  (or arg (setq arg 1))
+  (dotimes (_ arg)
+    (let* ((words (pyim-cstring-words-at-point t))
+           (max-length
+            (cl-reduce #'max
+                       (cons 0 (mapcar #'(lambda (word)
+                                           (nth 2 word))
+                                       words))))
+           (max-length (max (or max-length 1) 1)))
+      (forward-char max-length))))
+
+(defalias 'pyim-backward-word 'pyim-cstring-backward-word)
+(defun pyim-cstring-backward-word (&optional arg)
+  "向后移动 ARG 个英文或者中文词,向后移动时基于 *最长* 的词移动。"
+  (interactive "P")
+  (or arg (setq arg 1))
+  (dotimes (_ arg)
+    (let* ((words (pyim-cstring-words-at-point))
+           (max-length
+            (cl-reduce #'max
+                       (cons 0 (mapcar #'(lambda (word)
+                                           (nth 1 word))
+                                       words))))
+           (max-length (max (or max-length 1) 1)))
+      (backward-char max-length))))
 
 ;; * Footer
 (provide 'pyim-cstring)
diff --git a/pyim.el b/pyim.el
index 6bbb1d3..4bd073a 100644
--- a/pyim.el
+++ b/pyim.el
@@ -489,8 +489,8 @@
 ;; 下面两个函数可以将中文字符串转换的拼音字符串或者列表,用于 emacs-lisp
 ;; 编程。
 
-;; 1. `pyim-hanzi2pinyin' (考虑多音字)
-;; 2. `pyim-hanzi2pinyin-simple'  (不考虑多音字)
+;; 1. `pyim-cstring-to-pinyin' (考虑多音字)
+;; 2. `pyim-cstring-to-pinyin-simple'  (不考虑多音字)
 
 ;; *** 中文分词
 ;; pyim 包含了一个简单的分词函数:`pyim-cstring-split-to-list', 可以
@@ -513,7 +513,7 @@
 ;; 词库中不存在的中文词条。
 
 ;; *** 获取光标处的中文词条
-;; pyim 包含了一个简单的命令:`pyim-cwords-at-point', 这个命令
+;; pyim 包含了一个简单的命令:`pyim-cstring-words-at-point', 这个命令
 ;; 可以得到光标处的 *英文* 或者 *中文* 词条的 *列表*,这个命令依赖分词函数:
 ;; `pyim-cstring-split-to-list'。
 
@@ -1228,7 +1228,7 @@ MERGE-METHOD 是一个函数,这个函数需要两个数字参数,代表
 词条 WORD 默认会追加到已有词条的后面,如果 PREPEND 设置为 t,
 词条就会放到已有词条的最前面。
 
-根据当前输入法,决定是调用 `pyim-hanzi2pinyin' 还是
+根据当前输入法,决定是调用 `pyim-cstring-to-pinyin' 还是
 `pyim-hanzi2xingma' 来获取中文词条的编码。
 
 WORDCOUNT-HANDLER 可以是一个数字,代表将此数字设置为 WORD 的新词频,
@@ -1248,7 +1248,7 @@ BUG:拼音无法有效地处理多音字。"
            (codes (cond ((eq class 'xingma)
                          (pyim-hanzi2xingma word scheme-name t))
                         ;;拼音使用了多音字校正
-                        (t (pyim-hanzi2pinyin word nil "-" t nil t)))))
+                        (t (pyim-cstring-to-pinyin word nil "-" t nil t)))))
       ;; 保存对应词条的词频
       (when (> (length word) 0)
         (pyim-dcache-call-api
@@ -2954,7 +2954,7 @@ alist 列表。"
                 (region-beginning) (region-end)))
              (when (and (not mark-active) (> length 0))
                (delete-char (- 0 length)))
-             (setq code (pyim-hanzi2pinyin
+             (setq code (pyim-cstring-to-pinyin
                          (replace-regexp-in-string " " "" string)
                          nil "-" nil t))
              (when (and (> code 0)
@@ -2989,210 +2989,8 @@ alist 列表。"
   (setq pyim-input-ascii
         (not pyim-input-ascii)))
 
-;; ** 让 forward/backward 支持中文
-(defun pyim-forward-word (&optional arg)
-  "向前移动 ARG 英文或者中文词,向前移动时基于 *最长* 的词移动。"
-  (interactive "P")
-  (or arg (setq arg 1))
-  (dotimes (_ arg)
-    (let* ((words (pyim-cwords-at-point t))
-           (max-length
-            (cl-reduce #'max
-                       (cons 0 (mapcar #'(lambda (word)
-                                           (nth 2 word))
-                                       words))))
-           (max-length (max (or max-length 1) 1)))
-      (forward-char max-length))))
-
-(defun pyim-backward-word (&optional arg)
-  "向后移动 ARG 个英文或者中文词,向后移动时基于 *最长* 的词移动。"
-  (interactive "P")
-  (or arg (setq arg 1))
-  (dotimes (_ arg)
-    (let* ((words (pyim-cwords-at-point))
-           (max-length
-            (cl-reduce #'max
-                       (cons 0 (mapcar #'(lambda (word)
-                                           (nth 1 word))
-                                       words))))
-           (max-length (max (or max-length 1) 1)))
-      (backward-char max-length))))
-
-(defun pyim-cwords-at-point (&optional end-of-point)
-  "获取光标当前的词条列表,当 END-OF-POINT 设置为 t 时,获取光标后的词条列表。
-词条列表的每一个元素都是列表,这些列表的第一个元素为词条,第二个元素为光标处到词条
-头部的距离,第三个元素为光标处到词条尾部的距离。
-
-其工作原理是:
-
-1. 使用 `thing-at-point' 获取当前光标处的一个字符串,一般而言:英文会得到
-   一个单词,中文会得到一个句子。
-2. 英文单词直接返回这个单词的列表。
-3. 中文句子首先用 `pyim-cstring-split-to-list' 分词,然后根据光标在中文句子
-   中的位置,筛选出符合要求的中文词条。得到并返回 *一个* 或者 *多个* 词条
-   的列表。"
-  ;;
-  ;;                                光标到词 光标到词
-  ;;                                首的距离 尾的距离
-  ;;                                       | |
-  ;; 获取光标当前的词<I>条列表 -> (("的词" 2 0) ("词条" 1 1))
-  ;;
-  (let* ((case-fold-search t)
-         (current-pos (point))
-         (current-char
-          (if end-of-point
-              (string (following-char))
-            (string (preceding-char))))
-         (str (thing-at-point 'word t))
-         (str-length (length str))
-         (str-boundary (bounds-of-thing-at-point 'word))
-         (str-beginning-pos (when str-boundary
-                              (car str-boundary)))
-         (str-end-pos (when str-boundary
-                        (cdr str-boundary)))
-         (str-offset
-          (when (and str-beginning-pos str-end-pos)
-            (if (= current-pos str-end-pos)
-                (1+ (- str-end-pos str-beginning-pos))
-              (1+ (- current-pos str-beginning-pos)))))
-         str-offset-adjusted words-alist results)
-
-    ;; 当字符串长度太长时, `pyim-cstring-split-to-list'
-    ;; 的速度比较慢,这里确保待分词的字符串长度不超过10.
-    (when (and str (not (pyim-string-match-p "\\CC" str)))
-      (if (> str-offset 5)
-          (progn (setq str-offset-adjusted 5)
-                 (setq str (substring str
-                                      (- str-offset 5)
-                                      (min (+ str-offset 5) str-length))))
-        (setq str-offset-adjusted str-offset)
-        (setq str (substring str 0 (min 9 str-length)))))
-
-    (cond
-     ((and str (not (pyim-string-match-p "\\CC" str)))
-      (setq words-alist
-            (pyim-cstring-split-to-list str))
-      (dolist (word-list words-alist)
-        (let ((word-begin (nth 1 word-list))
-              (word-end (nth 2 word-list)))
-          (if (if end-of-point
-                  (and (< str-offset-adjusted word-end)
-                       (>= str-offset-adjusted word-begin))
-                (and (<= str-offset-adjusted word-end)
-                     (> str-offset-adjusted word-begin)))
-              (push (list (car word-list)
-                          (- str-offset-adjusted word-begin) ;; 例如: ("你好" 1 1)
-                          (- word-end str-offset-adjusted))
-                    results))))
-      (or results
-          (list (if end-of-point
-                    (list current-char 0 1)
-                  (list current-char 1 0)))))
-     (str (list (list str
-                      (- current-pos str-beginning-pos)
-                      (- str-end-pos current-pos)))))))
-
-
-;; ** 汉字到拼音的转换工具
-;;;###autoload
-(defun pyim-hanzi2pinyin (string &optional shou-zi-mu separator
-                                 return-list ignore-duo-yin-zi 
adjust-duo-yin-zi)
-  "将汉字字符串转换为对应的拼音字符串的工具.
-
-如果 SHOU-ZI-MU 设置为 t, 转换仅得到拼音首字母字符串。当
-RETURN-LIST 设置为 t 时,返回一个拼音列表,这个列表包含词条的一个
-或者多个拼音(词条包含多音字时);如果 IGNORE-DUO-YIN-ZI 设置为
-t, 遇到多音字时,只使用第一个拼音,其它拼音忽略;当
-ADJUST-DUO-YIN-Zi 设置为 t 时, `pyim-hanzi2pinyin' 会使用 pyim 已
-安装的词库来校正多音字,但这个功能有一定的限制:
-
-1. pyim 普通词库中不存在的词条不能较正
-2. 多音字校正速度比较慢,实时转换会产生卡顿。
-
-BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结果会
-包含多余的连接符:比如: '你=好' --> 'ni-=-hao'"
-  (if (not (pyim-string-match-p "\\cc" string))
-      (if return-list
-          (list string)
-        string)
-    (let (string-list pinyins-list pinyins-list-permutated 
pinyins-list-adjusted)
-
-      ;; 将汉字字符串转换为字符list,英文原样输出。
-      ;; 比如: “Hello银行” -> ("Hello" "银" "行")
-      (setq string-list
-            (if (pyim-string-match-p "\\CC" string)
-                ;; 处理中英文混合的情况
-                (split-string
-                 (replace-regexp-in-string
-                  "\\(\\cc\\)" "@@@@\\1@@@@" string)
-                 "@@@@")
-              ;; 如果词条只包含中文,使用`string-to-vector'
-              ;; 这样处理速度比较快。
-              (string-to-vector string)))
-
-      ;; 将上述汉字字符串里面的所有汉字转换为与之对应的拼音list。
-      ;; 比如: ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing"))
-      (mapc
-       #'(lambda (str)
-           ;; `string-to-vector' 得到的是 char vector, 需要将其转换为 string。
-           (when (numberp str)
-             (setq str (char-to-string str)))
-           (cond
-            ((> (length str) 1)
-             (push (list str) pinyins-list))
-            ((and (> (length str) 0)
-                  (pyim-string-match-p "\\cc" str))
-             (push (pyim-cchar2pinyin-get (string-to-char str))
-                   pinyins-list))
-            ((> (length str) 0)
-             (push (list str) pinyins-list))))
-       string-list)
-      (setq pinyins-list (nreverse pinyins-list))
-
-      ;; 通过排列组合的方式, 重排 pinyins-list。
-      ;; 比如:(("Hello") ("yin") ("hang" "xing")) -> (("Hello" "yin" "hang") 
("Hello" "yin" "xing"))
-      (setq pinyins-list-permutated (pyim-permutate-list2 pinyins-list))
-
-      ;; 使用 pyim 的安装的词库来校正多音字。
-      (when adjust-duo-yin-zi
-        ;; 确保 pyim 词库加载
-        (pyim-dcache-init-variables)
-        (dolist (pinyin-list pinyins-list-permutated)
-          (let* ((py-str (mapconcat #'identity pinyin-list "-"))
-                 (words-from-dicts
-                  ;; pyim-buffer-list 中第一个 buffer 对应的是个人词库文件
-                  ;; 个人词库文件中的词条,极有可能存在 *多音字污染*。
-                  ;; 这是由 pyim 保存词条的机制决定的。
-                  (pyim-dcache-get py-str '(code2word))))
-            (when (member string words-from-dicts)
-              (push pinyin-list pinyins-list-adjusted))))
-        (setq pinyins-list-adjusted
-              (nreverse pinyins-list-adjusted)))
-
-      ;; 返回拼音字符串或者拼音列表
-      (let* ((pinyins-list
-              (or pinyins-list-adjusted
-                  pinyins-list-permutated))
-             (list (mapcar
-                    #'(lambda (x)
-                        (mapconcat
-                         #'(lambda (str)
-                             (if shou-zi-mu
-                                 (substring str 0 1)
-                               str))
-                         x separator))
-                    (if ignore-duo-yin-zi
-                        (list (car pinyins-list))
-                      pinyins-list))))
-        (if return-list
-            list
-          (mapconcat #'identity list " "))))))
-
-;;;###autoload
-(defun pyim-hanzi2pinyin-simple (string &optional shou-zi-mu separator 
return-list)
-  "简化版的 `pyim-hanzi2pinyin', 不处理多音字。"
-  (pyim-hanzi2pinyin string shou-zi-mu separator return-list t))
-
+;; ** pyim 中文字符串工具
+(require 'pyim-cstring)
 
 ;; ** pyim 探针程序
 (require 'pyim-probe)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]