[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/pyim e0f80e5d36 11/12: Merge branch 'dcache'
From: |
ELPA Syncer |
Subject: |
[elpa] externals/pyim e0f80e5d36 11/12: Merge branch 'dcache' |
Date: |
Thu, 9 Jun 2022 10:57:53 -0400 (EDT) |
branch: externals/pyim
commit e0f80e5d36926f269174cc16385754ec6141f791
Merge: 1e0834c456 7288242a6d
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>
Merge branch 'dcache'
---
README.org | 2 +-
pyim-candidates.el | 2 +-
pyim-cstring.el | 2 +-
pyim-dcache.el | 206 ++++++------
pyim-dhashcache.el | 922 ++++++++++++++++++++++++++++------------------------
pyim-dregcache.el | 575 +++++++++++++++++---------------
pyim-process.el | 8 +-
pyim.el | 27 +-
tests/pyim-tests.el | 10 +-
9 files changed, 927 insertions(+), 827 deletions(-)
diff --git a/README.org b/README.org
index d278dc124e..81e423139b 100644
--- a/README.org
+++ b/README.org
@@ -33,7 +33,7 @@
1. 五笔用户
1. 需要 (require 'pyim-wbdict), 加载五笔 scheme 设置。
2. 需要将自己的五笔词库文件中的 code-prefix "." 替换为 "wubi/".
- 3. 运行 `pyim-dcache-upgrade' 命令,升级 icode2word 词库缓存。
+ 3. 运行 `pyim-upgrade' 命令,升级 icode2word 词库缓存。
2. 仓颉用户
1. 需要 (require 'pyim-cangjie5dict), 加载仓颉 scheme 设置。
2. 需要将自己的五笔词库文件中的 code-prefix "@" 替换为 "cangjie/".
diff --git a/pyim-candidates.el b/pyim-candidates.el
index 3f035fddbe..38675707b6 100644
--- a/pyim-candidates.el
+++ b/pyim-candidates.el
@@ -46,7 +46,7 @@
;; ** 获取备选词列表
(defun pyim-candidates-sort (candidates)
"对 CANDIDATES 进行排序。"
- (pyim-dcache-call-api 'sort-words candidates))
+ (pyim-dcache-sort-words candidates))
(cl-defgeneric pyim-candidates-get-chief (scheme &optional personal-words
common-words)
"PYIM 输入法第一位候选词的获取策略。")
diff --git a/pyim-cstring.el b/pyim-cstring.el
index c90b78afec..7e9dbff6ba 100644
--- a/pyim-cstring.el
+++ b/pyim-cstring.el
@@ -182,7 +182,7 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结
(mapcar (lambda (x)
(when (string-prefix-p prefix x)
(string-remove-prefix prefix x)))
- (sort (cl-copy-list (pyim-dcache-call-api
'search-word-code string))
+ (sort (cl-copy-list (pyim-dcache-search-word-code string))
(lambda (a b)
(> (length a) (length b))))))
(codes (remove nil dcache-codes)))
diff --git a/pyim-dcache.el b/pyim-dcache.el
index 79e0236bab..7dff11ebb8 100644
--- a/pyim-dcache.el
+++ b/pyim-dcache.el
@@ -65,26 +65,7 @@ pyim 对资源的消耗。
2. 自动更新功能无法正常工作,用户通过手工从其他机器上拷贝
dcache 文件的方法让 pyim 正常工作。")
-;; ** Dcache API 调用功能
-(defun pyim-dcache-call-api (api-name &rest api-args)
- "Get backend API named API-NAME then call it with arguments API-ARGS."
- ;; make sure the backend is load
- (unless (featurep pyim-dcache-backend)
- (require pyim-dcache-backend))
- (let ((func (intern (concat (symbol-name pyim-dcache-backend)
- "-" (symbol-name api-name)))))
- (if (functionp func)
- (apply func api-args)
- (when pyim-debug
- (message "%S 不是一个有效的 dcache api 函数." (symbol-name func))
- ;; Need to return nil
- nil))))
-
-;; ** Dcache 变量处理相关功能
-(defun pyim-dcache-init-variables ()
- "初始化 dcache 缓存相关变量."
- (pyim-dcache-call-api 'init-variables))
-
+;; ** Dcache 变量初始化相关函数
(defmacro pyim-dcache-init-variable (variable &optional fallback-value)
"初始化 VARIABLE.
@@ -96,18 +77,22 @@ dcache 文件的方法让 pyim 正常工作。")
,fallback-value
(make-hash-table :test #'equal)))))
-(defmacro pyim-dcache-reload-variable (variable)
- "从 `pyim-dcache-directory' 重新读取并设置 VARIABLE 的值."
- `(when (symbolp ',variable)
- (setq ,variable (or (pyim-dcache-get-value ',variable)
- (make-hash-table :test #'equal)))))
-
(defun pyim-dcache-get-value (variable)
"从 `pyim-dcache-directory' 中读取与 VARIABLE 对应的文件中保存的值."
(let ((file (expand-file-name (url-hexify-string (symbol-name variable))
pyim-dcache-directory)))
(pyim-dcache-get-value-from-file file)))
+(defun pyim-dcache-get-value-from-file (file)
+ "读取保存到 FILE 里面的 value."
+ (when (and (> (length file) 0)
+ (file-exists-p file))
+ (with-temp-buffer
+ (insert-file-contents file)
+ (ignore-errors
+ (read (current-buffer))))))
+
+;; ** Dcache 保存变量相关函数
(defun pyim-dcache-save-variable (variable value &optional
auto-backup-threshold)
"将 VARIABLE 变量的取值保存到 `pyim-dcache-directory' 中对应文件中.
@@ -117,14 +102,6 @@ dcache 文件的方法让 pyim 正常工作。")
pyim-dcache-directory)))
(pyim-dcache-save-value-to-file value file auto-backup-threshold)))
-(defun pyim-dcache-value-length (value)
- "获取 VALUE 的某个可以作为长度的值."
- (or (ignore-errors
- (if (hash-table-p value)
- (hash-table-count value)
- (length value)))
- 0))
-
(defun pyim-dcache-save-value-to-file (value file &optional
auto-backup-threshold)
"将 VALUE 保存到 FILE 文件中.
@@ -155,16 +132,14 @@ AUTO-BACKUP-THRESHOLD 倍, 那么原值将自动备份到 FILE 对应的备份
(insert ";; End:")
(pyim-dcache-write-file file)))))
-(defun pyim-dcache-get-value-from-file (file)
- "读取保存到 FILE 里面的 value."
- (when (and (> (length file) 0)
- (file-exists-p file))
- (with-temp-buffer
- (insert-file-contents file)
- (ignore-errors
- (read (current-buffer))))))
+(defun pyim-dcache-value-length (value)
+ "获取 VALUE 的某个可以作为长度的值."
+ (or (ignore-errors
+ (if (hash-table-p value)
+ (hash-table-count value)
+ (length value)))
+ 0))
-;; ** Dcache 文件处理功能
(defun pyim-dcache-write-file (filename &optional confirm)
"A helper function to write dcache files."
(let ((coding-system-for-write 'utf-8-unix)
@@ -181,98 +156,99 @@ AUTO-BACKUP-THRESHOLD 倍, 那么原值将自动备份到 FILE 对应的备份
(write-region (point-min) (point-max) filename nil :silent)
(message "Saving file %s..." filename)))
-(defun pyim-dcache-save-caches ()
- "保存 dcache.
+(defun pyim-dcache-create-files-md5 (files)
+ "为 FILES 生成 md5 字符串。"
+ ;; 当需要强制更新 dict 缓存时,更改这个字符串。
+ (let ((version "v1"))
+ (md5 (prin1-to-string
+ (mapcar (lambda (file)
+ (list version file (nth 5 (file-attributes file 'string))))
+ files)))))
- 将用户选择过的词生成的缓存和词频缓存的取值
- 保存到它们对应的文件中.
+;; ** Dcache 重新加载变量相关函数
+(defmacro pyim-dcache-reload-variable (variable)
+ "从 `pyim-dcache-directory' 重新读取并设置 VARIABLE 的值."
+ `(when (symbolp ',variable)
+ (setq ,variable (or (pyim-dcache-get-value ',variable)
+ (make-hash-table :test #'equal)))))
- 这个函数默认作为 `kill-emacs-hook' 使用。"
- (interactive)
- (pyim-dcache-call-api 'save-personal-dcache-to-file)
- t)
+;; ** Dcache 初始化功能接口
+(cl-defgeneric pyim-dcache-init-variables ()
+ "初始化 dcache 缓存相关变量."
+ nil)
-;; ** Dcache 导出功能
-(defun pyim-dcache-export-words-and-counts (file &optional confirm
ignore-counts)
- "将个人词条以及词条对应的词频信息导出到文件 FILE.
+(cl-defmethod pyim-dcache-init-variables :before ()
+ (unless (featurep pyim-dcache-backend)
+ (require pyim-dcache-backend)))
-如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为
-non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式"
- (interactive "F将词条和词频信息导出到文件: ")
- (pyim-dcache-init-variables)
- (pyim-dcache-call-api 'export-words-and-counts file confirm ignore-counts)
- (message "PYIM: 词条和词频信息导出完成。"))
+;; ** Dcache 检索词条功能接口
+(cl-defgeneric pyim-dcache-get (_code &optional _from)
+ "从 FROM 对应的 dcache 中搜索 CODE, 得到对应的词条.
-(defun pyim-dcache-export-personal-words (file &optional confirm)
- "将用户的个人词条导出为 pyim 词库文件.
+当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
+code 对应的中文词条了."
+ nil)
-如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil,
-文件存在时将会提示用户是否覆盖,默认为覆盖模式。"
- (interactive "F将个人词条导出到文件:")
- (pyim-dcache-init-variables)
- (pyim-dcache-call-api 'export-personal-words file confirm)
- (message "PYIM: 个人词条导出完成。"))
+(cl-defmethod pyim-dcache-get :before (_code &optional _from)
+ (unless (featurep pyim-dcache-backend)
+ (require pyim-dcache-backend)))
-;; ** Dcache 更新功能
-(defun pyim-dcache-update (&optional force)
- "读取并加载所有相关词库 dcache.
+;; ** Dcache 代码反查功能接口
+(cl-defgeneric pyim-dcache-search-word-code (word)
+ "从 dcache 中搜索 WROD 对应的 code.")
-如果 FORCE 为真,强制加载。"
- (pyim-dcache-call-api 'update force))
+;; ** Dcache 加词功能接口
+(cl-defgeneric pyim-dcache-insert-word (word code prepend)
+ "将词条 WORD 插入到 dcache 中。
-(defun pyim-dcache-create-files-md5 (files)
- "为 FILES 生成 md5 字符串。"
- ;; 当需要强制更新 dict 缓存时,更改这个字符串。
- (let ((version "v1"))
- (md5 (prin1-to-string
- (mapcar (lambda (file)
- (list version file (nth 5 (file-attributes file 'string))))
- files)))))
+如果 PREPEND 为 non-nil, 词条将放到已有词条的最前面。
+内部函数会根据 CODE 来确定插入对应的 hash key.")
-(defun pyim-dcache-update-wordcount (word &optional wordcount-handler)
+;; ** Dcache 删词功能
+(cl-defgeneric pyim-dcache-delete-word (word)
+ "将中文词条 WORD 从个人词库中删除")
+
+;; ** Dcache 更新功能接口
+(cl-defgeneric pyim-dcache-update (&optional force)
+ "读取并加载所有相关词库 dcache, 如果 FORCE 为真,强制加载。")
+
+;; ** Dcache 更新词条统计量功能接口
+(cl-defgeneric pyim-dcache-update-wordcount (word &optional wordcount-handler)
"保存 WORD 词频.
1. 如果 WORDCOUNT-HANDLER 是一个函数:那么其返回值将作为词频保存,
参数为原有词频。
2. 如果 WORDCOUNT-HANDLER 是一个数值:那么这个数值直接作为词频保存。
-3. 如果 WORDCOUNT-HANDLER 为其他值:词频不变."
- (pyim-dcache-call-api 'update-iword2count word wordcount-handler))
+3. 如果 WORDCOUNT-HANDLER 为其他值:词频不变.")
-;; ** Dcache 加词功能
-(defun pyim-dcache-insert-word (word code prepend)
- "将词条 WORD 插入到 dcache 中。
+;; ** Dcache 升级功能接口
+(cl-defgeneric pyim-dcache-upgrade ()
+ "升级词库缓存.")
-如果 PREPEND 为 non-nil, 词条将放到已有词条的最前面。
-内部函数会根据 CODE 来确定插入对应的 hash key."
- (pyim-dcache-call-api 'insert-word-into-icode2word word code prepend)
- ;; NOTE: 保存词条到 icode2word 词库缓存的同时,也在 ishortcode2word 词库缓存中
- ;; 临时写入一份,供当前 Emacs session 使用,但退出时 pyim 不会保存
- ;; ishortcode2word 词库缓存到文件,因为下次启动 Emacs 的时候,ishortcode2word
- ;; 词库缓存会从 icode2word 再次重建。
- (pyim-dcache-call-api 'insert-word-into-ishortcode2word word code prepend))
-
-;; ** Dcache 升级功能
-(defun pyim-dcache-upgrade ()
- "升级词库缓存.
-
-当前已有的功能:
-1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。"
- (interactive)
- (pyim-dcache-call-api 'upgrade-icode2word))
+;; ** Dcache 排序功能接口
+(cl-defgeneric pyim-dcache-sort-words (words)
+ "对 WORDS 进行排序。"
+ words)
-;; ** Dcache 删词功能
-(defun pyim-dcache-delete-word (word)
- "将中文词条 WORD 从个人词库中删除"
- (pyim-dcache-call-api 'delete-word word))
+;; ** Dcache 保存功能接口
+(cl-defgeneric pyim-dcache-save-caches ()
+ "保存 dcache.
-;; ** Dcache 检索功能
-(defun pyim-dcache-get (code &optional from)
- "从 FROM 对应的 dcache 中搜索 CODE, 得到对应的词条.
+将用户选择过的词生成的缓存和词频缓存的取值
+保存到它们对应的文件中.")
-当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
-code 对应的中文词条了."
- (when code
- (pyim-dcache-call-api 'get code from)))
+;; ** Dcache 导出功能接口
+(cl-defgeneric pyim-dcache-export-words-and-counts (file &optional confirm
ignore-counts)
+ "将个人词条以及词条对应的词频信息导出到文件 FILE.
+
+如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为
+non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式")
+
+(cl-defgeneric pyim-dcache-export-personal-words (file &optional confirm)
+ "将用户的个人词条导出为 pyim 词库文件.
+
+如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil,
+文件存在时将会提示用户是否覆盖,默认为覆盖模式。")
;; * Footer
(provide 'pyim-dcache)
diff --git a/pyim-dhashcache.el b/pyim-dhashcache.el
index db2b0a8ad6..f07cdf7811 100644
--- a/pyim-dhashcache.el
+++ b/pyim-dhashcache.el
@@ -79,11 +79,167 @@
(defvar pyim-dhashcache-update-iword2priority-p nil)
(defvar pyim-dhashcache-update-code2word-running-p nil)
-(defun pyim-dhashcache-update (&optional force)
+;; ** 初始化 dhashcache 相关函数
+(cl-defmethod pyim-dcache-init-variables
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "初始化 dcache 缓存相关变量."
+ (when (and (not pyim-dhashcache-icode2word)
+ pyim-dcache-directory
+ (file-directory-p pyim-dcache-directory)
+ (directory-files pyim-dcache-directory nil "-backup-"))
+ (message "PYIM: 在 %S 目录中发现备份文件的存在,可能是词库缓存文件损坏导致,请抓紧检查处理!!!"
+ pyim-dcache-directory))
+ (pyim-dhashcache-init-count-and-priority-variables)
+ (pyim-dcache-init-variable pyim-dhashcache-code2word)
+ (pyim-dcache-init-variable pyim-dhashcache-word2code)
+ (pyim-dcache-init-variable pyim-dhashcache-shortcode2word)
+ (pyim-dcache-init-variable pyim-dhashcache-icode2word)
+ (pyim-dcache-init-variable pyim-dhashcache-ishortcode2word))
+
+(defun pyim-dhashcache-init-count-and-priority-variables ()
+ "初始化 count 相关的变量。"
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count-log)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-10-words)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-50-words)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2priority))
+
+;; ** 从 dhashcache 搜索词条相关函数
+(cl-defmethod pyim-dcache-get
+ (code &context (pyim-dcache-backend (eql pyim-dhashcache))
+ &optional from)
+ "从 FROM 对应的 dcaches 中搜索 CODE, 得到对应的词条.
+
+当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
+code 对应的中文词条了。
+
+如果 FROM 为 nil, 则默认搜索 `pyim-dhashcache-icode2word' 和
+`pyim-dhashcache-code2word' 两个 dcache."
+ (when code
+ (let* ((caches (mapcar (lambda (x)
+ (intern (concat "pyim-dhashcache-" (symbol-name
x))))
+ (or (and from
+ (if (listp from)
+ from
+ (list from)))
+ '(icode2word code2word))))
+ result)
+ (dolist (cache caches)
+ (let* ((cache (ignore-errors (symbol-value cache)))
+ (value (and cache (gethash code cache))))
+ ;; 处理 iword2count.
+ (unless (listp value)
+ (setq value (list value)))
+ (when value
+ (setq result (append result value)))))
+ result)))
+
+;; ** 从 dhashcache 搜索代码相关函数
+(cl-defmethod pyim-dcache-search-word-code
+ (string &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ (gethash string pyim-dhashcache-word2code))
+
+;; ** 给 dhashcache 添加词条相关函数
+(cl-defmethod pyim-dcache-insert-word
+ (word code prepend
+ &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "将词条 WORD 插入到下面两个词库缓存中。
+
+1. `pyim-dhashcache-icode2word'
+2. `pyim-dhashcache-insert-word-into-ishortcode2word'."
+ (pyim-dhashcache-insert-word-into-icode2word word code prepend)
+ ;; NOTE: 保存词条到 icode2word 词库缓存的同时,也在 ishortcode2word 词库缓存中
+ ;; 临时写入一份,供当前 Emacs session 使用,但退出时 pyim 不会保存
+ ;; ishortcode2word 词库缓存到文件,因为下次启动 Emacs 的时候,ishortcode2word
+ ;; 词库缓存会从 icode2word 再次重建。
+ (pyim-dhashcache-insert-word-into-ishortcode2word word code prepend))
+
+(defmacro pyim-dhashcache-put (cache code &rest body)
+ "将 BODY 的返回值保存到 CACHE 对应的 CODE 中。
+
+注意事项:这个宏是一个指代宏,其中 orig-value 在这个宏中有特殊含
+义,代表原来 code 对应的取值。"
+ (declare (indent 0))
+ (let ((key (make-symbol "key"))
+ (table (make-symbol "table"))
+ (new-value (make-symbol "new-value")))
+ `(let* ((,key ,code)
+ (,table ,cache)
+ (orig-value (gethash ,key ,table))
+ ,new-value)
+ (setq ,new-value (progn ,@body))
+ (puthash ,key ,new-value ,table))))
+
+(defun pyim-dhashcache-insert-word-into-icode2word (word code prepend)
+ "将词条 WORD 插入到 icode2word 词库缓存 CODE 键对应的位置.
+
+默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
+到已有词条的最前面。"
+ (pyim-dhashcache-put
+ pyim-dhashcache-icode2word code
+ (if prepend
+ `(,word ,@(remove word orig-value))
+ `(,@(remove word orig-value) ,word))))
+
+(defun pyim-dhashcache-insert-word-into-ishortcode2word (word code prepend)
+ "将词条 WORD 插入到 ishortcode2word 词库缓存 CODE 首字母字符串对应的位置.
+
+默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
+到已有词条的最前面。"
+ (dolist (newcode (pyim-dhashcache-get-ishortcodes code))
+ (pyim-dhashcache-put
+ pyim-dhashcache-ishortcode2word
+ newcode
+ (if prepend
+ `(,word ,@(remove word orig-value))
+ `(,@(remove word orig-value) ,word)))))
+
+(defun pyim-dhashcache-get-ishortcodes (code)
+ "获取CODE 所有的简写 ishortcodes.
+
+比如: ni-hao -> (n-h)
+
+注意事项:这个函数用于全拼输入法。"
+ (when (and (> (length code) 0)
+ (not (pyim-string-match-p "/" code))
+ (not (pyim-string-match-p "[^a-z-]" code)))
+ (list (mapconcat
+ (lambda (x)
+ (substring x 0 1))
+ (split-string code "-") "-"))))
+
+;; ** 从 dhashcache 删除词条相关函数
+(cl-defmethod pyim-dcache-delete-word
+ (word &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "将中文词条 WORD 从个人词库中删除"
+ (maphash
+ (lambda (key value)
+ (when (member word value)
+ (let ((new-value (remove word value)))
+ (if new-value
+ (puthash key new-value pyim-dhashcache-icode2word)
+ (remhash key pyim-dhashcache-icode2word)))))
+ pyim-dhashcache-icode2word)
+ (maphash
+ (lambda (key value)
+ (when (member word value)
+ (print value)
+ (let ((new-value (remove word value)))
+ (if new-value
+ (puthash key new-value pyim-dhashcache-ishortcode2word)
+ (remhash key pyim-dhashcache-ishortcode2word)))))
+ pyim-dhashcache-ishortcode2word)
+ (remhash word pyim-dhashcache-iword2count)
+ (remhash word pyim-dhashcache-iword2count-log)
+ (remhash word pyim-dhashcache-iword2priority))
+
+;; ** 更新 dhashcache 相关函数
+(cl-defmethod pyim-dcache-update
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)) &optional force)
"读取并加载所有相关词库 dcache.
如果 FORCE 为真,强制加载。"
- (pyim-dhashcache-init-variables)
+ (pyim-dcache-init-variables)
(when pyim-dcache-auto-update
(pyim-dhashcache-update-iword2priority force)
(pyim-dhashcache-update-personal-words force)
@@ -91,22 +247,57 @@
(dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
(pyim-dhashcache-update-code2word dict-files dicts-md5 force))))
-(defun pyim-dhashcache-sort-words (words-list)
- "对 WORDS-LIST 排序"
- (let ((iword2count pyim-dhashcache-iword2count)
- (iword2priority pyim-dhashcache-iword2priority))
- (sort words-list
- (lambda (a b)
- (let ((p1 (gethash a iword2priority))
- (p2 (gethash b iword2priority)))
- (cond
- ((and (listp p1)
- (listp p2)
- (not (equal p1 p2)))
- (pyim-numbers> p1 p2))
- (t (let ((n1 (or (gethash a iword2count) 0))
- (n2 (or (gethash b iword2count) 0)))
- (> n1 n2)))))))))
+(defun pyim-dhashcache-update-iword2priority (&optional force)
+ "更新词条优先级表,如果 FORCE 为真,强制更新。"
+ (interactive)
+ (when (or force (not pyim-dhashcache-update-iword2priority-p))
+ ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
+ ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
+ (setq pyim-dhashcache-update-iword2priority-p t)
+ (async-start
+ `(lambda ()
+ ,@(pyim-dhashcache-async-inject-variables)
+ (require 'pyim-dhashcache)
+ (pyim-dhashcache-init-count-and-priority-variables)
+ (maphash
+ (lambda (key value)
+ (puthash key
+ (pyim-dhashcache-calculate-priority
+ (pyim-dhashcache-get-counts-from-log
+ value))
+ pyim-dhashcache-iword2priority))
+ pyim-dhashcache-iword2count-log)
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2priority
+ pyim-dhashcache-iword2priority)
+ nil)
+ (lambda (_)
+ (pyim-dcache-reload-variable pyim-dhashcache-iword2priority)))))
+
+(defun pyim-dhashcache-async-inject-variables ()
+ "pyim's async-inject-variables."
+ (list (async-inject-variables "^load-path$")
+ (async-inject-variables "^exec-path$")
+ (async-inject-variables "^pyim-.+?directory$")))
+
+(defun pyim-dhashcache-calculate-priority (counts-info)
+ "根据 COUNTS-INFO 计算优先级(优先级是多个数字组成的一个列表),
+用于对词条进行排序。COUNTS-INFO 是一个 alist, 其结构类似:
+
+ ((day n1 n2 n3 ...))
+
+其中 (n1 n2 n3 ...) 代表从当前日期逐日倒推,每日 count 所组成的列表。"
+ (mapcar (lambda (x)
+ (let* ((label (car x))
+ (plist (cdr x))
+ (weights (plist-get plist :weights))
+ (factor (plist-get plist :factor)))
+ (round (* (apply #'+ (cl-mapcar (lambda (a b)
+ (* (or a 0) b))
+ (cdr (assoc label counts-info))
+ weights))
+ factor))))
+ pyim-dhashcache-count-types))
(defun pyim-dhashcache-get-counts-from-log (log-info &optional time)
"从 LOG-INFO 中获取所有的 count 值。
@@ -131,190 +322,105 @@
`(,label ,@(reverse output))))
pyim-dhashcache-count-types))
-(defun pyim-dhashcache-calculate-priority (counts-info)
- "根据 COUNTS-INFO 计算优先级(优先级是多个数字组成的一个列表),
-用于对词条进行排序。COUNTS-INFO 是一个 alist, 其结构类似:
-
- ((day n1 n2 n3 ...))
-
-其中 (n1 n2 n3 ...) 代表从当前日期逐日倒推,每日 count 所组成的列表。"
- (mapcar (lambda (x)
- (let* ((label (car x))
- (plist (cdr x))
- (weights (plist-get plist :weights))
- (factor (plist-get plist :factor)))
- (round (* (apply #'+ (cl-mapcar (lambda (a b)
- (* (or a 0) b))
- (cdr (assoc label counts-info))
- weights))
- factor))))
- pyim-dhashcache-count-types))
-
-(defun pyim-dhashcache-get-shortcodes (code)
- "获取 CODE 所有的 shortcodes.
-
-比如:wubi/aaaa -> (wubi/aaa wubi/aa)
-
-注意事项:这个函数目前只用于五笔等型码输入法,不用于拼音输入法,
-因为拼音输入法词库太大,这样处理之后,会生成一个特别大的哈希表,
-占用太多内存资源,拼音输入法使用 ishortcode 机制。"
- (when (and (pyim-string-match-p "/" code)
- (not (pyim-string-match-p "-" code)))
- (let* ((x (split-string code "/"))
- (prefix (concat (nth 0 x) "/"))
- (code1 (nth 1 x))
- (n (length code1))
- results)
- (dotimes (i n)
- (when (> i 1)
- (push (concat prefix (substring code1 0 i)) results)))
- results)))
-
-(defun pyim-dhashcache-get-ishortcodes (code)
- "获取CODE 所有的简写 ishortcodes.
-
-比如: ni-hao -> (n-h)
-
-注意事项:这个函数用于全拼输入法。"
- (when (and (> (length code) 0)
- (not (pyim-string-match-p "/" code))
- (not (pyim-string-match-p "[^a-z-]" code)))
- (list (mapconcat
- (lambda (x)
- (substring x 0 1))
- (split-string code "-") "-"))))
-
-(defun pyim-dhashcache-async-inject-variables ()
- "pyim's async-inject-variables."
- (list (async-inject-variables "^load-path$")
- (async-inject-variables "^exec-path$")
- (async-inject-variables "^pyim-.+?directory$")))
+(defun pyim-dhashcache-update-personal-words (&optional force)
+ (pyim-dhashcache-update-icode2word force))
-(defun pyim-dhashcache-update-ishortcode2word (&optional force)
- "读取 `pyim-dhashcache-icode2word' 中的词库,创建 *简拼* 缓存,然后加载这个缓存.
+(defun pyim-dhashcache-update-icode2word (&optional force)
+ "对 personal 缓存中的词条进行排序,加载排序后的结果.
-如果 FORCE 为真,强制加载缓存。"
+在这个过程中使用了 `pyim-dhashcache-iword2count' 中记录的词频信息。
+如果 FORCE 为真,强制排序。"
(interactive)
- (when (or force (not pyim-dhashcache-update-ishortcode2word-p))
+ (when (or force (not pyim-dhashcache-update-icode2word-p))
;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-ishortcode2word-p t)
+ (setq pyim-dhashcache-update-icode2word-p t)
(async-start
`(lambda ()
,@(pyim-dhashcache-async-inject-variables)
(require 'pyim-dhashcache)
(pyim-dcache-init-variable pyim-dhashcache-icode2word)
(pyim-dhashcache-init-count-and-priority-variables)
+ (maphash
+ (lambda (key value)
+ (puthash key (pyim-dcache-sort-words value)
+ pyim-dhashcache-icode2word))
+ pyim-dhashcache-icode2word)
(pyim-dcache-save-variable
- 'pyim-dhashcache-ishortcode2word
- (pyim-dhashcache-update-ishortcode2word-1
- pyim-dhashcache-icode2word)))
+ 'pyim-dhashcache-icode2word
+ pyim-dhashcache-icode2word)
+ nil)
(lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-ishortcode2word)))))
-
-(defun pyim-dhashcache-update-ishortcode2word-1 (icode2word)
- "`pyim-dhashcache-update-ishortcode2word' 内部函数."
- (let ((ishortcode2word (make-hash-table :test #'equal)))
- (maphash
- (lambda (key value)
- (dolist (newkey (pyim-dhashcache-get-ishortcodes key))
- (puthash newkey
- (delete-dups
- `(,@(gethash newkey ishortcode2word)
- ,@value))
- ishortcode2word)))
- icode2word)
- (maphash
- (lambda (key value)
- (puthash key (pyim-dhashcache-sort-words value)
- ishortcode2word))
- ishortcode2word)
- ishortcode2word))
+ (pyim-dcache-reload-variable pyim-dhashcache-icode2word)
+ (pyim-dhashcache-update-ishortcode2word force)))))
-(defun pyim-dhashcache-update-shortcode2word (&optional force)
- "使用 `pyim-dhashcache-code2word' 中的词条,创建简写 code 词库缓存并加载.
+(defun pyim-dhashcache-update-ishortcode2word (&optional force)
+ "读取 `pyim-dhashcache-icode2word' 中的词库,创建 *简拼* 缓存,然后加载这个缓存.
-如果 FORCE 为真,强制运行。"
+如果 FORCE 为真,强制加载缓存。"
(interactive)
- (when (or force (not pyim-dhashcache-update-shortcode2word-p))
+ (when (or force (not pyim-dhashcache-update-ishortcode2word-p))
;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-shortcode2word-p t)
+ (setq pyim-dhashcache-update-ishortcode2word-p t)
(async-start
`(lambda ()
,@(pyim-dhashcache-async-inject-variables)
(require 'pyim-dhashcache)
- (pyim-dcache-init-variable pyim-dhashcache-code2word)
+ (pyim-dcache-init-variable pyim-dhashcache-icode2word)
(pyim-dhashcache-init-count-and-priority-variables)
(pyim-dcache-save-variable
- 'pyim-dhashcache-shortcode2word
- (pyim-dhashcache-update-shortcode2word-1
- pyim-dhashcache-code2word)))
+ 'pyim-dhashcache-ishortcode2word
+ (pyim-dhashcache-update-ishortcode2word-1
+ pyim-dhashcache-icode2word)))
(lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-shortcode2word)))))
-
-(defun pyim-dhashcache-update-shortcode2word-1 (code2word)
- "`pyim-dhashcache-update-shortcode2word' 的内部函数"
- (let ((shortcode2word (make-hash-table :test #'equal)))
+ (pyim-dcache-reload-variable pyim-dhashcache-ishortcode2word)))))
+
+(defun pyim-dhashcache-update-ishortcode2word-1 (icode2word)
+ "`pyim-dhashcache-update-ishortcode2word' 内部函数."
+ (let ((ishortcode2word (make-hash-table :test #'equal)))
(maphash
(lambda (key value)
- (dolist (x (pyim-dhashcache-get-shortcodes key))
- (puthash x
- (mapcar
- (lambda (word)
- ;; 这个地方的代码用于实现五笔 code 自动提示功能,
- ;; 比如输入 'aa' 后得到选词框:
- ;; ----------------------
- ;; | 1. 莁aa 2.匶wv ... |
- ;; ----------------------
- (if (get-text-property 0 :comment word)
- word
- (propertize word :comment (substring key (length x)))))
- (delete-dups `(,@(gethash x shortcode2word) ,@value)))
- shortcode2word)))
- code2word)
+ (dolist (newkey (pyim-dhashcache-get-ishortcodes key))
+ (puthash newkey
+ (delete-dups
+ `(,@(gethash newkey ishortcode2word)
+ ,@value))
+ ishortcode2word)))
+ icode2word)
(maphash
(lambda (key value)
- (puthash key (pyim-dhashcache-sort-words value)
- shortcode2word))
- shortcode2word)
- shortcode2word))
-
-(defun pyim-dhashcache-get-path (variable)
- "获取保存 VARIABLE 取值的文件的路径."
- (when (symbolp variable)
- (concat (file-name-as-directory pyim-dcache-directory)
- (symbol-name variable))))
-
-(defun pyim-dhashcache-generate-dcache-file (dict-files dcache-file)
- "读取词库文件列表:DICT-FILES, 生成一个词库缓冲文件 DCACHE-FILE.
+ (puthash key (pyim-dcache-sort-words value)
+ ishortcode2word))
+ ishortcode2word)
+ ishortcode2word))
-pyim 使用的词库文件是简单的文本文件,编码 *强制* 为 \\='utf-8-unix,
-其结构类似:
+(defun pyim-dhashcache-update-code2word (dict-files dicts-md5 &optional force)
+ "读取并加载词库.
- ni-bu-hao 你不好
- ni-hao 你好 妮好 你豪
+读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
-第一个空白字符之前的内容为 code,空白字符之后为中文词条列表。词库
-*不处理* 中文标点符号。"
- (let ((hashtable (make-hash-table :size 1000000 :test #'equal)))
- (dolist (file dict-files)
- (with-temp-buffer
- (let ((coding-system-for-read 'utf-8-unix))
- (insert-file-contents file))
- (goto-char (point-min))
- (forward-line 1)
- (while (not (eobp))
- (let* ((content (pyim-dline-parse))
- (code (car content))
- (words (cdr content)))
- (when (and code words)
- (puthash code
- (delete-dups `(,@(gethash code hashtable) ,@words))
- hashtable)))
- (forward-line 1))))
- (pyim-dcache-save-value-to-file hashtable dcache-file)
- hashtable))
+如果 FORCE 为真,强制加载。"
+ (interactive)
+ (let* ((code2word-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word))
+ (word2code-file (pyim-dhashcache-get-path 'pyim-dhashcache-word2code))
+ (code2word-md5-file (pyim-dhashcache-get-path
'pyim-dhashcache-code2word-md5)))
+ (when (or force (and (not (equal dicts-md5
(pyim-dcache-get-value-from-file code2word-md5-file)))
+ (not pyim-dhashcache-update-code2word-running-p)))
+ (setq pyim-dhashcache-update-code2word-running-p t)
+ ;; use hashtable
+ (async-start
+ `(lambda ()
+ ,@(pyim-dhashcache-async-inject-variables)
+ (require 'pyim-dhashcache)
+ (let ((dcache (pyim-dhashcache-generate-dcache-file ',dict-files
,code2word-file)))
+ (pyim-dhashcache-generate-word2code-dcache-file dcache
,word2code-file))
+ (pyim-dcache-save-value-to-file ',dicts-md5 ,code2word-md5-file))
+ (lambda (_)
+ (pyim-dcache-reload-variable pyim-dhashcache-code2word)
+ (pyim-dcache-reload-variable pyim-dhashcache-word2code)
+ (pyim-dhashcache-update-shortcode2word force)
+ (setq pyim-dhashcache-update-code2word-running-p nil))))))
(defun pyim-dhashcache-generate-word2code-dcache-file (dcache file)
"从 DCACHE 生成一个 word -> code 的反向查询表.
@@ -346,217 +452,116 @@ DCACHE 是一个 code -> words 的 hashtable.
dcache)
(pyim-dcache-save-value-to-file hashtable file))))
-(defun pyim-dhashcache-update-code2word (dict-files dicts-md5 &optional force)
- "读取并加载词库.
-
-读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
-
-如果 FORCE 为真,强制加载。"
- (interactive)
- (let* ((code2word-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word))
- (word2code-file (pyim-dhashcache-get-path 'pyim-dhashcache-word2code))
- (code2word-md5-file (pyim-dhashcache-get-path
'pyim-dhashcache-code2word-md5)))
- (when (or force (and (not (equal dicts-md5
(pyim-dcache-get-value-from-file code2word-md5-file)))
- (not pyim-dhashcache-update-code2word-running-p)))
- (setq pyim-dhashcache-update-code2word-running-p t)
- ;; use hashtable
- (async-start
- `(lambda ()
- ,@(pyim-dhashcache-async-inject-variables)
- (require 'pyim-dhashcache)
- (let ((dcache (pyim-dhashcache-generate-dcache-file ',dict-files
,code2word-file)))
- (pyim-dhashcache-generate-word2code-dcache-file dcache
,word2code-file))
- (pyim-dcache-save-value-to-file ',dicts-md5 ,code2word-md5-file))
- (lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-code2word)
- (pyim-dcache-reload-variable pyim-dhashcache-word2code)
- (pyim-dhashcache-update-shortcode2word force)
- (setq pyim-dhashcache-update-code2word-running-p nil))))))
-
-(defun pyim-dhashcache-export (dcache file &optional confirm)
- "将一个 pyim DCACHE 导出为文件 FILE.
+(defun pyim-dhashcache-get-path (variable)
+ "获取保存 VARIABLE 取值的文件的路径."
+ (when (symbolp variable)
+ (concat (file-name-as-directory pyim-dcache-directory)
+ (symbol-name variable))))
-如果 CONFIRM 为 non-nil,文件存在时将会提示用户是否覆盖,
-默认为覆盖模式"
- (with-temp-buffer
- (insert ";;; -*- coding: utf-8-unix -*-\n")
- (maphash
- (lambda (key value)
- (let ((value (cl-remove-if
- (lambda (x)
- ;; 如果某个词条的 text 属性 :noexport 设置为 t, 在导出的
- ;; 时候自动忽略这个词条。
- (and (stringp x)
- (get-text-property 0 :noexport x)))
- (if (listp value)
- value
- (list value)))))
- (when value
- (insert (format "%s %s\n" key (mapconcat #'identity value " "))))))
- dcache)
- (pyim-dcache-write-file file confirm)))
+(defun pyim-dhashcache-generate-dcache-file (dict-files dcache-file)
+ "读取词库文件列表:DICT-FILES, 生成一个词库缓冲文件 DCACHE-FILE.
-(defun pyim-dhashcache-get (code &optional from)
- "从 FROM 对应的 dcaches 中搜索 CODE, 得到对应的词条.
+pyim 使用的词库文件是简单的文本文件,编码 *强制* 为 \\='utf-8-unix,
+其结构类似:
-当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
-code 对应的中文词条了。
+ ni-bu-hao 你不好
+ ni-hao 你好 妮好 你豪
-如果 FROM 为 nil, 则默认搜索 `pyim-dhashcache-icode2word' 和
-`pyim-dhashcache-code2word' 两个 dcache."
- (let* ((caches (mapcar (lambda (x)
- (intern (concat "pyim-dhashcache-" (symbol-name
x))))
- (or (and from
- (if (listp from)
- from
- (list from)))
- '(icode2word code2word))))
- result)
- (dolist (cache caches)
- (let* ((cache (ignore-errors (symbol-value cache)))
- (value (and cache (gethash code cache))))
- ;; 处理 iword2count.
- (unless (listp value)
- (setq value (list value)))
- (when value
- (setq result (append result value)))))
- result))
+第一个空白字符之前的内容为 code,空白字符之后为中文词条列表。词库
+*不处理* 中文标点符号。"
+ (let ((hashtable (make-hash-table :size 1000000 :test #'equal)))
+ (dolist (file dict-files)
+ (with-temp-buffer
+ (let ((coding-system-for-read 'utf-8-unix))
+ (insert-file-contents file))
+ (goto-char (point-min))
+ (forward-line 1)
+ (while (not (eobp))
+ (let* ((content (pyim-dline-parse))
+ (code (car content))
+ (words (cdr content)))
+ (when (and code words)
+ (puthash code
+ (delete-dups `(,@(gethash code hashtable) ,@words))
+ hashtable)))
+ (forward-line 1))))
+ (pyim-dcache-save-value-to-file hashtable dcache-file)
+ hashtable))
-(defun pyim-dhashcache-update-icode2word (&optional force)
- "对 personal 缓存中的词条进行排序,加载排序后的结果.
+(defun pyim-dhashcache-update-shortcode2word (&optional force)
+ "使用 `pyim-dhashcache-code2word' 中的词条,创建简写 code 词库缓存并加载.
-在这个过程中使用了 `pyim-dhashcache-iword2count' 中记录的词频信息。
-如果 FORCE 为真,强制排序。"
+如果 FORCE 为真,强制运行。"
(interactive)
- (when (or force (not pyim-dhashcache-update-icode2word-p))
+ (when (or force (not pyim-dhashcache-update-shortcode2word-p))
;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-icode2word-p t)
+ (setq pyim-dhashcache-update-shortcode2word-p t)
(async-start
`(lambda ()
,@(pyim-dhashcache-async-inject-variables)
(require 'pyim-dhashcache)
- (pyim-dcache-init-variable pyim-dhashcache-icode2word)
+ (pyim-dcache-init-variable pyim-dhashcache-code2word)
(pyim-dhashcache-init-count-and-priority-variables)
- (maphash
- (lambda (key value)
- (puthash key (pyim-dhashcache-sort-words value)
- pyim-dhashcache-icode2word))
- pyim-dhashcache-icode2word)
(pyim-dcache-save-variable
- 'pyim-dhashcache-icode2word
- pyim-dhashcache-icode2word)
- nil)
+ 'pyim-dhashcache-shortcode2word
+ (pyim-dhashcache-update-shortcode2word-1
+ pyim-dhashcache-code2word)))
(lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-icode2word)
- (pyim-dhashcache-update-ishortcode2word force)))))
-
-(defun pyim-dhashcache-upgrade-icode2word ()
- "升级 icode2word 缓存。"
- (let ((delete-old-key-p (yes-or-no-p "Delete old key after upgrade? "))
- (ruler-list (delete-dups
- (remove nil
- (mapcar
- (lambda (scheme)
- (let ((code-prefix (plist-get (cdr scheme)
:code-prefix))
- (code-prefix-history (plist-get (cdr
scheme) :code-prefix-history)))
- (when code-prefix-history
- (cons code-prefix-history code-prefix))))
- pyim-schemes)))))
- (dolist (ruler ruler-list)
- (let ((old-prefix-list (car ruler))
- (new-prefix (cdr ruler)))
- (dolist (old-prefix old-prefix-list)
- (maphash
- (lambda (key _value)
- (when (string-prefix-p old-prefix key)
- (let* ((key-words (gethash key pyim-dhashcache-icode2word))
- (new-key (concat new-prefix (string-remove-prefix
old-prefix key)))
- (new-key-words (gethash new-key
pyim-dhashcache-icode2word))
- (merged-value (delete-dups `(,@new-key-words
,@key-words))))
- (puthash new-key merged-value pyim-dhashcache-icode2word)
- (message "PYIM icode2word upgrade: %S %S -> %S %S" key
key-words new-key merged-value)
- (when delete-old-key-p
- (remhash key pyim-dhashcache-icode2word)
- (message "PYIM icode2word upgrade: %S has been deleted."
key)))))
- pyim-dhashcache-icode2word))))))
-
-(defun pyim-dhashcache-update-personal-words (&optional force)
- (pyim-dhashcache-update-icode2word force))
-
-(defun pyim-dhashcache-init-variables ()
- "初始化 dcache 缓存相关变量."
- (when (and (not pyim-dhashcache-icode2word)
- pyim-dcache-directory
- (file-directory-p pyim-dcache-directory)
- (directory-files pyim-dcache-directory nil "-backup-"))
- (message "PYIM: 在 %S 目录中发现备份文件的存在,可能是词库缓存文件损坏导致,请抓紧检查处理!!!"
- pyim-dcache-directory))
- (pyim-dhashcache-init-count-and-priority-variables)
- (pyim-dcache-init-variable pyim-dhashcache-code2word)
- (pyim-dcache-init-variable pyim-dhashcache-word2code)
- (pyim-dcache-init-variable pyim-dhashcache-shortcode2word)
- (pyim-dcache-init-variable pyim-dhashcache-icode2word)
- (pyim-dcache-init-variable pyim-dhashcache-ishortcode2word))
+ (pyim-dcache-reload-variable pyim-dhashcache-shortcode2word)))))
-(defun pyim-dhashcache-init-count-and-priority-variables ()
- "初始化 count 相关的变量。"
- (pyim-dcache-init-variable pyim-dhashcache-iword2count)
- (pyim-dcache-init-variable pyim-dhashcache-iword2count-log)
- (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-10-words)
- (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-50-words)
- (pyim-dcache-init-variable pyim-dhashcache-iword2priority))
+(defun pyim-dhashcache-update-shortcode2word-1 (code2word)
+ "`pyim-dhashcache-update-shortcode2word' 的内部函数"
+ (let ((shortcode2word (make-hash-table :test #'equal)))
+ (maphash
+ (lambda (key value)
+ (dolist (x (pyim-dhashcache-get-shortcodes key))
+ (puthash x
+ (mapcar
+ (lambda (word)
+ ;; 这个地方的代码用于实现五笔 code 自动提示功能,
+ ;; 比如输入 'aa' 后得到选词框:
+ ;; ----------------------
+ ;; | 1. 莁aa 2.匶wv ... |
+ ;; ----------------------
+ (if (get-text-property 0 :comment word)
+ word
+ (propertize word :comment (substring key (length x)))))
+ (delete-dups `(,@(gethash x shortcode2word) ,@value)))
+ shortcode2word)))
+ code2word)
+ (maphash
+ (lambda (key value)
+ (puthash key (pyim-dcache-sort-words value)
+ shortcode2word))
+ shortcode2word)
+ shortcode2word))
-(defun pyim-dhashcache-save-personal-dcache-to-file ()
- ;; 用户选择过的词
- (pyim-dcache-save-variable
- 'pyim-dhashcache-icode2word
- pyim-dhashcache-icode2word 0.8)
- ;; 词条总 count
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2count
- pyim-dhashcache-iword2count 0.8)
- ;; 词条 count 日志
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2count-log
- pyim-dhashcache-iword2count-log 0.8)
- ;; 词条优先级
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2priority
- pyim-dhashcache-iword2priority 0.8))
+(defun pyim-dhashcache-get-shortcodes (code)
+ "获取 CODE 所有的 shortcodes.
-(defmacro pyim-dhashcache-put (cache code &rest body)
- "将 BODY 的返回值保存到 CACHE 对应的 CODE 中。
+比如:wubi/aaaa -> (wubi/aaa wubi/aa)
-注意事项:这个宏是一个指代宏,其中 orig-value 在这个宏中有特殊含
-义,代表原来 code 对应的取值。"
- (declare (indent 0))
- (let ((key (make-symbol "key"))
- (table (make-symbol "table"))
- (new-value (make-symbol "new-value")))
- `(let* ((,key ,code)
- (,table ,cache)
- (orig-value (gethash ,key ,table))
- ,new-value)
- (setq ,new-value (progn ,@body))
- (puthash ,key ,new-value ,table))))
+注意事项:这个函数目前只用于五笔等型码输入法,不用于拼音输入法,
+因为拼音输入法词库太大,这样处理之后,会生成一个特别大的哈希表,
+占用太多内存资源,拼音输入法使用 ishortcode 机制。"
+ (when (and (pyim-string-match-p "/" code)
+ (not (pyim-string-match-p "-" code)))
+ (let* ((x (split-string code "/"))
+ (prefix (concat (nth 0 x) "/"))
+ (code1 (nth 1 x))
+ (n (length code1))
+ results)
+ (dotimes (i n)
+ (when (> i 1)
+ (push (concat prefix (substring code1 0 i)) results)))
+ results)))
-(defun pyim-dhashcache-update-iword2count-recent (word n hash-table)
- (let (words-need-remove)
- (pyim-dhashcache-put
- hash-table :all-words
- (setq orig-value (remove word orig-value))
- (push word orig-value)
- (if (<= (length orig-value) n)
- orig-value
- (setq words-need-remove (nthcdr n orig-value))
- (cl-subseq orig-value 0 n)))
- (dolist (w words-need-remove)
- (remhash w hash-table))
- (pyim-dhashcache-put
- hash-table word
- (+ (or orig-value 0) 1))
- hash-table))
+;; ** 更新 dhashcache 词条计数
+(cl-defmethod pyim-dcache-update-wordcount
+ (word &context (pyim-dcache-backend (eql pyim-dhashcache))
+ &optional wordcount-handler)
+ (pyim-dhashcache-update-iword2count word wordcount-handler))
(defun pyim-dhashcache-update-iword2count (word &optional wordcount-handler)
"保存词频到缓存."
@@ -603,88 +608,139 @@ code 对应的中文词条了。
(pyim-dhashcache-get-counts-from-log
(gethash word pyim-dhashcache-iword2count-log)))))
-(defun pyim-dhashcache-update-iword2priority (&optional force)
- "更新词条优先级表,如果 FORCE 为真,强制更新。"
- (interactive)
- (when (or force (not pyim-dhashcache-update-iword2priority-p))
- ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
- ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-iword2priority-p t)
- (async-start
- `(lambda ()
- ,@(pyim-dhashcache-async-inject-variables)
- (require 'pyim-dhashcache)
- (pyim-dhashcache-init-count-and-priority-variables)
- (maphash
- (lambda (key value)
- (puthash key
- (pyim-dhashcache-calculate-priority
- (pyim-dhashcache-get-counts-from-log
- value))
- pyim-dhashcache-iword2priority))
- pyim-dhashcache-iword2count-log)
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2priority
- pyim-dhashcache-iword2priority)
- nil)
- (lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-iword2priority)))))
+(defun pyim-dhashcache-update-iword2count-recent (word n hash-table)
+ (let (words-need-remove)
+ (pyim-dhashcache-put
+ hash-table :all-words
+ (setq orig-value (remove word orig-value))
+ (push word orig-value)
+ (if (<= (length orig-value) n)
+ orig-value
+ (setq words-need-remove (nthcdr n orig-value))
+ (cl-subseq orig-value 0 n)))
+ (dolist (w words-need-remove)
+ (remhash w hash-table))
+ (pyim-dhashcache-put
+ hash-table word
+ (+ (or orig-value 0) 1))
+ hash-table))
-(defun pyim-dhashcache-delete-word (word)
- "将中文词条 WORD 从个人词库中删除"
- (maphash
- (lambda (key value)
- (when (member word value)
- (let ((new-value (remove word value)))
- (if new-value
- (puthash key new-value pyim-dhashcache-icode2word)
- (remhash key pyim-dhashcache-icode2word)))))
- pyim-dhashcache-icode2word)
- (maphash
- (lambda (key value)
- (when (member word value)
- (print value)
- (let ((new-value (remove word value)))
- (if new-value
- (puthash key new-value pyim-dhashcache-ishortcode2word)
- (remhash key pyim-dhashcache-ishortcode2word)))))
- pyim-dhashcache-ishortcode2word)
- (remhash word pyim-dhashcache-iword2count)
- (remhash word pyim-dhashcache-iword2count-log)
- (remhash word pyim-dhashcache-iword2priority))
+;; ** 根据 dhashcache 信息对词条进行排序
+(cl-defmethod pyim-dcache-sort-words
+ (words-list &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "对 WORDS-LIST 排序"
+ (let ((iword2count pyim-dhashcache-iword2count)
+ (iword2priority pyim-dhashcache-iword2priority))
+ (sort words-list
+ (lambda (a b)
+ (let ((p1 (gethash a iword2priority))
+ (p2 (gethash b iword2priority)))
+ (cond
+ ((and (listp p1)
+ (listp p2)
+ (not (equal p1 p2)))
+ (pyim-numbers> p1 p2))
+ (t (let ((n1 (or (gethash a iword2count) 0))
+ (n2 (or (gethash b iword2count) 0)))
+ (> n1 n2)))))))))
-(defun pyim-dhashcache-insert-word-into-icode2word (word code prepend)
- "将词条 WORD 插入到 icode2word 词库缓存 CODE 键对应的位置.
+;; ** 升级 dhashcache 相关函数
+(cl-defmethod pyim-dcache-upgrade
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "升级词库缓存.
-默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
-到已有词条的最前面。"
- (pyim-dhashcache-put
- pyim-dhashcache-icode2word code
- (if prepend
- `(,word ,@(remove word orig-value))
- `(,@(remove word orig-value) ,word))))
+当前已有的功能:
+1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。"
+ (pyim-dhashcache-upgrade-icode2word))
-(defun pyim-dhashcache-insert-word-into-ishortcode2word (word code prepend)
- "将词条 WORD 插入到 ishortcode2word 词库缓存 CODE 首字母字符串对应的位置.
+(defun pyim-dhashcache-upgrade-icode2word ()
+ "升级 icode2word 缓存。"
+ (let ((delete-old-key-p (yes-or-no-p "Delete old key after upgrade? "))
+ (ruler-list (delete-dups
+ (remove nil
+ (mapcar
+ (lambda (scheme)
+ (let ((code-prefix (plist-get (cdr scheme)
:code-prefix))
+ (code-prefix-history (plist-get (cdr
scheme) :code-prefix-history)))
+ (when code-prefix-history
+ (cons code-prefix-history code-prefix))))
+ pyim-schemes)))))
+ (dolist (ruler ruler-list)
+ (let ((old-prefix-list (car ruler))
+ (new-prefix (cdr ruler)))
+ (dolist (old-prefix old-prefix-list)
+ (maphash
+ (lambda (key _value)
+ (when (string-prefix-p old-prefix key)
+ (let* ((key-words (gethash key pyim-dhashcache-icode2word))
+ (new-key (concat new-prefix (string-remove-prefix
old-prefix key)))
+ (new-key-words (gethash new-key
pyim-dhashcache-icode2word))
+ (merged-value (delete-dups `(,@new-key-words
,@key-words))))
+ (puthash new-key merged-value pyim-dhashcache-icode2word)
+ (message "PYIM icode2word upgrade: %S %S -> %S %S" key
key-words new-key merged-value)
+ (when delete-old-key-p
+ (remhash key pyim-dhashcache-icode2word)
+ (message "PYIM icode2word upgrade: %S has been deleted."
key)))))
+ pyim-dhashcache-icode2word))))))
-默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
-到已有词条的最前面。"
- (dolist (newcode (pyim-dhashcache-get-ishortcodes code))
- (pyim-dhashcache-put
- pyim-dhashcache-ishortcode2word
- newcode
- (if prepend
- `(,word ,@(remove word orig-value))
- `(,@(remove word orig-value) ,word)))))
+;; ** 保存 dhashcache 相关函数
+(cl-defmethod pyim-dcache-save-caches
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)))
+ (pyim-dhashcache-save-personal-dcache-to-file))
-(defun pyim-dhashcache-search-word-code (string)
- (gethash string pyim-dhashcache-word2code))
+(defun pyim-dhashcache-save-personal-dcache-to-file ()
+ ;; 用户选择过的词
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-icode2word
+ pyim-dhashcache-icode2word 0.8)
+ ;; 词条总 count
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2count
+ pyim-dhashcache-iword2count 0.8)
+ ;; 词条 count 日志
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2count-log
+ pyim-dhashcache-iword2count-log 0.8)
+ ;; 词条优先级
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2priority
+ pyim-dhashcache-iword2priority 0.8))
-(defun pyim-dhashcache-export-personal-words (file &optional confirm)
+;; ** 导出相关函数
+(cl-defmethod pyim-dcache-export-personal-words
+ (file &context (pyim-dcache-backend (eql pyim-dhashcache))
+ &optional confirm)
"导出个人词库到 FILE."
+ (pyim-dcache-init-variables)
(pyim-dhashcache-export pyim-dhashcache-icode2word file confirm))
-(defun pyim-dhashcache-export-words-and-counts (file &optional confirm
ignore-counts)
+(defun pyim-dhashcache-export (dcache file &optional confirm)
+ "将一个 pyim DCACHE 导出为文件 FILE.
+
+如果 CONFIRM 为 non-nil,文件存在时将会提示用户是否覆盖,
+默认为覆盖模式"
+ (with-temp-buffer
+ (insert ";;; -*- coding: utf-8-unix -*-\n")
+ (maphash
+ (lambda (key value)
+ (let ((value (cl-remove-if
+ (lambda (x)
+ ;; 如果某个词条的 text 属性 :noexport 设置为 t, 在导出的
+ ;; 时候自动忽略这个词条。
+ (and (stringp x)
+ (get-text-property 0 :noexport x)))
+ (if (listp value)
+ value
+ (list value)))))
+ (when value
+ (insert (format "%s %s\n" key (mapconcat #'identity value " "))))))
+ dcache)
+ (pyim-dcache-write-file file confirm)))
+
+(cl-defmethod pyim-dcache-export-words-and-counts
+ (file &context (pyim-dcache-backend (eql pyim-dhashcache))
+ &optional confirm ignore-counts)
+ (pyim-dcache-init-variables)
(with-temp-buffer
(insert ";;; -*- coding: utf-8-unix -*-\n")
(maphash
@@ -710,6 +766,6 @@ code 对应的中文词条了。
(pyim-dcache-write-file file confirm)))
;; * Footer
-
(provide 'pyim-dhashcache)
+
;;; pyim-dhashcache.el ends here
diff --git a/pyim-dregcache.el b/pyim-dregcache.el
index 4686b074ed..ef4d5d6e8c 100644
--- a/pyim-dregcache.el
+++ b/pyim-dregcache.el
@@ -44,133 +44,87 @@
(defvar pyim-dregcache-iword2count nil)
(defvar pyim-dregcache-dicts-md5 nil)
-(defun pyim-dregcache-update (&optional force)
- "读取并加载所有相关词库 dcache.
-
-如果 FORCE 为真,强制加载。"
- (pyim-dregcache-init-variables)
- (when pyim-dcache-auto-update
- (pyim-dregcache-update-personal-words force)
- (let* ((dict-files (pyim-dict-get-enabled-dict-files))
- (dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
- (when pyim-debug
- (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s
dict-files=%s"
- pyim-dicts
- pyim-extra-dicts
- dict-files))
- (pyim-dregcache-update-code2word dict-files dicts-md5 force))))
-
-(defun pyim-dregcache-variable-file (variable)
- "Get VARIABLE dcache file path."
- (concat (file-name-as-directory pyim-dcache-directory)
- (symbol-name variable)))
-
-(defun pyim-dregcache-save-variable (variable value)
- "Save VARIABLE with its VALUE."
- (let* ((file (pyim-dregcache-variable-file variable))
- (save-silently t))
- (make-directory (file-name-directory file) t)
- (with-temp-buffer
- (insert value)
- (pyim-dcache-write-file file))))
-
-(defun pyim-dregcache-load-variable (variable)
- "载入 VARIABLE 对应的文件内容."
- (let* ((file (pyim-dregcache-variable-file variable)))
- (when (and file (file-exists-p file))
- (with-temp-buffer
- (insert-file-contents file)
- (buffer-string)))))
-
-(defun pyim-dregcache-sort-words (words-list)
- "对 WORDS-LIST 排序,词频大的排在前面."
- (let ((iword2count pyim-dregcache-iword2count))
- (sort words-list
- (lambda (a b)
- (let ((a (car (split-string a ":")))
- (b (car (split-string b ":"))))
- (> (or (gethash a iword2count) 0)
- (or (gethash b iword2count) 0)))))))
-
-(defun pyim-dregcache-sort-icode2word ()
- "对个人词库排序."
- ;; https://github.com/redguardtoo/zhfreq
- (with-temp-buffer
- (dolist (l (split-string pyim-dregcache-icode2word "\n"))
- (cond
- ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l)
- ;; 3字以上词很少,如果只处理单字,2字词,3字词
- ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+
\\)\\(.*\\)" l)
- (let* ((pinyin (match-string 1 l))
- (words (pyim-dregcache-sort-words (split-string (match-string 2
l) " "))))
- (insert (format "%s\n" (concat pinyin (string-join words " "))))))
- ;; 其他词
- ((string= l "")
- ;; skip empty line
- )
- (t
- (insert (format "%s\n" l)))))
- (setq pyim-dregcache-icode2word (buffer-string))))
-
-(defun pyim-dregcache-create-cache-content (raw-content)
- "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区."
- (let ((chars "bcdefghjklmnopqrstwxyz")
- (i 0)
- content-segments
- (start (string-match "^a" raw-content))
- chunk
- end)
- ;; 将字典缓存划分成多个"子搜索区域"
- (while (< i (length chars))
- (when (setq end (string-match (string ?^ (elt chars i))
- raw-content
- start))
- (setq chunk (substring-no-properties raw-content start end))
- (push chunk content-segments)
- (setq start end))
- (setq i (1+ i)))
-
- ;; last chunk
- (setq chunk (substring-no-properties raw-content end (length raw-content)))
- (push chunk content-segments)
- (list :content (nreverse content-segments))))
-
-(defun pyim-dregcache-load-dictionary-file (dict-file)
- "READ from DICT-FILE."
- (let* ((raw-content (with-temp-buffer
- (insert-file-contents dict-file)
- (buffer-string))))
- (setq pyim-dregcache-cache
- ;; use string type as key, so have to use `lax-plist-put'
- ;; @see
https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access
- (lax-plist-put pyim-dregcache-cache
- (file-truename dict-file)
- (pyim-dregcache-create-cache-content raw-content)))))
-
-(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force)
- "读取并加载词库.
+;; ** 初始化 dregcache 相关函数
+(cl-defmethod pyim-dcache-init-variables
+ (&context (pyim-dcache-backend (eql pyim-dregcache)))
+ "初始化 cache 缓存相关变量."
+ (pyim-dcache-init-variable
+ pyim-dregcache-iword2count
+ ;; dregcache 引擎也需要词频信息,第一次使用 dregcache 引擎的时候,
+ ;; 自动导入 dhashcache 引擎的词频信息,以后两个引擎的词频信息就
+ ;; 完全分开了。
+ (pyim-dcache-get-value 'pyim-dhashcache-iword2count))
+ (unless pyim-dregcache-icode2word
+ (pyim-dregcache-update-personal-words t)))
-读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
+;; ** 从 dregcache 搜索词条相关函数
+(cl-defmethod pyim-dcache-get
+ (code &context (pyim-dcache-backend (eql pyim-dregcache))
+ &optional from)
+ "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条."
+ (when code
+ (cond ((or (memq 'icode2word from)
+ (memq 'ishortcode2word from))
+ (pyim-dregcache-get-icode2word-ishortcode2word code))
+ ;; FIXME: pyim-dregcache 暂时不支持 iword2count-recent-10-words 和
+ ;; iword2count-recent-50-words.
+ ((or (memq 'iword2count-recent-10-words from)
+ (memq 'iword2count-recent-50-words from))
+ nil)
+ (t (let ((dict-files (pyim-dregcache-all-dict-files))
+ result)
+
+ (when pyim-debug (message "pyim-dregcache-get is called.
code=%s" code))
+ (when dict-files
+ (dolist (file dict-files)
+ (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
+ (content (pyim-dregcache-get-content code
file-info)))
+ (setq result (append (pyim-dregcache-get-1 content code)
result)))))
+ ;; `push' plus `nreverse' is more efficient than `add-to-list'
+ ;; Many examples exist in Emacs' own code
+ (nreverse result))))))
-DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
+(defun pyim-dregcache-get-icode2word-ishortcode2word (code)
+ "以 CODE 搜索个人词和个人联想词. 正则表达式搜索词库,不需要为联想词开单独缓存."
+ (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word
called => %s" code))
+ (when pyim-dregcache-icode2word
+ (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code))))
-如果 FORCE 为真,强制加载。"
- (interactive)
- (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5)))
- ;; no hashtable i file mapping algorithm
- (dolist (file dict-files)
- (pyim-dregcache-load-dictionary-file file))
- (setq pyim-dregcache-dicts-md5 dicts-md5)))
+(defmacro pyim-dregcache-match-line (code)
+ `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)"))
-(defmacro pyim-dregcache-shenmu2regexp (char)
- "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字."
- `(concat ,char "[a-z]*"))
+(defun pyim-dregcache-get-1 (content code)
+ (let ((case-fold-search t)
+ (start 0)
+ (pattern (pyim-dregcache-match-line code))
+ (content-length (length content))
+ word
+ output)
+ (while (and (< start content-length)
+ (setq start (string-match pattern content start)))
+ ;; 提取词
+ (setq word (match-string-no-properties 1 content))
+ (when word
+ (cond
+ ((string-match "^[^ ]+$" word)
+ ;; 单个词
+ (push word output))
+ (t
+ ;; 多个字
+ (setq output (append (nreverse (split-string word " +")) output)))))
+ ;; 继续搜索
+ (setq start (+ start 2 (length code) (length word))))
+ output))
(defmacro pyim-dregcache-is-shenmu (code)
"判断CODE 是否是一个声母."
`(and (eq (length ,code) 1)
(not (string-match ,code "aeo"))))
+(defmacro pyim-dregcache-shenmu2regexp (char)
+ "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字."
+ `(concat ,char "[a-z]*"))
+
(defun pyim-dregcache-code2regexp (code)
"将 CODE 转换成正则表达式用来搜索辞典缓存中的匹配项目.
单个声母会匹配所有以此生母开头的单个汉字."
@@ -209,9 +163,6 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
;; tian-an-men => tian-an-men[a-z-]*
(concat s "[a-z-]*"))))))))
-(defmacro pyim-dregcache-match-line (code)
- `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)"))
-
(defun pyim-dregcache-all-dict-files ()
"所有词典文件."
(let* (rlt)
@@ -238,57 +189,109 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
;; fetch segment using the first character of pinyin code
(nth idx rlt)))
-(defun pyim-dregcache-get-1 (content code)
- (let ((case-fold-search t)
- (start 0)
- (pattern (pyim-dregcache-match-line code))
- (content-length (length content))
- word
- output)
- (while (and (< start content-length)
- (setq start (string-match pattern content start)))
- ;; 提取词
- (setq word (match-string-no-properties 1 content))
- (when word
- (cond
- ((string-match "^[^ ]+$" word)
- ;; 单个词
- (push word output))
- (t
- ;; 多个字
- (setq output (append (nreverse (split-string word " +")) output)))))
- ;; 继续搜索
- (setq start (+ start 2 (length code) (length word))))
- output))
+;; ** 从 dregcache 搜索代码相关函数
+(cl-defmethod pyim-dcache-search-word-code
+ (word &context (pyim-dcache-backend (eql pyim-dregcache)))
+ "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code."
+ (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word))
+ (when pyim-dregcache-cache
+ (catch 'result
+ (let ((dict-files (pyim-dregcache-all-dict-files))
+ code)
+ (when pyim-dregcache-icode2word
+ (setq code (pyim-dregcache-search-word-code-1 word
pyim-dregcache-icode2word))
+ (when code (throw 'result (list code))))
+ (dolist (file dict-files)
+ (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
+ (contents (lax-plist-get file-info :content)))
+ (dolist (content contents)
+ (setq code (pyim-dregcache-search-word-code-1 word content))
+ (when code (throw 'result (list code))))))))))
-(defun pyim-dregcache-get (code &optional from)
- "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条."
- (cond ((or (memq 'icode2word from)
- (memq 'ishortcode2word from))
- (pyim-dregcache-get-icode2word-ishortcode2word code))
- ;; FIXME: pyim-dregcache 暂时不支持 iword2count-recent-10-words 和
- ;; iword2count-recent-50-words.
- ((or (memq 'iword2count-recent-10-words from)
- (memq 'iword2count-recent-50-words from))
- nil)
- (t (let ((dict-files (pyim-dregcache-all-dict-files))
- result)
-
- (when pyim-debug (message "pyim-dregcache-get is called. code=%s"
code))
- (when dict-files
- (dolist (file dict-files)
- (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
- (content (pyim-dregcache-get-content code file-info)))
- (setq result (append (pyim-dregcache-get-1 content code)
result)))))
- ;; `push' plus `nreverse' is more efficient than `add-to-list'
- ;; Many examples exist in Emacs' own code
- (nreverse result)))))
+(defun pyim-dregcache-search-word-code-1 (word content)
+ (let* ((case-fold-search t)
+ (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word
"$\\)")))
+ (when (string-match regexp content)
+ (match-string-no-properties 1 content))))
-(defun pyim-dregcache-get-icode2word-ishortcode2word (code)
- "以 CODE 搜索个人词和个人联想词. 正则表达式搜索词库,不需要为联想词开单独缓存."
- (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word
called => %s" code))
- (when pyim-dregcache-icode2word
- (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code))))
+;; ** 给 dregcache 添加词条相关函数
+(cl-defmethod pyim-dcache-insert-word
+ (word code prepend
+ &context (pyim-dcache-backend (eql pyim-dregcache)))
+ "将词条 WORD 插入到 `pyim-dregcache-icode2word'."
+ (pyim-dregcache-insert-word-into-icode2word word code prepend))
+
+(defun pyim-dregcache-insert-word-into-icode2word (word code prepend)
+ "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法."
+ (when pyim-debug
+ (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s"
+ word
+ code
+ prepend))
+ (with-temp-buffer
+ (when pyim-dregcache-icode2word
+ (insert pyim-dregcache-icode2word))
+ (goto-char (point-min))
+ (let* ((case-fold-search t)
+ substring replace-string beg end old-word-list)
+ (if (re-search-forward (concat "^" code " \\(.*\\)") nil t)
+ (progn
+ (setq beg (match-beginning 0))
+ (setq end (match-end 0))
+ (setq substring (match-string-no-properties 1))
+ (delete-region beg end)
+ ;; 这里不进行排序,在pyim-dregcache-update-personal-words排序
+ (setq old-word-list (pyim-dregcache-sort-words (split-string
substring " ")))
+ (setq replace-string (concat code " " (string-join (delete-dups
`(,@old-word-list ,word)) " "))))
+ (setq replace-string (concat code " " (or replace-string word) "\n")))
+ (goto-char (or beg (point-max)))
+ (insert replace-string))
+ (setq pyim-dregcache-icode2word
+ (buffer-string))))
+
+;; ** 从 dregcache 删除词条相关函数
+(cl-defmethod pyim-dcache-delete-word
+ (word &context (pyim-dcache-backend (eql pyim-dregcache)))
+ "将中文词条 WORD 从个人词库中删除."
+ (with-temp-buffer
+ (insert pyim-dregcache-icode2word)
+ (goto-char (point-min))
+ (let* ((case-fold-search t)
+ substring beg end)
+ (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word
"\\(.*\\)$") nil t)
+ (setq beg (match-beginning 0))
+ (setq end (match-end 0))
+ (setq substring (concat (match-string-no-properties 1)
+ (match-string-no-properties 2)
+ (match-string-no-properties 3)))
+
+ ;; delete string and the newline char
+ (delete-region beg (+ 1 end))
+ (when (> (length (split-string substring " ")) 1)
+ (goto-char beg)
+ (insert substring)))
+ (setq pyim-dregcache-icode2word
+ (buffer-string))))
+ ;; 删除对应词条的词频
+ (remhash word pyim-dregcache-iword2count))
+
+;; ** 更新 dhashcache 相关函数
+(cl-defmethod pyim-dcache-update
+ (&context (pyim-dcache-backend (eql pyim-dregcache)) &optional force)
+ "读取并加载所有相关词库 dcache.
+
+如果 FORCE 为真,强制加载。"
+ (pyim-dcache-init-variables)
+ (when pyim-dcache-auto-update
+ (pyim-dregcache-update-personal-words force)
+ (let* ((dict-files (pyim-dict-get-enabled-dict-files))
+ (dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
+ (when pyim-debug
+ (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s
dict-files=%s"
+ pyim-dicts
+ pyim-extra-dicts
+ dict-files))
+ (pyim-dregcache-update-code2word dict-files dicts-md5 force))))
(defun pyim-dregcache-update-personal-words (&optional force)
"合并 `pyim-dregcache-icode2word' 磁盘文件. 加载排序后的结果.
@@ -327,33 +330,74 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
(when (and force pyim-dregcache-icode2word)
(pyim-dregcache-sort-icode2word)))
-(defun pyim-dregcache-init-variables ()
- "初始化 cache 缓存相关变量."
- (pyim-dcache-init-variable
- pyim-dregcache-iword2count
- ;; dregcache 引擎也需要词频信息,第一次使用 dregcache 引擎的时候,
- ;; 自动导入 dhashcache 引擎的词频信息,以后两个引擎的词频信息就
- ;; 完全分开了。
- (pyim-dcache-get-value 'pyim-dhashcache-iword2count))
- (unless pyim-dregcache-icode2word
- (pyim-dregcache-update-personal-words t)))
+(defun pyim-dregcache-load-variable (variable)
+ "载入 VARIABLE 对应的文件内容."
+ (let* ((file (pyim-dregcache-variable-file variable)))
+ (when (and file (file-exists-p file))
+ (with-temp-buffer
+ (insert-file-contents file)
+ (buffer-string)))))
-(defun pyim-dregcache-save-personal-dcache-to-file ()
- "保存缓存内容到默认目录."
- (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file
called"))
- ;; 用户选择过的词存为标准辞典格式保存
- (when pyim-dregcache-icode2word
- (pyim-dregcache-save-variable
- 'pyim-dregcache-icode2word
- pyim-dregcache-icode2word))
- ;; 词频
- (pyim-dcache-save-variable
- 'pyim-dregcache-iword2count
- pyim-dregcache-iword2count))
+(defun pyim-dregcache-variable-file (variable)
+ "Get VARIABLE dcache file path."
+ (concat (file-name-as-directory pyim-dcache-directory)
+ (symbol-name variable)))
-(defun pyim-dregcache-export-words-and-counts ()
- "TODO"
- )
+(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force)
+ "读取并加载词库.
+
+读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
+
+DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
+
+如果 FORCE 为真,强制加载。"
+ (interactive)
+ (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5)))
+ ;; no hashtable i file mapping algorithm
+ (dolist (file dict-files)
+ (pyim-dregcache-load-dictionary-file file))
+ (setq pyim-dregcache-dicts-md5 dicts-md5)))
+
+(defun pyim-dregcache-load-dictionary-file (dict-file)
+ "READ from DICT-FILE."
+ (let* ((raw-content (with-temp-buffer
+ (insert-file-contents dict-file)
+ (buffer-string))))
+ (setq pyim-dregcache-cache
+ ;; use string type as key, so have to use `lax-plist-put'
+ ;; @see
https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access
+ (lax-plist-put pyim-dregcache-cache
+ (file-truename dict-file)
+ (pyim-dregcache-create-cache-content raw-content)))))
+
+(defun pyim-dregcache-create-cache-content (raw-content)
+ "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区."
+ (let ((chars "bcdefghjklmnopqrstwxyz")
+ (i 0)
+ content-segments
+ (start (string-match "^a" raw-content))
+ chunk
+ end)
+ ;; 将字典缓存划分成多个"子搜索区域"
+ (while (< i (length chars))
+ (when (setq end (string-match (string ?^ (elt chars i))
+ raw-content
+ start))
+ (setq chunk (substring-no-properties raw-content start end))
+ (push chunk content-segments)
+ (setq start end))
+ (setq i (1+ i)))
+
+ ;; last chunk
+ (setq chunk (substring-no-properties raw-content end (length raw-content)))
+ (push chunk content-segments)
+ (list :content (nreverse content-segments))))
+
+;; ** 更新 dregcache 词条计数。
+(cl-defmethod pyim-dcache-update-wordcount
+ (word &context (pyim-dcache-backend (eql pyim-dregcache))
+ &optional wordcount-handler)
+ (pyim-dregcache-update-iword2count word wordcount-handler))
(defun pyim-dregcache-update-iword2count (word &optional wordcount-handler)
"保存词频到缓存."
@@ -369,57 +413,13 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
(unless (equal orig-value new-value)
(puthash word new-value pyim-dregcache-iword2count))))
-(defun pyim-dregcache-delete-word (word)
- "将中文词条 WORD 从个人词库中删除."
- (with-temp-buffer
- (insert pyim-dregcache-icode2word)
- (goto-char (point-min))
- (let* ((case-fold-search t)
- substring beg end)
- (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word
"\\(.*\\)$") nil t)
- (setq beg (match-beginning 0))
- (setq end (match-end 0))
- (setq substring (concat (match-string-no-properties 1)
- (match-string-no-properties 2)
- (match-string-no-properties 3)))
-
- ;; delete string and the newline char
- (delete-region beg (+ 1 end))
- (when (> (length (split-string substring " ")) 1)
- (goto-char beg)
- (insert substring)))
- (setq pyim-dregcache-icode2word
- (buffer-string))))
- ;; 删除对应词条的词频
- (remhash word pyim-dregcache-iword2count))
+;; ** 升级 dhashcache 相关函数
+(cl-defmethod pyim-dcache-upgrade (&context (pyim-dcache-backend (eql
pyim-dregcache)))
+ "升级词库缓存.
-(defun pyim-dregcache-insert-word-into-icode2word (word code prepend)
- "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法."
- (when pyim-debug
- (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s"
- word
- code
- prepend))
- (with-temp-buffer
- (when pyim-dregcache-icode2word
- (insert pyim-dregcache-icode2word))
- (goto-char (point-min))
- (let* ((case-fold-search t)
- substring replace-string beg end old-word-list)
- (if (re-search-forward (concat "^" code " \\(.*\\)") nil t)
- (progn
- (setq beg (match-beginning 0))
- (setq end (match-end 0))
- (setq substring (match-string-no-properties 1))
- (delete-region beg end)
- ;; 这里不进行排序,在pyim-dregcache-update-personal-words排序
- (setq old-word-list (pyim-dregcache-sort-words (split-string
substring " ")))
- (setq replace-string (concat code " " (string-join (delete-dups
`(,@old-word-list ,word)) " "))))
- (setq replace-string (concat code " " (or replace-string word) "\n")))
- (goto-char (or beg (point-max)))
- (insert replace-string))
- (setq pyim-dregcache-icode2word
- (buffer-string))))
+当前已有的功能:
+1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。"
+ (pyim-dregcache-upgrade-icode2word))
(defun pyim-dregcache-upgrade-icode2word ()
"升级 icode2word 缓存。
@@ -428,31 +428,50 @@ dregcache 只支持全拼和双拼,不能用于五笔之类的型码输入法
update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所
以不需要具体实现细节。")
-(defun pyim-dregcache-search-word-code-1 (word content)
- (let* ((case-fold-search t)
- (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word
"$\\)")))
- (when (string-match regexp content)
- (match-string-no-properties 1 content))))
+;; ** 根据 dregcache 信息对词条进行排序
+(defun pyim-dregcache-sort-words (words-list)
+ "对 WORDS-LIST 排序,词频大的排在前面."
+ (let ((iword2count pyim-dregcache-iword2count))
+ (sort words-list
+ (lambda (a b)
+ (let ((a (car (split-string a ":")))
+ (b (car (split-string b ":"))))
+ (> (or (gethash a iword2count) 0)
+ (or (gethash b iword2count) 0)))))))
-(defun pyim-dregcache-search-word-code (word)
- "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code."
- (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word))
- (when pyim-dregcache-cache
- (catch 'result
- (let ((dict-files (pyim-dregcache-all-dict-files))
- code)
- (when pyim-dregcache-icode2word
- (setq code (pyim-dregcache-search-word-code-1 word
pyim-dregcache-icode2word))
- (when code (throw 'result (list code))))
- (dolist (file dict-files)
- (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
- (contents (lax-plist-get file-info :content)))
- (dolist (content contents)
- (setq code (pyim-dregcache-search-word-code-1 word content))
- (when code (throw 'result (list code))))))))))
+;; ** 保存 dregcache 相关函数
+(cl-defmethod pyim-dcache-save-caches
+ (&context (pyim-dcache-backend (eql pyim-dregcache)))
+ (pyim-dregcache-save-personal-dcache-to-file))
+
+(defun pyim-dregcache-save-personal-dcache-to-file ()
+ "保存缓存内容到默认目录."
+ (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file
called"))
+ ;; 用户选择过的词存为标准辞典格式保存
+ (when pyim-dregcache-icode2word
+ (pyim-dregcache-save-variable
+ 'pyim-dregcache-icode2word
+ pyim-dregcache-icode2word))
+ ;; 词频
+ (pyim-dcache-save-variable
+ 'pyim-dregcache-iword2count
+ pyim-dregcache-iword2count))
+
+(defun pyim-dregcache-save-variable (variable value)
+ "Save VARIABLE with its VALUE."
+ (let* ((file (pyim-dregcache-variable-file variable))
+ (save-silently t))
+ (make-directory (file-name-directory file) t)
+ (with-temp-buffer
+ (insert value)
+ (pyim-dcache-write-file file))))
-(defun pyim-dregcache-export-personal-words (file &optional confirm)
+;; ** 导出 dregcache 相关函数
+(cl-defmethod pyim-dcache-export-personal-words
+ (file &context (pyim-dcache-backend (eql pyim-dregcache))
+ &optional confirm)
"将个人词库存入 FILE."
+ (pyim-dcache-init-variables)
(when pyim-dregcache-icode2word
;; 按词频排序,把词频信息保存到用户词典
(pyim-dregcache-sort-icode2word)
@@ -466,6 +485,30 @@ update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所
(sort-lines nil (point-min) (point-max))
(pyim-dcache-write-file file confirm))))
+(defun pyim-dregcache-sort-icode2word ()
+ "对个人词库排序."
+ ;; https://github.com/redguardtoo/zhfreq
+ (with-temp-buffer
+ (dolist (l (split-string pyim-dregcache-icode2word "\n"))
+ (cond
+ ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l)
+ ;; 3字以上词很少,如果只处理单字,2字词,3字词
+ ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+
\\)\\(.*\\)" l)
+ (let* ((pinyin (match-string 1 l))
+ (words (pyim-dregcache-sort-words (split-string (match-string 2
l) " "))))
+ (insert (format "%s\n" (concat pinyin (string-join words " "))))))
+ ;; 其他词
+ ((string= l "")
+ ;; skip empty line
+ )
+ (t
+ (insert (format "%s\n" l)))))
+ (setq pyim-dregcache-icode2word (buffer-string))))
+
+(defun pyim-dregcache-export-words-and-counts ()
+ "TODO"
+ )
+
;; * Footer
(provide 'pyim-dregcache)
diff --git a/pyim-process.el b/pyim-process.el
index cafea2b7f7..23345dba4d 100644
--- a/pyim-process.el
+++ b/pyim-process.el
@@ -205,15 +205,17 @@ imobj 组合构成在一起,构成了 imobjs 这个概念。比如:
"PYIM 流程,词库相关的初始化工作。"
(pyim-recreate-local-variables)
(pyim-pymap-cache-create)
+ (pyim-dcache-init-variables)
(pyim-dcache-update force))
(defun pyim-process-save-dcaches (&optional force)
"PYIM 流程,保存 dcache."
(when force
- (pyim-dcache-save-caches)))
+ (pyim-dcache-save-caches))
+ t)
-(defun pyim-process-update-personal-words ()
- (pyim-dcache-call-api 'update-personal-words t))
+(defun pyim-process-update (&optional force)
+ (pyim-dcache-update force))
(defun pyim-process-start-daemon ()
"启动 pyim 流程需要的 daemon."
diff --git a/pyim.el b/pyim.el
index f6c69017d1..bff910eaf2 100644
--- a/pyim.el
+++ b/pyim.el
@@ -311,6 +311,12 @@ REFRESH-COMMON-DCACHE 已经废弃,不要再使用了。"
(pyim-process-save-dcaches save-personal-dcache)
(pyim-process-init-dcaches :force))
+;; ** 升级功能
+(defun pyim-upgrade ()
+ "升级 pyim 功能。"
+ (interactive)
+ (pyim-dcache-upgrade))
+
;; ** 键盘输入处理功能
(defun pyim-self-insert-command ()
"Pyim 默认的 self-insert-command."
@@ -424,13 +430,28 @@ MERGE-METHOD 是一个函数,这个函数需要两个数字参数,代表词
;; 有这一步骤,导入的词条就会被覆盖。
(pyim-process-save-dcaches t)
;; 更新相关的 dcache
- (pyim-process-update-personal-words)
+ (pyim-process-update t)
(message "PYIM: 词条和词频信息导入完成!")))
;; ** 导出功能
-(defalias 'pyim-export-words-and-counts 'pyim-dcache-export-words-and-counts)
-(defalias 'pyim-export-personal-words 'pyim-dcache-export-personal-words)
+(defun pyim-export-words-and-counts (file &optional confirm ignore-counts)
+ "将个人词条以及词条对应的词频信息导出到文件 FILE.
+
+如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为
+non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式"
+ (interactive "F将词条和词频信息导出到文件: ")
+ (pyim-dcache-export-words-and-counts file confirm ignore-counts)
+ (message "PYIM: 词条和词频信息导出完成。"))
+
+(defun pyim-export-personal-words (file &optional confirm)
+ "将用户的个人词条导出为 pyim 词库文件.
+
+如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil,
+文件存在时将会提示用户是否覆盖,默认为覆盖模式。"
+ (interactive "F将个人词条导出到文件:")
+ (pyim-dcache-export-personal-words file confirm)
+ (message "PYIM: 个人词条导出完成。"))
;; ** 删词功能
(defun pyim-delete-words-in-file (file)
diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el
index 3b366d296b..6eb2a246d9 100644
--- a/tests/pyim-tests.el
+++ b/tests/pyim-tests.el
@@ -982,7 +982,8 @@
(should (equal my/test:1 "hello"))))
(ert-deftest pyim-tests-pyim-dcache-export ()
- (let ((pyim-dhashcache-iword2count (make-hash-table :test #'equal))
+ (let ((pyim-dcache-backend 'pyim-dhashcache)
+ (pyim-dhashcache-iword2count (make-hash-table :test #'equal))
(pyim-dhashcache-icode2word (make-hash-table :test #'equal))
(file (pyim-tests-make-temp-file)))
(puthash "你好" 10 pyim-dhashcache-iword2count)
@@ -1196,15 +1197,16 @@ yin-xing 因行
(should (equal (gethash "n-h" pyim-dhashcache-ishortcode2word)
'("你慌" "你好" "你坏")))))
-(ert-deftest pyim-tests-pyim-dhashcache-sort-words ()
- (let ((pyim-dhashcache-iword2count (make-hash-table :test #'equal))
+(ert-deftest pyim-tests-pyim-dcache-sort-words ()
+ (let ((pyim-dcache-backend 'pyim-dhashcache)
+ (pyim-dhashcache-iword2count (make-hash-table :test #'equal))
words)
(puthash "你好" 3 pyim-dhashcache-iword2count)
(puthash "呢耗" 2 pyim-dhashcache-iword2count)
(puthash "你豪" 1 pyim-dhashcache-iword2count)
(setq words (list "呢耗" "你豪" "你好"))
- (should (equal (pyim-dhashcache-sort-words words)
+ (should (equal (pyim-dcache-sort-words words)
'("你好" "呢耗" "你豪")))))
(ert-deftest pyim-tests-pyim-dhashcache-get-counts-from-log ()
- [elpa] externals/pyim updated (1e0834c456 -> 6c05a5fc03), ELPA Syncer, 2022/06/09
- [elpa] externals/pyim b33d2f2a75 04/12: cl-defgeneric pyim-dcache-update, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 156d282073 06/12: cl-defgeneric pyim-dcache-save-caches, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 7288242a6d 10/12: Sort dcache.el, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 371d2ee095 05/12: cl-defgeneric pyim-dcache-export-*, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim fb74c9fc93 01/12: cl-defgeneric pyim-dcache-upgrade, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 12435095d3 08/12: Sort dcache, dhashcache, dregcache, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 504f20fd49 02/12: cl-defgeneric pyim-dcache-insert-word, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim e0f80e5d36 11/12: Merge branch 'dcache',
ELPA Syncer <=
- [elpa] externals/pyim f6a45f6506 03/12: cl-defgeneric pyim-dcache-update-wordcount, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 41564f3d74 09/12: Sort dregcache, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 6c05a5fc03 12/12: Fix pyim-tests.el, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 05d95422b9 07/12: cl-defgeneric rest dcache interfaces., ELPA Syncer, 2022/06/09