emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/elisa 594dd9efdb 5/6: Merge pull request #31 from s-kos


From: ELPA Syncer
Subject: [elpa] externals/elisa 594dd9efdb 5/6: Merge pull request #31 from s-kostyaev/add-embeddings-recalculation
Date: Thu, 2 Jan 2025 12:59:32 -0500 (EST)

branch: externals/elisa
commit 594dd9efdb57ce76a1d82a731db979c941e5b39f
Merge: 322d7eb839 f43953b4d7
Author: Sergey Kostyaev <s-kostyaev@users.noreply.github.com>
Commit: GitHub <noreply@github.com>

    Merge pull request #31 from s-kostyaev/add-embeddings-recalculation
    
    Add embeddings recalculation
---
 README.org | 23 ++++++++++++++++++-----
 elisa.el   | 44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/README.org b/README.org
index 3fc9d4003a..29af50a377 100644
--- a/README.org
+++ b/README.org
@@ -88,7 +88,7 @@ I prefer this models:
 #+begin_src shell
   ollama pull gemma2:9b-instruct-q6_K
   ollama pull qwen2.5:3b
-  ollama pull chatfire/bge-m3:q8_0
+  ollama pull snowflake-arctic-embed2
 #+end_src
 
 **** Complex documents
@@ -203,6 +203,12 @@ Disable all collections.
 
 Removes collection and all its data from index.
 
+*** elisa-async-recalculate-embeddings
+
+Recalculate embeddings asynchronously. Use it after changing
+~elisa-embeddings-provider~ variable. Can take some time. Works faster
+with ~elisa-batch-embeddings-enabled~.
+
 ** Configuration
 
 Example configuration.
@@ -222,7 +228,7 @@ Example configuration.
     ;; (setopt elisa-chat-provider
     ;;           (make-llm-ollama
     ;;            :chat-model "gemma2:9b-instruct-q6_K"
-    ;;            :embedding-model "chatfire/bge-m3:q8_0"
+    ;;            :embedding-model "snowflake-arctic-embed2"
     ;;            ;; set context window to 8k
     ;;            :default-chat-non-standard-params '(("num_ctx" . 8192))))
     ;;
@@ -230,11 +236,13 @@ Example configuration.
     (setopt elisa-chat-provider
          (make-llm-ollama
           :chat-model "qwen2.5:3b"
-          :embedding-model "chatfire/bge-m3:q8_0"
+          :embedding-model "snowflake-arctic-embed2"
           :default-chat-temperature 0.1
           :default-chat-non-standard-params '(("num_ctx" . 32768))))
     ;; this embedding model has stong multilingual capabilities
-    (setopt elisa-embeddings-provider (make-llm-ollama :embedding-model 
"chatfire/bge-m3:q8_0"))
+    (setopt elisa-embeddings-provider (make-llm-ollama :embedding-model 
"snowflake-arctic-embed2"))
+    ;; enable batch embeddings for faster processing
+    (setopt elisa-batch-embeddings-enabled t)
     :config
     ;; searxng works better than duckduckgo in my tests
     (setopt elisa-web-search-function 'elisa-search-searxng))
@@ -293,7 +301,7 @@ Example configuration.
 
 + ~elisa-semantic-split-function~:
     * Type: Function
-    * Description:  Function used to split text into semantically meaningful 
chunks.
+    * Description: Function used to split text into semantically meaningful 
chunks.
     * Default: ~elisa-split-by-paragraph~
 
 + ~elisa-prompt-rewriting-enabled~:
@@ -313,6 +321,11 @@ Example configuration.
     * Type: Boolean
     * Description: Enable batch embeddings if supported.
 
++ ~elisa-batch-size~:
+    * Type: Integer
+    * Description: Batch size to send to provider during batch embeddings 
calculation.
+    * Default: 300
+
 **** Web Search and Integration
 
 + ~elisa-searxng-url~:
diff --git a/elisa.el b/elisa.el
index 1dd57a9a96..415f7455fa 100644
--- a/elisa.el
+++ b/elisa.el
@@ -1,6 +1,6 @@
 ;;; elisa.el --- Emacs Lisp Information System Assistant -*- lexical-binding: 
t -*-
 
-;; Copyright (C) 2024  Free Software Foundation, Inc.
+;; Copyright (C) 2024, 2025 Free Software Foundation, Inc.
 
 ;; Author: Sergey Kostyaev <sskostyaev@gmail.com>
 ;; URL: http://github.com/s-kostyaev/elisa
@@ -107,6 +107,7 @@
 (require 'shr)
 (require 'plz)
 (require 'json)
+(require 'sqlite)
 
 (defgroup elisa nil
   "RAG implementation for `ellama'."
@@ -279,6 +280,10 @@ If set, all quotes with similarity less than threshold 
will be filtered out."
   "Enable batch embeddings if supported."
   :type 'boolean)
 
+(defcustom elisa-batch-size 300
+  "Batch size to send to provider during batch embeddings calculation."
+  :type 'integer)
+
 (defun elisa-supported-complex-document-p (path)
   "Check if PATH contain supported complex document."
   (cl-find (file-name-extension path)
@@ -350,6 +355,10 @@ database."
   (format "CREATE VIRTUAL TABLE IF NOT EXISTS data_embeddings USING 
vss0(embedding(%d));"
          (elisa-get-embedding-size)))
 
+(defun elisa-data-embeddings-drop-table-sql ()
+  "Generate sql for drop data embeddings table."
+  "DROP TABLE IF EXISTS data_embeddings;")
+
 (defun elisa-data-fts-create-table-sql ()
   "Generate sql for create full text search table."
   "CREATE VIRTUAL TABLE IF NOT EXISTS data_fts USING FTS5(data);")
@@ -473,7 +482,8 @@ Return list of vectors."
   (let ((provider elisa-embeddings-provider))
     (if (and elisa-batch-embeddings-enabled
             (member 'embeddings-batch (llm-capabilities provider)))
-       (llm-batch-embeddings provider chunks)
+       (let ((batches (seq-partition chunks elisa-batch-size)))
+         (flatten-list (mapcar (lambda (batch) (llm-batch-embeddings provider 
batch)) batches)))
       (mapcar (lambda (chunk) (llm-embedding provider chunk)) chunks))))
 
 (defun elisa-parse-info-manual (name collection-name)
@@ -1265,6 +1275,7 @@ Call ON-DONE callback with result as an argument after 
FUNC evaluation done."
                    ,(async-inject-variables "elisa-tar-executable")
                    ,(async-inject-variables "elisa-prompt-rewriting-enabled")
                    ,(async-inject-variables "elisa-batch-embeddings-enabled")
+                   ,(async-inject-variables "elisa-batch-size")
                    ,(async-inject-variables "elisa-rewrite-prompt-template")
                    ,(async-inject-variables "elisa-semantic-split-function")
                    ,(async-inject-variables 
"elisa-webpage-extraction-function")
@@ -1492,5 +1503,34 @@ Find similar quotes in COLLECTIONS and add it to 
context."
   (let ((cols (or collections elisa-enabled-collections)))
     (elisa--rewrite-prompt prompt (elisa--gen-chat cols))))
 
+(defun elisa-recalculate-embeddings ()
+  "Recalculate and save new embeddings after embedding provider change."
+  (sqlite-execute elisa-db "DELETE FROM data WHERE data = '';") ;; remove rows 
without data
+  (let* ((data-rows (sqlite-select elisa-db "SELECT rowid, data FROM data;"))
+        (texts (mapcar #'cadr data-rows))
+        (rowids (mapcar #'car data-rows))
+        (embeddings (elisa-embeddings texts))
+        (len (length rowids))
+        (i 0))
+    ;; Recreate embeddings table
+    (sqlite-execute elisa-db (elisa-data-embeddings-drop-table-sql))
+    (sqlite-execute elisa-db (elisa-data-embeddings-create-table-sql))
+    ;; Recalculate embeddings
+    (with-sqlite-transaction elisa-db
+      (while (< i len)
+       (let ((rowid (nth i rowids))
+             (embedding (nth i embeddings)))
+         (sqlite-execute
+          elisa-db
+          (format "INSERT INTO data_embeddings(rowid, embedding) VALUES (%s, 
%s);"
+                  rowid (elisa-vector-to-sqlite embedding)))
+         (setq i (1+ i)))))))
+
+;;;###autoload
+(defun elisa-async-recalculate-embeddings ()
+  "Recalculate embeddings asynchronously."
+  (interactive)
+  (elisa--async-do 'elisa-recalculate-embeddings))
+
 (provide 'elisa)
 ;;; elisa.el ends here.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]