[nongnu] elpa/subed f5f6c5ab07 1/2: subed-word-data: Parse WhisperX JSON

emacs-elpa-diffs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[nongnu] elpa/subed f5f6c5ab07 1/2: subed-word-data: Parse WhisperX JSON

From:	ELPA Syncer
Subject:	[nongnu] elpa/subed f5f6c5ab07 1/2: subed-word-data: Parse WhisperX JSON
Date:	Thu, 17 Oct 2024 13:01:11 -0400 (EDT)

branch: elpa/subed
commit f5f6c5ab0735b0bebfd6665d22bef7b4a59e5e0f
Author: Sacha Chua <sacha@sachachua.com>
Commit: Sacha Chua <sacha@sachachua.com>

    subed-word-data: Parse WhisperX JSON
    
    * subed/subed-word-data.el 
(subed-word-data--extract-words-from-whisperx-json): New.
    (subed-word-data-load-from-file): Add JSON file support.
    (subed-word-data--look-up-word): Handle entries
    without start or end times.
    (subed-word-data-subtitle-entries): Handle entries
    without start or end times.
---
 subed/subed-word-data.el | 46 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/subed/subed-word-data.el b/subed/subed-word-data.el
index a3c9471964..f99c312a6e 100644
--- a/subed/subed-word-data.el
+++ b/subed/subed-word-data.el
@@ -23,7 +23,7 @@
 ;;; Commentary:
 
 ;; This file parses timing data such as the ones you get from YouTube
-;; .srv2 and tries to match the timing data with the remaining text in
+;; .srv2 or WhisperX JSON and tries to match the timing data with the 
remaining text in
 ;; the current subtitle in order to determine the word timestamp for
 ;; splitting the subtitle.
 
@@ -72,6 +72,35 @@ Return a list of ((start . ?), (end . ?) (text . ?))."
                    rec))
                text-elements))))
 
+(defun subed-word-data--extract-words-from-whisperx-json (file)
+  "Extract the timing from file in WhisperX's JSON format.
+Return a list of ((start . ?), (end . ?) (text . ?))."
+  (let* ((json-object-type 'alist)
+         (json-array-type 'list)
+         (data (json-read-file file))
+         (base (seq-mapcat
+                                                               (lambda 
(segment)
+                                                                       
(seq-map (lambda (info)
+                                                                               
                                 (let-alist info
+                                                                               
                                         `((start . ,(and .start (* 1000 
.start)))
+                                 (end . ,(and .end (* 1000 .end)))
+                                                                               
                                                 (text . ,(and .word)))))
+                                                                               
                         (alist-get 'words segment)))
+                                                               (alist-get 
'segments data)))
+         last-end
+         current)
+               ;; numbers at the end of a sentence sometimes don't end up with 
times
+               ;; so we need to fix them
+    (while current
+                       (unless (alist-get 'start (car current))                
                                ; start
+                               (set-cdr (assoc 1 'start (car current)) (1+ 
last-end)))
+                       (unless (alist-get 'end (car current))                  
                        ; start
+                               (set-cdr (assoc 1 'end (car current)) (1- 
(alist-get 'start (cadr current)))))
+                       (setq
+                        last-end (alist-get 'end (car current))
+                        current (cdr current)))
+    base))
+
 (defun subed-word-data--load (data)
   "Load word-level timing from DATA.
 For now, only SRV2 files are supported."
@@ -82,9 +111,12 @@ For now, only SRV2 files are supported."
 ;;;###autoload
 (defun subed-word-data-load-from-file (file)
   "Load word-level timing from FILE.
-For now, only SRV2 files are supported."
+For now, only SRV2 and JSON files are supported."
   (interactive "fFile: ")
-  (subed-word-data--load (subed-word-data--extract-words-from-srv2 
(xml-parse-file file))))
+  (subed-word-data--load
+   (if (string-match "\\.json\\'" file)
+       (subed-word-data--extract-words-from-whisperx-json file)
+     (subed-word-data--extract-words-from-srv2 (xml-parse-file file)))))
 
 (defun subed-word-data-load-from-string (string)
   "Load word-level timing from STRING.
@@ -148,8 +180,8 @@ Return non-nil if they are the same after normalization."
            (words (if remaining-words
                       (reverse (seq-filter
                                 (lambda (o)
-                                  (and (<= (alist-get 'end o) end)
-                                       (>= (alist-get 'start o) start)
+                                  (and (or (not (alist-get 'end o)) (<= 
(alist-get 'end o) end))
+                                       (or (not (alist-get 'start o)) (>= 
(alist-get 'start o) start))
                                        (not (string-match "^\n*$" (alist-get 
'text o)))))
                                 subed-word-data--cache))))
            (offset 0)
@@ -182,8 +214,8 @@ Return non-nil if they are the same after normalization."
         (stop (+ (subed-subtitle-msecs-stop) subed-subtitle-spacing)))
     (seq-filter
      (lambda (o)
-       (and (<= (alist-get 'end o) stop)
-            (>= (alist-get 'start o) start)
+       (and (<= (or (alist-get 'end o) most-positive-fixnum) stop)
+            (>= (or (alist-get 'start o) 0) start)
             (not (string-match "^\n*$" (alist-get 'text o)))))
      subed-word-data--cache)))

[Prev in Thread]

Current Thread

[Next in Thread]

[nongnu] elpa/subed updated (4558c30f39 -> 87222b0f56), ELPA Syncer, 2024/10/17
- [nongnu] elpa/subed 87222b0f56 2/2: v1.2.17: Merge without including the current subtitle, ELPA Syncer, 2024/10/17
- [nongnu] elpa/subed f5f6c5ab07 1/2: subed-word-data: Parse WhisperX JSON, ELPA Syncer <=

Prev by Date: [nongnu] elpa/subed 87222b0f56 2/2: v1.2.17: Merge without including the current subtitle
Next by Date: [nongnu] elpa/subed updated (4558c30f39 -> 87222b0f56)
Previous by thread: [nongnu] elpa/subed 87222b0f56 2/2: v1.2.17: Merge without including the current subtitle
Next by thread: [nongnu] elpa/meow e2ff708b95: Add meow-pop-to-mark and meow-unpop-to-mark
Index(es):
- Date
- Thread