[nongnu] elpa/subed 15840c389f 1/2: Add subed-word-data.el with subed-wo

emacs-elpa-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[nongnu] elpa/subed 15840c389f 1/2: Add subed-word-data.el with subed-wo

From:	ELPA Syncer
Subject:	[nongnu] elpa/subed 15840c389f 1/2: Add subed-word-data.el with subed-word-data-load-from-file for SRV2
Date:	Wed, 26 Oct 2022 09:59:53 -0400 (EDT)
branch: elpa/subed
commit 15840c389f203b05fd11bf1545f396bc2a9af378
Author: Sacha Chua <sacha@sachachua.com>
Commit: Sacha Chua <sacha@sachachua.com>

    Add subed-word-data.el with subed-word-data-load-from-file for SRV2
    
    * subed/subed-word-data.el: You can use subed-word-data-load-from-file
    to load word-level timing data from SRV2 files or add
    subed-word-data-load-maybe to the subed-mode-hook.
    * subed/subed-word-data.el.license: Add license marker
    * subed/subed-common.el (subed-split-subtitle-timestamp-functions):
    New hook.
    * subed/subed-common.el (split-subtitle): Use
    subed-split-subtitle-timestamp-functions for more customizability.
    * subed/subed-srt.el (subed--append-subtitle): Don't get confused by spaces.
    * subed/subed-vtt.el (subed--jump-to-subtitle-id): Save the point and
    match more carefully.
    (subed--append-subtitle): Don't get confused by spaces.
    * tests/test-subed-vtt.el ("VTT"): Explicitly move to the first subtitle.
---
 NEWS.org                         |  12 +++
 subed/subed-common.el            |  67 +++++++++++-----
 subed/subed-srt.el               |   2 +
 subed/subed-vtt.el               |  52 +++++++------
 subed/subed-word-data.el         | 161 +++++++++++++++++++++++++++++++++++++++
 subed/subed-word-data.el.license |   3 +
 subed/subed.el                   |   2 +-
 tests/test-subed-vtt.el          |  14 +++-
 8 files changed, 268 insertions(+), 45 deletions(-)

diff --git a/NEWS.org b/NEWS.org
index e28bbd0faa..b1183af1cc 100644
--- a/NEWS.org
+++ b/NEWS.org
@@ -1,4 +1,16 @@
 * subed news
+** Version 1.0.15 - 2022-10-26 - Sacha Chua
+
+Added support for SRV2 files in subed-word-data.el. You can use
+subed-word-data-load-from-file to load word-level timing data from
+SRV2 files or add subed-word-data-load-maybe to the subed-mode-hook.
+
+VTT no longer assumes that the start of the file is part of
+the first subtitle.
+
+VTT and SRT are now less confused by spaces at the end of a subtitle
+when splitting.
+
 ** Version 1.0.14 - 2022-10-25 - Sacha Chua
 
 Delete the CPS overlay when disabling it
diff --git a/subed/subed-common.el b/subed/subed-common.el
index 6bdd8c4b32..d60fc82eeb 100644
--- a/subed/subed-common.el
+++ b/subed/subed-common.el
@@ -1114,6 +1114,40 @@ following manner:
           (subed-backward-subtitle-text)))))
   (point))
 
+(defvar subed-split-subtitle-timestamp-functions
+  '(subed-split-subtitle-based-on-mpv-playback-position
+    subed-split-subtitle-based-on-point-ratio)
+  "Functions to call to get the timestamp to split at.
+Functions will be called with one argument.
+They should return a timestamp in milliseconds.
+The first non-nil value will be used for the split.
+Functions should preserve the point.")
+
+(defun subed-split-subtitle-based-on-mpv-playback-position ()
+  "Return a timestamp based on the current MPV position.
+Do this only if the position is within the start and end of the
+current subtitle."
+  (when (and
+         subed-mpv-playback-position
+         (>= subed-mpv-playback-position (subed-subtitle-msecs-start))
+         (<= subed-mpv-playback-position (subed-subtitle-msecs-stop)))
+    subed-mpv-playback-position))
+
+(defun subed-split-subtitle-based-on-point-ratio ()
+  "Return a timestamp based on the position and number of characters in the 
subtitle text."
+  (let* ((pos (point))
+         (text-beg (or (save-excursion (subed-jump-to-subtitle-text)) pos))
+         (text-end (or (save-excursion (subed-jump-to-subtitle-end)) pos)))
+    ;; Ensure point is on subtitle text
+    (when (and (>= pos text-beg)
+               (<= pos text-end))
+      (let* ((orig-start (subed-subtitle-msecs-start))
+             (orig-end (subed-subtitle-msecs-stop))
+             (text-fraction (if (= text-beg text-end) 1
+                              (/ (* 1.0 (- (point) text-beg)) (- text-end 
text-beg))))
+             (time-fraction (floor (* text-fraction (- orig-end orig-start)))))
+        (+ orig-start time-fraction)))))
+
 (subed-define-generic-function split-subtitle (&optional offset)
   "Split current subtitle at point.
 
@@ -1151,27 +1185,24 @@ position of the point."
     ;; Ensure point is on subtitle text
     (unless (and text-beg (>= (point) text-beg))
       (subed-jump-to-subtitle-text))
-    (let* ((orig-start (subed-subtitle-msecs-start))
-           (orig-end (subed-subtitle-msecs-stop))
-           (text-fraction (if (= text-beg text-end) 1 (/ (* 1.0 (- (point) 
text-beg)) (- text-end text-beg))))
-           (time-fraction (floor (* text-fraction (- orig-end orig-start))))
+    (let* ((orig-end (subed-subtitle-msecs-stop))
            (split-timestamp
             (cond
-             ((and (numberp offset) (> offset 0)) (+ orig-start offset))
-             ((and (numberp offset) (< offset 0)) (+ orig-end offset))
-             ((or (equal offset t)
-                  (null subed-mpv-playback-position)
-                  (> subed-mpv-playback-position orig-end)
-                  (< subed-mpv-playback-position orig-start))
-              (+ orig-start time-fraction))
-             (subed-mpv-playback-position subed-mpv-playback-position)))
+             ((and (numberp offset) (> offset 0))
+              (+ (subed-subtitle-msecs-start) offset))
+             ((and (numberp offset) (< offset 0))
+              (+ orig-end offset))
+             (t (run-hook-with-args-until-success 
'subed-split-subtitle-timestamp-functions))))
            (new-text (string-trim (buffer-substring (point) text-end)))
-           (new-start-timestamp (+ split-timestamp subed-subtitle-spacing)))
-      (subed-set-subtitle-time-stop split-timestamp)
-      (skip-chars-backward "\n")
-      (delete-region (point) (progn (subed-jump-to-subtitle-end) 
(skip-chars-forward " \t") (point)))
-      (when (looking-at "[ \t]+") (replace-match ""))
-      (subed-append-subtitle nil new-start-timestamp orig-end (string-trim 
new-text)))
+           (new-start-timestamp (and split-timestamp (+ split-timestamp 
subed-subtitle-spacing))))
+      (if split-timestamp
+          (progn
+            (subed-set-subtitle-time-stop split-timestamp)
+            (skip-chars-backward "\r\n")
+            (delete-region (point) (progn (subed-jump-to-subtitle-end) 
(skip-chars-forward " \t") (point)))
+            (when (looking-at "[ \t]+") (replace-match ""))
+            (subed-append-subtitle nil new-start-timestamp orig-end 
(string-trim new-text)))
+        (error "Could not determine timestamp for splitting.")))
     (point)))
 
 ;;; Merging
diff --git a/subed/subed-srt.el b/subed/subed-srt.el
index 9eda9bba9e..33cf0aab8a 100644
--- a/subed/subed-srt.el
+++ b/subed/subed-srt.el
@@ -241,6 +241,8 @@ Return new point.  Use the format-specific function for 
MAJOR-MODE."
   (unless (subed-forward-subtitle-id)
     ;; Point is on last subtitle or buffer is empty
     (subed-jump-to-subtitle-end)
+    (when (looking-at "[[:space:]]+")
+      (replace-match ""))
     ;; Moved point to end of last subtitle; ensure separator exists
     (while (not (looking-at "\\(\\`\\|[[:blank:]]*\n[[:blank:]]*\n\\)"))
       (save-excursion (insert ?\n)))
diff --git a/subed/subed-vtt.el b/subed/subed-vtt.el
index 9a9a44a46d..463ecb342b 100644
--- a/subed/subed-vtt.el
+++ b/subed/subed-vtt.el
@@ -111,29 +111,33 @@ If SUB-ID is not given, focus the current subtitle's ID.
 Return point or nil if no subtitle ID could be found.
 WebVTT doesn't use IDs, so we use the starting timestamp instead.
 Use the format-specific function for MAJOR-MODE."
-  (if (stringp sub-id)
-      ;; Look for a line that contains the timestamp, preceded by one or more
-      ;; blank lines or the beginning of the buffer.
-      (let* ((orig-point (point))
-             (regex (concat "\\(" subed--regexp-separator "\\|\\`\\)\\(" 
(regexp-quote sub-id) "\\)"))
-             (match-found (progn (goto-char (point-min))
-                                 (re-search-forward regex nil t))))
-        (if match-found
-            (goto-char (match-beginning 2))
-          (goto-char orig-point)))
-    ;; Find one or more blank lines.
-    (re-search-forward "\\([[:blank:]]*\n\\)+" nil t)
-    ;; Find two or more blank lines or the beginning of the buffer, followed
-    ;; by line starting with a timestamp.
-    (let* ((regex (concat  "\\(" subed--regexp-separator "\\|\\`\\)"
-                           "\\(?:" subed-vtt--regexp-identifier "\\)?"
-                           "\\(" subed--regexp-timestamp "\\)"))
-           (match-found (re-search-backward regex nil t)))
-      (when match-found
-        (goto-char (match-beginning 2)))))
-  ;; Make extra sure we're on a timestamp, return nil if we're not
-  (when (looking-at "^\\(\\([0-9]+:\\)?[0-9]+:[0-9]+\\.[0-9]+\\)")
-    (point)))
+  (let ((orig-point (point)) found)
+    (if (stringp sub-id)
+        ;; Look for a line that contains the timestamp, preceded by one or more
+        ;; blank lines or the beginning of the buffer.
+        (let* ((regex (concat "\\(" subed--regexp-separator "\\|\\`\\)\\("
+                              (regexp-quote sub-id) "\\)")))
+          (goto-char (point-min))
+          (setq found (re-search-forward regex nil t))
+          (if found
+              (goto-char (match-beginning 2))
+            (goto-char orig-point)))
+      ;; Find one or more blank lines.
+      (or (re-search-forward "\\(^[[:blank:]]*\n\\)+" nil t)
+          (goto-char (point-max)))
+      ;; Find two or more blank lines or the beginning of the buffer, followed
+      ;; by line starting with a timestamp.
+      (let* ((regex (concat  "\\(" subed--regexp-separator "\\|\\`\\)"
+                             "\\(?:" subed-vtt--regexp-identifier "\\)?"
+                             "\\(" subed--regexp-timestamp "\\)")))
+        (setq found (re-search-backward regex nil t))
+        (when found
+          (goto-char (match-beginning 2)))))
+    ;; Make extra sure we're on a timestamp, return nil if we're not
+    (if (and found (looking-at "^\\(\\([0-9]+:\\)?[0-9]+:[0-9]+\\.[0-9]+\\)"))
+        (point)
+      (goto-char orig-point)
+      nil)))
 
 (cl-defmethod subed--jump-to-subtitle-time-start (&context (major-mode 
subed-vtt-mode) &optional sub-id)
   "Move point to subtitle's start time.
@@ -251,6 +255,8 @@ point.  Use the format-specific function for MAJOR-MODE."
   (unless (subed-forward-subtitle-id)
     ;; Point is on last subtitle or buffer is empty
     (subed-jump-to-subtitle-end)
+    (when (looking-at "[[:space:]]+")
+      (replace-match ""))
     ;; Moved point to end of last subtitle; ensure separator exists
     (while (not (looking-at "\\(\\`\\|[[:blank:]]*\n[[:blank:]]*\n\\)"))
       (save-excursion (insert ?\n)))
diff --git a/subed/subed-word-data.el b/subed/subed-word-data.el
new file mode 100644
index 0000000000..b474611f5b
--- /dev/null
+++ b/subed/subed-word-data.el
@@ -0,0 +1,161 @@
+;;; subed-word-data.el --- Use word-level timing data when splitting subtitles 
 -*- lexical-binding: t; -*-
+
+;;; License:
+;;
+;; Copyright (C) 2022  Sacha Chua
+
+;; Author: Sacha Chua <sacha@sachachua.com>
+;; Keywords: multimedia
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; This file parses timing data such as the ones you get from YouTube
+;; .srv2 and tries to match the timing data with the remaining text in
+;; the current subtitle in order to determine the word timestamp for
+;; splitting the subtitle.
+
+;; To try to automatically load word data from a similarly-named file
+;; in the buffer, add this to your configuration:
+
+;; (with-eval-after-load 'subed
+;;   (add-hook 'subed-mode-hook 'subed-word-data-load-maybe))
+
+;;; Code:
+
+(require 'xml)
+(require 'dom)
+
+(defvar-local subed-word-data--cache nil
+  "Word-level timing in the form ((start . ms) (end . ms) (text . ms))")
+
+(defun subed-word-data--extract-words-from-srv2 (data)
+  "Extract the timing from DATA in SRV2 format.
+Return a list of ((start . ?), (end . ?) (text . ?))."
+  (when (stringp data)
+    (with-temp-buffer (insert data) (setq data (xml-parse-region))))
+  (let* ((text-elements (reverse (dom-by-tag data 'text)))
+         (last-start
+          (and text-elements
+               (+ (string-to-number
+                   (alist-get 't (xml-node-attributes (car text-elements))))
+                  (string-to-number (alist-get 'd (xml-node-attributes (car 
text-elements))))))))
+    (reverse
+     (mapcar #'(lambda (element)
+                 (let ((rec (list (cons 'start (string-to-number (alist-get 't 
(xml-node-attributes element))))
+                                  (cons 'end last-start)
+                                  (cons 'text
+                                         (replace-regexp-in-string "&#39;" "'"
+                                                                   (car 
(xml-node-children element)))
+                                         ))))
+                   (setq last-start (alist-get 'start rec))
+                   rec))
+               text-elements))))
+
+(defun subed-word-data--load (data)
+  "Load word-level timing from DATA.
+For now, only SRV2 files are supported."
+  (setq-local subed-word-data--cache data)
+  (add-hook 'subed-split-subtitle-timestamp-functions 
#'subed-word-data-split-at-word-timestamp -5 t))
+
+(defun subed-word-data-load-from-file (file)
+  "Load word-level timing from FILE.
+For now, only SRV2 files are supported."
+  (interactive "fFile: ")
+  (subed-word-data--load (subed-word-data--extract-words-from-srv2 
(xml-parse-file file))))
+
+(defun subed-word-data-load-from-string (string)
+  "Load word-level timing from STRING.
+For now, only SRV2 files are supported."
+  (subed-word-data--load (subed-word-data--extract-words-from-srv2 string)))
+
+(defvar subed-word-data-extensions '(".en.srv2" ".srv2") "Extensions to search 
for word data.")
+
+(defun subed-word-data-load-maybe ()
+  "Load word data if available. Suitable for adding to `subed-mode-hook'."
+  (when (buffer-file-name)
+    (let (file)
+      (catch 'found
+        (mapc (lambda (ext)
+                (when (file-exists-p (concat (file-name-sans-extension 
(buffer-file-name)) ext))
+                  (setq file (concat (file-name-sans-extension 
(buffer-file-name)) ext))
+                  (throw 'found)))
+              subed-word-data-extensions))
+      (when file
+        (subed-word-data-load-from-file file)
+        (message "Word data loaded.")))))
+
+(defvar subed-word-data-normalizing-functions 
'(subed-word-data-normalize-word-default)
+  "Functions to run to normalize words before comparison.")
+
+(defun subed-word-data-normalize-word-default (s)
+  "Downcase S and remove non-alphanumeric characters for comparison."
+  (replace-regexp-in-string "[^[:alnum:]]" "" (downcase s)))
+
+(defun subed-word-data-normalize-word (word)
+  "Normalize WORD to make it easier to compare."
+  (mapc (lambda (func)
+          (setq word (funcall func word)))
+        subed-word-data-normalizing-functions)
+  word)
+
+(defun subed-word-data-compare-normalized-string= (word1 word2)
+  "Compare two words and return t if they are the same after normalization."
+  (string= (subed-word-data-normalize-word word1)
+           (subed-word-data-normalize-word word2)))
+
+(defvar subed-word-data-compare 'subed-word-data-compare-normalized-string=
+  "Function to use to compare.")
+
+(defun subed-word-data--look-up-word ()
+  "Find the word timing that matches the one at point (approximately)."
+  (save-excursion
+    (skip-syntax-backward "w")
+    (let* ((end (subed-subtitle-msecs-stop))
+           (start (subed-subtitle-msecs-start))
+           (remaining-words (split-string
+                             (buffer-substring
+                              (point)
+                              (or (subed-jump-to-subtitle-end) (point)))))
+           (words (if remaining-words
+                      (reverse (seq-filter
+                                (lambda (o)
+                                  (and (<= (alist-get 'end o) end)
+                                       (>= (alist-get 'start o) start)
+                                       (not (string-match "^\n*$" (alist-get 
'text o)))))
+                                subed-word-data--cache))))
+           (offset 0)
+           (done (null remaining-words))
+           candidate)
+      (while (not done)
+        (setq candidate (elt words (+ (1- (length remaining-words)) offset)))
+        (cond
+         ((and candidate (funcall subed-word-data-compare
+                                  (car remaining-words)
+                                  (alist-get 'text candidate)))
+          (setq done t))
+         ((> offset (length words)) (setq done t))
+         ((> offset 0) (setq offset (- offset)))
+         (t (setq offset (1+ (- offset))))))
+      candidate)))
+
+(defun subed-word-data-split-at-word-timestamp ()
+  "Return the starting timestamp if the word is found."
+  (when subed-word-data--cache
+    (let ((time (assoc-default 'start (subed-word-data--look-up-word))))
+      (when time (- time subed-subtitle-spacing)))))
+
+(provide 'subed-word-data)
+;;; subed-word-data.el ends here
diff --git a/subed/subed-word-data.el.license b/subed/subed-word-data.el.license
new file mode 100644
index 0000000000..1c67cfabee
--- /dev/null
+++ b/subed/subed-word-data.el.license
@@ -0,0 +1,3 @@
+;;;; SPDX-FileCopyrightText: 2022 Sacha Chua
+;;;;
+;;;; SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/subed/subed.el b/subed/subed.el
index 2191eba02b..58a568b6ef 100644
--- a/subed/subed.el
+++ b/subed/subed.el
@@ -1,6 +1,6 @@
 ;;; subed.el --- A major mode for editing subtitles  -*- lexical-binding: t; 
-*-
 
-;; Version: 1.0.14
+;; Version: 1.0.15
 ;; Maintainer: Sacha Chua <sacha@sachachua.com>
 ;; Author: Random User
 ;; Keywords: convenience, files, hypermedia, multimedia
diff --git a/tests/test-subed-vtt.el b/tests/test-subed-vtt.el
index 49d529200e..ed360d7ee5 100644
--- a/tests/test-subed-vtt.el
+++ b/tests/test-subed-vtt.el
@@ -55,6 +55,7 @@ Baz.
          (insert mock-vtt-data)
          (let ((msecs (- (save-excursion
                            (goto-char (point-min))
+                           (subed-forward-subtitle-id)
                            (subed-subtitle-msecs-start))
                          1)))
            (expect (subed-subtitle-id-at-msecs msecs) :to-equal nil))))
@@ -237,6 +238,7 @@ Baz.
         (with-temp-vtt-buffer
          (insert mock-vtt-data)
          (goto-char (point-min))
+         (subed-forward-subtitle-id)
          (expect (subed-jump-to-subtitle-time-start) :to-equal 9)
          (expect (looking-at subed--regexp-timestamp) :to-be t)
          (expect (match-string 0) :to-equal "00:01:01.000")
@@ -257,6 +259,7 @@ Baz.
         (with-temp-vtt-buffer
          (insert mock-vtt-data)
          (goto-char (point-min))
+         (subed-forward-subtitle-id)
          (expect (subed-jump-to-subtitle-time-stop) :to-equal 26)
          (expect (looking-at subed--regexp-timestamp) :to-be t)
          (expect (match-string 0) :to-equal "00:01:05.123")
@@ -277,6 +280,7 @@ Baz.
         (with-temp-vtt-buffer
          (insert mock-vtt-data)
          (goto-char (point-min))
+         (subed-forward-subtitle-id)
          (expect (subed-jump-to-subtitle-text) :to-equal 39)
          (expect (point) :to-equal (save-excursion (goto-char (point-max)) 
(search-backward "Foo.")))
          (re-search-forward "\n\n")
@@ -294,6 +298,7 @@ Baz.
         (with-temp-vtt-buffer
          (insert mock-vtt-data)
          (goto-char (point-min))
+         (subed-forward-subtitle-id)
          (expect (subed-jump-to-subtitle-end) :to-be 43)
          (expect (looking-back "^Foo.$") :to-be t)
          (forward-char 2)
@@ -313,6 +318,7 @@ Baz.
          (re-search-forward "Foo\\.\n")
          (replace-match "Foo.\n ")
          (goto-char (point-min))
+         (subed-forward-subtitle-id)
          (expect (subed-jump-to-subtitle-end) :to-be 43)
          (expect (looking-back "^Foo.$") :to-be t)))
       (it "returns nil if subtitle end cannot be found."
@@ -350,6 +356,7 @@ Baz.
          (expect (subed-jump-to-subtitle-end) :to-be nil)
          (expect (looking-at "^$") :to-be t)
          (goto-char (point-min))
+         (subed-forward-subtitle-id)
          (expect (subed-jump-to-subtitle-end) :to-be 39)
          (expect (looking-at "^$") :to-be t)
          (subed-jump-to-subtitle-text "00:02:02.234")
@@ -622,11 +629,12 @@ Baz.
        (subed-jump-to-subtitle-id "00:03:03.45")
        (subed-set-subtitle-time-start (+ (* 1 60 60 1000) (* 2 60 1000) (* 3 
1000) 4) 3)
        (expect (save-excursion (subed-jump-to-subtitle-time-start)
-                               (thing-at-point 'line)) :to-equal "01:02:03.004 
--> 00:03:15.5\n")
+                               (thing-at-point 'line))
+               :to-equal "01:02:03.004 --> 00:03:15.5\n")
        (subed-set-subtitle-time-stop (+ (* 2 60 60 1000) (* 3 60 1000) (* 4 
1000) 60) 3)
        (expect (save-excursion (subed-jump-to-subtitle-time-start)
-                               (thing-at-point 'line)) :to-equal "01:02:03.004 
--> 02:03:04.060\n")))
-    )
+                               (thing-at-point 'line))
+               :to-equal "01:02:03.004 --> 02:03:04.060\n"))))
 
   (describe "Inserting a subtitle"
     (describe "in an empty buffer"
[Prev in Thread]
Current Thread
[Next in Thread]
[nongnu] elpa/subed updated (74793f0653 -> a63a472039), ELPA Syncer, 2022/10/26
- [nongnu] elpa/subed a63a472039 2/2: Add autoload cookies, ELPA Syncer, 2022/10/26
- [nongnu] elpa/subed 15840c389f 1/2: Add subed-word-data.el with subed-word-data-load-from-file for SRV2, ELPA Syncer <=
Prev by Date: [nongnu] elpa/subed a63a472039 2/2: Add autoload cookies
Next by Date: [elpa] main b54540b503: elpa-packages (hiddenquote): Enable auto-sync
Previous by thread: [nongnu] elpa/subed a63a472039 2/2: Add autoload cookies
Next by thread: [elpa] main b54540b503: elpa-packages (hiddenquote): Enable auto-sync
Index(es):
- Date
- Thread