emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] trunk r118264: Add a new, somewhat experimental "readabili


From: Lars Ingebrigtsen
Subject: [Emacs-diffs] trunk r118264: Add a new, somewhat experimental "readability" command to eww
Date: Mon, 03 Nov 2014 00:01:25 +0000
User-agent: Bazaar (2.6b2)

------------------------------------------------------------
revno: 118264
revision-id: address@hidden
parent: address@hidden
committer: Lars Magne Ingebrigtsen <address@hidden>
branch nick: trunk
timestamp: Mon 2014-11-03 01:01:20 +0100
message:
  Add a new, somewhat experimental "readability" command to eww
  
  * net/eww.el (eww-readable): New command and keystroke.
  
  * net/shr.el (shr-retransform-dom): New function.
modified:
  etc/NEWS                       news-20141002041645-34n5fasbwydbo8t6-1
  lisp/ChangeLog                 changelog-20091113204419-o5vbwnq5f7feedwu-1432
  lisp/net/eww.el                eww.el-20130610114603-80ap3gwnw4x4m5ix-1
  lisp/net/shr.el                shr.el-20101002102929-yfzewk55rsg0mn93-1
=== modified file 'etc/NEWS'
--- a/etc/NEWS  2014-10-31 21:35:35 +0000
+++ b/etc/NEWS  2014-11-03 00:01:20 +0000
@@ -133,6 +133,12 @@
 *** New minor mode global-eldoc-mode
 *** eldoc-documentation-function now defaults to nil
 
+** eww
+
+*** A new command `R' (`eww-readable') will try do identify the main
+textual parts of a web page and display only that, leaving menus and
+the like off the page.
+
 ** Message mode
 
 *** text/html messages that contain inline image parts will be

=== modified file 'lisp/ChangeLog'
--- a/lisp/ChangeLog    2014-11-02 23:02:01 +0000
+++ b/lisp/ChangeLog    2014-11-03 00:01:20 +0000
@@ -1,5 +1,9 @@
 2014-11-02  Lars Magne Ingebrigtsen  <address@hidden>
 
+       * net/eww.el (eww-readable): New command and keystroke.
+
+       * net/shr.el (shr-retransform-dom): New function.
+
        * net/eww.el (eww-display-html): Set `eww-current-source' in the
        correct buffer.
        (eww-view-source): Use it.

=== modified file 'lisp/net/eww.el'
--- a/lisp/net/eww.el   2014-11-02 23:02:01 +0000
+++ b/lisp/net/eww.el   2014-11-03 00:01:20 +0000
@@ -402,6 +402,7 @@
   (setq-local eww-contents-url nil))
 
 (defun eww-view-source ()
+  "View the HTML source code of the current page."
   (interactive)
   (let ((buf (get-buffer-create "*eww-source*"))
         (source eww-current-source))
@@ -413,6 +414,60 @@
         (html-mode)))
     (view-buffer buf)))
 
+(defun eww-readable ()
+  "View the main \"readable\" parts of the current web page.
+This command uses heuristics to find the parts of the web page that
+contains the main textual portion, leaving out navigation menus and
+the like."
+  (interactive)
+  (let* ((source eww-current-source)
+        (dom (shr-transform-dom
+              (with-temp-buffer
+                (insert source)
+                (libxml-parse-html-region (point-min) (point-max))))))
+    (eww-score-readability dom)
+    (eww-display-html 'utf-8 nil (shr-retransform-dom
+                                 (eww-highest-readability dom)))
+    (setq eww-current-source source)))
+
+(defun eww-score-readability (node)
+  (let ((score -1))
+    (cond
+     ((memq (car node) '(script head))
+      (setq score -2))
+     ((eq (car node) 'meta)
+      (setq score -1))
+     ((eq (car node) 'a)
+      (setq score (- (length (split-string
+                             (or (cdr (assoc 'text (cdr node))) ""))))))
+     (t
+      (dolist (elem (cdr node))
+       (cond
+        ((eq (car elem) 'text)
+         (setq score (+ score (length (split-string (cdr elem))))))
+        ((consp (cdr elem))
+         (setq score (+ score
+                        (or (cdr (assoc :eww-readability-score (cdr elem)))
+                            (eww-score-readability elem)))))))))
+    ;; Cache the score of the node to avoid recomputing all the time.
+    (setcdr node (cons (cons :eww-readability-score score) (cdr node)))
+    score))
+
+(defun eww-highest-readability (node)
+  (let ((result node)
+       highest)
+    (dolist (elem (cdr node))
+      (when (and (consp (cdr elem))
+                (> (or (cdr (assoc
+                             :eww-readability-score
+                             (setq highest
+                                   (eww-highest-readability elem))))
+                       most-negative-fixnum)
+                   (or (cdr (assoc :eww-readability-score (cdr result)))
+                       most-negative-fixnum)))
+       (setq result highest)))
+    result))
+
 (defvar eww-mode-map
   (let ((map (make-sparse-keymap)))
     (suppress-keymap map)
@@ -435,6 +490,7 @@
     (define-key map "w" 'eww-copy-page-url)
     (define-key map "C" 'url-cookie-list)
     (define-key map "v" 'eww-view-source)
+    (define-key map "R" 'eww-readable)
     (define-key map "H" 'eww-list-histories)
 
     (define-key map "b" 'eww-add-bookmark)

=== modified file 'lisp/net/shr.el'
--- a/lisp/net/shr.el   2014-09-18 19:18:34 +0000
+++ b/lisp/net/shr.el   2014-11-03 00:01:20 +0000
@@ -370,6 +370,26 @@
        (push (shr-transform-dom sub) result)))
     (nreverse result)))
 
+(defun shr-retransform-dom (dom)
+  "Transform the shr DOM back into the libxml DOM."
+  (let ((tag (car dom))
+       (attributes nil)
+       (text nil)
+       (sub-nodes nil))
+    (dolist (elem (cdr dom))
+      (cond
+       ((eq (car elem) 'text)
+       (setq text (cdr elem)))
+       ((not (consp (cdr elem)))
+       (push (cons (intern (substring (symbol-name (car elem)) 1) obarray)
+                   (cdr elem))
+             attributes))
+       (t
+       (push (shr-retransform-dom elem) sub-nodes))))
+    (append (list tag (nreverse attributes))
+           (nreverse sub-nodes)
+           (and text (list text)))))
+
 (defsubst shr-generic (cont)
   (dolist (sub cont)
     (cond


reply via email to

[Prev in Thread] Current Thread [Next in Thread]