gnu-emacs-sources
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

cleanup.el (Was Replacing non-ascii characters)


From: Tom Breton
Subject: cleanup.el (Was Replacing non-ascii characters)
Date: 12 Dec 2005 17:41:37 -0500
User-agent: Gnus/5.0802 (Gnus v5.8.2) Emacs/20.4

Mark Elston asked on comp.emacs for something to ASCIIify text he
cut-and-pasted from the web.  Here's a little thing I use for just
that job:

;;; cleanup.el --- Clean up quoted material of a few common types

;; Copyright (C) 2005 by Tom Breton <address@hidden>

;; Author: Tom Breton <address@hidden>
;; Keywords: convenience

;; This file is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; This file is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to
;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;; Rationale: Do you sometimes cut and paste from web pages into
;; Emacs?  Sometimes the result isn't ASCII text, but an ugly
;; near-ASCII text that can't be saved with plain encoding.

;; Solution: Call `tehom-cleanup-quoted' for HTML that used entities
;; for characters (eg, "&ldquo;").  Or `tehom-cleanup-a-hat' for the
;; encoding where each escape sequence starts with the circumflexed a
;; (looks like "a" with a hat on it).

;; There's also `tehom-cleanup-google' that cleans up text that came
;; from Google groups.

;; If none of those apply, it's easy to define another translation:
;; Define a list where each element is two strings, (OLD-TEXT
;; NEW-TEXT).  If OLD-TEXT is not plain text, call
;; `tehom-escape-unprintables' to translate it to plain text.  Define
;; a call `tehom-cleanup-quoted-aux' with that list as an argument -
;; and there's your translator.

;; These translation lists are just compilations of the common escape
;; sequences that I found I needed to fix, so they can certainly be
;; extended.

;;; Code:

(defconst tehom-cleanup-html-equivs
   (cons
      (list (char-to-string 2217) "(c)")
      '(
          ( "&ldquo;" "``")
          ( "&rdquo;" "''")
          ( "&lsquo;" "`")
          ( "&rsquo;" "'")
          ( "&hellip;" "...")
          ("&ndash;" "-")
          ("&mdash;" " -- ")
          ))
    "HTML entity ASCII equivalents")


(defconst tehom-cleanup-google-removals
   '(
       (
          "- Hide quoted text -
- Show quoted text -

" 
          "")

       (
          "Reply | Reply to Author | Forward | Print | Individual Message | 
Show original | Report Abuse"
          "")
       (
          "Reply | Reply to Author | Forward | Print | View Thread | Show 
original | Report Abuse "
          "")
       (
          " - Find messages by this author"
          "")
       )
   
   "Google groups sequences that you probably want to get rid of" )


(defvar tehom-cleanup-ahat-equivs 
   '(
       ("\x8e2??"        "'")
       ("\x8e2?~"        "`")
       ("\x8e2?\x8a6"    "...")
       ("\x8e2\x3f\x8a2" "*>")
       )

   "A-hat ASCII equivalents" )

;;Maintenance helper.
(defun tehom-escape-unprintables (str)
   "Insert STR, all escaped.  
For translating new sequences."
   (dotimes (i (length str))
      (insert (format "\\x%x" (aref str i)))))


;;;###autoload
(defun tehom-cleanup-quoted ()
   "Clean up quoted HTML PCDATA"
   
   (interactive)
   (tehom-cleanup-quoted-aux tehom-cleanup-html-equivs))


;;;###autoload
(defun tehom-cleanup-google ()
   "Clean up captures from new Google news"
   (interactive)
   (tehom-cleanup-quoted-aux tehom-cleanup-google-removals))

;;;###autoload
(defun tehom-cleanup-a-hat ()
   "Clean up captures from new Google news"
   (interactive)
   (tehom-cleanup-quoted-aux tehom-cleanup-ahat-equivs))


(defun tehom-cleanup-quoted-aux (replacements)
   "Cleanup quoted HTML PCDATA"
   
   (interactive)
   (require 'cl)
   (dolist (i replacements)
      (goto-char (point-min))
      (replace-string (first i) (second i)))

   (goto-char (point-min))
   (delete-blank-lines)
   (delete-blank-lines))

;;; cleanup.el ends here

-- 
Tom Breton, the calm-eyed visionary


reply via email to

[Prev in Thread] Current Thread [Next in Thread]