Index: mule.el =================================================================== RCS file: /cvsroot/emacs/emacs/lisp/international/mule.el,v retrieving revision 1.227 diff -u -c -r1.227 mule.el cvs server: conflicting specifications of output style *** mule.el 23 Oct 2005 18:24:00 -0000 1.227 --- mule.el 27 Oct 2005 23:56:29 -0000 *************** *** 1588,1594 **** (symbol :tag "Coding system")))) ;; See the bottom of this file for built-in auto coding functions. ! (defcustom auto-coding-functions '(sgml-xml-auto-coding-function sgml-html-meta-auto-coding-function) "A list of functions which attempt to determine a coding system. --- 1588,1595 ---- (symbol :tag "Coding system")))) ;; See the bottom of this file for built-in auto coding functions. ! (defcustom auto-coding-functions '(project-gutenberg-auto-coding-function ! sgml-xml-auto-coding-function sgml-html-meta-auto-coding-function) "A list of functions which attempt to determine a coding system. *************** *** 2204,2209 **** --- 2205,2315 ---- ;;; Built-in auto-coding-functions: + + (defun project-gutenberg-auto-coding-function (size) + "Determine character encoding of a Project Gutenberg EBook/Etext. + This function is designed for use in `auto-coding-functions'. + + A Project Gutenberg text has \"Project Gutenberg\" in the first line, and a + subsequent \"Character set encoding:\" line. The latter gives the coding + system. + + Some early non-ASCII texts don't have a \"Character set encoding:\", for + those you have to use other Emacs mechanisms (eg. \\[universal-coding-system-argument]). + + See http://www.gutenberg.org for more about Project Gutenberg." + + ;; This regexp identifies a gutenberg file, it's kept fairly tight to + ;; avoid false matches. + ;; + ;; Many early gutenberg files have different first lines, but the + ;; alternatives here are enough for the non-ascii files existing in 2005. + ;; + ;; Some (but not all) utf-8 files begin with a marker sequence EF BB BF. + + (and (looking-at "\\(...\\)?\\(Project Gutenberg\\('s\\)?\\|The Project Gutenberg\\|\\**This is a COPYRIGHTED Project Gutenberg\\) ") + + ;; The regexp here is "^Cha[rt]acter set encoding: *\\(.*\\)", except + ;; tweaked to avoid trailing spaces and \r in the match-string. + ;; + ;; Project Gutenberg files are CRLF line endings (usually) so \r is + ;; normal; and trailing spaces have been seen in a few files. + ;; + ;; "Chatacter" is a typo seen in about 220 files as of 2005 (though + ;; only 38 are non-ASCII). + ;; + (re-search-forward + "^Cha[rt]acter set encoding:[ \t\r]*\\(\\([ \t\r]*[^ \t\r\n]+\\)*\\)" + ;; only search first 200 lines + (save-excursion (forward-line 200) (point)) + t) + + ;; The character set names are slightly free form. They're perfectly + ;; understandable to a human, but need some massaging to get + ;; something `locale-charset-to-coding-system' can handle. The stuff + ;; below was tested on the full set of files in 2005. + ;; + ;; Some readme.txt files have "MP3" or the like given as the + ;; character set, which is bogus, it refers to the existance of .mp3 + ;; files, the .txt is plain ascii. We let such cases get the warning + ;; message. + + (let* ((orig-charset (match-string 1)) + (charset (downcase orig-charset))) + + ;; "ascii" -> "us-ascii" + ;; "iso-646-us (us-ascii)" -> "us-ascii" + (if (member charset '("ascii" "iso-646-us (us-ascii)")) + (setq charset "us-ascii")) + + ;; "ascii, with a few iso-8859-1 characters" etc -> "iso-8859-1" + ;; "acii, with some iso-8859-1 characters" -> "iso-8859-1" + ;; the "acii" is a typo in dvptn10.txt, easy enough to allow it + (setq charset (replace-regexp-in-string + "^as?cii[ (,]*with.* \\(iso-8859-[0-9]+\\).*" + "\\1" charset t)) + + ;; "cp-1250" -> "windows-1250" + ;; "cp1251" -> "windows-1251" + ;; "codepage 1250" -> "windows-1250" + ;; "windows codepage 1252" -> "windows-1252" + ;; "windows code page 1252" -> "windows-1252" + (setq charset (replace-regexp-in-string + "^\\(cp\\|codepage\\|windows \\(code ?page\\)?\\)[ -]*" + "windows-" charset t t)) + + ;; "unicode" alone -> "utf-8", found in 10752-8.txt + (setq charset (replace-regexp-in-string "^unicode\r?$" "utf-8" + charset t)) + + ;; "unicode utf-8" -> "utf-8" + (setq charset (replace-regexp-in-string "^unicode utf" "utf" + charset t t)) + + ;; "unicode (utf-8)" -> "utf-8" + (setq charset (replace-regexp-in-string "^unicode (\\(.*\\))$" "\\1" + charset t)) + + ;; "iso-8858-1" -> "iso-8859-1", typo in 10439-8.txt + (setq charset (replace-regexp-in-string "8858" "8859" charset t t)) + + ;; "ido-8859-1" -> "iso-8859-1", typo in 10549-8.txt + (setq charset (replace-regexp-in-string "^ido-" "iso-" charset t t)) + + ;; "iso 8859-1 (latin-1)" -> "latin-1" + (setq charset (replace-regexp-in-string + "^iso 8859-\\([0-9]+\\) (\\(latin-\\1\\))$" + "\\2" charset t)) + + ;; "iso=8859-1" -> "iso-8859-1" + ;; "big 5" -> "big-5" + (setq charset (replace-regexp-in-string "[= ]" "-" charset t t)) + + (or (locale-charset-to-coding-system charset) + (progn + (message "Warning: unknown coding system \"%s\"" + orig-charset) + nil))))) (defun sgml-xml-auto-coding-function (size) "Determine whether the buffer is XML, and if so, its encoding.