groff-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[groff] 02/02: Use uchardet library in `preconv' to detect input file en


From: Bertrand Garrigues
Subject: [groff] 02/02: Use uchardet library in `preconv' to detect input file encoding
Date: Fri, 21 Jul 2017 20:25:27 -0400 (EDT)

bgarrigues pushed a commit to branch master
in repository groff.

commit 75c0873c3f0d9f353b967e316c20ee1bb06d1e86
Author: Bertrand Garrigues <address@hidden>
Date:   Tue Jul 4 23:40:46 2017 +0200

    Use uchardet library in `preconv' to detect input file encoding
    
    * m4/groff.m4 (GROFF_UCHARDET): new macro that use pkg-config to
    check if uchardet library is available and define new option
    `--with-uchardet':
    - If `yes' is passed and uchardet is not found, configure fails.
    - If `no' is passed we don't use uchardet and don't display any
    warning.
    - If `auto' (or any other option) is passed of if --with-uchardet
    is not passed, we use uchardet if available, otherwise the build
    goes on but a warning is displayed at the end of the configuration
    phase.
    
    * configure.ac: use pkg-config and `GROFF_UCHARDET' to detect the
    presence of uchardet library.
    
    * src/preproc/preconv/preconv.am: link against uchardet library if
    available.
    
    * src/preproc/preconv/preconv.cpp (detect_file_encoding): new
    function that uses uchardet library to detect input file encoding.
    (do_file): `detect_file_encoding'.
    
    * src/preproc/preconv/preconv.1.man: update `preconv'
    documentation.
    
    See https://savannah.gnu.org/bugs/?51330
---
 ChangeLog                         | 30 +++++++++++++
 configure.ac                      | 20 +++++----
 m4/groff.m4                       | 36 ++++++++++++++++
 src/preproc/preconv/preconv.1.man | 11 ++++-
 src/preproc/preconv/preconv.am    |  3 +-
 src/preproc/preconv/preconv.cpp   | 88 +++++++++++++++++++++++++++++++++++++--
 6 files changed, 173 insertions(+), 15 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index b4d328f..44645b9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,35 @@
 2017-07-12  Bertrand Garrigues <address@hidden>
 
+       Use uchardet library in `preconv' to detect input file encoding
+
+       * m4/groff.m4 (GROFF_UCHARDET): new macro that use pkg-config to
+       check if uchardet library is available and define new option
+       `--with-uchardet':
+       - If `yes' is passed and uchardet is not found, configure fails.
+       - If `no' is passed we don't use uchardet and don't display any
+       warning.
+       - If `auto' (or any other option) is passed of if --with-uchardet
+       is not passed, we use uchardet if available, otherwise the build
+       goes on but a warning is displayed at the end of the configuration
+       phase.
+
+       * configure.ac: use pkg-config and `GROFF_UCHARDET' to detect the
+       presence of uchardet library.
+
+       * src/preproc/preconv/preconv.am: link against uchardet library if
+       available.
+
+       * src/preproc/preconv/preconv.cpp (detect_file_encoding): new
+       function that uses uchardet library to detect input file encoding.
+       (do_file): `detect_file_encoding'.
+
+       * src/preproc/preconv/preconv.1.man: update `preconv'
+       documentation.
+
+       See https://savannah.gnu.org/bugs/?51330
+
+2017-07-12  Bertrand Garrigues <address@hidden>
+
        Add gnulib module fprintf-posix
 
        * bootstrap.conf: do it.
diff --git a/configure.ac b/configure.ac
index 0558515..97b6cad 100644
--- a/configure.ac
+++ b/configure.ac
@@ -87,6 +87,8 @@ AC_PROG_INSTALL
 AC_PROG_LN_S
 AC_PROG_MKDIR_P
 GROFF_PROG_XPMTOPPM
+PKG_PROG_PKG_CONFIG
+GROFF_UCHARDET
 
 # use a dummy substitution if no csh hack is necessary to avoid errors
 # with non-GNU sed programs
@@ -219,27 +221,29 @@ AC_OUTPUT
 echo "
 ${PACKAGE_NAME} version ${PACKAGE_VERSION}
 ----------------------------------------------------------------------
- Prefix            : ${prefix}
- Compiler          : ${CC} ${CFLAGS} ${CPPFLAGS}"
+ Prefix                          : ${prefix}
+ Compiler                        : ${CC} ${CFLAGS} ${CPPFLAGS}"
 if test "x$groff_no_x" = "xyes"; then
 echo "\
- X11 support       : no"
+ X11 support                     : no"
 else
 echo "\
- X11 support       : yes
- X11 resources dir : $appresdir"
+ X11 support                     : yes
+ X11 resources dir               : $appresdir"
 fi
 if test "x$doc" = x; then
 echo "\
- Doc build         : no"
+ Doc build                       : no"
 else
 echo "\
- Doc build         : ${doc} "
+ Doc build                       : ${doc} "
 fi
 echo "\
- URW fonts for pdf : $groff_have_urw_fonts"
+ URW fonts for pdf               : $groff_have_urw_fonts
+ Use uchardet library for preconv: $groff_have_uchardet"
 echo "\
 ----------------------------------------------------------------------"
 
 GROFF_APPRESDIR_CHECK
 GROFF_URW_FONTS_CHECK
+GROFF_UCHARDET_CHECK
diff --git a/m4/groff.m4 b/m4/groff.m4
index d1727cd..1f41649 100644
--- a/m4/groff.m4
+++ b/m4/groff.m4
@@ -1621,3 +1621,39 @@ AC_DEFUN([GROFF_BASH],
      BASH_PROG=/bin/sh
   fi
   AC_SUBST([BASH_PROG])])
+
+# Looking for uchardet library, used by preconv.
+AC_DEFUN([GROFF_UCHARDET],
+  [AC_ARG_WITH([uchardet],
+               AS_HELP_STRING([--with-uchardet],
+                              [Build `preconv' with uchardet library for file \
+                               encoding automatic detection [=auto|no|yes]]))
+   AS_IF([test "x$with_uchardet" != "xno"],
+         [PKG_CHECK_MODULES([UCHARDET],
+                            [uchardet >= 0.0.1],
+                            [AC_DEFINE([HAVE_UCHARDET],
+                                       [1],
+                                       [uchardet library availability])
+                             groff_have_uchardet=yes],
+                            [if test "x$with_uchardet" = "xyes"; then
+                               AC_MSG_FAILURE([Could not found uchardet 
library])
+                             else
+                               AC_MSG_WARN([uchardet library not found, 
preconv \
+                                            might not work properly])
+                             fi
+                             groff_have_uchardet=no])],
+          [groff_have_uchardet=no]
+          )])
+
+# Warning if uchardet library was not found
+AC_DEFUN([GROFF_UCHARDET_CHECK],
+  [if test "x$groff_have_uchardet" = "xno" -a "x$with_uchardet" != "xno"; then
+  AC_MSG_WARN([
+  uchardet library was not found, preprocessor `preconv', which uses it to 
detect
+  the input file encoding, might not work properly (to check how and in which
+  order `preconv' tries to determine the file encoding see its man page).
+  ])
+  fi
+  ])
+
+
diff --git a/src/preproc/preconv/preconv.1.man 
b/src/preproc/preconv/preconv.1.man
index ce93d93..70ae3c3 100644
--- a/src/preproc/preconv/preconv.1.man
+++ b/src/preproc/preconv/preconv.1.man
@@ -142,13 +142,20 @@ Otherwise, check whether the input starts with a
 If found, use it.
 .
 .IP 3.
-Finally, check whether there is a known
+Otherwise, check whether there is a known
 .I coding tag
 (see below) in either the first or second input line.
 .
 If found, use it.
 .
-.IP 4.
+.IP 4
+Finally, if
+.BR uchardet
+library
+(an encoding detector library available on most major distributions)
+is available on the system, use it to try to detect the encoding of the file.
+.
+.IP 5.
 If everything fails, use a default encoding as given with option
 .BR \-D ,
 by the current locale, or \[oq]latin1\[cq] if the locale is set to
diff --git a/src/preproc/preconv/preconv.am b/src/preproc/preconv/preconv.am
index 942eb0a..c873592 100644
--- a/src/preproc/preconv/preconv.am
+++ b/src/preproc/preconv/preconv.am
@@ -19,8 +19,9 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
 bin_PROGRAMS += preconv
-preconv_LDADD = libgroff.a $(LIBM) $(LIBICONV)
+preconv_LDADD = libgroff.a $(LIBM) $(LIBICONV) $(UCHARDET_LIBS)
 preconv_SOURCES = src/preproc/preconv/preconv.cpp
+preconv_CPPFLAGS = $(AM_CPPFLAGS) $(UCHARDET_CFLAGS)
 man1_MANS += src/preproc/preconv/preconv.1
 EXTRA_DIST += src/preproc/preconv/preconv.1.man
 
diff --git a/src/preproc/preconv/preconv.cpp b/src/preproc/preconv/preconv.cpp
index 2819b5e..b7241b0 100644
--- a/src/preproc/preconv/preconv.cpp
+++ b/src/preproc/preconv/preconv.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-/* Copyright (C) 2005-2014  Free Software Foundation, Inc.
+/* Copyright (C) 2005-2017  Free Software Foundation, Inc.
      Written by Werner Lemberg (address@hidden)
 
 This file is part of groff.
@@ -22,6 +22,10 @@ along with this program. If not, see 
<http://www.gnu.org/licenses/>. */
 #include <assert.h>
 #include <stdlib.h>
 #include <errno.h>
+#include <sys/stat.h>
+#ifdef HAVE_UCHARDET
+#include <uchardet/uchardet.h>
+#endif
 #include "errarg.h"
 #include "error.h"
 #include "localcharset.h"
@@ -997,6 +1001,67 @@ check_coding_tag(FILE *fp, string &data)
   return NULL;
 }
 
+char *
+detect_file_encoding(FILE *fp)
+{
+#ifdef HAVE_UCHARDET
+  uchardet_t ud;
+  struct stat stat_buf;
+  size_t len, read_bytes;
+  char *data;
+  int res, current_position;
+  const char *charset;
+  char *ret;
+
+  current_position = ftell(fp);
+  /* due to BOM and tag detection we are not at the begining of the file */
+  rewind(fp);
+  if (fstat(fileno(fp), &stat_buf) != 0) {
+    fprintf(stderr, "fstat: %s\n", strerror(errno));
+    return NULL;
+  }
+  len = stat_buf.st_size;
+  if (debug_flag)
+    fprintf(stderr, "  len: %zu\n", len);  
+  if (len == 0)
+    return NULL;
+  data = (char *)calloc(len, 1);
+  read_bytes = fread(data, 1, len, fp);
+  if (read_bytes == 0) {
+    fprintf(stderr, "fread: %s\n", strerror(errno));
+    return NULL;
+  }
+  /* We rewind back to the original position */
+  if (fseek(fp, current_position, SEEK_SET) != 0) {
+    fprintf(stderr, "Fatal error: fseek: %s\n", strerror(errno));
+    return NULL;
+  }
+  ud = uchardet_new();
+  res = uchardet_handle_data(ud, data, len);
+  if (res != 0) {
+    fprintf(stderr, "uchardet_handle_data: %d\n", res);
+    uchardet_delete(ud);
+    return NULL;
+  }
+  if (debug_flag)
+    fprintf(stderr, "  uchardet read: %zu bytes\n", read_bytes);
+  uchardet_data_end(ud);
+  charset = uchardet_get_charset(ud);
+  if (debug_flag)
+    fprintf(stderr, "  charset: %s\n", charset);
+  if (charset) {
+    ret = (char *)calloc(strlen(charset) + 1, 1);
+    strcpy(ret, charset);
+  }
+  uchardet_delete(ud);
+  free(data);
+
+  return ret;
+#else /* not HAVE_UCHARDET */
+  return NULL;
+#endif /* not HAVE_UCHARDET */
+}
+
 // ---------------------------------------------------------
 // Handle an input file.  If filename is `-' handle stdin.
 //
@@ -1025,6 +1090,7 @@ do_file(const char *filename)
   const char *BOM_encoding = get_BOM(fp, BOM, data);
   // Determine the encoding.
   char *encoding;
+  int must_free_encoding = 0;
   if (user_encoding[0]) {
     if (debug_flag) {
       fprintf(stderr, "  user-specified encoding `%s', "
@@ -1046,8 +1112,15 @@ do_file(const char *filename)
     char *file_encoding = check_coding_tag(fp, data);
     if (!file_encoding) {
       if (debug_flag)
-       fprintf(stderr, "  no file encoding\n");
-      file_encoding = default_encoding;
+       fprintf(stderr, "  no encoding tag\n");
+      file_encoding = detect_file_encoding(fp);
+      if (!file_encoding) {
+        if (debug_flag)
+          fprintf(stderr, "  could not detect encoding with uchardet\n");
+        file_encoding = default_encoding;
+      }
+      else
+        must_free_encoding = 1;
     }
     else
       if (debug_flag)
@@ -1056,6 +1129,8 @@ do_file(const char *filename)
   }
   strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
   encoding_string[MAX_VAR_LEN - 1] = 0;
+  if (must_free_encoding)
+    free(encoding);
   encoding = encoding_string;
   // Translate from MIME & Emacs encoding names to locale encoding names.
   encoding = emacs2mime(encoding_string);
@@ -1143,13 +1218,18 @@ main(int argc, char **argv)
                            "dD:e:hrv", long_options, NULL)) != EOF)
     switch (opt) {
     case 'v':
-      printf("GNU preconv (groff) version %s %s iconv support\n",
+      printf("GNU preconv (groff) version %s %s iconv support and %s uchardet 
support\n",
             Version_string,
 #ifdef HAVE_ICONV
             "with"
 #else
             "without"
 #endif /* HAVE_ICONV */
+#ifdef HAVE_UCHARDET
+             "with"
+#else
+             "without"
+#endif /* HAVE_UCHARDET */
            );
       exit(0);
       break;



reply via email to

[Prev in Thread] Current Thread [Next in Thread]