[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[groff] 02/02: Use uchardet library in `preconv' to detect input file en
From: |
Bertrand Garrigues |
Subject: |
[groff] 02/02: Use uchardet library in `preconv' to detect input file encoding |
Date: |
Fri, 21 Jul 2017 20:25:27 -0400 (EDT) |
bgarrigues pushed a commit to branch master
in repository groff.
commit 75c0873c3f0d9f353b967e316c20ee1bb06d1e86
Author: Bertrand Garrigues <address@hidden>
Date: Tue Jul 4 23:40:46 2017 +0200
Use uchardet library in `preconv' to detect input file encoding
* m4/groff.m4 (GROFF_UCHARDET): new macro that use pkg-config to
check if uchardet library is available and define new option
`--with-uchardet':
- If `yes' is passed and uchardet is not found, configure fails.
- If `no' is passed we don't use uchardet and don't display any
warning.
- If `auto' (or any other option) is passed of if --with-uchardet
is not passed, we use uchardet if available, otherwise the build
goes on but a warning is displayed at the end of the configuration
phase.
* configure.ac: use pkg-config and `GROFF_UCHARDET' to detect the
presence of uchardet library.
* src/preproc/preconv/preconv.am: link against uchardet library if
available.
* src/preproc/preconv/preconv.cpp (detect_file_encoding): new
function that uses uchardet library to detect input file encoding.
(do_file): `detect_file_encoding'.
* src/preproc/preconv/preconv.1.man: update `preconv'
documentation.
See https://savannah.gnu.org/bugs/?51330
---
ChangeLog | 30 +++++++++++++
configure.ac | 20 +++++----
m4/groff.m4 | 36 ++++++++++++++++
src/preproc/preconv/preconv.1.man | 11 ++++-
src/preproc/preconv/preconv.am | 3 +-
src/preproc/preconv/preconv.cpp | 88 +++++++++++++++++++++++++++++++++++++--
6 files changed, 173 insertions(+), 15 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index b4d328f..44645b9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,35 @@
2017-07-12 Bertrand Garrigues <address@hidden>
+ Use uchardet library in `preconv' to detect input file encoding
+
+ * m4/groff.m4 (GROFF_UCHARDET): new macro that use pkg-config to
+ check if uchardet library is available and define new option
+ `--with-uchardet':
+ - If `yes' is passed and uchardet is not found, configure fails.
+ - If `no' is passed we don't use uchardet and don't display any
+ warning.
+ - If `auto' (or any other option) is passed of if --with-uchardet
+ is not passed, we use uchardet if available, otherwise the build
+ goes on but a warning is displayed at the end of the configuration
+ phase.
+
+ * configure.ac: use pkg-config and `GROFF_UCHARDET' to detect the
+ presence of uchardet library.
+
+ * src/preproc/preconv/preconv.am: link against uchardet library if
+ available.
+
+ * src/preproc/preconv/preconv.cpp (detect_file_encoding): new
+ function that uses uchardet library to detect input file encoding.
+ (do_file): `detect_file_encoding'.
+
+ * src/preproc/preconv/preconv.1.man: update `preconv'
+ documentation.
+
+ See https://savannah.gnu.org/bugs/?51330
+
+2017-07-12 Bertrand Garrigues <address@hidden>
+
Add gnulib module fprintf-posix
* bootstrap.conf: do it.
diff --git a/configure.ac b/configure.ac
index 0558515..97b6cad 100644
--- a/configure.ac
+++ b/configure.ac
@@ -87,6 +87,8 @@ AC_PROG_INSTALL
AC_PROG_LN_S
AC_PROG_MKDIR_P
GROFF_PROG_XPMTOPPM
+PKG_PROG_PKG_CONFIG
+GROFF_UCHARDET
# use a dummy substitution if no csh hack is necessary to avoid errors
# with non-GNU sed programs
@@ -219,27 +221,29 @@ AC_OUTPUT
echo "
${PACKAGE_NAME} version ${PACKAGE_VERSION}
----------------------------------------------------------------------
- Prefix : ${prefix}
- Compiler : ${CC} ${CFLAGS} ${CPPFLAGS}"
+ Prefix : ${prefix}
+ Compiler : ${CC} ${CFLAGS} ${CPPFLAGS}"
if test "x$groff_no_x" = "xyes"; then
echo "\
- X11 support : no"
+ X11 support : no"
else
echo "\
- X11 support : yes
- X11 resources dir : $appresdir"
+ X11 support : yes
+ X11 resources dir : $appresdir"
fi
if test "x$doc" = x; then
echo "\
- Doc build : no"
+ Doc build : no"
else
echo "\
- Doc build : ${doc} "
+ Doc build : ${doc} "
fi
echo "\
- URW fonts for pdf : $groff_have_urw_fonts"
+ URW fonts for pdf : $groff_have_urw_fonts
+ Use uchardet library for preconv: $groff_have_uchardet"
echo "\
----------------------------------------------------------------------"
GROFF_APPRESDIR_CHECK
GROFF_URW_FONTS_CHECK
+GROFF_UCHARDET_CHECK
diff --git a/m4/groff.m4 b/m4/groff.m4
index d1727cd..1f41649 100644
--- a/m4/groff.m4
+++ b/m4/groff.m4
@@ -1621,3 +1621,39 @@ AC_DEFUN([GROFF_BASH],
BASH_PROG=/bin/sh
fi
AC_SUBST([BASH_PROG])])
+
+# Looking for uchardet library, used by preconv.
+AC_DEFUN([GROFF_UCHARDET],
+ [AC_ARG_WITH([uchardet],
+ AS_HELP_STRING([--with-uchardet],
+ [Build `preconv' with uchardet library for file \
+ encoding automatic detection [=auto|no|yes]]))
+ AS_IF([test "x$with_uchardet" != "xno"],
+ [PKG_CHECK_MODULES([UCHARDET],
+ [uchardet >= 0.0.1],
+ [AC_DEFINE([HAVE_UCHARDET],
+ [1],
+ [uchardet library availability])
+ groff_have_uchardet=yes],
+ [if test "x$with_uchardet" = "xyes"; then
+ AC_MSG_FAILURE([Could not found uchardet
library])
+ else
+ AC_MSG_WARN([uchardet library not found,
preconv \
+ might not work properly])
+ fi
+ groff_have_uchardet=no])],
+ [groff_have_uchardet=no]
+ )])
+
+# Warning if uchardet library was not found
+AC_DEFUN([GROFF_UCHARDET_CHECK],
+ [if test "x$groff_have_uchardet" = "xno" -a "x$with_uchardet" != "xno"; then
+ AC_MSG_WARN([
+ uchardet library was not found, preprocessor `preconv', which uses it to
detect
+ the input file encoding, might not work properly (to check how and in which
+ order `preconv' tries to determine the file encoding see its man page).
+ ])
+ fi
+ ])
+
+
diff --git a/src/preproc/preconv/preconv.1.man
b/src/preproc/preconv/preconv.1.man
index ce93d93..70ae3c3 100644
--- a/src/preproc/preconv/preconv.1.man
+++ b/src/preproc/preconv/preconv.1.man
@@ -142,13 +142,20 @@ Otherwise, check whether the input starts with a
If found, use it.
.
.IP 3.
-Finally, check whether there is a known
+Otherwise, check whether there is a known
.I coding tag
(see below) in either the first or second input line.
.
If found, use it.
.
-.IP 4.
+.IP 4
+Finally, if
+.BR uchardet
+library
+(an encoding detector library available on most major distributions)
+is available on the system, use it to try to detect the encoding of the file.
+.
+.IP 5.
If everything fails, use a default encoding as given with option
.BR \-D ,
by the current locale, or \[oq]latin1\[cq] if the locale is set to
diff --git a/src/preproc/preconv/preconv.am b/src/preproc/preconv/preconv.am
index 942eb0a..c873592 100644
--- a/src/preproc/preconv/preconv.am
+++ b/src/preproc/preconv/preconv.am
@@ -19,8 +19,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
bin_PROGRAMS += preconv
-preconv_LDADD = libgroff.a $(LIBM) $(LIBICONV)
+preconv_LDADD = libgroff.a $(LIBM) $(LIBICONV) $(UCHARDET_LIBS)
preconv_SOURCES = src/preproc/preconv/preconv.cpp
+preconv_CPPFLAGS = $(AM_CPPFLAGS) $(UCHARDET_CFLAGS)
man1_MANS += src/preproc/preconv/preconv.1
EXTRA_DIST += src/preproc/preconv/preconv.1.man
diff --git a/src/preproc/preconv/preconv.cpp b/src/preproc/preconv/preconv.cpp
index 2819b5e..b7241b0 100644
--- a/src/preproc/preconv/preconv.cpp
+++ b/src/preproc/preconv/preconv.cpp
@@ -1,5 +1,5 @@
// -*- C++ -*-
-/* Copyright (C) 2005-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2017 Free Software Foundation, Inc.
Written by Werner Lemberg (address@hidden)
This file is part of groff.
@@ -22,6 +22,10 @@ along with this program. If not, see
<http://www.gnu.org/licenses/>. */
#include <assert.h>
#include <stdlib.h>
#include <errno.h>
+#include <sys/stat.h>
+#ifdef HAVE_UCHARDET
+#include <uchardet/uchardet.h>
+#endif
#include "errarg.h"
#include "error.h"
#include "localcharset.h"
@@ -997,6 +1001,67 @@ check_coding_tag(FILE *fp, string &data)
return NULL;
}
+char *
+detect_file_encoding(FILE *fp)
+{
+#ifdef HAVE_UCHARDET
+ uchardet_t ud;
+ struct stat stat_buf;
+ size_t len, read_bytes;
+ char *data;
+ int res, current_position;
+ const char *charset;
+ char *ret;
+
+ current_position = ftell(fp);
+ /* due to BOM and tag detection we are not at the begining of the file */
+ rewind(fp);
+ if (fstat(fileno(fp), &stat_buf) != 0) {
+ fprintf(stderr, "fstat: %s\n", strerror(errno));
+ return NULL;
+ }
+ len = stat_buf.st_size;
+ if (debug_flag)
+ fprintf(stderr, " len: %zu\n", len);
+ if (len == 0)
+ return NULL;
+ data = (char *)calloc(len, 1);
+ read_bytes = fread(data, 1, len, fp);
+ if (read_bytes == 0) {
+ fprintf(stderr, "fread: %s\n", strerror(errno));
+ return NULL;
+ }
+ /* We rewind back to the original position */
+ if (fseek(fp, current_position, SEEK_SET) != 0) {
+ fprintf(stderr, "Fatal error: fseek: %s\n", strerror(errno));
+ return NULL;
+ }
+ ud = uchardet_new();
+ res = uchardet_handle_data(ud, data, len);
+ if (res != 0) {
+ fprintf(stderr, "uchardet_handle_data: %d\n", res);
+ uchardet_delete(ud);
+ return NULL;
+ }
+ if (debug_flag)
+ fprintf(stderr, " uchardet read: %zu bytes\n", read_bytes);
+ uchardet_data_end(ud);
+ charset = uchardet_get_charset(ud);
+ if (debug_flag)
+ fprintf(stderr, " charset: %s\n", charset);
+ if (charset) {
+ ret = (char *)calloc(strlen(charset) + 1, 1);
+ strcpy(ret, charset);
+ }
+ uchardet_delete(ud);
+ free(data);
+
+ return ret;
+#else /* not HAVE_UCHARDET */
+ return NULL;
+#endif /* not HAVE_UCHARDET */
+}
+
// ---------------------------------------------------------
// Handle an input file. If filename is `-' handle stdin.
//
@@ -1025,6 +1090,7 @@ do_file(const char *filename)
const char *BOM_encoding = get_BOM(fp, BOM, data);
// Determine the encoding.
char *encoding;
+ int must_free_encoding = 0;
if (user_encoding[0]) {
if (debug_flag) {
fprintf(stderr, " user-specified encoding `%s', "
@@ -1046,8 +1112,15 @@ do_file(const char *filename)
char *file_encoding = check_coding_tag(fp, data);
if (!file_encoding) {
if (debug_flag)
- fprintf(stderr, " no file encoding\n");
- file_encoding = default_encoding;
+ fprintf(stderr, " no encoding tag\n");
+ file_encoding = detect_file_encoding(fp);
+ if (!file_encoding) {
+ if (debug_flag)
+ fprintf(stderr, " could not detect encoding with uchardet\n");
+ file_encoding = default_encoding;
+ }
+ else
+ must_free_encoding = 1;
}
else
if (debug_flag)
@@ -1056,6 +1129,8 @@ do_file(const char *filename)
}
strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
encoding_string[MAX_VAR_LEN - 1] = 0;
+ if (must_free_encoding)
+ free(encoding);
encoding = encoding_string;
// Translate from MIME & Emacs encoding names to locale encoding names.
encoding = emacs2mime(encoding_string);
@@ -1143,13 +1218,18 @@ main(int argc, char **argv)
"dD:e:hrv", long_options, NULL)) != EOF)
switch (opt) {
case 'v':
- printf("GNU preconv (groff) version %s %s iconv support\n",
+ printf("GNU preconv (groff) version %s %s iconv support and %s uchardet
support\n",
Version_string,
#ifdef HAVE_ICONV
"with"
#else
"without"
#endif /* HAVE_ICONV */
+#ifdef HAVE_UCHARDET
+ "with"
+#else
+ "without"
+#endif /* HAVE_UCHARDET */
);
exit(0);
break;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [groff] 02/02: Use uchardet library in `preconv' to detect input file encoding,
Bertrand Garrigues <=