[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r9777 - in Extractor: . src/include src/main src/plugins
From: |
gnunet |
Subject: |
[GNUnet-SVN] r9777 - in Extractor: . src/include src/main src/plugins |
Date: |
Wed, 16 Dec 2009 15:14:01 +0100 |
Author: grothoff
Date: 2009-12-16 15:14:01 +0100 (Wed, 16 Dec 2009)
New Revision: 9777
Added:
Extractor/src/plugins/ole2_extractor.c
Removed:
Extractor/src/plugins/ole2/
Modified:
Extractor/configure.ac
Extractor/src/include/extractor.h
Extractor/src/main/extractor_metatypes.c
Extractor/src/plugins/Makefile.am
Extractor/src/plugins/man_extractor.c
Extractor/src/plugins/pdf_extractor.cc
Extractor/src/plugins/rpm_extractor.c
Log:
ole2
Modified: Extractor/configure.ac
===================================================================
--- Extractor/configure.ac 2009-12-16 13:25:07 UTC (rev 9776)
+++ Extractor/configure.ac 2009-12-16 14:14:01 UTC (rev 9777)
@@ -556,7 +556,6 @@
src/common/Makefile
src/main/Makefile
src/plugins/Makefile
-src/plugins/ole2/Makefile
src/plugins/oo/Makefile
src/plugins/printable/Makefile
src/plugins/hash/Makefile
Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h 2009-12-16 13:25:07 UTC (rev 9776)
+++ Extractor/src/include/extractor.h 2009-12-16 14:14:01 UTC (rev 9777)
@@ -237,13 +237,23 @@
/* image specifics */
EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 112,
-
-
EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE = 113,
EXTRACTOR_METATYPE_THUMBNAIL = 114,
EXTRACTOR_METATYPE_IMAGE_RESOLUTION = 115,
EXTRACTOR_METATYPE_SOURCE = 116,
+ /* (text) document processing specifics */
+ EXTRACTOR_METATYPE_CHARACTER_SET = 117,
+ EXTRACTOR_METATYPE_LINE_COUNT = 118,
+ EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 119,
+ EXTRACTOR_METATYPE_WORD_COUNT = 120,
+ EXTRACTOR_METATYPE_CHARACTER_COUNT = 121,
+ EXTRACTOR_METATYPE_PAGE_ORIENTATION = 122,
+ EXTRACTOR_METATYPE_PAPER_SIZE = 123,
+ EXTRACTOR_METATYPE_TEMPLATE = 124,
+ EXTRACTOR_METATYPE_COMPANY = 125,
+ EXTRACTOR_METATYPE_MANAGER = 126,
+ EXTRACTOR_METATYPE_REVISION_NUMBER = 127,
/* fixme: used up to here! */
EXTRACTOR_METATYPE_SCALE = 108,
@@ -251,14 +261,6 @@
/* FIXME: transcribe & renumber those below */
- /* (text) document processing specifics */
- EXTRACTOR_METATYPE_CHARACTER_SET = 104,
- EXTRACTOR_METATYPE_LINE_COUNT = 105,
- EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 106,
- EXTRACTOR_METATYPE_WORD_COUNT = 93,
- EXTRACTOR_METATYPE_CHARACTER_COUNT = 94,
- EXTRACTOR_METATYPE_PAGE_ORIENTATION = 35,
- EXTRACTOR_METATYPE_PAPER_SIZE = 36,
EXTRACTOR_METATYPE_USED_FONTS = 37,
EXTRACTOR_METATYPE_PAGE_ORDER = 38,
@@ -312,10 +314,7 @@
EXTRACTOR_METATYPE_OWNER = 66,
EXTRACTOR_METATYPE_MEDIA_TYPE = 68,
EXTRACTOR_METATYPE_CONTACT = 69,
- EXTRACTOR_METATYPE_TEMPLATE = 88,
EXTRACTOR_METATYPE_SECURITY = 97,
- EXTRACTOR_METATYPE_COMPANY = 102,
- EXTRACTOR_METATYPE_MANAGER = 109,
EXTRACTOR_METATYPE_INFORMATION = 112,
EXTRACTOR_METATYPE_FULL_NAME = 113,
EXTRACTOR_METATYPE_LINK = 116,
Modified: Extractor/src/main/extractor_metatypes.c
===================================================================
--- Extractor/src/main/extractor_metatypes.c 2009-12-16 13:25:07 UTC (rev
9776)
+++ Extractor/src/main/extractor_metatypes.c 2009-12-16 14:14:01 UTC (rev
9777)
@@ -294,12 +294,36 @@
gettext_noop ("resolution in dots per inch") },
{ gettext_noop ("source"),
gettext_noop ("Originating entity") },
+ { gettext_noop ("character set"),
+ gettext_noop ("character encoding used") },
+ { gettext_noop ("line count"),
+ gettext_noop ("number of lines") },
+ { gettext_noop ("paragraph count"),
+ gettext_noop ("number o paragraphs") },
+ { gettext_noop ("word count"),
+ gettext_noop ("number of words") },
+ { gettext_noop ("page orientation"),
+ gettext_noop ("") },
+ { gettext_noop ("paper size"),
+ gettext_noop ("") },
+ { gettext_noop ("template"),
+ gettext_noop ("template the document uses or is based on") },
+ { gettext_noop ("company"),
+ gettext_noop ("") },
+ { gettext_noop ("manager"),
+ gettext_noop ("") },
+ { gettext_noop ("revision number"),
+ gettext_noop ("") },
{ gettext_noop (""),
gettext_noop ("") },
{ gettext_noop (""),
gettext_noop ("") },
{ gettext_noop (""),
gettext_noop ("") },
+ { gettext_noop (""),
+ gettext_noop ("") },
+ { gettext_noop (""),
+ gettext_noop ("") },
#if 0
gettext_noop("author"),
Modified: Extractor/src/plugins/Makefile.am
===================================================================
--- Extractor/src/plugins/Makefile.am 2009-12-16 13:25:07 UTC (rev 9776)
+++ Extractor/src/plugins/Makefile.am 2009-12-16 14:14:01 UTC (rev 9777)
@@ -13,7 +13,7 @@
if HAVE_GLIB
if WITH_GSF
- oledir=ole2
+ ole2=libextractor_ole2.la
endif
if HAVE_GTK
thumbgtk=libextractor_thumbnailgtk.la
@@ -58,7 +58,7 @@
# toggle for development
SUBDIRS = .
-# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir)
+# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash
if HAVE_VORBISFILE
@@ -95,6 +95,7 @@
libextractor_jpeg.la \
libextractor_man.la \
libextractor_mime.la \
+ $(ole2) \
$(pdf) \
$(rpm) \
$(thumbgtk)
@@ -176,6 +177,16 @@
libextractor_mime_la_LDFLAGS = \
$(PLUGINFLAGS)
+libextractor_ole2_la_SOURCES = \
+ ole2_extractor.c
+libextractor_ole2_la_CFLAGS = \
+ $(GSF_CFLAGS)
+libextractor_ole2_la_LIBADD = \
+ $(LIBADD) $(GSF_LIBS) \
+ $(top_builddir)/src/common/libextractor_common.la
+libextractor_ole2_la_LDFLAGS = \
+ $(PLUGINFLAGS)
+
libextractor_pdf_la_SOURCES = \
pdf_extractor.cc
libextractor_pdf_la_LDFLAGS = \
Modified: Extractor/src/plugins/man_extractor.c
===================================================================
--- Extractor/src/plugins/man_extractor.c 2009-12-16 13:25:07 UTC (rev
9776)
+++ Extractor/src/plugins/man_extractor.c 2009-12-16 14:14:01 UTC (rev
9777)
@@ -1,6 +1,6 @@
/*
This file is part of libextractor.
- (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
+ (C) 2002, 2003, 2004, 2009 Vidyut Samanta and Christian Grothoff
libextractor is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
Copied: Extractor/src/plugins/ole2_extractor.c (from rev 9774,
Extractor/src/plugins/ole2/ole2extractor.c)
===================================================================
--- Extractor/src/plugins/ole2_extractor.c (rev 0)
+++ Extractor/src/plugins/ole2_extractor.c 2009-12-16 14:14:01 UTC (rev
9777)
@@ -0,0 +1,599 @@
+/*
+ This file is part of libextractor.
+ (C) 2004, 2005, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ This code makes extensive use of libgsf
+ -- the Gnome Structured File Library
+ Copyright (C) 2002-2004 Jody Goldberg (address@hidden)
+
+ Part of this code was borrowed from wordleaker.cpp. See also
+ the README file in this directory.
+*/
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+
+#include <glib-object.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <gsf/gsf-utils.h>
+#include <gsf/gsf-input-memory.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-infile-msole.h>
+#include <gsf/gsf-msole-utils.h>
+
+#define DEBUG_OLE2 0
+
+/* ******************************** main extraction code
************************ */
+
+static int
+addKeyword(EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls,
+ const char *phrase,
+ enum EXTRACTOR_MetaType type) {
+ if (strlen(phrase) == 0)
+ return 0;
+ if (0 == strcmp(phrase, "\"\""))
+ return 0;
+ if (0 == strcmp(phrase, "\" \""))
+ return 0;
+ if (0 == strcmp(phrase, " "))
+ return 0;
+ return proc (proc_cls,
+ "ole2",
+ type,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ phrase,
+ strlen (phrase) +1);
+}
+
+typedef struct {
+ char * text;
+ enum EXTRACTOR_MetaType type;
+} Matches;
+
+static Matches tmap[] = {
+ { "Title", EXTRACTOR_METATYPE_TITLE },
+ { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
+ { "Category", EXTRACTOR_METATYPE_SECTION },
+ { "Manager", EXTRACTOR_METATYPE_MANAGER },
+ { "Company", EXTRACTOR_METATYPE_COMPANY },
+ { "Subject", EXTRACTOR_METATYPE_SUBJECT },
+ { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+ { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
+ { "Comments", EXTRACTOR_METATYPE_COMMENT },
+ { "Template", EXTRACTOR_METATYPE_TEMPLATE },
+ { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
+ { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
+ { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
+ { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE },
+ { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
+ { "LastSavedTime" , EXTRACTOR_METATYPE_MODIFICATION_DATE },
+ { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
+ { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
+ { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
+ { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
+ { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
+ { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
+ { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
+ { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
+ { "dc:title", EXTRACTOR_METATYPE_TITLE },
+ { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
+ { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+ { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
+ { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
+ { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
+ { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
+ { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
+ { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+ { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
+ { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES },
+ /* { "Dictionary", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, */
+ /* { "gsf:security", EXTRACTOR_SECURITY }, */
+ /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
+ /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, //
encoding? */
+ /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
+ { NULL, 0 }
+};
+
+
+struct ProcContext
+{
+ EXTRACTOR_MetaDataProcessor proc;
+ void *proc_cls;
+ int ret;
+};
+
+
+static void processMetadata(gpointer key,
+ gpointer value,
+ gpointer user_data) {
+ struct ProcContext *pc = user_data;
+ const char * type = key;
+ const GsfDocProp * prop = value;
+ const GValue * gval;
+ char * contents;
+ int pos;
+
+ if ( (key == NULL) ||
+ (value == NULL) )
+ return;
+ if (pc->ret != 0)
+ return;
+ gval = gsf_doc_prop_get_val(prop);
+
+ if (G_VALUE_TYPE(gval) == G_TYPE_STRING)
+ {
+ contents = strdup(g_value_get_string(gval));
+ }
+ else
+ {
+ /* convert other formats? */
+ contents = g_strdup_value_contents(gval);
+ }
+ if (contents == NULL)
+ return;
+ if ( (strlen(contents) > 0) &&
+ (contents[strlen(contents)-1] == '\n') )
+ contents[strlen(contents)-1] = '\0';
+ pos = 0;
+ while (tmap[pos].text != NULL)
+ {
+ if (0 == strcmp(tmap[pos].text,
+ type))
+ break;
+ pos++;
+ }
+ if (0 == strcmp (type, "meta:generator"))
+ {
+ const char * mimetype = "application/vnd.ms-files";
+ if((0 == strncmp(value, "Microsoft Word", 14)) ||
+ (0 == strncmp(value, "Microsoft Office Word", 21)))
+ mimetype = "application/msword";
+ else if((0 == strncmp(value, "Microsoft Excel", 15)) ||
+ (0 == strncmp(value, "Microsoft Office Excel", 22)))
+ mimetype = "application/vnd.ms-excel";
+ else if((0 == strncmp(value, "Microsoft PowerPoint", 20)) ||
+ (0 == strncmp(value, "Microsoft Office PowerPoint", 27)))
+ mimetype = "application/vnd.ms-powerpoint";
+ else if(0 == strncmp(value, "Microsoft Project", 17))
+ mimetype = "application/vnd.ms-project";
+ else if(0 == strncmp(value, "Microsoft Visio", 15))
+ mimetype = "application/vnd.visio";
+ else if(0 == strncmp(value, "Microsoft Office", 16))
+ mimetype = "application/vnd.ms-office";
+
+ if (0 != addKeyword(pc->proc,
+ pc->proc_cls, mimetype, EXTRACTOR_METATYPE_MIMETYPE))
+ {
+ free (contents);
+ pc->ret = 1;
+ return;
+ }
+ }
+ if (tmap[pos].text != NULL)
+ {
+ if (0 != addKeyword(pc->proc, pc->proc_cls,
+ contents,
+ tmap[pos].type))
+ {
+ free (contents);
+ pc->ret = 1;
+ return;
+ }
+ }
+#if DEBUG_OLE2
+ else
+ printf("No match for type `%s'\n",
+ type);
+#endif
+ free(contents);
+}
+
+
+static int
+process(GsfInput * in,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls)
+{
+ struct ProcContext pc;
+ GsfDocMetaData * sections;
+ GError * error;
+
+ pc.proc = proc;
+ pc.proc_cls = proc_cls;
+ pc.ret = 0;
+ sections = gsf_doc_meta_data_new();
+ error = gsf_msole_metadata_read(in, sections);
+ if (error == NULL) {
+ gsf_doc_meta_data_foreach(sections,
+ &processMetadata,
+ &pc);
+ }
+ g_object_unref(G_OBJECT(sections));
+ return pc.ret;
+}
+
+static int
+processSO(GsfInput * src,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls) {
+ off_t size = gsf_input_size(src);
+ if ( (size < 0x374) || (size > 4*1024*1024) ) /* == 0x375?? */
+ return 0;
+ char buf[size];
+ gsf_input_read(src, size, (unsigned char*) buf);
+ if ( (buf[0] != 0x0F) ||
+ (buf[1] != 0x0) ||
+ (0 != strncmp(&buf[2],
+ "SfxDocumentInfo",
+ strlen("SfxDocumentInfo"))) ||
+ (buf[0x11] != 0x0B) ||
+ (buf[0x13] != 0x00) || /* pw protected! */
+ (buf[0x12] != 0x00) )
+ return 0;
+ buf[0xd3] = '\0';
+ if (buf[0x94] + buf[0x93] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0x95],
+ EXTRACTOR_METATYPE_TITLE))
+ return 1;
+ buf[0x114] = '\0';
+ if (buf[0xd5] + buf[0xd4] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0xd6],
+ EXTRACTOR_METATYPE_SUBJECT))
+ return 1;
+ buf[0x215] = '\0';
+ if (buf[0x115] + buf[0x116] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0x117],
+ EXTRACTOR_METATYPE_COMMENT))
+ return 1;
+ buf[0x296] = '\0';
+ if (buf[0x216] + buf[0x217] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0x218],
+ EXTRACTOR_METATYPE_KEYWORDS))
+ return 1;
+ /* fixme: do timestamps,
+ mime-type, user-defined info's */
+ return 0;
+}
+
+/* *************** wordleaker stuff *************** */
+
+#define __(a) dgettext("iso-639", a)
+
+static const char * lidToLanguage( unsigned int lid ) {
+ switch ( lid ) {
+ case 0x0400:
+ return _("No Proofing");
+ case 0x0401:
+ return __("Arabic");
+ case 0x0402:
+ return __("Bulgarian");
+ case 0x0403:
+ return __("Catalan");
+ case 0x0404:
+ return _("Traditional Chinese");
+ case 0x0804:
+ return _("Simplified Chinese");
+ case 0x0405:
+ return __("Chechen");
+ case 0x0406:
+ return __("Danish");
+ case 0x0407:
+ return __("German");
+ case 0x0807:
+ return _("Swiss German");
+ case 0x0408:
+ return __("Greek");
+ case 0x0409:
+ return _("U.S. English");
+ case 0x0809:
+ return _("U.K. English");
+ case 0x0c09:
+ return _("Australian English");
+ case 0x040a:
+ return _("Castilian Spanish");
+ case 0x080a:
+ return _("Mexican Spanish");
+ case 0x040b:
+ return __("Finnish");
+ case 0x040c:
+ return __("French");
+ case 0x080c:
+ return _("Belgian French");
+ case 0x0c0c:
+ return _("Canadian French");
+ case 0x100c:
+ return _("Swiss French");
+ case 0x040d:
+ return __("Hebrew");
+ case 0x040e:
+ return __("Hungarian");
+ case 0x040f:
+ return __("Icelandic");
+ case 0x0410:
+ return __("Italian");
+ case 0x0810:
+ return _("Swiss Italian");
+ case 0x0411:
+ return __("Japanese");
+ case 0x0412:
+ return __("Korean");
+ case 0x0413:
+ return __("Dutch");
+ case 0x0813:
+ return _("Belgian Dutch");
+ case 0x0414:
+ return _("Norwegian Bokmal");
+ case 0x0814:
+ return __("Norwegian Nynorsk");
+ case 0x0415:
+ return __("Polish");
+ case 0x0416:
+ return __("Brazilian Portuguese");
+ case 0x0816:
+ return __("Portuguese");
+ case 0x0417:
+ return _("Rhaeto-Romanic");
+ case 0x0418:
+ return __("Romanian");
+ case 0x0419:
+ return __("Russian");
+ case 0x041a:
+ return _("Croato-Serbian (Latin)");
+ case 0x081a:
+ return _("Serbo-Croatian (Cyrillic)");
+ case 0x041b:
+ return __("Slovak");
+ case 0x041c:
+ return __("Albanian");
+ case 0x041d:
+ return __("Swedish");
+ case 0x041e:
+ return __("Thai");
+ case 0x041f:
+ return __("Turkish");
+ case 0x0420:
+ return __("Urdu");
+ case 0x0421:
+ return __("Bahasa");
+ case 0x0422:
+ return __("Ukrainian");
+ case 0x0423:
+ return __("Byelorussian");
+ case 0x0424:
+ return __("Slovenian");
+ case 0x0425:
+ return __("Estonian");
+ case 0x0426:
+ return __("Latvian");
+ case 0x0427:
+ return __("Lithuanian");
+ case 0x0429:
+ return _("Farsi");
+ case 0x042D:
+ return __("Basque");
+ case 0x042F:
+ return __("Macedonian");
+ case 0x0436:
+ return __("Afrikaans");
+ case 0x043E:
+ return __("Malayalam");
+ default:
+ return NULL;
+ }
+}
+
+
+static int
+history_extract(GsfInput * stream,
+ unsigned int lcbSttbSavedBy,
+ unsigned int fcSttbSavedBy,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls)
+{
+ unsigned int where = 0;
+ unsigned char * lbuffer;
+ unsigned int i;
+ unsigned int length;
+ char * author;
+ char * filename;
+ char * rbuf;
+ unsigned int nRev;
+ int ret;
+
+ // goto offset of revision
+ gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
+ if (gsf_input_remaining(stream) < lcbSttbSavedBy)
+ return 0;
+ lbuffer = malloc(lcbSttbSavedBy);
+ // read all the revision history
+ gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
+ // there are n strings, so n/2 revisions (author & file)
+ nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
+ where = 6;
+ ret = 0;
+ for (i=0; i < nRev; i++) {
+ if (where >= lcbSttbSavedBy)
+ break;
+ length = lbuffer[where++];
+ if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
+ (where + 2 * length + 2 <= where) )
+ break;
+ author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
+ length * 2,
+ "UTF-16BE");
+ where += length * 2 + 1;
+ length = lbuffer[where++];
+ if ( (where + 2 * length >= lcbSttbSavedBy) ||
+ (where + 2 * length + 1 <= where) ) {
+ free(author);
+ break;
+ }
+ filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
+ length * 2,
+ "UTF-16BE");
+ where += length * 2 + 1;
+ rbuf = malloc(strlen(author) + strlen(filename) + 512);
+ snprintf(rbuf, 512 + strlen(author) + strlen(filename),
+ _("Revision #%u: Author '%s' worked on '%s'"),
+ i, author, filename);
+ free(author);
+ free(filename);
+ ret = addKeyword(proc, proc_cls,
+ rbuf,
+ EXTRACTOR_METATYPE_REVISION_HISTORY);
+ free(rbuf);
+ if (0 != ret)
+ break;
+ }
+ free(lbuffer);
+ return ret;
+}
+
+
+int
+EXTRACTOR_ole2_extract (const char *data,
+ size_t size,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls,
+ const char *options)
+{
+ GsfInput * input;
+ GsfInfile * infile;
+ GsfInput * src;
+ const char * name;
+ int i;
+ unsigned int lcb;
+ unsigned int fcb;
+ const unsigned char * data512;
+ unsigned int lid;
+ const char * lang;
+ int ret;
+
+ ret = 0;
+ if (size < 512 + 898)
+ return 0; /* can hardly be OLE2 */
+ input = gsf_input_memory_new((const guint8 *) data,
+ (gsf_off_t) size,
+ FALSE);
+ if (input == NULL)
+ return 0;
+
+ infile = gsf_infile_msole_new(input, NULL);
+ if (infile == NULL) {
+ g_object_unref(G_OBJECT(input));
+ return 0;
+ }
+ lcb = 0;
+ fcb = 0;
+ for (i=0;i<gsf_infile_num_children(infile);i++) {
+ name = gsf_infile_name_by_index (infile, i);
+ src = NULL;
+ if (ret != 0)
+ break;
+ if (name == NULL)
+ continue;
+ if ( (0 == strcmp(name, "\005SummaryInformation"))
+ || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
+ src = gsf_infile_child_by_index (infile, i);
+ if (src != NULL)
+ ret = process(src,
+ proc,
+ proc_cls);
+ }
+ if (0 == strcmp(name, "SfxDocumentInfo")) {
+ src = gsf_infile_child_by_index (infile, i);
+ if ( (src != NULL) && (ret == 0) )
+ ret = processSO(src,
+ proc,
+ proc_cls);
+ }
+ if (src != NULL)
+ g_object_unref(G_OBJECT(src));
+ }
+
+ data512 = (const unsigned char*) &data[512];
+ lid = data512[6] + (data512[7] << 8);
+ lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) +
(data512[729] << 24);
+ fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) +
(data512[725] << 24);
+ lang = lidToLanguage(lid);
+ if ( (lang != NULL) && (ret == 0) )
+ ret = addKeyword(proc, proc_cls,
+ lang,
+ EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE);
+ if (lcb >= 6) {
+ for (i=0;i<gsf_infile_num_children(infile);i++) {
+ if (ret != 0)
+ break;
+ name = gsf_infile_name_by_index (infile, i);
+ if (name == NULL)
+ continue;
+ if ( (0 == strcmp(name, "1Table")) ||
+ (0 == strcmp(name, "0Table")) ) {
+ src = gsf_infile_child_by_index (infile, i);
+ if (src != NULL) {
+ ret = history_extract(src,
+ lcb,
+ fcb,
+ proc, proc_cls);
+ g_object_unref(G_OBJECT(src));
+ }
+ }
+ }
+ }
+ g_object_unref(G_OBJECT(infile));
+ g_object_unref(G_OBJECT(input));
+ return ret;
+}
+
+
+static void
+nolog (const gchar *log_domain,
+ GLogLevelFlags log_level,
+ const gchar *message,
+ gpointer user_data) {
+}
+
+
+void __attribute__ ((constructor)) ole2_ltdl_init() {
+ g_type_init();
+#ifdef HAVE_GSF_INIT
+ gsf_init();
+#endif
+ /* disable logging -- thanks, Jody! */
+ g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL |
G_LOG_LEVEL_WARNING, &nolog, NULL);
+}
+
+
+void __attribute__ ((destructor)) ole2_ltdl_fini() {
+#ifdef HAVE_GSF_INIT
+ gsf_shutdown();
+#endif
+}
+
+/* end of ole2_extractor.c */
+
Modified: Extractor/src/plugins/pdf_extractor.cc
===================================================================
--- Extractor/src/plugins/pdf_extractor.cc 2009-12-16 13:25:07 UTC (rev
9776)
+++ Extractor/src/plugins/pdf_extractor.cc 2009-12-16 14:14:01 UTC (rev
9777)
@@ -37,6 +37,7 @@
#include <poppler/Page.h>
#include <poppler/PDFDoc.h>
#include <poppler/Error.h>
+#include <poppler/GlobalParams.h>
#include <poppler/goo/GooString.h>
#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type,
EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT;
}} while (0)
@@ -167,7 +168,11 @@
BaseStream * stream;
int err;
- /* errorInit(); -- keep commented out, otherwise errors are printed to
stderr for non-pdf files! */
+ if (globalParams == NULL)
+ {
+ globalParams = new GlobalParams();
+ globalParams->setErrQuiet (gTrue);
+ }
obj.initNull();
err = 0;
stream = new MemStream( (char*) data, 0, size, &obj);
Modified: Extractor/src/plugins/rpm_extractor.c
===================================================================
--- Extractor/src/plugins/rpm_extractor.c 2009-12-16 13:25:07 UTC (rev
9776)
+++ Extractor/src/plugins/rpm_extractor.c 2009-12-16 14:14:01 UTC (rev
9777)
@@ -1,6 +1,6 @@
/*
This file is part of libextractor.
- (C) 2002, 2003, 2008 Vidyut Samanta and Christian Grothoff
+ (C) 2002, 2003, 2008, 2009 Vidyut Samanta and Christian Grothoff
libextractor is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [GNUnet-SVN] r9777 - in Extractor: . src/include src/main src/plugins,
gnunet <=