[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/
From: |
Pierre Dittgen |
Subject: |
Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ? |
Date: |
Thu, 22 Apr 2004 11:37:31 +0200 |
User-agent: |
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007 |
Pas de mon côté. Tu peux envoyer un patch ?
Le voilà.
Pierre
--
Pierre Dittgen, address@hidden
PASS Technologie http://www.pass-tech.fr
diff -urN analysis/Analyzer_br.java analysis.new/Analyzer_br.java
--- analysis/Analyzer_br.java 2004-04-22 11:35:06.251500000 +0200
+++ analysis.new/Analyzer_br.java 2004-04-13 17:51:53.546875000 +0200
@@ -30,16 +30,16 @@
package fr.gouv.culture.sdx.search.lucene.analysis;
import fr.gouv.culture.sdx.search.lucene.analysis.filter.BrazilianStemFilter;
+import
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LetterOrDigitTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
-import java.io.IOException;
import java.io.Reader;
+import java.io.IOException;
import java.util.Hashtable;
/*
@@ -103,7 +103,7 @@
* Builds an analyzer with the given stop words.
*/
public Analyzer_br(File stopwords) throws IOException {
- super.stopTable = WordlistLoader.getWordtable(stopwords);
+ super.stopTable = WordlistLoader.getWordtable(stopwords);
}
/**
@@ -134,7 +134,7 @@
* StandardFilter, StopFilter, GermanStemFilter
and LowerCaseFilter.
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(reader);
+ TokenStream result = new LetterOrDigitTokenizer(reader);
result = new StandardFilter(result);
result = new StopFilter(result, super.stopTable);
result = new BrazilianStemFilter(result, super.excludeTable);
diff -urN analysis/Analyzer_cz.java analysis.new/Analyzer_cz.java
--- analysis/Analyzer_cz.java 2004-04-22 11:35:06.407750000 +0200
+++ analysis.new/Analyzer_cz.java 2004-04-13 18:01:20.484375000 +0200
@@ -54,12 +54,12 @@
* <http://www.apache.org/>.
*/
+import
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LetterOrDigitTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.*;
import java.util.Hashtable;
@@ -166,7 +166,7 @@
* StandardFilter, StopFilter, GermanStemFilter
and LowerCaseFilter
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(reader);
+ TokenStream result = new LetterOrDigitTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopTable);
diff -urN analysis/Analyzer_fr.java analysis.new/Analyzer_fr.java
--- analysis/Analyzer_fr.java 2004-04-22 11:35:06.673375000 +0200
+++ analysis.new/Analyzer_fr.java 2004-04-13 17:19:06.390625000 +0200
@@ -31,12 +31,12 @@
import fr.gouv.culture.sdx.search.lucene.analysis.filter.FrenchStandardFilter;
import fr.gouv.culture.sdx.search.lucene.analysis.filter.ISOLatin1AccentFilter;
+import
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LetterOrDigitTokenizer;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
@@ -94,7 +94,7 @@
TokenStream result;
// Builds the chain...
- result = new StandardTokenizer(reader);
+ result = new LetterOrDigitTokenizer(reader);
FrenchStandardFilter fsf = new FrenchStandardFilter();
fsf.enableLogging(logger);
diff -urN analysis/CVS/Entries analysis.new/CVS/Entries
--- analysis/CVS/Entries 2004-04-22 11:35:07.126500000 +0200
+++ analysis.new/CVS/Entries 2004-04-13 16:44:54.296875000 +0200
@@ -1,18 +1,18 @@
+/AbstractAnalyzer.java/1.11/Wed Mar 24 18:26:17 2004//
+/Analyzer.java/1.7/Thu Feb 6 14:10:08 2003//
+/AnalyzerManager.java/1.20/Fri Mar 26 15:26:37 2004//
+/Analyzer_ar.java/1.5/Wed Mar 24 18:26:17 2004//
+/Analyzer_br.java/1.2/Tue Apr 6 19:01:15 2004//
+/Analyzer_cn.java/1.4/Thu Feb 6 14:10:08 2003//
+/Analyzer_cz.java/1.4/Tue Apr 6 19:01:15 2004//
+/Analyzer_de.java/1.3/Mon Jan 12 15:07:40 2004//
+/Analyzer_en.java/1.1/Sun May 26 21:30:10 2002//
+/Analyzer_fr.java/1.13/Thu Feb 6 14:10:08 2003//
+/Analyzer_ru.java/1.3/Mon Jan 19 11:56:20 2004//
+/DefaultAnalyzer.java/1.13/Mon Jan 12 15:07:40 2004//
+/Glosser_ar_en.java/1.5/Wed Mar 24 18:26:17 2004//
+/MetaAnalyzer.java/1.10/Wed Mar 24 18:26:17 2004//
+/package.html/1.2/Tue Aug 27 16:50:19 2002//
D/filter////
D/stemmer////
D/tokenizer////
-/AbstractAnalyzer.java/1.11/Thu Apr 22 09:35:05 2004//
-/Analyzer.java/1.7/Thu Apr 22 09:35:05 2004//
-/AnalyzerManager.java/1.20/Thu Apr 22 09:35:05 2004//
-/Analyzer_ar.java/1.5/Thu Apr 22 09:35:05 2004//
-/Analyzer_br.java/1.3/Thu Apr 22 09:35:06 2004//
-/Analyzer_cn.java/1.4/Thu Apr 22 09:35:06 2004//
-/Analyzer_cz.java/1.4/Thu Apr 22 09:35:06 2004//
-/Analyzer_de.java/1.3/Thu Apr 22 09:35:06 2004//
-/Analyzer_en.java/1.1/Thu Apr 22 09:35:06 2004//
-/Analyzer_fr.java/1.13/Thu Apr 22 09:35:06 2004//
-/Analyzer_ru.java/1.3/Thu Apr 22 09:35:06 2004//
-/DefaultAnalyzer.java/1.13/Thu Apr 22 09:35:06 2004//
-/Glosser_ar_en.java/1.5/Thu Apr 22 09:35:07 2004//
-/MetaAnalyzer.java/1.11/Thu Apr 22 09:35:07 2004//
-/package.html/1.2/Thu Apr 22 09:35:07 2004//
diff -urN analysis/CVS/Entries.Extra analysis.new/CVS/Entries.Extra
--- analysis/CVS/Entries.Extra 2004-04-22 11:35:07.126500000 +0200
+++ analysis.new/CVS/Entries.Extra 2004-04-13 16:44:54.296875000 +0200
@@ -1,6 +1,3 @@
-D/filter///
-D/stemmer///
-D/tokenizer///
/AbstractAnalyzer.java///
/Analyzer.java///
/AnalyzerManager.java///
@@ -16,3 +13,6 @@
/Glosser_ar_en.java///
/MetaAnalyzer.java///
/package.html///
+D/filter///
+D/stemmer///
+D/tokenizer///
diff -urN analysis/CVS/Entries.Log analysis.new/CVS/Entries.Log
--- analysis/CVS/Entries.Log 2004-04-22 11:35:09.735875000 +0200
+++ analysis.new/CVS/Entries.Log 1970-01-01 01:00:00.000000000 +0100
@@ -1,3 +0,0 @@
-A D/filter////
-A D/stemmer////
-A D/tokenizer////
diff -urN analysis/DefaultAnalyzer.java analysis.new/DefaultAnalyzer.java
--- analysis/DefaultAnalyzer.java 2004-04-22 11:35:06.985875000 +0200
+++ analysis.new/DefaultAnalyzer.java 2004-04-13 17:19:41.468750000 +0200
@@ -30,9 +30,9 @@
package fr.gouv.culture.sdx.search.lucene.analysis;
import fr.gouv.culture.sdx.exception.SDXException;
+import
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LaxistLowerCaseTokenizer;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -129,9 +129,9 @@
/** Filters LowerCaseTokenizer with StopFilter. */
public TokenStream tokenStream(String fieldName, Reader reader) {
if (stopTable != null)
- return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
+ return new StopFilter(new LaxistLowerCaseTokenizer(reader),
stopTable);
else
- return new LowerCaseTokenizer(reader);
+ return new LaxistLowerCaseTokenizer(reader);
}
/**
diff -urN analysis/MetaAnalyzer.java analysis.new/MetaAnalyzer.java
--- analysis/MetaAnalyzer.java 2004-04-22 11:35:07.017125000 +0200
+++ analysis.new/MetaAnalyzer.java 2004-03-24 19:26:17.000000000 +0100
@@ -32,7 +32,7 @@
import fr.gouv.culture.sdx.exception.SDXException;
import fr.gouv.culture.sdx.exception.SDXExceptionCode;
import fr.gouv.culture.sdx.search.lucene.Field;
-import fr.gouv.culture.sdx.search.lucene.FieldList;
+import fr.gouv.culture.sdx.search.lucene.FieldsDefinition;
import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
@@ -49,7 +49,7 @@
public class MetaAnalyzer extends AbstractAnalyzer {
/** The fields definition object. */
- private FieldList fields;
+ private FieldsDefinition fields;
/** The default analzyer to use. */
private Analyzer defaultAnalyzer;
@@ -70,7 +70,7 @@
*
* @param fields The fields and their definitions (cannot be null).
*/
- public void setUp(FieldList fields) throws SDXException {
+ public void setUp(FieldsDefinition fields) throws SDXException {
if (fields == null) throw new SDXException(logger,
SDXExceptionCode.ERROR_FIELDS_DEF_NULL, null, null);
this.fields = fields;
@@ -106,9 +106,9 @@
return theAnalyzer.tokenStream(fieldName, reader);
}
- /** Returns a the FieldList for this MetaAnalyzer (basically a Hashtable
of all the Fields)*/
+ /** Returns a the FieldsDefinition for this MetaAnalyzer (basically a
Hashtable of all the Fields)*/
//TODO?:is this still necessary, as it exists both in LuceneIndex and
MetaAnalyzer?-rbp
- public FieldList getFieldList() {
+ public FieldsDefinition getFieldsDefinition() {
return this.fields;
}
diff -urN analysis/tokenizer/LaxistLowerCaseTokenizer.java
analysis.new/tokenizer/LaxistLowerCaseTokenizer.java
--- analysis/tokenizer/LaxistLowerCaseTokenizer.java 1970-01-01
01:00:00.000000000 +0100
+++ analysis.new/tokenizer/LaxistLowerCaseTokenizer.java 2004-04-02
14:45:48.000000000 +0200
@@ -0,0 +1,62 @@
+/*
+SDX: Documentary System in XML.
+Copyright (C) 2000, 2001, 2002 Ministere de la culture et de la communication
(France), AJLSM
+
+Ministere de la culture et de la communication,
+Mission de la recherche et de la technologie
+3 rue de Valois, 75042 Paris Cedex 01 (France)
address@hidden, address@hidden
+
+AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
address@hidden
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the
+Free Software Foundation, Inc.
+59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+or connect to:
+http://www.fsf.org/copyleft/gpl.html
+*/
+/*
+ * Created by Vim :-)
+ * User: Pierre Dittgen
+ * Date: 2 apr. 2004
+ */
+package fr.gouv.culture.sdx.search.lucene.analysis.tokenizer;
+
+// Jdk import
+import java.io.Reader;
+
+/**
+ * Title: LaxistLowerCaseTokenizer
+ * Description: Like org.apache.lucene.analysis.LowerCaseTokenizer but
+ * inherits from LetterOrDigitTokenizer, not from LetterTokenizer
+ * Copyright: Copyright (c) 2004
+ * Company:
+ * @author Pierre Dittgen
+ * @version 1.0
+ *
+ */
+public final class LaxistLowerCaseTokenizer extends LetterOrDigitTokenizer
+{
+ public LaxistLowerCaseTokenizer(Reader in)
+ {
+ super(in);
+ }
+
+ protected char normalize(char c)
+ {
+ return Character.toLowerCase(c);
+ }
+}
+
diff -urN analysis/tokenizer/LetterOrDigitTokenizer.java
analysis.new/tokenizer/LetterOrDigitTokenizer.java
--- analysis/tokenizer/LetterOrDigitTokenizer.java 1970-01-01
01:00:00.000000000 +0100
+++ analysis.new/tokenizer/LetterOrDigitTokenizer.java 2004-04-02
14:52:42.000000000 +0200
@@ -0,0 +1,68 @@
+/*
+SDX: Documentary System in XML.
+Copyright (C) 2000, 2001, 2002 Ministere de la culture et de la communication
(France), AJLSM
+
+Ministere de la culture et de la communication,
+Mission de la recherche et de la technologie
+3 rue de Valois, 75042 Paris Cedex 01 (France)
address@hidden, address@hidden
+
+AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
address@hidden
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the
+Free Software Foundation, Inc.
+59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+or connect to:
+http://www.fsf.org/copyleft/gpl.html
+*/
+/*
+ * Created by Vim :-)
+ * User: Pierre Dittgen
+ * Date: 2 apr. 2004
+ */
+package fr.gouv.culture.sdx.search.lucene.analysis.tokenizer;
+
+// Lucene import
+import org.apache.lucene.analysis.CharTokenizer;
+
+// Jdk import
+import java.io.Reader;
+
+
+/**
+ * Title: LetterOrDigitTokenizer
+ * Description: Like org.apache.lucene.analysis.LetterTokenizer but also
+ * accept digits
+ * Copyright: Copyright (c) 2004
+ * Company:
+ * @author Pierre Dittgen
+ * @version 1.0
+ *
+ */
+public class LetterOrDigitTokenizer extends CharTokenizer {
+
+
+ public LetterOrDigitTokenizer(Reader in)
+ {
+ super(in);
+ }
+
+ protected boolean isTokenChar(char c)
+ {
+ return Character.isLetterOrDigit(c);
+ }
+
+}
+
- [sdx-developers] Intégration à SDX2.2 de la recherche des chiffres/nombres dans le texte ?, Pierre Dittgen, 2004/04/22
- Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Pierrick Brihaye, 2004/04/22
- Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?,
Pierre Dittgen <=
- Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Pierrick Brihaye, 2004/04/22
- RE : [sdx-developers] Intégration à SDX2.2 de la recherche des chiffres/nombres dans le texte ?, Rasik Pandey, 2004/04/22
- Re: RE : [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Pierrick Brihaye, 2004/04/22
- RE : RE : [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Rasik Pandey, 2004/04/22
- Re: RE : RE : [sdx-developers] Intégration à SD X2.2 de la recherche des chiffres/nombres dans le texte ?, Pierrick Brihaye, 2004/04/22
- Re: RE : [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Pierre Dittgen, 2004/04/23
- Re: RE : [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Pierrick Brihaye, 2004/04/23
- Re: RE : [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?, Pierre Dittgen, 2004/04/23