1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.util.HashMap;
23 import java.util.Locale;
27 * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
28 * which are handled in TrecContentSource. Required to be stateless and hence thread safe.
30 public abstract class TrecDocParser {
32 /** Types of trec parse paths, */
33 public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
35 /** trec parser type used for unknown extensions */
36 public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
38 static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
40 pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
41 pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
42 pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
43 pathType2parser.put(ParsePathType.FT, new TrecFTParser());
44 pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
47 static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
49 for (ParsePathType ppt : ParsePathType.values()) {
50 pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
54 /** max length of walk up from file to its ancestors when looking for a known path type */
55 private static final int MAX_PATH_LENGTH = 10;
58 * Compute the path type of a file by inspecting name of file and its parents
60 public static ParsePathType pathType(File f) {
62 while (f != null && ++pathLength < MAX_PATH_LENGTH) {
63 ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
67 f = f.getParentFile();
69 return DEFAULT_PATH_TYPE;
73 * parse the text prepared in docBuf into a result DocData,
74 * no synchronization is required.
75 * @param docData reusable result
76 * @param name name that should be set to the result
77 * @param trecSrc calling trec content source
78 * @param docBuf text to parse
79 * @param pathType type of parsed file, or null if unknown - may be used by
80 * parsers to alter their behavior according to the file path type.
82 public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
83 StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
86 * strip tags from <code>buf</code>: each tag is replaced by a single blank.
87 * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
89 public static String stripTags(StringBuilder buf, int start) {
90 return stripTags(buf.substring(start),0);
94 * strip tags from input.
95 * @see #stripTags(StringBuilder, int)
97 public static String stripTags(String buf, int start) {
99 buf = buf.substring(0);
101 return buf.replaceAll("<[^>]*>", " ");
105 * Extract from <code>buf</code> the text of interest within specified tags
106 * @param buf entire input text
107 * @param startTag tag marking start of text of interest
108 * @param endTag tag marking end of text of interest
109 * @param maxPos if ≥ 0 sets a limit on start of text of interest
110 * @return text of interest or null if not found
112 public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
113 int k1 = buf.indexOf(startTag);
114 if (k1>=0 && (maxPos<0 || k1<maxPos)) {
115 k1 += startTag.length();
116 int k2 = buf.indexOf(endTag,k1);
117 if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
118 if (noisePrefixes != null) {
119 for (String noise : noisePrefixes) {
120 int k1a = buf.indexOf(noise,k1);
121 if (k1a>=0 && k1a<k2) {
122 k1 = k1a + noise.length();
126 return buf.substring(k1,k2).trim();
132 //public static void main(String[] args) {
133 // System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));