lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.util.HashMap;
  23 import java.util.Locale;
  24 import java.util.Map;
  25
  26 /**
  27  * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
  28  * which are handled in TrecContentSource. Required to be stateless and hence thread safe.
  29  */
  30 public abstract class TrecDocParser {
  31
  32   /** Types of trec parse paths, */
  33   public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
  34
  35   /** trec parser type used for unknown extensions */
  36   public static final ParsePathType DEFAULT_PATH_TYPE  = ParsePathType.GOV2;
  37
  38   static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
  39   static {
  40     pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
  41     pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
  42     pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
  43     pathType2parser.put(ParsePathType.FT, new TrecFTParser());
  44     pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
  45   }
  46
  47   static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
  48   static {
  49     for (ParsePathType ppt : ParsePathType.values()) {
  50       pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
  51     }
  52   }
  53
  54   /** max length of walk up from file to its ancestors when looking for a known path type */
  55   private static final int MAX_PATH_LENGTH = 10;
  56
  57   /**
  58    * Compute the path type of a file by inspecting name of file and its parents
  59    */
  60   public static ParsePathType pathType(File f) {
  61     int pathLength = 0;
  62     while (f != null && ++pathLength < MAX_PATH_LENGTH) {
  63       ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
  64       if (ppt!=null) {
  65         return ppt;
  66       }
  67       f = f.getParentFile();
  68     }
  69     return DEFAULT_PATH_TYPE;
  70   }
  71
  72   /**
  73    * parse the text prepared in docBuf into a result DocData,
  74    * no synchronization is required.
  75    * @param docData reusable result
  76    * @param name name that should be set to the result
  77    * @param trecSrc calling trec content source
  78    * @param docBuf text to parse
  79    * @param pathType type of parsed file, or null if unknown - may be used by
  80    * parsers to alter their behavior according to the file path type.
  81    */
  82   public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
  83       StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
  84
  85   /**
  86    * strip tags from <code>buf</code>: each tag is replaced by a single blank.
  87    * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
  88    */
  89   public static String stripTags(StringBuilder buf, int start) {
  90     return stripTags(buf.substring(start),0);
  91   }
  92
  93   /**
  94    * strip tags from input.
  95    * @see #stripTags(StringBuilder, int)
  96    */
  97   public static String stripTags(String buf, int start) {
  98     if (start>0) {
  99       buf = buf.substring(0);
 100     }
 101     return buf.replaceAll("<[^>]*>", " ");
 102   }
 103
 104   /**
 105    * Extract from <code>buf</code> the text of interest within specified tags
 106    * @param buf entire input text
 107    * @param startTag tag marking start of text of interest
 108    * @param endTag tag marking end of text of interest
 109    * @param maxPos if &ge; 0 sets a limit on start of text of interest
 110    * @return text of interest or null if not found
 111    */
 112   public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
 113     int k1 = buf.indexOf(startTag);
 114     if (k1>=0 && (maxPos<0 || k1<maxPos)) {
 115       k1 += startTag.length();
 116       int k2 = buf.indexOf(endTag,k1);
 117       if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
 118         if (noisePrefixes != null) {
 119           for (String noise : noisePrefixes) {
 120             int k1a = buf.indexOf(noise,k1);
 121             if (k1a>=0 && k1a<k2) {
 122               k1 = k1a + noise.length();
 123             }
 124           }
 125         }
 126         return buf.substring(k1,k2).trim();
 127       }
 128     }
 129     return null;
 130   }
 131
 132   //public static void main(String[] args) {
 133   //  System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
 134   //}
 135
 136 }