lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Date;
  22
  23 /**
  24  * Parser for the FT docs in trec disks 4+5 collection format
  25  */
  26 public class TrecLATimesParser extends TrecDocParser {
  27
  28   private static final String DATE = "<DATE>";
  29   private static final String DATE_END = "</DATE>";
  30   private static final String DATE_NOISE = "day,"; // anything aftre the ','
  31
  32   private static final String SUBJECT = "<SUBJECT>";
  33   private static final String SUBJECT_END = "</SUBJECT>";
  34   private static final String HEADLINE = "<HEADLINE>";
  35   private static final String HEADLINE_END = "</HEADLINE>";
  36
  37   @Override
  38   public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
  39       StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
  40     int mark = 0; // that much is skipped
  41
  42     // date...
  43     Date date = null;
  44     String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
  45     if (dateStr != null) {
  46       int d2a = dateStr.indexOf(DATE_NOISE);
  47       if (d2a > 0) {
  48         dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
  49       }
  50       dateStr = stripTags(dateStr,0).toString();
  51       date = trecSrc.parseDate(dateStr.trim());
  52     }
  53
  54     // title... first try with SUBJECT, them with HEADLINE
  55     String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
  56     if (title==null) {
  57       title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
  58     }
  59     if (title!=null) {
  60       title = stripTags(title,0).toString().trim();
  61     }
  62
  63     docData.clear();
  64     docData.setName(name);
  65     docData.setDate(date);
  66     docData.setTitle(title);
  67     docData.setBody(stripTags(docBuf, mark).toString());
  68     return docData;
  69   }
  70
  71 }