pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / benchmark / src / java / org / apache / lucene / benchmark / byTask / feeds / demohtml / HTMLParser.jj
diff --git a/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj
new file mode 100644 (file)
index 0000000..7aff98e
--- /dev/null
@@ -0,0 +1,386 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// HTMLParser.jj
+
+options {
+  STATIC = false;
+  //DEBUG_LOOKAHEAD = true;
+  //DEBUG_TOKEN_MANAGER = true;
+  UNICODE_INPUT = true;
+}
+
+PARSER_BEGIN(HTMLParser)
+
+package org.apache.lucene.benchmark.byTask.feeds.demohtml;
+
+import java.io.*;
+import java.util.Locale;
+import java.util.Properties;
+
+public class HTMLParser {
+  public static int SUMMARY_LENGTH = 200;
+
+  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
+  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
+  Properties metaTags=new Properties();
+  String currentMetaTag=null;
+  String currentMetaContent=null;
+  int length = 0;
+  boolean titleComplete = false;
+  boolean inTitle = false;
+  boolean inMetaTag = false;
+  boolean inStyle = false;
+  boolean afterTag = false;
+  boolean afterSpace = false;
+  String eol = System.getProperty("line.separator");
+  Reader pipeIn = null;
+  Writer pipeOut;
+  private MyPipedInputStream pipeInStream = null;
+  private PipedOutputStream pipeOutStream = null;
+  
+  private class MyPipedInputStream extends PipedInputStream{
+    
+    public MyPipedInputStream(){
+      super();
+    }
+    
+    public MyPipedInputStream(PipedOutputStream src) throws IOException{
+      super(src);
+    }
+    
+    public boolean full() throws IOException{
+      return this.available() >= PipedInputStream.PIPE_SIZE;
+    }
+  }
+
+  public String getTitle() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                               // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+       if (titleComplete || pipeInStream.full())
+         break;
+       wait(10);
+      }
+    }
+    return title.toString().trim();
+  }
+
+  public Properties getMetaTags() throws IOException,
+InterruptedException {
+    if (pipeIn == null)
+      getReader();                               // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+       if (titleComplete || pipeInStream.full())
+         break;
+       wait(10);
+      }
+    }
+    return metaTags;
+  }
+
+
+  public String getSummary() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                               // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+       if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
+         break;
+       wait(10);
+      }
+    }
+    if (summary.length() > SUMMARY_LENGTH)
+      summary.setLength(SUMMARY_LENGTH);
+
+    String sum = summary.toString().trim();
+    String tit = getTitle();
+    if (sum.equals(""))
+      return tit;
+    else
+      return sum;
+  }
+
+  public Reader getReader() throws IOException {
+    if (pipeIn == null) {
+      pipeInStream = new MyPipedInputStream();
+      pipeOutStream = new PipedOutputStream(pipeInStream);
+      pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
+      pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
+
+      Thread thread = new ParserThread(this);
+      thread.start();                            // start parsing
+    }
+
+    return pipeIn;
+  }
+
+  void addToSummary(String text) {
+    if (summary.length() < SUMMARY_LENGTH) {
+      summary.append(text);
+      if (summary.length() >= SUMMARY_LENGTH) {
+       synchronized(this) {
+         notifyAll();
+       }
+      }
+    }
+  }
+
+  void addText(String text) throws IOException {
+    if (inStyle)
+      return;
+    if (inTitle)
+      title.append(text);
+    else {
+      addToSummary(text);
+      if (!titleComplete && !(title.length() == 0)) {  // finished title
+       synchronized(this) {
+         titleComplete = true;                   // tell waiting threads
+         notifyAll();
+       }
+      }
+    }
+
+    length += text.length();
+    pipeOut.write(text);
+
+    afterSpace = false;
+  }
+  
+  void addMetaTag() {
+      metaTags.setProperty(currentMetaTag, currentMetaContent);
+      currentMetaTag = null;
+      currentMetaContent = null;
+      return;
+  }
+
+  void addSpace() throws IOException {
+    if (!afterSpace) {
+      if (inTitle)
+       title.append(" ");
+      else
+       addToSummary(" ");
+
+      String space = afterTag ? eol : " ";
+      length += space.length();
+      pipeOut.write(space);
+      afterSpace = true;
+    }
+  }
+
+//    void handleException(Exception e) {
+//      System.out.println(e.toString());  // print the error message
+//      System.out.println("Skipping...");
+//      Token t;
+//      do {
+//        t = getNextToken();
+//      } while (t.kind != TagEnd);
+//    }
+}
+
+PARSER_END(HTMLParser)
+
+
+void HTMLDocument() throws IOException :
+{
+  Token t;
+}
+{
+//  try {
+    ( Tag()         { afterTag = true; }
+    | t=Decl()      { afterTag = true; }
+    | CommentTag()  { afterTag = true; }
+    | ScriptTag()  { afterTag = true; }
+    | t=<Word>      { addText(t.image); afterTag = false; }
+    | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }
+    | t=<Punct>     { addText(t.image); afterTag = false; }
+    | <Space>       { addSpace(); afterTag = false; }
+    )* <EOF>
+//  } catch (ParseException e) {
+//    handleException(e);
+//  }
+}
+
+void Tag() throws IOException :
+{
+  Token t1, t2;
+  boolean inImg = false;
+}
+{
+  t1=<TagName> {
+   String tagName = t1.image.toLowerCase(Locale.ENGLISH);
+   if(Tags.WS_ELEMS.contains(tagName) ) {
+      addSpace();
+    }
+    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+    inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
+    inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
+    inImg = tagName.equalsIgnoreCase("<img");    // keep track if in <IMG>
+  }
+  (t1=<ArgName>
+   (<ArgEquals>
+    (t2=ArgValue()                               // save ALT text in IMG tag
+     {
+       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
+         addText("[" + t2.image + "]");
+
+       if(inMetaTag &&
+                       (  t1.image.equalsIgnoreCase("name") ||
+                          t1.image.equalsIgnoreCase("HTTP-EQUIV")
+                       )
+          && t2 != null)
+       {
+               currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
+               if(currentMetaTag != null && currentMetaContent != null) {
+               addMetaTag();
+               }
+       }
+       if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
+null)
+       {
+               currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
+               if(currentMetaTag != null && currentMetaContent != null) {
+               addMetaTag();
+               }
+       }
+     }
+    )?
+   )?
+  )*
+  <TagEnd>
+}
+
+Token ArgValue() :
+{
+  Token t = null;
+}
+{
+  t=<ArgValue>                              { return t; }
+| LOOKAHEAD(2)
+  <ArgQuote1> <CloseQuote1>                 { return t; }
+| <ArgQuote1> t=<Quote1Text> <CloseQuote1>  { return t; }
+| LOOKAHEAD(2)
+  <ArgQuote2> <CloseQuote2>                 { return t; }
+| <ArgQuote2> t=<Quote2Text> <CloseQuote2>  { return t; }
+}
+
+
+Token Decl() :
+{
+  Token t;
+}
+{
+  t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
+  { return t; }
+}
+
+
+void CommentTag() :
+{}
+{
+  (<Comment1> ( <CommentText1> )* <CommentEnd1>)
+ |
+  (<Comment2> ( <CommentText2> )* <CommentEnd2>)
+}
+
+void ScriptTag() :
+{}
+{
+  <ScriptStart> ( <ScriptText> )* <ScriptEnd>
+}
+
+
+TOKEN :
+{
+  < ScriptStart: "<script" > : WithinScript
+| < TagName:  "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
+| < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
+
+| < Comment1:  "<!--" > : WithinComment1
+| < Comment2:  "<!" >   : WithinComment2
+
+| < Word:     ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
+                <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
+| < #LET:     ["A"-"Z","a"-"z","0"-"9"] >
+| < #NUM:     ["0"-"9"] >
+| < #HEX:     ["0"-"9","A"-"F","a"-"f"] >
+
+| < Entity:   ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
+
+| < Space:    (<SP>)+ >
+| < #SP:      [" ","\t","\r","\n"] >
+
+| < Punct:    ~[] > // Keep this last.  It is a catch-all.
+}
+
+<WithinScript> TOKEN:
+{
+  < ScriptText:  (~["<",">"])+ | "<" | ">" >
+| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
+}
+
+<WithinTag> TOKEN:
+{
+  < ArgName:   (~[" ","\t","\r","\n","=",">","'","\""])
+               (~[" ","\t","\r","\n","=",">"])* >
+| < ArgEquals: "=" >  : AfterEquals
+| < TagEnd:    ">" | "=>" >  : DEFAULT
+}
+
+<AfterEquals> TOKEN:
+{
+  < ArgValue:  (~[" ","\t","\r","\n","=",">","'","\""])
+              (~[" ","\t","\r","\n",">"])* > : WithinTag
+}
+
+<WithinTag, AfterEquals> TOKEN:
+{
+  < ArgQuote1: "'"  > : WithinQuote1
+| < ArgQuote2: "\"" > : WithinQuote2
+}
+
+<WithinTag, AfterEquals> SKIP:
+{
+  < <Space> >
+}
+
+<WithinQuote1> TOKEN:
+{
+  < Quote1Text:  (~["'"])+ >
+| < CloseQuote1: <ArgQuote1> > : WithinTag
+}
+
+<WithinQuote2> TOKEN:
+{
+  < Quote2Text:  (~["\""])+ >
+| < CloseQuote2: <ArgQuote2> > : WithinTag
+}
+
+
+<WithinComment1> TOKEN :
+{
+  < CommentText1:  (~["-"])+ | "-" >
+| < CommentEnd1:   "-->" > : DEFAULT
+}
+
+<WithinComment2> TOKEN :
+{
+  < CommentText2:  (~[">"])+ >
+| < CommentEnd2:   ">" > : DEFAULT
+}