X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj new file mode 100644 index 0000000..7aff98e --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj @@ -0,0 +1,386 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// HTMLParser.jj + +options { + STATIC = false; + //DEBUG_LOOKAHEAD = true; + //DEBUG_TOKEN_MANAGER = true; + UNICODE_INPUT = true; +} + +PARSER_BEGIN(HTMLParser) + +package org.apache.lucene.benchmark.byTask.feeds.demohtml; + +import java.io.*; +import java.util.Locale; +import java.util.Properties; + +public class HTMLParser { + public static int SUMMARY_LENGTH = 200; + + StringBuffer title = new StringBuffer(SUMMARY_LENGTH); + StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); + Properties metaTags=new Properties(); + String currentMetaTag=null; + String currentMetaContent=null; + int length = 0; + boolean titleComplete = false; + boolean inTitle = false; + boolean inMetaTag = false; + boolean inStyle = false; + boolean afterTag = false; + boolean afterSpace = false; + String eol = System.getProperty("line.separator"); + Reader pipeIn = null; + Writer pipeOut; + private MyPipedInputStream pipeInStream = null; + private PipedOutputStream pipeOutStream = null; + + private class MyPipedInputStream extends PipedInputStream{ + + public MyPipedInputStream(){ + super(); + } + + public MyPipedInputStream(PipedOutputStream src) throws IOException{ + super(src); + } + + public boolean full() throws IOException{ + return this.available() >= PipedInputStream.PIPE_SIZE; + } + } + + public String getTitle() throws IOException, InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (titleComplete || pipeInStream.full()) + break; + wait(10); + } + } + return title.toString().trim(); + } + + public Properties getMetaTags() throws IOException, +InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (titleComplete || pipeInStream.full()) + break; + wait(10); + } + } + return metaTags; + } + + + public String getSummary() throws IOException, InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) + break; + wait(10); + } + } + if (summary.length() > SUMMARY_LENGTH) + summary.setLength(SUMMARY_LENGTH); + + String sum = summary.toString().trim(); + String tit = getTitle(); + if (sum.equals("")) + return tit; + else + return sum; + } + + public Reader getReader() throws IOException { + if (pipeIn == null) { + pipeInStream = new MyPipedInputStream(); + pipeOutStream = new PipedOutputStream(pipeInStream); + pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); + pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); + + Thread thread = new ParserThread(this); + thread.start(); // start parsing + } + + return pipeIn; + } + + void addToSummary(String text) { + if (summary.length() < SUMMARY_LENGTH) { + summary.append(text); + if (summary.length() >= SUMMARY_LENGTH) { + synchronized(this) { + notifyAll(); + } + } + } + } + + void addText(String text) throws IOException { + if (inStyle) + return; + if (inTitle) + title.append(text); + else { + addToSummary(text); + if (!titleComplete && !(title.length() == 0)) { // finished title + synchronized(this) { + titleComplete = true; // tell waiting threads + notifyAll(); + } + } + } + + length += text.length(); + pipeOut.write(text); + + afterSpace = false; + } + + void addMetaTag() { + metaTags.setProperty(currentMetaTag, currentMetaContent); + currentMetaTag = null; + currentMetaContent = null; + return; + } + + void addSpace() throws IOException { + if (!afterSpace) { + if (inTitle) + title.append(" "); + else + addToSummary(" "); + + String space = afterTag ? eol : " "; + length += space.length(); + pipeOut.write(space); + afterSpace = true; + } + } + +// void handleException(Exception e) { +// System.out.println(e.toString()); // print the error message +// System.out.println("Skipping..."); +// Token t; +// do { +// t = getNextToken(); +// } while (t.kind != TagEnd); +// } +} + +PARSER_END(HTMLParser) + + +void HTMLDocument() throws IOException : +{ + Token t; +} +{ +// try { + ( Tag() { afterTag = true; } + | t=Decl() { afterTag = true; } + | CommentTag() { afterTag = true; } + | ScriptTag() { afterTag = true; } + | t= { addText(t.image); afterTag = false; } + | t= { addText(Entities.decode(t.image)); afterTag = false; } + | t= { addText(t.image); afterTag = false; } + | { addSpace(); afterTag = false; } + )* +// } catch (ParseException e) { +// handleException(e); +// } +} + +void Tag() throws IOException : +{ + Token t1, t2; + boolean inImg = false; +} +{ + t1= { + String tagName = t1.image.toLowerCase(Locale.ENGLISH); + if(Tags.WS_ELEMS.contains(tagName) ) { + addSpace(); + } + inTitle = tagName.equalsIgnoreCase(" + inMetaTag = tagName.equalsIgnoreCase(" + inStyle = tagName.equalsIgnoreCase(" + inImg = tagName.equalsIgnoreCase(" + } + (t1= + ( + (t2=ArgValue() // save ALT text in IMG tag + { + if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) + addText("[" + t2.image + "]"); + + if(inMetaTag && + ( t1.image.equalsIgnoreCase("name") || + t1.image.equalsIgnoreCase("HTTP-EQUIV") + ) + && t2 != null) + { + currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH); + if(currentMetaTag != null && currentMetaContent != null) { + addMetaTag(); + } + } + if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != +null) + { + currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH); + if(currentMetaTag != null && currentMetaContent != null) { + addMetaTag(); + } + } + } + )? + )? + )* + +} + +Token ArgValue() : +{ + Token t = null; +} +{ + t= { return t; } +| LOOKAHEAD(2) + { return t; } +| t= { return t; } +| LOOKAHEAD(2) + { return t; } +| t= { return t; } +} + + +Token Decl() : +{ + Token t; +} +{ + t= ( | ArgValue() | )* + { return t; } +} + + +void CommentTag() : +{} +{ + ( ( )* ) + | + ( ( )* ) +} + +void ScriptTag() : +{} +{ + ( )* +} + + +TOKEN : +{ + < ScriptStart: " : WithinScript +| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] ()? > : WithinTag +| < DeclName: "<" "!" ["A"-"Z","a"-"z"] ()? > : WithinTag + +| < Comment1: "" > : DEFAULT +} + + TOKEN : +{ + < CommentText2: (~[">"])+ > +| < CommentEnd2: ">" > : DEFAULT +}