X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html diff --git a/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html b/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html deleted file mode 100755 index b890f25..0000000 --- a/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html +++ /dev/null @@ -1,99 +0,0 @@ - - - -
- -The highlight package contains classes to provide "keyword in context" features -typically used to highlight search terms in the text of results pages. -The Highlighter class is the central component and can be used to extract the -most interesting sections of a piece of text and highlight them, with the help of -Fragmenter, fragment Scorer, and Formatter classes. - -- //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) - IndexSearcher searcher = new IndexSearcher(directory); - QueryParser parser = new QueryParser("notv", analyzer); - Query query = parser.parse("million"); - - TopDocs hits = searcher.search(query, 10); - - SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); - Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); - for (int i = 0; i < 10; i++) { - int id = hits.scoreDocs[i].doc; - Document doc = searcher.doc(id); - String text = doc.get("notv"); - TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "notv", analyzer); - TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); - for (int j = 0; j < frag.length; j++) { - if ((frag[j] != null) && (frag[j].getScore() > 0)) { - System.out.println((frag[j].toString())); - } - } - //Term vector - text = doc.get("tv"); - tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), hits.scoreDocs[i].doc, "tv", analyzer); - frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); - for (int j = 0; j < frag.length; j++) { - if ((frag[j] != null) && (frag[j].getScore() > 0)) { - System.out.println((frag[j].toString())); - } - } - System.out.println("-------------"); - } -- -
-The highlighter takes a TokenStream as input. Until now these streams have typically been produced -using an Analyzer but the new class TokenSources provides helper methods for obtaining TokenStreams from -the new TermVector position support (see latest CVS version).
- -The new class GradientFormatter can use a scale of colors to highlight terms according to their score. -A subtle use of color can help emphasise the reasons for matching (useful when doing "MoreLikeThis" queries and -you want to see what the basis of the similarities are).
- -The QueryScorer class has a new constructor which can use an IndexReader to derive the IDF (inverse document frequency) -for each term in order to influence the score. This is useful for helping to extracting the most significant sections -of a document and in supplying scores used by the new GradientFormatter to color significant words more strongly. -The QueryScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score -which is associated with the top color.
- - - - - - \ No newline at end of file