Merge branch 'pretty' of github.com:fnp/wolnelektury into pretty

author Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>

Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)

committer Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>

Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)
author Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)
committer Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)
diff --combined apps/search/index.py

index 6068fa2,47583ec..71c0ed2
--- 1/apps/search/index.py
--- 2/apps/search/index.py
+++ b/apps/search/index.py
@@@ -27,6 -27,7 +27,7 @@@ from librarian import dcparse
   from librarian.parser import WLDocument
   from lxml import etree
   import catalogue.models
+ from pdcounter.models import Author as PDCounterAuthor
   from multiprocessing.pool import ThreadPool
   from threading import current_thread
   import atexit
@@@ -219,6 -220,15 +220,15 @@@ class Index(BaseIndex)
               doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
               self.index.addDocument(doc)
   
+         for pdtag in PDCounterAuthor.objects.all():
+             doc = Document()
+             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
+             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
+             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
+             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
+             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
+             self.index.addDocument(doc)
+ 
       def create_book_doc(self, book):
           """
           Create a lucene document referring book id.
@@@ -322,9 -332,10 +332,10 @@@
   
           # get published date
           source = book_info.source_name
-         match = self.published_date_re.search(source)
-         if match is not None:
-             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+         if hasattr(book_info, 'source_name'):
+             match = self.published_date_re.search(source)
+             if match is not None:
+                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
   
           return fields
   
@@@ -359,18 -370,11 +370,18 @@@
               return []
   
           def walker(node, ignore_tags=[]):
- -            yield node, None
- -            for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
- -                for b, e in walker(child):
- -                    yield b, e
- -            yield None, node
+ +
+ +            if node.tag not in ignore_tags:
+ +                yield node, None, None
+ +                if node.text is not None:
+ +                    yield None, node.text, None
+ +                for child in list(node):
+ +                    for b, t, e in walker(child):
+ +                        yield b, t, e
+ +                yield None, None, node
+ +
+ +            if node.tail is not None:
+ +                yield None, node.tail, None
               return
   
           def fix_format(text):
@@@ -442,50 -446,35 +453,50 @@@
   
                   # section content
                   content = []
- -                footnote = None
- -
- -                for start, end in walker(header, ignore_tags=self.ignore_content_tags):
- -                    # handle footnotes
- -                    # if start is not None and start.tag in self.footnote_tags:
- -                    #     footnote = ' '.join(start.itertext())
- -                    # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
- -                    #     doc = add_part(snippets, header_index=position, header_type=header.tag,
- -                    #                    content=footnote)
+ +                footnote = []
   
- -                    #     self.index.addDocument(doc)
+ +                def all_content(text):
+ +                    for frag in fragments.values():
+ +                        frag['content'].append(text)
+ +                    content.append(text)
+ +                handle_text = [all_content]
   
- -                    #     footnote = None
   
+ +                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
+ +                    # handle footnotes
+ +                    if start is not None and start.tag in self.footnote_tags:
+ +                        footnote = []
+ +                        def collect_footnote(t):
+ +                            footnote.append(t)
+ +                        handle_text.append(collect_footnote)
+ +                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
+ +                        handle_text.pop()
+ +                        doc = add_part(snippets, header_index=position, header_type=header.tag,
+ +                                       content=u''.join(footnote),
+ +                                       is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
+ +                
+ +                        self.index.addDocument(doc)
+ +                        print "@ footnote text: %s" % footnote
+ +                        footnote = []
+ +                    
                       # handle fragments and themes.
                       if start is not None and start.tag == 'begin':
                           fid = start.attrib['id'][1:]
                           fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
   
+ +                    # themes for this fragment
                       elif start is not None and start.tag == 'motyw':
                           fid = start.attrib['id'][1:]
+ +                        handle_text.append(None)
                           if start.text is not None:
                               fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
+ +                    elif end is not None and end.tag == 'motyw':
+ +                        handle_text.pop()
   
                       elif start is not None and start.tag == 'end':
                           fid = start.attrib['id'][1:]
                           if fid not in fragments:
                               continue  # a broken <end> node, skip it
- -                                      #                        import pdb; pdb.set_trace()
                           frag = fragments[fid]
                           if frag['themes'] == []:
                               continue  # empty themes list.
@@@ -498,20 -487,22 +509,20 @@@
                                          fragment_anchor=fid,
                                          content=fix_format(frag['content']),
                                          themes=frag['themes'])
- -
+ +                        print '@ FRAG %s' % frag['content']
                           self.index.addDocument(doc)
   
                           # Collect content.
- -                    elif start is not None:
- -                        for frag in fragments.values():
- -                            frag['content'].append(start.text)
- -                        content.append(start.text)
- -                    elif end is not None:
- -                        for frag in fragments.values():
- -                            frag['content'].append(end.tail)
- -                        content.append(end.tail)
+ +
+ +                    if text is not None and handle_text is not []:
+ +                        hdl = handle_text[-1]
+ +                        if hdl is not None:
+ +                            hdl(text)
   
                           # in the end, add a section text.
                   doc = add_part(snippets, header_index=position, header_type=header.tag,
                                  content=fix_format(content))
+ +                print '@ CONTENT: %s' % fix_format(content)
   
                   self.index.addDocument(doc)
   
@@@ -624,26 -615,25 +635,25 @@@ class SearchResult(object)
           stored = search.searcher.doc(scoreDocs.doc)
           self.book_id = int(stored.get("book_id"))
   
-         header_type = stored.get("header_type")
-         if not header_type:
-             return
- 
-         sec = (header_type, int(stored.get("header_index")))
-         header_span = stored.get('header_span')
-         header_span = header_span is not None and int(header_span) or 1
- 
-         fragment = stored.get("fragment_anchor")
- 
           pd = stored.get("published_date")
           if pd is None:
               pd = 0
           self.published_date = int(pd)
   
-         if snippets:
-             snippets = snippets.replace("/\n", "\n")
-         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+         header_type = stored.get("header_type")
+         # we have a content hit in some header of fragment
+         if header_type is not None:
+             sec = (header_type, int(stored.get("header_index")))
+             header_span = stored.get('header_span')
+             header_span = header_span is not None and int(header_span) or 1
+ 
+             fragment = stored.get("fragment_anchor")
   
-         self._hits.append(hit)
+             if snippets:
+                 snippets = snippets.replace("/\n", "\n")
+             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+ 
+             self._hits.append(hit)
   
           self.search = search
           self.searched = searched
@@@ -1228,19 -1218,27 +1238,27 @@@ class Search(IndexStore)
           if terms:
               return JArray('object')(terms, Term)
   
-     def search_tags(self, query, filter=None, max_results=40):
+     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
           """
           Search for Tag objects using query.
           """
-         tops = self.searcher.search(query, filter, max_results)
+         if not pdcounter:
+             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+         tops = self.searcher.search(query, filters, max_results)
   
           tags = []
           for found in tops.scoreDocs:
               doc = self.searcher.doc(found.doc)
-             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-             tags.append(tag)
-             print "%s (%d) -> %f" % (tag, tag.id, found.score)
- 
+             is_pdcounter = doc.get('is_pdcounter')
+             if is_pdcounter:
+                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
+             else:
+                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+                 # don't add the pdcounter tag if same tag already exists
+             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
+                 tags.append(tag)
+                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
+         print 'returning %s' % tags
           return tags
   
       def search_books(self, query, filter=None, max_results=10):
@@@ -1254,7 -1252,7 +1272,7 @@@
               bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
           return bks
   
-     def create_prefix_phrase(self, toks, field):
+     def make_prefix_phrase(self, toks, field):
           q = MultiPhraseQuery()
           for i in range(len(toks)):
               t = Term(field, toks[i])
@@@ -1280,7 -1278,7 +1298,7 @@@
   
           return only_term
   
-     def hint_tags(self, string, max_results=50):
+     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
           """
           Return auto-complete hints for tags
           using prefix search.
@@@ -1289,14 -1287,17 +1307,17 @@@
           top = BooleanQuery()
   
           for field in ['tag_name', 'tag_name_pl']:
-             q = self.create_prefix_phrase(toks, field)
+             if prefix:
+                 q = self.make_prefix_phrase(toks, field)
+             else:
+                 q = self.make_term_query(toks, field)
               top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
   
           no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
   
-         return self.search_tags(top, no_book_cat, max_results=max_results)
+         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
   
-     def hint_books(self, string, max_results=50):
+     def hint_books(self, string, max_results=50, prefix=True):
           """
           Returns auto-complete hints for book titles
           Because we do not index 'pseudo' title-tags.
@@@ -1304,7 -1305,10 +1325,10 @@@
           """
           toks = self.get_tokens(string, field='SIMPLE')
   
-         q = self.create_prefix_phrase(toks, 'title')
+         if prefix:
+             q = self.make_prefix_phrase(toks, 'title')
+         else:
+             q = self.make_term_query(toks, 'title')
   
           return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
author	Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
	Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)
committer	Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
	Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)