Merge branch 'pretty' of github.com:fnp/wolnelektury into pretty
authorMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)
committerMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Mon, 30 Jan 2012 16:44:44 +0000 (17:44 +0100)
Conflicts:
wolnelektury/templates/catalogue/book_searched.html

1  2 
apps/search/index.py

diff --combined apps/search/index.py
@@@ -27,6 -27,7 +27,7 @@@ from librarian import dcparse
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
+ from pdcounter.models import Author as PDCounterAuthor
  from multiprocessing.pool import ThreadPool
  from threading import current_thread
  import atexit
@@@ -219,6 -220,15 +220,15 @@@ class Index(BaseIndex)
              doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
              self.index.addDocument(doc)
  
+         for pdtag in PDCounterAuthor.objects.all():
+             doc = Document()
+             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
+             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
+             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
+             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
+             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
+             self.index.addDocument(doc)
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
  
          # get published date
          source = book_info.source_name
-         match = self.published_date_re.search(source)
-         if match is not None:
-             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+         if hasattr(book_info, 'source_name'):
+             match = self.published_date_re.search(source)
+             if match is not None:
+                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
  
          return fields
  
              return []
  
          def walker(node, ignore_tags=[]):
 -            yield node, None
 -            for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 -                for b, e in walker(child):
 -                    yield b, e
 -            yield None, node
 +
 +            if node.tag not in ignore_tags:
 +                yield node, None, None
 +                if node.text is not None:
 +                    yield None, node.text, None
 +                for child in list(node):
 +                    for b, t, e in walker(child):
 +                        yield b, t, e
 +                yield None, None, node
 +
 +            if node.tail is not None:
 +                yield None, node.tail, None
              return
  
          def fix_format(text):
  
                  # section content
                  content = []
 -                footnote = None
 -
 -                for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 -                    # handle footnotes
 -                    # if start is not None and start.tag in self.footnote_tags:
 -                    #     footnote = ' '.join(start.itertext())
 -                    # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 -                    #     doc = add_part(snippets, header_index=position, header_type=header.tag,
 -                    #                    content=footnote)
 +                footnote = []
  
 -                    #     self.index.addDocument(doc)
 +                def all_content(text):
 +                    for frag in fragments.values():
 +                        frag['content'].append(text)
 +                    content.append(text)
 +                handle_text = [all_content]
  
 -                    #     footnote = None
  
 +                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 +                    # handle footnotes
 +                    if start is not None and start.tag in self.footnote_tags:
 +                        footnote = []
 +                        def collect_footnote(t):
 +                            footnote.append(t)
 +                        handle_text.append(collect_footnote)
 +                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 +                        handle_text.pop()
 +                        doc = add_part(snippets, header_index=position, header_type=header.tag,
 +                                       content=u''.join(footnote),
 +                                       is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 +                
 +                        self.index.addDocument(doc)
 +                        print "@ footnote text: %s" % footnote
 +                        footnote = []
 +                    
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
                          fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
  
 +                    # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
 +                        handle_text.append(None)
                          if start.text is not None:
                              fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 +                    elif end is not None and end.tag == 'motyw':
 +                        handle_text.pop()
  
                      elif start is not None and start.tag == 'end':
                          fid = start.attrib['id'][1:]
                          if fid not in fragments:
                              continue  # a broken <end> node, skip it
 -                                      #                        import pdb; pdb.set_trace()
                          frag = fragments[fid]
                          if frag['themes'] == []:
                              continue  # empty themes list.
                                         fragment_anchor=fid,
                                         content=fix_format(frag['content']),
                                         themes=frag['themes'])
 -
 +                        print '@ FRAG %s' % frag['content']
                          self.index.addDocument(doc)
  
                          # Collect content.
 -                    elif start is not None:
 -                        for frag in fragments.values():
 -                            frag['content'].append(start.text)
 -                        content.append(start.text)
 -                    elif end is not None:
 -                        for frag in fragments.values():
 -                            frag['content'].append(end.tail)
 -                        content.append(end.tail)
 +
 +                    if text is not None and handle_text is not []:
 +                        hdl = handle_text[-1]
 +                        if hdl is not None:
 +                            hdl(text)
  
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position, header_type=header.tag,
                                 content=fix_format(content))
 +                print '@ CONTENT: %s' % fix_format(content)
  
                  self.index.addDocument(doc)
  
@@@ -624,26 -615,25 +635,25 @@@ class SearchResult(object)
          stored = search.searcher.doc(scoreDocs.doc)
          self.book_id = int(stored.get("book_id"))
  
-         header_type = stored.get("header_type")
-         if not header_type:
-             return
-         sec = (header_type, int(stored.get("header_index")))
-         header_span = stored.get('header_span')
-         header_span = header_span is not None and int(header_span) or 1
-         fragment = stored.get("fragment_anchor")
          pd = stored.get("published_date")
          if pd is None:
              pd = 0
          self.published_date = int(pd)
  
-         if snippets:
-             snippets = snippets.replace("/\n", "\n")
-         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+         header_type = stored.get("header_type")
+         # we have a content hit in some header of fragment
+         if header_type is not None:
+             sec = (header_type, int(stored.get("header_index")))
+             header_span = stored.get('header_span')
+             header_span = header_span is not None and int(header_span) or 1
+             fragment = stored.get("fragment_anchor")
  
-         self._hits.append(hit)
+             if snippets:
+                 snippets = snippets.replace("/\n", "\n")
+             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+             self._hits.append(hit)
  
          self.search = search
          self.searched = searched
@@@ -1228,19 -1218,27 +1238,27 @@@ class Search(IndexStore)
          if terms:
              return JArray('object')(terms, Term)
  
-     def search_tags(self, query, filter=None, max_results=40):
+     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
          """
          Search for Tag objects using query.
          """
-         tops = self.searcher.search(query, filter, max_results)
+         if not pdcounter:
+             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+         tops = self.searcher.search(query, filters, max_results)
  
          tags = []
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
-             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-             tags.append(tag)
-             print "%s (%d) -> %f" % (tag, tag.id, found.score)
+             is_pdcounter = doc.get('is_pdcounter')
+             if is_pdcounter:
+                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
+             else:
+                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+                 # don't add the pdcounter tag if same tag already exists
+             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
+                 tags.append(tag)
+                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
+         print 'returning %s' % tags
          return tags
  
      def search_books(self, query, filter=None, max_results=10):
              bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
          return bks
  
-     def create_prefix_phrase(self, toks, field):
+     def make_prefix_phrase(self, toks, field):
          q = MultiPhraseQuery()
          for i in range(len(toks)):
              t = Term(field, toks[i])
  
          return only_term
  
-     def hint_tags(self, string, max_results=50):
+     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
          """
          Return auto-complete hints for tags
          using prefix search.
          top = BooleanQuery()
  
          for field in ['tag_name', 'tag_name_pl']:
-             q = self.create_prefix_phrase(toks, field)
+             if prefix:
+                 q = self.make_prefix_phrase(toks, field)
+             else:
+                 q = self.make_term_query(toks, field)
              top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
  
          no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
  
-         return self.search_tags(top, no_book_cat, max_results=max_results)
+         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
  
-     def hint_books(self, string, max_results=50):
+     def hint_books(self, string, max_results=50, prefix=True):
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
          """
          toks = self.get_tokens(string, field='SIMPLE')
  
-         q = self.create_prefix_phrase(toks, 'title')
+         if prefix:
+             q = self.make_prefix_phrase(toks, 'title')
+         else:
+             q = self.make_term_query(toks, 'title')
  
          return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)