- def add_part(snippets, **fields):
- doc = self.create_book_doc(book)
- for n, v in book_fields.items():
- doc[n] = v
-
- doc['header_index'] = fields["header_index"]
- doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
- doc['header_type'] = fields['header_type']
-
- doc['text'] = fields['text']
-
- # snippets
- snip_pos = snippets.add(fields["text"])
-
- doc['snippets_position'] = snip_pos[0]
- doc['snippets_length'] = snip_pos[1]
- if snippets.revision:
- doc["snippets_revision"] = snippets.revision
-
- if 'fragment_anchor' in fields:
- doc["fragment_anchor"] = fields['fragment_anchor']
-
- if 'themes' in fields:
- doc['themes'] = fields['themes']
- doc['uid'] = "part%s-%s-%s-%s" % (
- book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
- return doc
-
- fragments = {}
- snippets = Snippets(book.id).open('w')
- try:
- for header, position in zip(list(master), range(len(master))):
-
- if header.tag in self.skip_header_tags:
- continue
- if header.tag is etree.Comment:
- continue
-
- # section content
- content = []
- footnote = []
-
- def all_content(text):
- for frag in fragments.values():
- frag['text'].append(text)
- content.append(text)
- handle_text = [all_content]
-
- for start, text, end in walker(header):
- # handle footnotes
- if start is not None and start.tag in self.footnote_tags:
- footnote = []
-
- def collect_footnote(t):
- footnote.append(t)
-
- handle_text.append(collect_footnote)
- elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
- handle_text.pop()
- doc = add_part(snippets, header_index=position, header_type=header.tag,
- text=''.join(footnote),
- is_footnote=True)
- self.index.add(doc)
- footnote = []
-
- # handle fragments and themes.
- if start is not None and start.tag == 'begin':
- fid = start.attrib['id'][1:]
- fragments[fid] = {
- 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-
- # themes for this fragment
- elif start is not None and start.tag == 'motyw':
- fid = start.attrib['id'][1:]
- handle_text.append(lambda text: None)
- if start.text is not None:
- fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
- elif end is not None and end.tag == 'motyw':
- handle_text.pop()
-
- elif start is not None and start.tag == 'end':
- fid = start.attrib['id'][1:]
- if fid not in fragments:
- continue # a broken <end> node, skip it
- frag = fragments[fid]
- if not frag['themes']:
- continue # empty themes list.
- del fragments[fid]
-
- doc = add_part(snippets,
- header_type=frag['start_header'],
- header_index=frag['start_section'],
- header_span=position - frag['start_section'] + 1,
- fragment_anchor=fid,
- text=fix_format(frag['text']),
- themes=frag['themes'])
- self.index.add(doc)
-
- # Collect content.
-
- if text is not None and handle_text is not []:
- hdl = handle_text[-1]
- hdl(text)
-
- # in the end, add a section text.
- doc = add_part(snippets, header_index=position,
- header_type=header.tag, text=fix_format(content))
-
- self.index.add(doc)
-
- finally:
- snippets.close()
-
- def remove_picture(self, picture_or_id):
- """Removes a picture from search index."""
- if isinstance(picture_or_id, picture.models.Picture):
- picture_id = picture_or_id.id
- else:
- picture_id = picture_or_id
- self.delete_query(self.index.Q(picture_id=picture_id))
-
- def index_picture(self, picture, picture_info=None, overwrite=True):
- """
- Indexes the picture.
- Creates a lucene document for extracted metadata
- and calls self.index_area() to index the contents of the picture.
- """
- if overwrite:
- # we don't remove snippets, since they might be still needed by
- # threads using not reopened index
- self.remove_picture(picture)
-
- picture_doc = {'picture_id': int(picture.id)}
- meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
- 'authors', 'title', 'epochs', 'kinds', 'genres'])
-
- picture_doc.update(meta_fields)
-
- picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
- self.index.add(picture_doc)
- del picture_doc['is_book']
- for area in picture.areas.all():
- self.index_area(area, picture_fields=picture_doc)
-
- def index_area(self, area, picture_fields):
- """
- Indexes themes and objects on the area.
- """
- doc = dict(picture_fields)
- doc['area_id'] = area.id
- doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
- doc['uid'] = 'area%s' % area.id
- self.index.add(doc)
-
-
-@total_ordering
-class SearchResult(object):
- def __init__(self, doc, how_found=None, query_terms=None):
- self.boost = 1.0
- self._hits = []
- self._processed_hits = None # processed hits
- self.snippets = []
- self.query_terms = query_terms
- self._book = None
-
- if 'score' in doc:
- self._score = doc['score']
- else:
- self._score = 0
-
- self.book_id = int(doc["book_id"])
-
- try:
- self.published_date = int(doc.get("published_date"))
- except ValueError:
- self.published_date = 0
-
- # content hits
- header_type = doc.get("header_type", None)
- # we have a content hit in some header of fragment
- if header_type is not None:
- sec = (header_type, int(doc["header_index"]))
- header_span = doc['header_span']
- header_span = header_span is not None and int(header_span) or 1
- fragment = doc.get("fragment_anchor", None)
- snippets_pos = (doc['snippets_position'], doc['snippets_length'])
- snippets_rev = doc.get('snippets_revision', None)
-
- hit = (sec + (header_span,), fragment, self._score, {
- 'how_found': how_found,
- 'snippets_pos': snippets_pos,
- 'snippets_revision': snippets_rev,
- 'themes': doc.get('themes', []),
- 'themes_pl': doc.get('themes_pl', [])
- })
-
- self._hits.append(hit)
-
- @classmethod
- def from_book(cls, book, how_found=None, query_terms=None):
- doc = {
- 'score': book.popularity.count,
- 'book_id': book.id,
- 'published_date': 0,
- }
- result = cls(doc, how_found=how_found, query_terms=query_terms)
- result._book = book
- return result
-
- def __str__(self):
- return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
- (self.book_id, len(self._hits),
- len(self._processed_hits) if self._processed_hits else -1,
- self._score, len(self.snippets))
-
- def __bytes__(self):
- return str(self).encode('utf-8')
-
- @property
- def score(self):
- return self._score * self.boost
-
- def merge(self, other):
- if self.book_id != other.book_id:
- raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
- self._hits += other._hits
- self._score += max(other._score, 0)
- return self
-
- def get_book(self):
- if self._book is not None:
- return self._book
- try:
- self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
- except catalogue.models.Book.DoesNotExist:
- self._book = None
- return self._book
-
- book = property(get_book)
-
- POSITION = 0
- FRAGMENT = 1
- POSITION_INDEX = 1
- POSITION_SPAN = 2
- SCORE = 2
- OTHER = 3
-
- @property
- def hits(self):
- if self._processed_hits is not None:
- return self._processed_hits
-
- # to sections and fragments
- frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
-
- sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
-
- # sections not covered by fragments
- sect = filter(lambda s: 0 == len(list(filter(
- lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
- f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
-
- def remove_duplicates(lst, keyfn, larger):
- els = {}
- for e in lst:
- eif = keyfn(e)
- if eif in els:
- if larger(els[eif], e):
- continue
- els[eif] = e
- return els.values()
-
- # remove fragments with duplicated fid's and duplicated snippets
- frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
-
- # remove duplicate sections
- sections = {}
-
- for s in sect:
- si = s[self.POSITION][self.POSITION_INDEX]
- # skip existing
- if si in sections:
- if sections[si]['score'] >= s[self.SCORE]:
- continue
-
- m = {'score': s[self.SCORE],
- 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
- }
- m.update(s[self.OTHER])
- sections[si] = m
-
- hits = list(sections.values())
-
- for f in frags:
- try:
- frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
- except catalogue.models.Fragment.DoesNotExist:
- # stale index