- def add_part(snippets, **fields):
- doc = self.create_book_doc(book)
- for n, v in book_fields.items():
- doc[n] = v
-
- doc['header_index'] = fields["header_index"]
- doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
- doc['header_type'] = fields['header_type']
-
- doc['text'] = fields['text']
-
- # snippets
- snip_pos = snippets.add(fields["text"])
-
- doc['snippets_position'] = snip_pos[0]
- doc['snippets_length'] = snip_pos[1]
- if snippets.revision:
- doc["snippets_revision"] = snippets.revision
-
- if 'fragment_anchor' in fields:
- doc["fragment_anchor"] = fields['fragment_anchor']
-
- if 'themes' in fields:
- doc['themes'] = fields['themes']
- doc['uid'] = "part%s-%s-%s-%s" % (
- book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
- return doc
-
- def give_me_utf8(s):
- if isinstance(s, unicode):
- return s.encode('utf-8')
- else:
- return s
-
- fragments = {}
- snippets = Snippets(book.id).open('w')
- try:
- for header, position in zip(list(master), range(len(master))):
-
- if header.tag in self.skip_header_tags:
- continue
- if header.tag is etree.Comment:
- continue
-
- # section content
- content = []
- footnote = []
-
- def all_content(text):
- for frag in fragments.values():
- frag['text'].append(text)
- content.append(text)
- handle_text = [all_content]
-
- for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
- # handle footnotes
- if start is not None and start.tag in self.footnote_tags:
- footnote = []
-
- def collect_footnote(t):
- footnote.append(t)
-
- handle_text.append(collect_footnote)
- elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
- handle_text.pop()
- doc = add_part(snippets, header_index=position, header_type=header.tag,
- text=u''.join(footnote),
- is_footnote=True)
- self.index.add(doc)
- footnote = []
-
- # handle fragments and themes.
- if start is not None and start.tag == 'begin':
- fid = start.attrib['id'][1:]
- fragments[fid] = {
- 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-
- # themes for this fragment
- elif start is not None and start.tag == 'motyw':
- fid = start.attrib['id'][1:]
- handle_text.append(lambda text: None)
- if start.text is not None:
- fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
- elif end is not None and end.tag == 'motyw':
- handle_text.pop()
-
- elif start is not None and start.tag == 'end':
- fid = start.attrib['id'][1:]
- if fid not in fragments:
- continue # a broken <end> node, skip it
- frag = fragments[fid]
- if not frag['themes']:
- continue # empty themes list.
- del fragments[fid]
-
- doc = add_part(snippets,
- header_type=frag['start_header'],
- header_index=frag['start_section'],
- header_span=position - frag['start_section'] + 1,
- fragment_anchor=fid,
- text=fix_format(frag['text']),
- themes=frag['themes'])
- self.index.add(doc)
-
- # Collect content.
-
- if text is not None and handle_text is not []:
- hdl = handle_text[-1]
- hdl(text)
-
- # in the end, add a section text.
- doc = add_part(snippets, header_index=position,
- header_type=header.tag, text=fix_format(content))
-
- self.index.add(doc)
-
- finally:
- snippets.close()
-
-
-class SearchResult(object):
- def __init__(self, doc, how_found=None, query_terms=None):
- self.boost = 1.0
- self._hits = []
- self._processed_hits = None # processed hits
- self.snippets = []
- self.query_terms = query_terms
- self._book = None
-
- if 'score' in doc:
- self._score = doc['score']
- else:
- self._score = 0
-
- self.book_id = int(doc["book_id"])
-
- try:
- self.published_date = int(doc.get("published_date"))
- except ValueError:
- self.published_date = 0
-
- # content hits
- header_type = doc.get("header_type", None)
- # we have a content hit in some header of fragment
- if header_type is not None:
- sec = (header_type, int(doc["header_index"]))
- header_span = doc['header_span']
- header_span = header_span is not None and int(header_span) or 1
- fragment = doc.get("fragment_anchor", None)
- snippets_pos = (doc['snippets_position'], doc['snippets_length'])
- snippets_rev = doc.get('snippets_revision', None)
-
- hit = (sec + (header_span,), fragment, self._score, {
- 'how_found': how_found,
- 'snippets_pos': snippets_pos,
- 'snippets_revision': snippets_rev,
- 'themes': doc.get('themes', []),
- 'themes_pl': doc.get('themes_pl', [])
- })
-
- self._hits.append(hit)
-
- def __unicode__(self):
- return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
- (self.book_id, len(self._hits),
- len(self._processed_hits) if self._processed_hits else -1,
- self._score, len(self.snippets))
-
- def __str__(self):
- return unicode(self).encode('utf-8')
-
- @property
- def score(self):
- return self._score * self.boost
-
- def merge(self, other):
- if self.book_id != other.book_id:
- raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
- self._hits += other._hits
- if other.score > self.score:
- self._score = other._score
- return self
-
- def get_book(self):
- if self._book is not None:
- return self._book
- self._book = catalogue.models.Book.objects.get(id=self.book_id)
- return self._book
-
- book = property(get_book)
-
- POSITION = 0
- FRAGMENT = 1
- POSITION_INDEX = 1
- POSITION_SPAN = 2
- SCORE = 2
- OTHER = 3
-
- @property
- def hits(self):
- if self._processed_hits is not None:
- return self._processed_hits
-
- # to sections and fragments
- frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
-
- sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
-
- # sections not covered by fragments
- sect = filter(lambda s: 0 == len(filter(
- lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
- f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
-
- def remove_duplicates(lst, keyfn, compare):
- els = {}
- for e in lst:
- eif = keyfn(e)
- if eif in els:
- if compare(els[eif], e) >= 1:
- continue
- els[eif] = e
- return els.values()
-
- # remove fragments with duplicated fid's and duplicated snippets
- frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
- # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
- # lambda a, b: cmp(a[SCORE], b[SCORE]))
-
- # remove duplicate sections
- sections = {}
-
- for s in sect:
- si = s[self.POSITION][self.POSITION_INDEX]
- # skip existing
- if si in sections:
- if sections[si]['score'] >= s[self.SCORE]:
- continue
-
- m = {'score': s[self.SCORE],
- 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
- }
- m.update(s[self.OTHER])
- sections[si] = m
-
- hits = sections.values()
-
- for f in frags:
- try:
- frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
- except catalogue.models.Fragment.DoesNotExist:
- # stale index