- header_docs = []
- for header, position in zip(list(master), range(len(master))):
- if header.tag in self.skip_header_tags:
- continue
- doc = self.create_book_doc(book)
- doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
- doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
- content = u' '.join([t for t in header.itertext()])
- doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
- header_docs.append(doc)
-
- def walker(node):
- yield node, None
- for child in list(node):
- for b, e in walker(child):
- yield b, e
- yield None, node
- return
-
- # Then we create a document for each fragments
- # fragment_anchor - the anchor
- # themes - list of themes [not indexed]
- fragment_docs = []
- # will contain (framgent id -> { content: [], themes: [] }
- fragments = {}
- for start, end in walker(master):
- if start is not None and start.tag == 'begin':
- fid = start.attrib['id'][1:]
- fragments[fid] = {'content': [], 'themes': []}
- fragments[fid]['content'].append(start.tail)
- elif start is not None and start.tag == 'motyw':
- fid = start.attrib['id'][1:]
- fragments[fid]['themes'].append(start.text)
- fragments[fid]['content'].append(start.tail)
- elif start is not None and start.tag == 'end':
- fid = start.attrib['id'][1:]
- if fid not in fragments:
- continue # a broken <end> node, skip it
- frag = fragments[fid]
- del fragments[fid]
-
- def jstr(l):
- return u' '.join(map(
- lambda x: x == None and u'(none)' or unicode(x),
- l))
-
- doc = self.create_book_doc(book)
- doc.add(Field("fragment_anchor", fid,
- Field.Store.YES, Field.Index.NOT_ANALYZED))
- doc.add(Field("content",
- u' '.join(filter(lambda s: s is not None, frag['content'])),
- Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
- doc.add(Field("themes",
- u' '.join(filter(lambda s: s is not None, frag['themes'])),
- Field.Store.NO, Field.Index.ANALYZED))
-
- fragment_docs.append(doc)
- elif start is not None:
- for frag in fragments.values():
- frag['content'].append(start.text)
- elif end is not None:
- for frag in fragments.values():
- frag['content'].append(end.tail)
-
- return header_docs + fragment_docs
-
- def __enter__(self):
- self.open()
- return self
-
- def __exit__(self, type, value, tb):
- self.close()
-
-
-def log_exception_wrapper(f):
- def _wrap(*a):
- try:
- f(*a)
- except Exception, e:
- print("Error in indexing thread: %s" % e)
- traceback.print_exc()
- raise e
- return _wrap