+ fields[field.name] = dt
+
+ # get published date
+ pd = None
+ if hasattr(book_info, 'source_name') and book_info.source_name:
+ match = self.published_date_re.search(book_info.source_name)
+ if match is not None:
+ pd = str(match.groups()[0])
+ if not pd: pd = ""
+ fields["published_date"] = pd
+
+ return fields
+
+ # def add_gaps(self, fields, fieldname):
+ # """
+ # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
+ # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
+ # """
+ # def gap():
+ # while True:
+ # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
+ # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
+
+ def get_master(self, root):
+ """
+ Returns the first master tag from an etree.
+ """
+ for master in root.iter():
+ if master.tag in self.master_tags:
+ return master
+
+ def index_content(self, book, book_fields={}):
+ """
+ Walks the book XML and extract content from it.
+ Adds parts for each header tag and for each fragment.
+ """
+ wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
+ root = wld.edoc.getroot()
+
+ master = self.get_master(root)
+ if master is None:
+ return []
+
+ def walker(node, ignore_tags=[]):
+
+ if node.tag not in ignore_tags:
+ yield node, None, None
+ if node.text is not None:
+ yield None, node.text, None
+ for child in list(node):
+ for b, t, e in walker(child):
+ yield b, t, e
+ yield None, None, node
+
+ if node.tail is not None:
+ yield None, node.tail, None
+ return
+
+ def fix_format(text):
+ # separator = [u" ", u"\t", u".", u";", u","]
+ if isinstance(text, list):
+ # need to join it first
+ text = filter(lambda s: s is not None, content)
+ text = u' '.join(text)
+ # for i in range(len(text)):
+ # if i > 0:
+ # if text[i][0] not in separator\
+ # and text[i - 1][-1] not in separator:
+ # text.insert(i, u" ")
+
+ return re.sub("(?m)/$", "", text)
+
+ def add_part(snippets, **fields):
+ doc = self.create_book_doc(book)
+ for n, v in book_fields.items():
+ doc[n] = v
+
+ doc['header_index'] = fields["header_index"]
+ doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
+ doc['header_type'] = fields['header_type']
+
+ doc['text'] = fields['text']
+
+ # snippets
+ snip_pos = snippets.add(fields["text"])
+
+ doc['snippets_position'] = snip_pos[0]
+ doc['snippets_length'] = snip_pos[1]
+ if snippets.revision:
+ doc["snippets_revision"] = snippets.revision
+
+ if 'fragment_anchor' in fields:
+ doc["fragment_anchor"] = fields['fragment_anchor']
+
+ if 'themes' in fields:
+ doc['themes'] = fields['themes']
+ doc['uid'] = "part%s%s%s" % (doc['header_index'],
+ doc['header_span'],
+ doc.get('fragment_anchor', ''))
+ return doc
+
+ def give_me_utf8(s):
+ if isinstance(s, unicode):
+ return s.encode('utf-8')
+ else:
+ return s
+
+ fragments = {}
+ snippets = Snippets(book.id).open('w')
+ try:
+ for header, position in zip(list(master), range(len(master))):
+
+ if header.tag in self.skip_header_tags:
+ continue
+ if header.tag is etree.Comment:
+ continue
+
+ # section content
+ content = []
+ footnote = []
+
+ def all_content(text):
+ for frag in fragments.values():
+ frag['text'].append(text)
+ content.append(text)
+ handle_text = [all_content]
+
+ for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
+ # handle footnotes
+ if start is not None and start.tag in self.footnote_tags:
+ footnote = []
+
+ def collect_footnote(t):
+ footnote.append(t)
+
+ handle_text.append(collect_footnote)
+ elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
+ handle_text.pop()
+ doc = add_part(snippets, header_index=position, header_type=header.tag,
+ text=u''.join(footnote),
+ is_footnote=True)
+ self.index.add(doc)
+ #print "@ footnote text: %s" % footnote
+ footnote = []
+
+ # handle fragments and themes.
+ if start is not None and start.tag == 'begin':
+ fid = start.attrib['id'][1:]
+ fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+
+ # themes for this fragment
+ elif start is not None and start.tag == 'motyw':
+ fid = start.attrib['id'][1:]
+ handle_text.append(None)
+ if start.text is not None:
+ fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
+ elif end is not None and end.tag == 'motyw':
+ handle_text.pop()
+
+ elif start is not None and start.tag == 'end':
+ fid = start.attrib['id'][1:]
+ if fid not in fragments:
+ continue # a broken <end> node, skip it
+ frag = fragments[fid]
+ if frag['themes'] == []:
+ continue # empty themes list.
+ del fragments[fid]
+
+ doc = add_part(snippets,
+ header_type=frag['start_header'],
+ header_index=frag['start_section'],
+ header_span=position - frag['start_section'] + 1,
+ fragment_anchor=fid,
+ text=fix_format(frag['text']),
+ themes=frag['themes'])
+ #print '@ FRAG %s' % frag['content']
+ self.index.add(doc)
+
+ # Collect content.
+
+ if text is not None and handle_text is not []:
+ hdl = handle_text[-1]
+ if hdl is not None:
+ hdl(text)