1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian.parser import WLDocument
13 from librarian import XHTMLNS, ParseError
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 ENTITY_SUBSTITUTIONS = [
26 'legacy': 'xslt/book2html.xslt',
27 'full': 'xslt/wl2html_full.xslt',
28 'partial': 'xslt/wl2html_partial.xslt'
31 def get_stylesheet(name):
32 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
34 def substitute_entities(context, text):
35 """XPath extension function converting all entites in passed text."""
36 if isinstance(text, list):
38 for entity, substitutution in ENTITY_SUBSTITUTIONS:
39 text = text.replace(entity, substitutution)
42 # Register substitute_entities function with lxml
43 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
44 ns['substitute_entities'] = substitute_entities
46 def transform(input, output_filename=None, is_file=True, \
47 parse_dublincore=True, stylesheet='legacy', options={}):
48 """Transforms file input_filename in XML to output_filename in XHTML."""
51 style_filename = get_stylesheet(stylesheet)
52 style = etree.parse(style_filename)
55 document = WLDocument.from_file(input, True, \
56 parse_dublincore=parse_dublincore)
58 document = WLDocument.from_string(input, True, \
59 parse_dublincore=parse_dublincore)
61 result = document.transform(style, **options)
62 del document # no longer needed large object :)
64 if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result) is not None:
65 add_anchors(result.getroot())
66 add_table_of_contents(result.getroot())
68 if output_filename is not None:
69 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
76 raise ValueError("'%s' is not a valid stylesheet.")
77 except (XMLSyntaxError, XSLTApplyError), e:
80 class Fragment(object):
81 def __init__(self, id, themes):
82 super(Fragment, self).__init__()
87 def append(self, event, element):
88 self.events.append((event, element))
90 def closed_events(self):
92 for event, element in self.events:
94 stack.append(('end', element))
99 print 'CLOSED NON-OPEN TAG:', element
102 return self.events + stack
106 for event, element in self.closed_events():
108 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
110 result.append(element.text)
112 result.append(u'</%s>' % element.tag)
114 result.append(element.tail)
116 result.append(element)
118 return ''.join(result)
120 def __unicode__(self):
121 return self.to_string()
124 def extract_fragments(input_filename):
125 """Extracts theme fragments from input_filename."""
127 closed_fragments = {}
129 for event, element in etree.iterparse(input_filename, events=('start', 'end')):
130 # Process begin and end elements
131 if element.get('class', '') in ('theme-begin', 'theme-end'):
132 if not event == 'end': continue # Process elements only once, on end event
135 if element.get('class', '') == 'theme-begin':
136 fragment = Fragment(id=element.get('fid'), themes=element.text)
139 if element.getparent().get('id', None) != 'book-text':
140 parents = [element.getparent()]
141 while parents[-1].getparent().get('id', None) != 'book-text':
142 parents.append(parents[-1].getparent())
145 for parent in parents:
146 fragment.append('start', parent)
148 open_fragments[fragment.id] = fragment
150 # Close existing fragment
153 fragment = open_fragments[element.get('fid')]
155 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
157 closed_fragments[fragment.id] = fragment
158 del open_fragments[fragment.id]
160 # Append element tail to lost_text (we don't want to lose any text)
162 for fragment_id in open_fragments:
163 open_fragments[fragment_id].append('text', element.tail)
166 # Process all elements except begin and end
168 # Omit annotation tags
169 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
170 if event == 'end' and element.tail:
171 for fragment_id in open_fragments:
172 open_fragments[fragment_id].append('text', element.tail)
174 for fragment_id in open_fragments:
175 open_fragments[fragment_id].append(event, copy.copy(element))
177 return closed_fragments, open_fragments
180 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
182 if link_text is None:
184 anchor = etree.Element('a', href='#%s' % prefix)
185 anchor.set('class', 'anchor')
186 anchor.text = unicode(link_text)
188 anchor.tail = element.text
190 element.insert(0, anchor)
193 anchor_target = etree.Element('a', name='%s' % prefix)
194 anchor_target.set('class', 'target')
195 anchor_target.text = u' '
197 anchor_target.tail = element.text
199 element.insert(0, anchor_target)
202 def any_ancestor(element, test):
203 for ancestor in element.iterancestors():
209 def add_anchors(root):
211 for element in root.iterdescendants():
212 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
213 or e.tag == 'blockquote'):
216 if element.tag == 'p' and 'verse' in element.get('class', ''):
217 if counter == 1 or counter % 5 == 0:
218 add_anchor(element, "f%d" % counter, link_text=counter)
220 elif 'paragraph' in element.get('class', ''):
221 add_anchor(element, "f%d" % counter, link_text=counter)
225 def add_table_of_contents(root):
228 for element in root.iterdescendants():
229 if element.tag in ('h2', 'h3'):
230 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
233 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
234 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
236 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
237 add_anchor(element, "s%d" % counter, with_link=False)
240 toc = etree.Element('div')
242 toc_header = etree.SubElement(toc, 'h2')
243 toc_header.text = u'Spis treści'
244 toc_list = etree.SubElement(toc, 'ol')
246 for n, section, text, subsections in sections:
247 section_element = etree.SubElement(toc_list, 'li')
248 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
251 subsection_list = etree.SubElement(section_element, 'ol')
252 for n, subsection, text, _ in subsections:
253 subsection_element = etree.SubElement(subsection_list, 'li')
254 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)