1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian.parser import WLDocument
13 from librarian import XHTMLNS, ParseError
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 ENTITY_SUBSTITUTIONS = [
26 'legacy': 'xslt/book2html.xslt',
27 'full': 'xslt/wl2html_full.xslt',
28 'partial': 'xslt/wl2html_partial.xslt'
31 def get_stylesheet(name):
32 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
34 def substitute_entities(context, text):
35 """XPath extension function converting all entites in passed text."""
36 if isinstance(text, list):
38 for entity, substitutution in ENTITY_SUBSTITUTIONS:
39 text = text.replace(entity, substitutution)
42 # Register substitute_entities function with lxml
43 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
44 ns['substitute_entities'] = substitute_entities
46 def transform(input, output_filename=None, is_file=True, \
47 parse_dublincore=True, stylesheet='legacy', options={}):
48 """Transforms file input_filename in XML to output_filename in XHTML.
50 If output_filename is None, returns an XML,
51 otherwise returns True if file has been written,False if it hasn't.
52 File won't be written if it has no content.
56 style_filename = get_stylesheet(stylesheet)
57 style = etree.parse(style_filename)
60 document = WLDocument.from_file(input, True, \
61 parse_dublincore=parse_dublincore)
63 document = WLDocument.from_string(input, True, \
64 parse_dublincore=parse_dublincore)
66 result = document.transform(style, **options)
67 del document # no longer needed large object :)
69 if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result):
70 add_anchors(result.getroot())
71 add_table_of_contents(result.getroot())
73 if output_filename is not None:
74 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
79 if output_filename is not None:
84 raise ValueError("'%s' is not a valid stylesheet.")
85 except (XMLSyntaxError, XSLTApplyError), e:
88 class Fragment(object):
89 def __init__(self, id, themes):
90 super(Fragment, self).__init__()
95 def append(self, event, element):
96 self.events.append((event, element))
98 def closed_events(self):
100 for event, element in self.events:
102 stack.append(('end', element))
107 print 'CLOSED NON-OPEN TAG:', element
110 return self.events + stack
114 for event, element in self.closed_events():
116 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
118 result.append(element.text)
120 result.append(u'</%s>' % element.tag)
122 result.append(element.tail)
124 result.append(element)
126 return ''.join(result)
128 def __unicode__(self):
129 return self.to_string()
132 def extract_fragments(input_filename):
133 """Extracts theme fragments from input_filename."""
135 closed_fragments = {}
137 for event, element in etree.iterparse(input_filename, events=('start', 'end')):
138 # Process begin and end elements
139 if element.get('class', '') in ('theme-begin', 'theme-end'):
140 if not event == 'end': continue # Process elements only once, on end event
143 if element.get('class', '') == 'theme-begin':
144 fragment = Fragment(id=element.get('fid'), themes=element.text)
147 if element.getparent().get('id', None) != 'book-text':
148 parents = [element.getparent()]
149 while parents[-1].getparent().get('id', None) != 'book-text':
150 parents.append(parents[-1].getparent())
153 for parent in parents:
154 fragment.append('start', parent)
156 open_fragments[fragment.id] = fragment
158 # Close existing fragment
161 fragment = open_fragments[element.get('fid')]
163 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
165 closed_fragments[fragment.id] = fragment
166 del open_fragments[fragment.id]
168 # Append element tail to lost_text (we don't want to lose any text)
170 for fragment_id in open_fragments:
171 open_fragments[fragment_id].append('text', element.tail)
174 # Process all elements except begin and end
176 # Omit annotation tags
177 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
178 if event == 'end' and element.tail:
179 for fragment_id in open_fragments:
180 open_fragments[fragment_id].append('text', element.tail)
182 for fragment_id in open_fragments:
183 open_fragments[fragment_id].append(event, copy.copy(element))
185 return closed_fragments, open_fragments
188 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
190 if link_text is None:
192 anchor = etree.Element('a', href='#%s' % prefix)
193 anchor.set('class', 'anchor')
194 anchor.text = unicode(link_text)
196 anchor.tail = element.text
198 element.insert(0, anchor)
201 anchor_target = etree.Element('a', name='%s' % prefix)
202 anchor_target.set('class', 'target')
203 anchor_target.text = u' '
205 anchor_target.tail = element.text
207 element.insert(0, anchor_target)
210 def any_ancestor(element, test):
211 for ancestor in element.iterancestors():
217 def add_anchors(root):
219 for element in root.iterdescendants():
220 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
221 or e.tag == 'blockquote'):
224 if element.tag == 'p' and 'verse' in element.get('class', ''):
225 if counter == 1 or counter % 5 == 0:
226 add_anchor(element, "f%d" % counter, link_text=counter)
228 elif 'paragraph' in element.get('class', ''):
229 add_anchor(element, "f%d" % counter, link_text=counter)
233 def add_table_of_contents(root):
236 for element in root.iterdescendants():
237 if element.tag in ('h2', 'h3'):
238 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
241 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
242 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
244 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
245 add_anchor(element, "s%d" % counter, with_link=False)
248 toc = etree.Element('div')
250 toc_header = etree.SubElement(toc, 'h2')
251 toc_header.text = u'Spis treści'
252 toc_list = etree.SubElement(toc, 'ol')
254 for n, section, text, subsections in sections:
255 section_element = etree.SubElement(toc_list, 'li')
256 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
259 subsection_list = etree.SubElement(section_element, 'ol')
260 for n, subsection, text, _ in subsections:
261 subsection_element = etree.SubElement(subsection_list, 'li')
262 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)