1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian.parser import WLDocument
13 from librarian import XHTMLNS, ParseError
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 ENTITY_SUBSTITUTIONS = [
26 'legacy': 'xslt/book2html.xslt',
27 'full': 'xslt/wl2html_full.xslt',
28 'partial': 'xslt/wl2html_partial.xslt'
31 def get_stylesheet(name):
32 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
34 def substitute_entities(context, text):
35 """XPath extension function converting all entites in passed text."""
36 if isinstance(text, list):
38 for entity, substitutution in ENTITY_SUBSTITUTIONS:
39 text = text.replace(entity, substitutution)
42 # Register substitute_entities function with lxml
43 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
44 ns['substitute_entities'] = substitute_entities
46 def html_has_content(text):
47 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
49 def transform(input, output_filename=None, is_file=True, \
50 parse_dublincore=True, stylesheet='legacy', options={}):
51 """Transforms file input_filename in XML to output_filename in XHTML.
53 If output_filename is None, returns an XML,
54 otherwise returns True if file has been written,False if it hasn't.
55 File won't be written if it has no content.
59 style_filename = get_stylesheet(stylesheet)
60 style = etree.parse(style_filename)
63 document = WLDocument.from_file(input, True, \
64 parse_dublincore=parse_dublincore)
66 document = WLDocument.from_string(input, True, \
67 parse_dublincore=parse_dublincore)
69 result = document.transform(style, **options)
70 del document # no longer needed large object :)
72 if html_has_content(result):
73 add_anchors(result.getroot())
74 add_table_of_contents(result.getroot())
76 if output_filename is not None:
77 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
82 if output_filename is not None:
87 raise ValueError("'%s' is not a valid stylesheet.")
88 except (XMLSyntaxError, XSLTApplyError), e:
91 class Fragment(object):
92 def __init__(self, id, themes):
93 super(Fragment, self).__init__()
98 def append(self, event, element):
99 self.events.append((event, element))
101 def closed_events(self):
103 for event, element in self.events:
105 stack.append(('end', element))
110 print 'CLOSED NON-OPEN TAG:', element
113 return self.events + stack
117 for event, element in self.closed_events():
119 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
121 result.append(element.text)
123 result.append(u'</%s>' % element.tag)
125 result.append(element.tail)
127 result.append(element)
129 return ''.join(result)
131 def __unicode__(self):
132 return self.to_string()
135 def extract_fragments(input_filename):
136 """Extracts theme fragments from input_filename."""
138 closed_fragments = {}
140 for event, element in etree.iterparse(input_filename, events=('start', 'end')):
141 # Process begin and end elements
142 if element.get('class', '') in ('theme-begin', 'theme-end'):
143 if not event == 'end': continue # Process elements only once, on end event
146 if element.get('class', '') == 'theme-begin':
147 fragment = Fragment(id=element.get('fid'), themes=element.text)
150 if element.getparent().get('id', None) != 'book-text':
151 parents = [element.getparent()]
152 while parents[-1].getparent().get('id', None) != 'book-text':
153 parents.append(parents[-1].getparent())
156 for parent in parents:
157 fragment.append('start', parent)
159 open_fragments[fragment.id] = fragment
161 # Close existing fragment
164 fragment = open_fragments[element.get('fid')]
166 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
168 closed_fragments[fragment.id] = fragment
169 del open_fragments[fragment.id]
171 # Append element tail to lost_text (we don't want to lose any text)
173 for fragment_id in open_fragments:
174 open_fragments[fragment_id].append('text', element.tail)
177 # Process all elements except begin and end
179 # Omit annotation tags
180 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
181 if event == 'end' and element.tail:
182 for fragment_id in open_fragments:
183 open_fragments[fragment_id].append('text', element.tail)
185 for fragment_id in open_fragments:
186 open_fragments[fragment_id].append(event, copy.copy(element))
188 return closed_fragments, open_fragments
191 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
193 if link_text is None:
195 anchor = etree.Element('a', href='#%s' % prefix)
196 anchor.set('class', 'anchor')
197 anchor.text = unicode(link_text)
199 anchor.tail = element.text
201 element.insert(0, anchor)
204 anchor_target = etree.Element('a', name='%s' % prefix)
205 anchor_target.set('class', 'target')
206 anchor_target.text = u' '
208 anchor_target.tail = element.text
210 element.insert(0, anchor_target)
213 def any_ancestor(element, test):
214 for ancestor in element.iterancestors():
220 def add_anchors(root):
222 for element in root.iterdescendants():
223 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
224 or e.tag == 'blockquote'):
227 if element.tag == 'p' and 'verse' in element.get('class', ''):
228 if counter == 1 or counter % 5 == 0:
229 add_anchor(element, "f%d" % counter, link_text=counter)
231 elif 'paragraph' in element.get('class', ''):
232 add_anchor(element, "f%d" % counter, link_text=counter)
236 def add_table_of_contents(root):
239 for element in root.iterdescendants():
240 if element.tag in ('h2', 'h3'):
241 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
244 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
245 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
247 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
248 add_anchor(element, "s%d" % counter, with_link=False)
251 toc = etree.Element('div')
253 toc_header = etree.SubElement(toc, 'h2')
254 toc_header.text = u'Spis treści'
255 toc_list = etree.SubElement(toc, 'ol')
257 for n, section, text, subsections in sections:
258 section_element = etree.SubElement(toc_list, 'li')
259 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
262 subsection_list = etree.SubElement(section_element, 'ol')
263 for n, subsection, text, _ in subsections:
264 subsection_element = etree.SubElement(subsection_list, 'li')
265 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)