1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, IOFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
26 def get_stylesheet(name):
27 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
30 def html_has_content(text):
31 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
34 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
35 """Transforms the WL document to XHTML.
37 If output_filename is None, returns an XML,
38 otherwise returns True if file has been written,False if it hasn't.
39 File won't be written if it has no content.
43 style_filename = get_stylesheet(stylesheet)
44 style = etree.parse(style_filename)
46 document = copy.deepcopy(wldoc)
48 document.swap_endlines()
52 document.edoc.getroot().set(flag, 'yes')
54 document.clean_ed_note()
58 result = document.transform(style, **options)
59 del document # no longer needed large object :)
61 if html_has_content(result):
62 add_anchors(result.getroot())
63 add_table_of_contents(result.getroot())
65 return IOFile.from_string(
66 etree.tostring(result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
70 raise ValueError("'%s' is not a valid stylesheet.")
71 except (XMLSyntaxError, XSLTApplyError), e:
75 class Fragment(object):
76 def __init__(self, id, themes):
77 super(Fragment, self).__init__()
82 def append(self, event, element):
83 self.events.append((event, element))
85 def closed_events(self):
87 for event, element in self.events:
89 stack.append(('end', element))
94 print 'CLOSED NON-OPEN TAG:', element
97 return self.events + stack
101 for event, element in self.closed_events():
103 result.append(u'<%s %s>' % (
104 element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
106 result.append(element.text)
108 result.append(u'</%s>' % element.tag)
110 result.append(element.tail)
112 result.append(element)
114 return ''.join(result)
116 def __unicode__(self):
117 return self.to_string()
120 def extract_fragments(input_filename):
121 """Extracts theme fragments from input_filename."""
123 closed_fragments = {}
125 # iterparse would die on a HTML document
126 parser = etree.HTMLParser(encoding='utf-8')
127 buf = cStringIO.StringIO()
128 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
131 for event, element in etree.iterparse(buf, events=('start', 'end')):
132 # Process begin and end elements
133 if element.get('class', '') in ('theme-begin', 'theme-end'):
134 if not event == 'end':
135 continue # Process elements only once, on end event
138 if element.get('class', '') == 'theme-begin':
139 fragment = Fragment(id=element.get('fid'), themes=element.text)
142 if element.getparent().get('id', None) != 'book-text':
143 parents = [element.getparent()]
144 while parents[-1].getparent().get('id', None) != 'book-text':
145 parents.append(parents[-1].getparent())
148 for parent in parents:
149 fragment.append('start', parent)
151 open_fragments[fragment.id] = fragment
153 # Close existing fragment
156 fragment = open_fragments[element.get('fid')]
158 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
160 closed_fragments[fragment.id] = fragment
161 del open_fragments[fragment.id]
163 # Append element tail to lost_text (we don't want to lose any text)
165 for fragment_id in open_fragments:
166 open_fragments[fragment_id].append('text', element.tail)
168 # Process all elements except begin and end
170 # Omit annotation tags
171 if (len(element.get('name', '')) or
172 element.get('class', '') in ('annotation', 'anchor')):
173 if event == 'end' and element.tail:
174 for fragment_id in open_fragments:
175 open_fragments[fragment_id].append('text', element.tail)
177 for fragment_id in open_fragments:
178 open_fragments[fragment_id].append(event, copy.copy(element))
180 return closed_fragments, open_fragments
183 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
185 if link_text is None:
187 anchor = etree.Element('a', href='#%s' % prefix)
188 anchor.set('class', 'anchor')
189 anchor.text = unicode(link_text)
191 anchor.tail = element.text
193 element.insert(0, anchor)
196 anchor_target = etree.Element('a', name='%s' % prefix)
197 anchor_target.set('class', 'target')
198 anchor_target.text = u' '
200 anchor_target.tail = element.text
202 element.insert(0, anchor_target)
205 def any_ancestor(element, test):
206 for ancestor in element.iterancestors():
212 def add_anchors(root):
216 side_classes = ('note', 'motto', 'motto_podpis', 'dedication')
217 return e.get('class') in side_classes or e.get('id') == 'nota_red' or e.tag == 'blockquote'
219 for element in root.iterdescendants():
220 if any_ancestor(element, is_side_text):
223 if element.tag == 'p' and 'verse' in element.get('class', ''):
224 if counter == 1 or counter % 5 == 0:
225 add_anchor(element, "f%d" % counter, link_text=counter)
227 elif 'paragraph' in element.get('class', ''):
228 add_anchor(element, "f%d" % counter, link_text=counter)
232 def raw_printable_text(element):
233 working = copy.deepcopy(element)
234 for e in working.findall('a'):
235 if e.get('class') == 'annotation':
237 return etree.tostring(working, method='text', encoding=unicode).strip()
240 def add_table_of_contents(root):
245 return e.get('id') in ('footnotes', 'nota_red') or e.get('class') == 'person-list'
247 for element in root.iterdescendants():
248 if element.tag in ('h2', 'h3'):
249 if any_ancestor(element, is_side_text):
252 element_text = raw_printable_text(element)
253 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
254 sections[-1][3].append((counter, element.tag, element_text, []))
256 sections.append((counter, element.tag, element_text, []))
257 add_anchor(element, "s%d" % counter, with_link=False)
260 toc = etree.Element('div')
262 toc_header = etree.SubElement(toc, 'h2')
263 toc_header.text = u'Spis treści'
264 toc_list = etree.SubElement(toc, 'ol')
266 for n, section, text, subsections in sections:
267 section_element = etree.SubElement(toc_list, 'li')
268 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
271 subsection_list = etree.SubElement(section_element, 'ol')
272 for n1, subsection, text1, _ in subsections:
273 subsection_element = etree.SubElement(subsection_list, 'li')
274 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=text1)
279 def extract_annotations(html_path):
280 """For each annotation, yields a tuple: anchor, text, html."""
281 parser = etree.HTMLParser(encoding='utf-8')
282 tree = etree.parse(html_path, parser)
283 footnotes = tree.find('//*[@id="footnotes"]')
284 if footnotes is not None:
285 for footnote in footnotes.findall('div'):
286 anchor = footnote.find('a[@name]').get('name')
288 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
289 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
290 yield anchor, text_str, html_str