1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian.parser import WLDocument
13 from librarian import XHTMLNS, ParseError
14 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
18 functions.reg_substitute_entities()
19 functions.reg_person_name()
22 'legacy': 'xslt/book2html.xslt',
23 'full': 'xslt/wl2html_full.xslt',
24 'partial': 'xslt/wl2html_partial.xslt'
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
30 def html_has_content(text):
31 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
33 def transform(input, output_filename=None, is_file=True, \
34 parse_dublincore=True, stylesheet='legacy', options={}, flags=None):
35 """Transforms file input_filename in XML to output_filename in XHTML.
37 If output_filename is None, returns an XML,
38 otherwise returns True if file has been written,False if it hasn't.
39 File won't be written if it has no content.
43 style_filename = get_stylesheet(stylesheet)
44 style = etree.parse(style_filename)
47 document = WLDocument.from_file(input, True, \
48 parse_dublincore=parse_dublincore)
50 document = WLDocument.from_string(input, True, \
51 parse_dublincore=parse_dublincore)
55 document.edoc.getroot().set(flag, 'yes')
57 document.clean_ed_note()
59 result = document.transform(style, **options)
60 del document # no longer needed large object :)
62 if html_has_content(result):
63 add_anchors(result.getroot())
64 add_table_of_contents(result.getroot())
66 if output_filename is not None:
67 result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
72 if output_filename is not None:
77 raise ValueError("'%s' is not a valid stylesheet.")
78 except (XMLSyntaxError, XSLTApplyError), e:
81 class Fragment(object):
82 def __init__(self, id, themes):
83 super(Fragment, self).__init__()
88 def append(self, event, element):
89 self.events.append((event, element))
91 def closed_events(self):
93 for event, element in self.events:
95 stack.append(('end', element))
100 print 'CLOSED NON-OPEN TAG:', element
103 return self.events + stack
107 for event, element in self.closed_events():
109 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
111 result.append(element.text)
113 result.append(u'</%s>' % element.tag)
115 result.append(element.tail)
117 result.append(element)
119 return ''.join(result)
121 def __unicode__(self):
122 return self.to_string()
125 def extract_fragments(input_filename):
126 """Extracts theme fragments from input_filename."""
128 closed_fragments = {}
130 # iterparse would die on a HTML document
131 parser = etree.HTMLParser(encoding='utf-8')
132 buf = cStringIO.StringIO()
133 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
136 for event, element in etree.iterparse(buf, events=('start', 'end')):
137 # Process begin and end elements
138 if element.get('class', '') in ('theme-begin', 'theme-end'):
139 if not event == 'end': continue # Process elements only once, on end event
142 if element.get('class', '') == 'theme-begin':
143 fragment = Fragment(id=element.get('fid'), themes=element.text)
146 if element.getparent().get('id', None) != 'book-text':
147 parents = [element.getparent()]
148 while parents[-1].getparent().get('id', None) != 'book-text':
149 parents.append(parents[-1].getparent())
152 for parent in parents:
153 fragment.append('start', parent)
155 open_fragments[fragment.id] = fragment
157 # Close existing fragment
160 fragment = open_fragments[element.get('fid')]
162 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
164 closed_fragments[fragment.id] = fragment
165 del open_fragments[fragment.id]
167 # Append element tail to lost_text (we don't want to lose any text)
169 for fragment_id in open_fragments:
170 open_fragments[fragment_id].append('text', element.tail)
173 # Process all elements except begin and end
175 # Omit annotation tags
176 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
177 if event == 'end' and element.tail:
178 for fragment_id in open_fragments:
179 open_fragments[fragment_id].append('text', element.tail)
181 for fragment_id in open_fragments:
182 open_fragments[fragment_id].append(event, copy.copy(element))
184 return closed_fragments, open_fragments
187 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
189 if link_text is None:
191 anchor = etree.Element('a', href='#%s' % prefix)
192 anchor.set('class', 'anchor')
193 anchor.text = unicode(link_text)
195 anchor.tail = element.text
197 element.insert(0, anchor)
200 anchor_target = etree.Element('a', name='%s' % prefix)
201 anchor_target.set('class', 'target')
202 anchor_target.text = u' '
204 anchor_target.tail = element.text
206 element.insert(0, anchor_target)
209 def any_ancestor(element, test):
210 for ancestor in element.iterancestors():
216 def add_anchors(root):
218 for element in root.iterdescendants():
219 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
220 or e.get('id') == 'nota_red'
221 or e.tag == 'blockquote'):
224 if element.tag == 'p' and 'verse' in element.get('class', ''):
225 if counter == 1 or counter % 5 == 0:
226 add_anchor(element, "f%d" % counter, link_text=counter)
228 elif 'paragraph' in element.get('class', ''):
229 add_anchor(element, "f%d" % counter, link_text=counter)
233 def add_table_of_contents(root):
236 for element in root.iterdescendants():
237 if element.tag in ('h2', 'h3'):
238 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
241 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
242 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
244 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
245 add_anchor(element, "s%d" % counter, with_link=False)
248 toc = etree.Element('div')
250 toc_header = etree.SubElement(toc, 'h2')
251 toc_header.text = u'Spis treści'
252 toc_list = etree.SubElement(toc, 'ol')
254 for n, section, text, subsections in sections:
255 section_element = etree.SubElement(toc_list, 'li')
256 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
259 subsection_list = etree.SubElement(section_element, 'ol')
260 for n, subsection, text, _ in subsections:
261 subsection_element = etree.SubElement(subsection_list, 'li')
262 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
267 def extract_annotations(html_path):
268 """For each annotation, yields a tuple: anchor, text, html."""
269 parser = etree.HTMLParser(encoding='utf-8')
270 tree = etree.parse(html_path, parser)
271 for footnote in tree.find('//*[@id="footnotes"]').findall('div'):
272 anchor = footnote.find('a[@href]').get('href')
274 text_str = etree.tostring(footnote, method='text', encoding='utf-8')
275 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
276 yield anchor, text_str, html_str