1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_contents(result.getroot())
62 return OutputFile.from_string(etree.tostring(result, method='html',
63 xml_declaration=False, pretty_print=True, encoding='utf-8'))
67 raise ValueError("'%s' is not a valid stylesheet.")
68 except (XMLSyntaxError, XSLTApplyError), e:
71 class Fragment(object):
72 def __init__(self, id, themes):
73 super(Fragment, self).__init__()
78 def append(self, event, element):
79 self.events.append((event, element))
81 def closed_events(self):
83 for event, element in self.events:
85 stack.append(('end', element))
90 print 'CLOSED NON-OPEN TAG:', element
93 return self.events + stack
97 for event, element in self.closed_events():
99 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
101 result.append(element.text)
103 result.append(u'</%s>' % element.tag)
105 result.append(element.tail)
107 result.append(element)
109 return ''.join(result)
111 def __unicode__(self):
112 return self.to_string()
115 def extract_fragments(input_filename):
116 """Extracts theme fragments from input_filename."""
118 closed_fragments = {}
120 # iterparse would die on a HTML document
121 parser = etree.HTMLParser(encoding='utf-8')
122 buf = cStringIO.StringIO()
123 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
126 for event, element in etree.iterparse(buf, events=('start', 'end')):
127 # Process begin and end elements
128 if element.get('class', '') in ('theme-begin', 'theme-end'):
129 if not event == 'end': continue # Process elements only once, on end event
132 if element.get('class', '') == 'theme-begin':
133 fragment = Fragment(id=element.get('fid'), themes=element.text)
136 if element.getparent().get('id', None) != 'book-text':
137 parents = [element.getparent()]
138 while parents[-1].getparent().get('id', None) != 'book-text':
139 parents.append(parents[-1].getparent())
142 for parent in parents:
143 fragment.append('start', parent)
145 open_fragments[fragment.id] = fragment
147 # Close existing fragment
150 fragment = open_fragments[element.get('fid')]
152 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
154 closed_fragments[fragment.id] = fragment
155 del open_fragments[fragment.id]
157 # Append element tail to lost_text (we don't want to lose any text)
159 for fragment_id in open_fragments:
160 open_fragments[fragment_id].append('text', element.tail)
163 # Process all elements except begin and end
165 # Omit annotation tags
166 if (len(element.get('name', '')) or
167 element.get('class', '') in ('annotation', 'anchor')):
168 if event == 'end' and element.tail:
169 for fragment_id in open_fragments:
170 open_fragments[fragment_id].append('text', element.tail)
172 for fragment_id in open_fragments:
173 open_fragments[fragment_id].append(event, copy.copy(element))
175 return closed_fragments, open_fragments
178 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
180 if link_text is None:
182 anchor = etree.Element('a', href='#%s' % prefix)
183 anchor.set('class', 'anchor')
184 anchor.text = unicode(link_text)
186 anchor.tail = element.text
188 element.insert(0, anchor)
191 anchor_target = etree.Element('a', name='%s' % prefix)
192 anchor_target.set('class', 'target')
193 anchor_target.text = u' '
195 anchor_target.tail = element.text
197 element.insert(0, anchor_target)
200 def any_ancestor(element, test):
201 for ancestor in element.iterancestors():
207 def add_anchors(root):
209 for element in root.iterdescendants():
210 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
211 or e.get('id') == 'nota_red'
212 or e.tag == 'blockquote'):
215 if element.tag == 'p' and 'verse' in element.get('class', ''):
216 if counter == 1 or counter % 5 == 0:
217 add_anchor(element, "f%d" % counter, link_text=counter)
219 elif 'paragraph' in element.get('class', ''):
220 add_anchor(element, "f%d" % counter, link_text=counter)
224 def add_table_of_contents(root):
227 for element in root.iterdescendants():
228 if element.tag in ('h2', 'h3'):
229 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
232 element_text = etree.tostring(element, method='text',
233 encoding=unicode).strip()
234 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
235 sections[-1][3].append((counter, element.tag, element_text, []))
237 sections.append((counter, element.tag, element_text, []))
238 add_anchor(element, "s%d" % counter, with_link=False)
241 toc = etree.Element('div')
243 toc_header = etree.SubElement(toc, 'h2')
244 toc_header.text = u'Spis treści'
245 toc_list = etree.SubElement(toc, 'ol')
247 for n, section, text, subsections in sections:
248 section_element = etree.SubElement(toc_list, 'li')
249 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
252 subsection_list = etree.SubElement(section_element, 'ol')
253 for n, subsection, text, _ in subsections:
254 subsection_element = etree.SubElement(subsection_list, 'li')
255 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
260 def extract_annotations(html_path):
261 """For each annotation, yields a tuple: anchor, text, html."""
262 parser = etree.HTMLParser(encoding='utf-8')
263 tree = etree.parse(html_path, parser)
264 footnotes = tree.find('//*[@id="footnotes"]')
265 if footnotes is not None:
266 for footnote in footnotes.findall('div'):
267 anchor = footnote.find('a[@name]').get('name')
269 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
270 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
271 yield anchor, text_str, html_str