1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_contents(result.getroot())
62 return OutputFile.from_string(etree.tostring(result, method='html',
63 xml_declaration=False, pretty_print=True, encoding='utf-8'))
67 raise ValueError("'%s' is not a valid stylesheet.")
68 except (XMLSyntaxError, XSLTApplyError), e:
71 class Fragment(object):
72 def __init__(self, id, themes):
73 super(Fragment, self).__init__()
78 def append(self, event, element):
79 self.events.append((event, element))
81 def closed_events(self):
83 for event, element in self.events:
85 stack.append(('end', element))
90 print 'CLOSED NON-OPEN TAG:', element
93 return self.events + stack
97 for event, element in self.closed_events():
99 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
101 result.append(element.text)
103 result.append(u'</%s>' % element.tag)
105 result.append(element.tail)
107 result.append(element)
109 return ''.join(result)
111 def __unicode__(self):
112 return self.to_string()
115 def extract_fragments(input_filename):
116 """Extracts theme fragments from input_filename."""
118 closed_fragments = {}
120 # iterparse would die on a HTML document
121 parser = etree.HTMLParser(encoding='utf-8')
122 buf = cStringIO.StringIO()
123 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
126 for event, element in etree.iterparse(buf, events=('start', 'end')):
127 # Process begin and end elements
128 if element.get('class', '') in ('theme-begin', 'theme-end'):
129 if not event == 'end': continue # Process elements only once, on end event
132 if element.get('class', '') == 'theme-begin':
133 fragment = Fragment(id=element.get('fid'), themes=element.text)
136 if element.getparent().get('id', None) != 'book-text':
137 parents = [element.getparent()]
138 while parents[-1].getparent().get('id', None) != 'book-text':
139 parents.append(parents[-1].getparent())
142 for parent in parents:
143 fragment.append('start', parent)
145 open_fragments[fragment.id] = fragment
147 # Close existing fragment
150 fragment = open_fragments[element.get('fid')]
152 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
154 closed_fragments[fragment.id] = fragment
155 del open_fragments[fragment.id]
157 # Append element tail to lost_text (we don't want to lose any text)
159 for fragment_id in open_fragments:
160 open_fragments[fragment_id].append('text', element.tail)
163 # Process all elements except begin and end
165 # Omit annotation tags
166 if (len(element.get('name', '')) or
167 element.get('class', '') in ('annotation', 'anchor')):
168 if event == 'end' and element.tail:
169 for fragment_id in open_fragments:
170 open_fragments[fragment_id].append('text', element.tail)
172 for fragment_id in open_fragments:
173 open_fragments[fragment_id].append(event, copy.copy(element))
175 return closed_fragments, open_fragments
178 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
180 if link_text is None:
182 anchor = etree.Element('a', href='#%s' % prefix)
183 anchor.set('class', 'anchor')
184 anchor.text = unicode(link_text)
186 anchor.tail = element.text
188 element.insert(0, anchor)
191 anchor_target = etree.Element('a', name='%s' % prefix)
192 anchor_target.set('class', 'target')
193 anchor_target.text = u' '
195 anchor_target.tail = element.text
197 element.insert(0, anchor_target)
200 def any_ancestor(element, test):
201 for ancestor in element.iterancestors():
207 def add_anchors(root):
209 for element in root.iterdescendants():
210 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
211 or e.get('id') == 'nota_red'
212 or e.tag == 'blockquote'):
215 if element.tag == 'p' and 'verse' in element.get('class', ''):
216 if counter == 1 or counter % 5 == 0:
217 add_anchor(element, "f%d" % counter, link_text=counter)
219 elif 'paragraph' in element.get('class', ''):
220 add_anchor(element, "f%d" % counter, link_text=counter)
224 def raw_printable_text(element):
225 working = copy.deepcopy(element)
226 for e in working.findall('a'):
227 if e.get('class') == 'annotation':
229 return etree.tostring(working, method='text', encoding=unicode).strip()
232 def add_table_of_contents(root):
235 for element in root.iterdescendants():
236 if element.tag in ('h2', 'h3'):
237 if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
240 element_text = raw_printable_text(element)
241 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
242 sections[-1][3].append((counter, element.tag, element_text, []))
244 sections.append((counter, element.tag, element_text, []))
245 add_anchor(element, "s%d" % counter, with_link=False)
248 toc = etree.Element('div')
250 toc_header = etree.SubElement(toc, 'h2')
251 toc_header.text = u'Spis treści'
252 toc_list = etree.SubElement(toc, 'ol')
254 for n, section, text, subsections in sections:
255 section_element = etree.SubElement(toc_list, 'li')
256 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
259 subsection_list = etree.SubElement(section_element, 'ol')
260 for n, subsection, text, _ in subsections:
261 subsection_element = etree.SubElement(subsection_list, 'li')
262 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
267 def extract_annotations(html_path):
268 """For each annotation, yields a tuple: anchor, text, html."""
269 parser = etree.HTMLParser(encoding='utf-8')
270 tree = etree.parse(html_path, parser)
271 footnotes = tree.find('//*[@id="footnotes"]')
272 if footnotes is not None:
273 for footnote in footnotes.findall('div'):
274 anchor = footnote.find('a[@name]').get('name')
276 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
277 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
278 yield anchor, text_str, html_str