1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_contents(result.getroot())
62 return OutputFile.from_string(etree.tostring(result, method='html',
63 xml_declaration=False, pretty_print=True, encoding='utf-8'))
67 raise ValueError("'%s' is not a valid stylesheet.")
68 except (XMLSyntaxError, XSLTApplyError), e:
71 class Fragment(object):
72 def __init__(self, id, themes):
73 super(Fragment, self).__init__()
78 def append(self, event, element):
79 self.events.append((event, element))
81 def closed_events(self):
83 for event, element in self.events:
85 stack.append(('end', element))
90 print 'CLOSED NON-OPEN TAG:', element
93 return self.events + stack
97 for event, element in self.closed_events():
99 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
101 result.append(element.text)
103 result.append(u'</%s>' % element.tag)
105 result.append(element.tail)
107 result.append(element)
109 return ''.join(result)
111 def __unicode__(self):
112 return self.to_string()
115 def extract_fragments(input_filename):
116 """Extracts theme fragments from input_filename."""
118 closed_fragments = {}
120 # iterparse would die on a HTML document
121 parser = etree.HTMLParser(encoding='utf-8')
122 buf = cStringIO.StringIO()
123 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
126 for event, element in etree.iterparse(buf, events=('start', 'end')):
127 # Process begin and end elements
128 if element.get('class', '') in ('theme-begin', 'theme-end'):
129 if not event == 'end': continue # Process elements only once, on end event
132 if element.get('class', '') == 'theme-begin':
133 fragment = Fragment(id=element.get('fid'), themes=element.text)
136 if element.getparent().get('id', None) != 'book-text':
137 parents = [element.getparent()]
138 while parents[-1].getparent().get('id', None) != 'book-text':
139 parents.append(parents[-1].getparent())
142 for parent in parents:
143 fragment.append('start', parent)
145 open_fragments[fragment.id] = fragment
147 # Close existing fragment
150 fragment = open_fragments[element.get('fid')]
152 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
154 closed_fragments[fragment.id] = fragment
155 del open_fragments[fragment.id]
157 # Append element tail to lost_text (we don't want to lose any text)
159 for fragment_id in open_fragments:
160 open_fragments[fragment_id].append('text', element.tail)
163 # Process all elements except begin and end
165 # Omit annotation tags
166 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
167 if event == 'end' and element.tail:
168 for fragment_id in open_fragments:
169 open_fragments[fragment_id].append('text', element.tail)
171 for fragment_id in open_fragments:
172 open_fragments[fragment_id].append(event, copy.copy(element))
174 return closed_fragments, open_fragments
177 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
179 if link_text is None:
181 anchor = etree.Element('a', href='#%s' % prefix)
182 anchor.set('class', 'anchor')
183 anchor.text = unicode(link_text)
185 anchor.tail = element.text
187 element.insert(0, anchor)
190 anchor_target = etree.Element('a', name='%s' % prefix)
191 anchor_target.set('class', 'target')
192 anchor_target.text = u' '
194 anchor_target.tail = element.text
196 element.insert(0, anchor_target)
199 def any_ancestor(element, test):
200 for ancestor in element.iterancestors():
206 def add_anchors(root):
208 for element in root.iterdescendants():
209 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
210 or e.get('id') == 'nota_red'
211 or e.tag == 'blockquote'):
214 if element.tag == 'p' and 'verse' in element.get('class', ''):
215 if counter == 1 or counter % 5 == 0:
216 add_anchor(element, "f%d" % counter, link_text=counter)
218 elif 'paragraph' in element.get('class', ''):
219 add_anchor(element, "f%d" % counter, link_text=counter)
223 def add_table_of_contents(root):
226 for element in root.iterdescendants():
227 if element.tag in ('h2', 'h3'):
228 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
231 element_text = etree.tostring(element, method='text',
232 encoding=unicode).strip()
233 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
234 sections[-1][3].append((counter, element.tag, element_text, []))
236 sections.append((counter, element.tag, element_text, []))
237 add_anchor(element, "s%d" % counter, with_link=False)
240 toc = etree.Element('div')
242 toc_header = etree.SubElement(toc, 'h2')
243 toc_header.text = u'Spis treści'
244 toc_list = etree.SubElement(toc, 'ol')
246 for n, section, text, subsections in sections:
247 section_element = etree.SubElement(toc_list, 'li')
248 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
251 subsection_list = etree.SubElement(section_element, 'ol')
252 for n, subsection, text, _ in subsections:
253 subsection_element = etree.SubElement(subsection_list, 'li')
254 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
259 def extract_annotations(html_path):
260 """For each annotation, yields a tuple: anchor, text, html."""
261 parser = etree.HTMLParser(encoding='utf-8')
262 tree = etree.parse(html_path, parser)
263 footnotes = tree.find('//*[@id="footnotes"]')
264 if footnotes is not None:
265 for footnote in footnotes.findall('div'):
266 anchor = footnote.find('a[@name]').get('name')
268 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
269 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
270 yield anchor, text_str, html_str