1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_contents(result.getroot())
62 return OutputFile.from_string(etree.tostring(result, method='html',
63 xml_declaration=False, pretty_print=True, encoding='utf-8'))
67 raise ValueError("'%s' is not a valid stylesheet.")
68 except (XMLSyntaxError, XSLTApplyError), e:
71 class Fragment(object):
72 def __init__(self, id, themes):
73 super(Fragment, self).__init__()
78 def append(self, event, element):
79 self.events.append((event, element))
81 def closed_events(self):
83 for event, element in self.events:
85 stack.append(('end', element))
90 print 'CLOSED NON-OPEN TAG:', element
93 return self.events + stack
97 for event, element in self.closed_events():
99 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
101 result.append(element.text)
103 result.append(u'</%s>' % element.tag)
105 result.append(element.tail)
107 result.append(element)
109 return ''.join(result)
111 def __unicode__(self):
112 return self.to_string()
115 def extract_fragments(input_filename):
116 """Extracts theme fragments from input_filename."""
118 closed_fragments = {}
120 # iterparse would die on a HTML document
121 parser = etree.HTMLParser(encoding='utf-8')
122 buf = cStringIO.StringIO()
123 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
126 for event, element in etree.iterparse(buf, events=('start', 'end')):
127 # Process begin and end elements
128 if element.get('class', '') in ('theme-begin', 'theme-end'):
129 if not event == 'end': continue # Process elements only once, on end event
132 if element.get('class', '') == 'theme-begin':
133 fragment = Fragment(id=element.get('fid'), themes=element.text)
136 if element.getparent().get('id', None) != 'book-text':
137 parents = [element.getparent()]
138 while parents[-1].getparent().get('id', None) != 'book-text':
139 parents.append(parents[-1].getparent())
142 for parent in parents:
143 fragment.append('start', parent)
145 open_fragments[fragment.id] = fragment
147 # Close existing fragment
150 fragment = open_fragments[element.get('fid')]
152 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
154 closed_fragments[fragment.id] = fragment
155 del open_fragments[fragment.id]
157 # Append element tail to lost_text (we don't want to lose any text)
159 for fragment_id in open_fragments:
160 open_fragments[fragment_id].append('text', element.tail)
163 # Process all elements except begin and end
165 # Omit annotation tags
166 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
167 if event == 'end' and element.tail:
168 for fragment_id in open_fragments:
169 open_fragments[fragment_id].append('text', element.tail)
171 for fragment_id in open_fragments:
172 open_fragments[fragment_id].append(event, copy.copy(element))
174 return closed_fragments, open_fragments
177 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
179 if link_text is None:
181 anchor = etree.Element('a', href='#%s' % prefix)
182 anchor.set('class', 'anchor')
183 anchor.text = unicode(link_text)
185 anchor.tail = element.text
187 element.insert(0, anchor)
190 anchor_target = etree.Element('a', name='%s' % prefix)
191 anchor_target.set('class', 'target')
192 anchor_target.text = u' '
194 anchor_target.tail = element.text
196 element.insert(0, anchor_target)
199 def any_ancestor(element, test):
200 for ancestor in element.iterancestors():
206 def add_anchors(root):
208 for element in root.iterdescendants():
209 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
210 or e.get('id') == 'nota_red'
211 or e.tag == 'blockquote'):
214 if element.tag == 'p' and 'verse' in element.get('class', ''):
215 if counter == 1 or counter % 5 == 0:
216 add_anchor(element, "f%d" % counter, link_text=counter)
218 elif 'paragraph' in element.get('class', ''):
219 add_anchor(element, "f%d" % counter, link_text=counter)
223 def add_table_of_contents(root):
226 for element in root.iterdescendants():
227 if element.tag in ('h2', 'h3'):
228 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
231 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
232 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
234 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
235 add_anchor(element, "s%d" % counter, with_link=False)
238 toc = etree.Element('div')
240 toc_header = etree.SubElement(toc, 'h2')
241 toc_header.text = u'Spis treści'
242 toc_list = etree.SubElement(toc, 'ol')
244 for n, section, text, subsections in sections:
245 section_element = etree.SubElement(toc_list, 'li')
246 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
249 subsection_list = etree.SubElement(section_element, 'ol')
250 for n, subsection, text, _ in subsections:
251 subsection_element = etree.SubElement(subsection_list, 'li')
252 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
257 def extract_annotations(html_path):
258 """For each annotation, yields a tuple: anchor, text, html."""
259 parser = etree.HTMLParser(encoding='utf-8')
260 tree = etree.parse(html_path, parser)
261 footnotes = tree.find('//*[@id="footnotes"]')
262 if footnotes is not None:
263 for footnote in footnotes.findall('div'):
264 anchor = footnote.find('a[@name]').get('name')
266 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
267 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
268 yield anchor, text_str, html_str