1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 functions.reg_substitute_entities()
18 functions.reg_person_name()
21 'legacy': 'xslt/book2html.xslt',
22 'full': 'xslt/wl2html_full.xslt',
23 'partial': 'xslt/wl2html_partial.xslt'
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
32 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
35 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
36 """Transforms the WL document to XHTML.
38 If output_filename is None, returns an XML,
39 otherwise returns True if file has been written,False if it hasn't.
40 File won't be written if it has no content.
44 style_filename = get_stylesheet(stylesheet)
45 style = etree.parse(style_filename)
47 document = copy.deepcopy(wldoc)
49 document.swap_endlines()
53 document.edoc.getroot().set(flag, 'yes')
55 document.clean_ed_note()
56 document.clean_ed_note('abstrakt')
60 options.setdefault('gallery', "''")
61 result = document.transform(style, **options)
62 del document # no longer needed large object :)
64 if html_has_content(result):
65 add_anchors(result.getroot())
66 add_table_of_themes(result.getroot())
67 add_table_of_contents(result.getroot())
69 return OutputFile.from_string(etree.tostring(
70 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
74 raise ValueError("'%s' is not a valid stylesheet.")
75 except (XMLSyntaxError, XSLTApplyError), e:
79 class Fragment(object):
80 def __init__(self, id, themes):
81 super(Fragment, self).__init__()
86 def append(self, event, element):
87 self.events.append((event, element))
89 def closed_events(self):
91 for event, element in self.events:
93 stack.append(('end', element))
98 print 'CLOSED NON-OPEN TAG:', element
101 return self.events + stack
105 for event, element in self.closed_events():
107 result.append(u'<%s %s>' % (
108 element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
110 result.append(element.text)
112 result.append(u'</%s>' % element.tag)
114 result.append(element.tail)
116 result.append(element)
118 return ''.join(result)
120 def __unicode__(self):
121 return self.to_string()
124 def extract_fragments(input_filename):
125 """Extracts theme fragments from input_filename."""
127 closed_fragments = {}
129 # iterparse would die on a HTML document
130 parser = etree.HTMLParser(encoding='utf-8')
131 buf = cStringIO.StringIO()
132 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
135 for event, element in etree.iterparse(buf, events=('start', 'end')):
136 # Process begin and end elements
137 if element.get('class', '') in ('theme-begin', 'theme-end'):
138 if not event == 'end':
139 continue # Process elements only once, on end event
142 if element.get('class', '') == 'theme-begin':
143 fragment = Fragment(id=element.get('fid'), themes=element.text)
146 parent = element.getparent()
148 while parent.get('id', None) != 'book-text':
149 cparent = copy.deepcopy(parent)
151 parents.append(cparent)
152 parent = parent.getparent()
155 for parent in parents:
156 fragment.append('start', parent)
158 open_fragments[fragment.id] = fragment
160 # Close existing fragment
163 fragment = open_fragments[element.get('fid')]
165 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
167 closed_fragments[fragment.id] = fragment
168 del open_fragments[fragment.id]
170 # Append element tail to lost_text (we don't want to lose any text)
172 for fragment_id in open_fragments:
173 open_fragments[fragment_id].append('text', element.tail)
175 # Process all elements except begin and end
177 # Omit annotation tags
178 if (len(element.get('name', '')) or
179 element.get('class', '') in ('annotation', 'anchor')):
180 if event == 'end' and element.tail:
181 for fragment_id in open_fragments:
182 open_fragments[fragment_id].append('text', element.tail)
184 for fragment_id in open_fragments:
185 open_fragments[fragment_id].append(event, copy.copy(element))
187 return closed_fragments, open_fragments
190 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
191 parent = element.getparent()
192 index = parent.index(element)
195 if link_text is None:
197 anchor = etree.Element('a', href='#%s' % prefix)
198 anchor.set('class', 'anchor')
199 anchor.text = unicode(link_text)
200 parent.insert(index, anchor)
203 anchor_target = etree.Element('a', name='%s' % prefix)
204 anchor_target.set('class', 'target')
205 anchor_target.text = u' '
206 parent.insert(index, anchor_target)
209 def any_ancestor(element, test):
210 for ancestor in element.iterancestors():
216 def add_anchors(root):
218 for element in root.iterdescendants():
220 return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication', 'frame') or \
221 e.get('id') == 'nota_red' or e.tag == 'blockquote'
222 if any_ancestor(element, f):
225 if element.tag == 'p' and 'verse' in element.get('class', ''):
226 if counter == 1 or counter % 5 == 0:
227 add_anchor(element, "f%d" % counter, link_text=counter)
229 elif 'paragraph' in element.get('class', ''):
230 add_anchor(element, "f%d" % counter, link_text=counter)
234 def raw_printable_text(element):
235 working = copy.deepcopy(element)
236 for e in working.findall('a'):
237 if e.get('class') in ('annotation', 'theme-begin'):
239 return etree.tostring(working, method='text', encoding=unicode).strip()
242 def add_table_of_contents(root):
245 for element in root.iterdescendants():
246 if element.tag in ('h2', 'h3'):
247 if any_ancestor(element,
248 lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
251 element_text = raw_printable_text(element)
252 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
253 sections[-1][3].append((counter, element.tag, element_text, []))
255 sections.append((counter, element.tag, element_text, []))
256 add_anchor(element, "s%d" % counter, with_link=False)
259 toc = etree.Element('div')
261 toc_header = etree.SubElement(toc, 'h2')
262 toc_header.text = u'Spis treści'
263 toc_list = etree.SubElement(toc, 'ol')
265 for n, section, text, subsections in sections:
266 section_element = etree.SubElement(toc_list, 'li')
267 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
270 subsection_list = etree.SubElement(section_element, 'ol')
271 for n1, subsection, subtext, _ in subsections:
272 subsection_element = etree.SubElement(subsection_list, 'li')
273 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
278 def add_table_of_themes(root):
280 from sortify import sortify
286 for fragment in root.findall('.//a[@class="theme-begin"]'):
287 if not fragment.text:
289 theme_names = [s.strip() for s in fragment.text.split(',')]
290 for theme_name in theme_names:
291 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
292 book_themes = book_themes.items()
293 book_themes.sort(key=lambda s: sortify(s[0]))
294 themes_div = etree.Element('div', id="themes")
295 themes_ol = etree.SubElement(themes_div, 'ol')
296 for theme_name, fragments in book_themes:
297 themes_li = etree.SubElement(themes_ol, 'li')
298 themes_li.text = "%s: " % theme_name
299 for i, fragment in enumerate(fragments):
300 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
301 item.text = str(i + 1)
303 root.insert(0, themes_div)
306 def extract_annotations(html_path):
307 """Extracts annotations from HTML for annotations dictionary.
309 For each annotation, yields a tuple of:
310 anchor, footnote type, valid qualifiers, text, html.
313 from .fn_qualifiers import FN_QUALIFIERS
315 parser = etree.HTMLParser(encoding='utf-8')
316 tree = etree.parse(html_path, parser)
317 footnotes = tree.find('//*[@id="footnotes"]')
318 re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
319 if footnotes is not None:
320 for footnote in footnotes.findall('div'):
321 fn_type = footnote.get('class').split('-')[1]
322 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
325 if len(footnote) and footnote[-1].tail == '\n':
326 footnote[-1].tail = None
327 text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
328 html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
330 match = re_qualifier.match(text_str)
332 qualifier_str = match.group(1)
334 for candidate in re.split('[;,]', qualifier_str):
335 candidate = candidate.strip()
336 if candidate in FN_QUALIFIERS:
337 qualifiers.append(candidate)
338 elif candidate.startswith('z '):
339 subcandidate = candidate.split()[1]
340 if subcandidate in FN_QUALIFIERS:
341 qualifiers.append(subcandidate)
345 yield anchor, fn_type, qualifiers, text_str, html_str