1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 functions.reg_substitute_entities()
18 functions.reg_person_name()
21 'legacy': 'xslt/book2html.xslt',
22 'full': 'xslt/wl2html_full.xslt',
23 'partial': 'xslt/wl2html_partial.xslt'
26 def get_stylesheet(name):
27 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29 def html_has_content(text):
30 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
32 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
33 """Transforms the WL document to XHTML.
35 If output_filename is None, returns an XML,
36 otherwise returns True if file has been written,False if it hasn't.
37 File won't be written if it has no content.
41 style_filename = get_stylesheet(stylesheet)
42 style = etree.parse(style_filename)
44 document = copy.deepcopy(wldoc)
46 document.swap_endlines()
50 document.edoc.getroot().set(flag, 'yes')
52 document.clean_ed_note()
53 document.clean_ed_note('abstrakt')
57 result = document.transform(style, **options)
58 del document # no longer needed large object :)
60 if html_has_content(result):
61 add_anchors(result.getroot())
62 add_table_of_themes(result.getroot())
63 add_table_of_contents(result.getroot())
65 return OutputFile.from_string(etree.tostring(result, method='html',
66 xml_declaration=False, pretty_print=True, encoding='utf-8'))
70 raise ValueError("'%s' is not a valid stylesheet.")
71 except (XMLSyntaxError, XSLTApplyError), e:
74 class Fragment(object):
75 def __init__(self, id, themes):
76 super(Fragment, self).__init__()
81 def append(self, event, element):
82 self.events.append((event, element))
84 def closed_events(self):
86 for event, element in self.events:
88 stack.append(('end', element))
93 print 'CLOSED NON-OPEN TAG:', element
96 return self.events + stack
100 for event, element in self.closed_events():
102 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
104 result.append(element.text)
106 result.append(u'</%s>' % element.tag)
108 result.append(element.tail)
110 result.append(element)
112 return ''.join(result)
114 def __unicode__(self):
115 return self.to_string()
118 def extract_fragments(input_filename):
119 """Extracts theme fragments from input_filename."""
121 closed_fragments = {}
123 # iterparse would die on a HTML document
124 parser = etree.HTMLParser(encoding='utf-8')
125 buf = cStringIO.StringIO()
126 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
129 for event, element in etree.iterparse(buf, events=('start', 'end')):
130 # Process begin and end elements
131 if element.get('class', '') in ('theme-begin', 'theme-end'):
132 if not event == 'end': continue # Process elements only once, on end event
135 if element.get('class', '') == 'theme-begin':
136 fragment = Fragment(id=element.get('fid'), themes=element.text)
139 parent = element.getparent()
141 while parent.get('id', None) != 'book-text':
142 cparent = copy.deepcopy(parent)
144 parents.append(cparent)
145 parent = parent.getparent()
148 for parent in parents:
149 fragment.append('start', parent)
151 open_fragments[fragment.id] = fragment
153 # Close existing fragment
156 fragment = open_fragments[element.get('fid')]
158 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
160 closed_fragments[fragment.id] = fragment
161 del open_fragments[fragment.id]
163 # Append element tail to lost_text (we don't want to lose any text)
165 for fragment_id in open_fragments:
166 open_fragments[fragment_id].append('text', element.tail)
169 # Process all elements except begin and end
171 # Omit annotation tags
172 if (len(element.get('name', '')) or
173 element.get('class', '') in ('annotation', 'anchor')):
174 if event == 'end' and element.tail:
175 for fragment_id in open_fragments:
176 open_fragments[fragment_id].append('text', element.tail)
178 for fragment_id in open_fragments:
179 open_fragments[fragment_id].append(event, copy.copy(element))
181 return closed_fragments, open_fragments
184 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
185 parent = element.getparent()
186 index = parent.index(element)
189 if link_text is None:
191 anchor = etree.Element('a', href='#%s' % prefix)
192 anchor.set('class', 'anchor')
193 anchor.text = unicode(link_text)
194 parent.insert(index, anchor)
197 anchor_target = etree.Element('a', name='%s' % prefix)
198 anchor_target.set('class', 'target')
199 anchor_target.text = u' '
200 parent.insert(index, anchor_target)
203 def any_ancestor(element, test):
204 for ancestor in element.iterancestors():
210 def add_anchors(root):
212 for element in root.iterdescendants():
213 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
214 or e.get('id') == 'nota_red'
215 or e.tag == 'blockquote'):
218 if element.tag == 'p' and 'verse' in element.get('class', ''):
219 if counter == 1 or counter % 5 == 0:
220 add_anchor(element, "f%d" % counter, link_text=counter)
222 elif 'paragraph' in element.get('class', ''):
223 add_anchor(element, "f%d" % counter, link_text=counter)
227 def raw_printable_text(element):
228 working = copy.deepcopy(element)
229 for e in working.findall('a'):
230 if e.get('class') in ('annotation', 'theme-begin'):
232 return etree.tostring(working, method='text', encoding=unicode).strip()
235 def add_table_of_contents(root):
238 for element in root.iterdescendants():
239 if element.tag in ('h2', 'h3'):
240 if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
243 element_text = raw_printable_text(element)
244 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
245 sections[-1][3].append((counter, element.tag, element_text, []))
247 sections.append((counter, element.tag, element_text, []))
248 add_anchor(element, "s%d" % counter, with_link=False)
251 toc = etree.Element('div')
253 toc_header = etree.SubElement(toc, 'h2')
254 toc_header.text = u'Spis treści'
255 toc_list = etree.SubElement(toc, 'ol')
257 for n, section, text, subsections in sections:
258 section_element = etree.SubElement(toc_list, 'li')
259 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
262 subsection_list = etree.SubElement(section_element, 'ol')
263 for n, subsection, text, _ in subsections:
264 subsection_element = etree.SubElement(subsection_list, 'li')
265 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
270 def add_table_of_themes(root):
272 from sortify import sortify
274 sortify = lambda x: x
277 for fragment in root.findall('.//a[@class="theme-begin"]'):
278 if not fragment.text:
280 theme_names = [s.strip() for s in fragment.text.split(',')]
281 for theme_name in theme_names:
282 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
283 book_themes = book_themes.items()
284 book_themes.sort(key=lambda s: sortify(s[0]))
285 themes_div = etree.Element('div', id="themes")
286 themes_ol = etree.SubElement(themes_div, 'ol')
287 for theme_name, fragments in book_themes:
288 themes_li = etree.SubElement(themes_ol, 'li')
289 themes_li.text = "%s: " % theme_name
290 for i, fragment in enumerate(fragments):
291 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
292 item.text = str(i + 1)
294 root.insert(0, themes_div)
297 def extract_annotations(html_path):
298 """Extracts annotations from HTML for annotations dictionary.
300 For each annotation, yields a tuple of:
301 anchor, footnote type, valid qualifiers, text, html.
304 from .fn_qualifiers import FN_QUALIFIERS
306 parser = etree.HTMLParser(encoding='utf-8')
307 tree = etree.parse(html_path, parser)
308 footnotes = tree.find('//*[@id="footnotes"]')
309 re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
310 if footnotes is not None:
311 for footnote in footnotes.findall('div'):
312 fn_type = footnote.get('class').split('-')[1]
313 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
316 if len(footnote) and footnote[-1].tail == '\n':
317 footnote[-1].tail = None
318 text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
319 html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
321 match = re_qualifier.match(text_str)
323 qualifier_str = match.group(1)
325 for candidate in re.split('[;,]', qualifier_str):
326 candidate = candidate.strip()
327 if candidate in FN_QUALIFIERS:
328 qualifiers.append(candidate)
329 elif candidate.startswith('z '):
330 subcandidate = candidate.split()[1]
331 if subcandidate in FN_QUALIFIERS:
332 qualifiers.append(subcandidate)
336 yield anchor, fn_type, qualifiers, text_str, html_str