1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 functions.reg_substitute_entities()
18 functions.reg_person_name()
21 'legacy': 'xslt/book2html.xslt',
22 'full': 'xslt/wl2html_full.xslt',
23 'partial': 'xslt/wl2html_partial.xslt'
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
32 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
35 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
36 """Transforms the WL document to XHTML.
38 If output_filename is None, returns an XML,
39 otherwise returns True if file has been written,False if it hasn't.
40 File won't be written if it has no content.
44 style_filename = get_stylesheet(stylesheet)
45 style = etree.parse(style_filename)
47 document = copy.deepcopy(wldoc)
49 document.swap_endlines()
53 document.edoc.getroot().set(flag, 'yes')
55 document.clean_ed_note()
56 document.clean_ed_note('abstrakt')
60 result = document.transform(style, **options)
61 del document # no longer needed large object :)
63 if html_has_content(result):
64 add_anchors(result.getroot())
65 add_table_of_themes(result.getroot())
66 add_table_of_contents(result.getroot())
68 return OutputFile.from_string(etree.tostring(
69 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
73 raise ValueError("'%s' is not a valid stylesheet.")
74 except (XMLSyntaxError, XSLTApplyError), e:
78 class Fragment(object):
79 def __init__(self, id, themes):
80 super(Fragment, self).__init__()
85 def append(self, event, element):
86 self.events.append((event, element))
88 def closed_events(self):
90 for event, element in self.events:
92 stack.append(('end', element))
97 print 'CLOSED NON-OPEN TAG:', element
100 return self.events + stack
104 for event, element in self.closed_events():
106 result.append(u'<%s %s>' % (
107 element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
109 result.append(element.text)
111 result.append(u'</%s>' % element.tag)
113 result.append(element.tail)
115 result.append(element)
117 return ''.join(result)
119 def __unicode__(self):
120 return self.to_string()
123 def extract_fragments(input_filename):
124 """Extracts theme fragments from input_filename."""
126 closed_fragments = {}
128 # iterparse would die on a HTML document
129 parser = etree.HTMLParser(encoding='utf-8')
130 buf = cStringIO.StringIO()
131 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
134 for event, element in etree.iterparse(buf, events=('start', 'end')):
135 # Process begin and end elements
136 if element.get('class', '') in ('theme-begin', 'theme-end'):
137 if not event == 'end':
138 continue # Process elements only once, on end event
141 if element.get('class', '') == 'theme-begin':
142 fragment = Fragment(id=element.get('fid'), themes=element.text)
145 parent = element.getparent()
147 while parent.get('id', None) != 'book-text':
148 cparent = copy.deepcopy(parent)
150 parents.append(cparent)
151 parent = parent.getparent()
154 for parent in parents:
155 fragment.append('start', parent)
157 open_fragments[fragment.id] = fragment
159 # Close existing fragment
162 fragment = open_fragments[element.get('fid')]
164 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
166 closed_fragments[fragment.id] = fragment
167 del open_fragments[fragment.id]
169 # Append element tail to lost_text (we don't want to lose any text)
171 for fragment_id in open_fragments:
172 open_fragments[fragment_id].append('text', element.tail)
174 # Process all elements except begin and end
176 # Omit annotation tags
177 if (len(element.get('name', '')) or
178 element.get('class', '') in ('annotation', 'anchor')):
179 if event == 'end' and element.tail:
180 for fragment_id in open_fragments:
181 open_fragments[fragment_id].append('text', element.tail)
183 for fragment_id in open_fragments:
184 open_fragments[fragment_id].append(event, copy.copy(element))
186 return closed_fragments, open_fragments
189 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
190 parent = element.getparent()
191 index = parent.index(element)
194 if link_text is None:
196 anchor = etree.Element('a', href='#%s' % prefix)
197 anchor.set('class', 'anchor')
198 anchor.text = unicode(link_text)
199 parent.insert(index, anchor)
202 anchor_target = etree.Element('a', name='%s' % prefix)
203 anchor_target.set('class', 'target')
204 anchor_target.text = u' '
205 parent.insert(index, anchor_target)
208 def any_ancestor(element, test):
209 for ancestor in element.iterancestors():
215 def add_anchors(root):
217 for element in root.iterdescendants():
219 return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') or \
220 e.get('id') == 'nota_red' or e.tag == 'blockquote'
221 if any_ancestor(element, f):
224 if element.tag == 'p' and 'verse' in element.get('class', ''):
225 if counter == 1 or counter % 5 == 0:
226 add_anchor(element, "f%d" % counter, link_text=counter)
228 elif 'paragraph' in element.get('class', ''):
229 add_anchor(element, "f%d" % counter, link_text=counter)
233 def raw_printable_text(element):
234 working = copy.deepcopy(element)
235 for e in working.findall('a'):
236 if e.get('class') in ('annotation', 'theme-begin'):
238 return etree.tostring(working, method='text', encoding=unicode).strip()
241 def add_table_of_contents(root):
244 for element in root.iterdescendants():
245 if element.tag in ('h2', 'h3'):
246 if any_ancestor(element,
247 lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
250 element_text = raw_printable_text(element)
251 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
252 sections[-1][3].append((counter, element.tag, element_text, []))
254 sections.append((counter, element.tag, element_text, []))
255 add_anchor(element, "s%d" % counter, with_link=False)
258 toc = etree.Element('div')
260 toc_header = etree.SubElement(toc, 'h2')
261 toc_header.text = u'Spis treści'
262 toc_list = etree.SubElement(toc, 'ol')
264 for n, section, text, subsections in sections:
265 section_element = etree.SubElement(toc_list, 'li')
266 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
269 subsection_list = etree.SubElement(section_element, 'ol')
270 for n1, subsection, subtext, _ in subsections:
271 subsection_element = etree.SubElement(subsection_list, 'li')
272 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
277 def add_table_of_themes(root):
279 from sortify import sortify
285 for fragment in root.findall('.//a[@class="theme-begin"]'):
286 if not fragment.text:
288 theme_names = [s.strip() for s in fragment.text.split(',')]
289 for theme_name in theme_names:
290 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
291 book_themes = book_themes.items()
292 book_themes.sort(key=lambda s: sortify(s[0]))
293 themes_div = etree.Element('div', id="themes")
294 themes_ol = etree.SubElement(themes_div, 'ol')
295 for theme_name, fragments in book_themes:
296 themes_li = etree.SubElement(themes_ol, 'li')
297 themes_li.text = "%s: " % theme_name
298 for i, fragment in enumerate(fragments):
299 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
300 item.text = str(i + 1)
302 root.insert(0, themes_div)
305 def extract_annotations(html_path):
306 """Extracts annotations from HTML for annotations dictionary.
308 For each annotation, yields a tuple of:
309 anchor, footnote type, valid qualifiers, text, html.
312 from .fn_qualifiers import FN_QUALIFIERS
314 parser = etree.HTMLParser(encoding='utf-8')
315 tree = etree.parse(html_path, parser)
316 footnotes = tree.find('//*[@id="footnotes"]')
317 re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
318 if footnotes is not None:
319 for footnote in footnotes.findall('div'):
320 fn_type = footnote.get('class').split('-')[1]
321 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
324 if len(footnote) and footnote[-1].tail == '\n':
325 footnote[-1].tail = None
326 text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
327 html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
329 match = re_qualifier.match(text_str)
331 qualifier_str = match.group(1)
333 for candidate in re.split('[;,]', qualifier_str):
334 candidate = candidate.strip()
335 if candidate in FN_QUALIFIERS:
336 qualifiers.append(candidate)
337 elif candidate.startswith('z '):
338 subcandidate = candidate.split()[1]
339 if subcandidate in FN_QUALIFIERS:
340 qualifiers.append(subcandidate)
344 yield anchor, fn_type, qualifiers, text_str, html_str