1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 functions.reg_substitute_entities()
18 functions.reg_person_name()
21 'legacy': 'xslt/book2html.xslt',
22 'full': 'xslt/wl2html_full.xslt',
23 'partial': 'xslt/wl2html_partial.xslt'
26 def get_stylesheet(name):
27 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29 def html_has_content(text):
30 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
32 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
33 """Transforms the WL document to XHTML.
35 If output_filename is None, returns an XML,
36 otherwise returns True if file has been written,False if it hasn't.
37 File won't be written if it has no content.
41 style_filename = get_stylesheet(stylesheet)
42 style = etree.parse(style_filename)
44 document = copy.deepcopy(wldoc)
46 document.swap_endlines()
50 document.edoc.getroot().set(flag, 'yes')
52 document.clean_ed_note()
56 result = document.transform(style, **options)
57 del document # no longer needed large object :)
59 if html_has_content(result):
60 add_anchors(result.getroot())
61 add_table_of_themes(result.getroot())
62 add_table_of_contents(result.getroot())
64 return OutputFile.from_string(etree.tostring(result, method='html',
65 xml_declaration=False, pretty_print=True, encoding='utf-8'))
69 raise ValueError("'%s' is not a valid stylesheet.")
70 except (XMLSyntaxError, XSLTApplyError), e:
73 class Fragment(object):
74 def __init__(self, id, themes):
75 super(Fragment, self).__init__()
80 def append(self, event, element):
81 self.events.append((event, element))
83 def closed_events(self):
85 for event, element in self.events:
87 stack.append(('end', element))
92 print 'CLOSED NON-OPEN TAG:', element
95 return self.events + stack
99 for event, element in self.closed_events():
101 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
103 result.append(element.text)
105 result.append(u'</%s>' % element.tag)
107 result.append(element.tail)
109 result.append(element)
111 return ''.join(result)
113 def __unicode__(self):
114 return self.to_string()
117 def extract_fragments(input_filename):
118 """Extracts theme fragments from input_filename."""
120 closed_fragments = {}
122 # iterparse would die on a HTML document
123 parser = etree.HTMLParser(encoding='utf-8')
124 buf = cStringIO.StringIO()
125 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
128 for event, element in etree.iterparse(buf, events=('start', 'end')):
129 # Process begin and end elements
130 if element.get('class', '') in ('theme-begin', 'theme-end'):
131 if not event == 'end': continue # Process elements only once, on end event
134 if element.get('class', '') == 'theme-begin':
135 fragment = Fragment(id=element.get('fid'), themes=element.text)
138 parent = element.getparent()
140 while parent.get('id', None) != 'book-text':
141 cparent = copy.deepcopy(parent)
143 parents.append(cparent)
144 parent = parent.getparent()
147 for parent in parents:
148 fragment.append('start', parent)
150 open_fragments[fragment.id] = fragment
152 # Close existing fragment
155 fragment = open_fragments[element.get('fid')]
157 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
159 closed_fragments[fragment.id] = fragment
160 del open_fragments[fragment.id]
162 # Append element tail to lost_text (we don't want to lose any text)
164 for fragment_id in open_fragments:
165 open_fragments[fragment_id].append('text', element.tail)
168 # Process all elements except begin and end
170 # Omit annotation tags
171 if (len(element.get('name', '')) or
172 element.get('class', '') in ('annotation', 'anchor')):
173 if event == 'end' and element.tail:
174 for fragment_id in open_fragments:
175 open_fragments[fragment_id].append('text', element.tail)
177 for fragment_id in open_fragments:
178 open_fragments[fragment_id].append(event, copy.copy(element))
180 return closed_fragments, open_fragments
183 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
184 parent = element.getparent()
185 index = parent.index(element)
188 if link_text is None:
190 anchor = etree.Element('a', href='#%s' % prefix)
191 anchor.set('class', 'anchor')
192 anchor.text = unicode(link_text)
193 parent.insert(index, anchor)
196 anchor_target = etree.Element('a', name='%s' % prefix)
197 anchor_target.set('class', 'target')
198 anchor_target.text = u' '
199 parent.insert(index, anchor_target)
202 def any_ancestor(element, test):
203 for ancestor in element.iterancestors():
209 def add_anchors(root):
211 for element in root.iterdescendants():
212 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
213 or e.get('id') == 'nota_red'
214 or e.tag == 'blockquote'):
217 if element.tag == 'p' and 'verse' in element.get('class', ''):
218 if counter == 1 or counter % 5 == 0:
219 add_anchor(element, "f%d" % counter, link_text=counter)
221 elif 'paragraph' in element.get('class', ''):
222 add_anchor(element, "f%d" % counter, link_text=counter)
226 def raw_printable_text(element):
227 working = copy.deepcopy(element)
228 for e in working.findall('a'):
229 if e.get('class') in ('annotation', 'theme-begin'):
231 return etree.tostring(working, method='text', encoding=unicode).strip()
234 def add_table_of_contents(root):
237 for element in root.iterdescendants():
238 if element.tag in ('h2', 'h3'):
239 if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
242 element_text = raw_printable_text(element)
243 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
244 sections[-1][3].append((counter, element.tag, element_text, []))
246 sections.append((counter, element.tag, element_text, []))
247 add_anchor(element, "s%d" % counter, with_link=False)
250 toc = etree.Element('div')
252 toc_header = etree.SubElement(toc, 'h2')
253 toc_header.text = u'Spis treści'
254 toc_list = etree.SubElement(toc, 'ol')
256 for n, section, text, subsections in sections:
257 section_element = etree.SubElement(toc_list, 'li')
258 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
261 subsection_list = etree.SubElement(section_element, 'ol')
262 for n, subsection, text, _ in subsections:
263 subsection_element = etree.SubElement(subsection_list, 'li')
264 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
269 def add_table_of_themes(root):
271 from sortify import sortify
273 sortify = lambda x: x
276 for fragment in root.findall('.//a[@class="theme-begin"]'):
277 if not fragment.text:
279 theme_names = [s.strip() for s in fragment.text.split(',')]
280 for theme_name in theme_names:
281 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
282 book_themes = book_themes.items()
283 book_themes.sort(key=lambda s: sortify(s[0]))
284 themes_div = etree.Element('div', id="themes")
285 themes_ol = etree.SubElement(themes_div, 'ol')
286 for theme_name, fragments in book_themes:
287 themes_li = etree.SubElement(themes_ol, 'li')
288 themes_li.text = "%s: " % theme_name
289 for i, fragment in enumerate(fragments):
290 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
291 item.text = str(i + 1)
293 root.insert(0, themes_div)
296 def extract_annotations(html_path):
297 """Extracts annotations from HTML for annotations dictionary.
299 For each annotation, yields a tuple of:
300 anchor, footnote type, valid qualifiers, text, html.
303 from .fn_qualifiers import FN_QUALIFIERS
305 parser = etree.HTMLParser(encoding='utf-8')
306 tree = etree.parse(html_path, parser)
307 footnotes = tree.find('//*[@id="footnotes"]')
308 re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
309 if footnotes is not None:
310 for footnote in footnotes.findall('div'):
311 fn_type = footnote.get('class').split('-')[1]
312 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
315 if len(footnote) and footnote[-1].tail == '\n':
316 footnote[-1].tail = None
317 text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
318 html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
320 match = re_qualifier.match(text_str)
322 qualifier_str = match.group(1)
324 for candidate in re.split('[;,]', qualifier_str):
325 candidate = candidate.strip()
326 if candidate in FN_QUALIFIERS:
327 qualifiers.append(candidate)
328 elif candidate.startswith('z '):
329 subcandidate = candidate.split()[1]
330 if subcandidate in FN_QUALIFIERS:
331 qualifiers.append(subcandidate)
335 yield anchor, fn_type, qualifiers, text_str, html_str