1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_themes(result.getroot())
61 add_table_of_contents(result.getroot())
63 return OutputFile.from_string(etree.tostring(result, method='html',
64 xml_declaration=False, pretty_print=True, encoding='utf-8'))
68 raise ValueError("'%s' is not a valid stylesheet.")
69 except (XMLSyntaxError, XSLTApplyError), e:
72 class Fragment(object):
73 def __init__(self, id, themes):
74 super(Fragment, self).__init__()
79 def append(self, event, element):
80 self.events.append((event, element))
82 def closed_events(self):
84 for event, element in self.events:
86 stack.append(('end', element))
91 print 'CLOSED NON-OPEN TAG:', element
94 return self.events + stack
98 for event, element in self.closed_events():
100 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
102 result.append(element.text)
104 result.append(u'</%s>' % element.tag)
106 result.append(element.tail)
108 result.append(element)
110 return ''.join(result)
112 def __unicode__(self):
113 return self.to_string()
116 def extract_fragments(input_filename):
117 """Extracts theme fragments from input_filename."""
119 closed_fragments = {}
121 # iterparse would die on a HTML document
122 parser = etree.HTMLParser(encoding='utf-8')
123 buf = cStringIO.StringIO()
124 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
127 for event, element in etree.iterparse(buf, events=('start', 'end')):
128 # Process begin and end elements
129 if element.get('class', '') in ('theme-begin', 'theme-end'):
130 if not event == 'end': continue # Process elements only once, on end event
133 if element.get('class', '') == 'theme-begin':
134 fragment = Fragment(id=element.get('fid'), themes=element.text)
137 if element.getparent().get('id', None) != 'book-text':
138 parents = [element.getparent()]
139 while parents[-1].getparent().get('id', None) != 'book-text':
140 parents.append(parents[-1].getparent())
143 for parent in parents:
144 fragment.append('start', parent)
146 open_fragments[fragment.id] = fragment
148 # Close existing fragment
151 fragment = open_fragments[element.get('fid')]
153 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
155 closed_fragments[fragment.id] = fragment
156 del open_fragments[fragment.id]
158 # Append element tail to lost_text (we don't want to lose any text)
160 for fragment_id in open_fragments:
161 open_fragments[fragment_id].append('text', element.tail)
164 # Process all elements except begin and end
166 # Omit annotation tags
167 if (len(element.get('name', '')) or
168 element.get('class', '') in ('annotation', 'anchor')):
169 if event == 'end' and element.tail:
170 for fragment_id in open_fragments:
171 open_fragments[fragment_id].append('text', element.tail)
173 for fragment_id in open_fragments:
174 open_fragments[fragment_id].append(event, copy.copy(element))
176 return closed_fragments, open_fragments
179 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
180 parent = element.getparent()
181 index = parent.index(element)
184 if link_text is None:
186 anchor = etree.Element('a', href='#%s' % prefix)
187 anchor.set('class', 'anchor')
188 anchor.text = unicode(link_text)
189 parent.insert(index, anchor)
192 anchor_target = etree.Element('a', name='%s' % prefix)
193 anchor_target.set('class', 'target')
194 anchor_target.text = u' '
195 parent.insert(index, anchor_target)
198 def any_ancestor(element, test):
199 for ancestor in element.iterancestors():
205 def add_anchors(root):
207 for element in root.iterdescendants():
208 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
209 or e.get('id') == 'nota_red'
210 or e.tag == 'blockquote'):
213 if element.tag == 'p' and 'verse' in element.get('class', ''):
214 if counter == 1 or counter % 5 == 0:
215 add_anchor(element, "f%d" % counter, link_text=counter)
217 elif 'paragraph' in element.get('class', ''):
218 add_anchor(element, "f%d" % counter, link_text=counter)
222 def raw_printable_text(element):
223 working = copy.deepcopy(element)
224 for e in working.findall('a'):
225 if e.get('class') == 'annotation':
227 return etree.tostring(working, method='text', encoding=unicode).strip()
230 def add_table_of_contents(root):
233 for element in root.iterdescendants():
234 if element.tag in ('h2', 'h3'):
235 if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
238 element_text = raw_printable_text(element)
239 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
240 sections[-1][3].append((counter, element.tag, element_text, []))
242 sections.append((counter, element.tag, element_text, []))
243 add_anchor(element, "s%d" % counter, with_link=False)
246 toc = etree.Element('div')
248 toc_header = etree.SubElement(toc, 'h2')
249 toc_header.text = u'Spis treści'
250 toc_list = etree.SubElement(toc, 'ol')
252 for n, section, text, subsections in sections:
253 section_element = etree.SubElement(toc_list, 'li')
254 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
257 subsection_list = etree.SubElement(section_element, 'ol')
258 for n, subsection, text, _ in subsections:
259 subsection_element = etree.SubElement(subsection_list, 'li')
260 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
265 def add_table_of_themes(root):
267 from sortify import sortify
269 sortify = lambda x: x
272 for fragment in root.findall('.//a[@class="theme-begin"]'):
273 if not fragment.text:
275 theme_names = [s.strip() for s in fragment.text.split(',')]
276 for theme_name in theme_names:
277 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
278 book_themes = book_themes.items()
279 book_themes.sort(key=lambda s: sortify(s[0]))
280 themes_div = etree.Element('div', id="themes")
281 themes_ol = etree.SubElement(themes_div, 'ol')
282 for theme_name, fragments in book_themes:
283 themes_li = etree.SubElement(themes_ol, 'li')
284 themes_li.text = "%s: " % theme_name
285 for i, fragment in enumerate(fragments):
286 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
287 item.text = str(i + 1)
289 root.insert(0, themes_div)
293 def extract_annotations(html_path):
294 """For each annotation, yields a tuple: anchor, text, html."""
295 parser = etree.HTMLParser(encoding='utf-8')
296 tree = etree.parse(html_path, parser)
297 footnotes = tree.find('//*[@id="footnotes"]')
298 if footnotes is not None:
299 for footnote in footnotes.findall('div'):
300 anchor = footnote.find('a[@name]').get('name')
302 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
303 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
304 yield anchor, text_str, html_str