1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_themes(result.getroot())
61 add_table_of_contents(result.getroot())
63 return OutputFile.from_string(etree.tostring(result, method='html',
64 xml_declaration=False, pretty_print=True, encoding='utf-8'))
68 raise ValueError("'%s' is not a valid stylesheet.")
69 except (XMLSyntaxError, XSLTApplyError), e:
72 class Fragment(object):
73 def __init__(self, id, themes):
74 super(Fragment, self).__init__()
79 def append(self, event, element):
80 self.events.append((event, element))
82 def closed_events(self):
84 for event, element in self.events:
86 stack.append(('end', element))
91 print 'CLOSED NON-OPEN TAG:', element
94 return self.events + stack
98 for event, element in self.closed_events():
100 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
102 result.append(element.text)
104 result.append(u'</%s>' % element.tag)
106 result.append(element.tail)
108 result.append(element)
110 return ''.join(result)
112 def __unicode__(self):
113 return self.to_string()
116 def extract_fragments(input_filename):
117 """Extracts theme fragments from input_filename."""
119 closed_fragments = {}
121 # iterparse would die on a HTML document
122 parser = etree.HTMLParser(encoding='utf-8')
123 buf = cStringIO.StringIO()
124 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
127 for event, element in etree.iterparse(buf, events=('start', 'end')):
128 # Process begin and end elements
129 if element.get('class', '') in ('theme-begin', 'theme-end'):
130 if not event == 'end': continue # Process elements only once, on end event
133 if element.get('class', '') == 'theme-begin':
134 fragment = Fragment(id=element.get('fid'), themes=element.text)
137 if element.getparent().get('id', None) != 'book-text':
138 parents = [element.getparent()]
139 while parents[-1].getparent().get('id', None) != 'book-text':
140 parents.append(parents[-1].getparent())
143 for parent in parents:
144 fragment.append('start', parent)
146 open_fragments[fragment.id] = fragment
148 # Close existing fragment
151 fragment = open_fragments[element.get('fid')]
153 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
155 closed_fragments[fragment.id] = fragment
156 del open_fragments[fragment.id]
158 # Append element tail to lost_text (we don't want to lose any text)
160 for fragment_id in open_fragments:
161 open_fragments[fragment_id].append('text', element.tail)
164 # Process all elements except begin and end
166 # Omit annotation tags
167 if (len(element.get('name', '')) or
168 element.get('class', '') in ('annotation', 'anchor')):
169 if event == 'end' and element.tail:
170 for fragment_id in open_fragments:
171 open_fragments[fragment_id].append('text', element.tail)
173 for fragment_id in open_fragments:
174 open_fragments[fragment_id].append(event, copy.copy(element))
176 return closed_fragments, open_fragments
179 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
181 if link_text is None:
183 anchor = etree.Element('a', href='#%s' % prefix)
184 anchor.set('class', 'anchor')
185 anchor.text = unicode(link_text)
187 anchor.tail = element.text
189 element.insert(0, anchor)
192 anchor_target = etree.Element('a', name='%s' % prefix)
193 anchor_target.set('class', 'target')
194 anchor_target.text = u' '
196 anchor_target.tail = element.text
198 element.insert(0, anchor_target)
201 def any_ancestor(element, test):
202 for ancestor in element.iterancestors():
208 def add_anchors(root):
210 for element in root.iterdescendants():
211 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
212 or e.get('id') == 'nota_red'
213 or e.tag == 'blockquote'):
216 if element.tag == 'p' and 'verse' in element.get('class', ''):
217 if counter == 1 or counter % 5 == 0:
218 add_anchor(element, "f%d" % counter, link_text=counter)
220 elif 'paragraph' in element.get('class', ''):
221 add_anchor(element, "f%d" % counter, link_text=counter)
225 def raw_printable_text(element):
226 working = copy.deepcopy(element)
227 for e in working.findall('a'):
228 if e.get('class') == 'annotation':
230 return etree.tostring(working, method='text', encoding=unicode).strip()
233 def add_table_of_contents(root):
236 for element in root.iterdescendants():
237 if element.tag in ('h2', 'h3'):
238 if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
241 element_text = raw_printable_text(element)
242 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
243 sections[-1][3].append((counter, element.tag, element_text, []))
245 sections.append((counter, element.tag, element_text, []))
246 add_anchor(element, "s%d" % counter, with_link=False)
249 toc = etree.Element('div')
251 toc_header = etree.SubElement(toc, 'h2')
252 toc_header.text = u'Spis treści'
253 toc_list = etree.SubElement(toc, 'ol')
255 for n, section, text, subsections in sections:
256 section_element = etree.SubElement(toc_list, 'li')
257 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
260 subsection_list = etree.SubElement(section_element, 'ol')
261 for n, subsection, text, _ in subsections:
262 subsection_element = etree.SubElement(subsection_list, 'li')
263 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
268 def add_table_of_themes(root):
270 from sortify import sortify
272 sortify = lambda x: x
275 for fragment in root.findall('.//a[@class="theme-begin"]'):
276 if not fragment.text:
278 theme_names = [s.strip() for s in fragment.text.split(',')]
279 for theme_name in theme_names:
280 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
281 book_themes = book_themes.items()
282 book_themes.sort(key=lambda s: sortify(s[0]))
283 themes_div = etree.Element('div', id="themes")
284 themes_ol = etree.SubElement(themes_div, 'ol')
285 for theme_name, fragments in book_themes:
286 themes_li = etree.SubElement(themes_ol, 'li')
287 themes_li.text = "%s: " % theme_name
288 for i, fragment in enumerate(fragments):
289 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
290 item.text = str(i + 1)
292 root.insert(0, themes_div)
296 def extract_annotations(html_path):
297 """For each annotation, yields a tuple: anchor, text, html."""
298 parser = etree.HTMLParser(encoding='utf-8')
299 tree = etree.parse(html_path, parser)
300 footnotes = tree.find('//*[@id="footnotes"]')
301 if footnotes is not None:
302 for footnote in footnotes.findall('div'):
303 anchor = footnote.find('a[@name]').get('name')
305 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
306 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
307 yield anchor, text_str, html_str