1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
10 from lxml import etree
11 from librarian import XHTMLNS, ParseError, OutputFile
12 from librarian import functions
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 functions.reg_substitute_entities()
17 functions.reg_person_name()
20 'legacy': 'xslt/book2html.xslt',
21 'full': 'xslt/wl2html_full.xslt',
22 'partial': 'xslt/wl2html_partial.xslt'
25 def get_stylesheet(name):
26 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
28 def html_has_content(text):
29 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
32 """Transforms the WL document to XHTML.
34 If output_filename is None, returns an XML,
35 otherwise returns True if file has been written,False if it hasn't.
36 File won't be written if it has no content.
40 style_filename = get_stylesheet(stylesheet)
41 style = etree.parse(style_filename)
43 document = copy.deepcopy(wldoc)
45 document.swap_endlines()
49 document.edoc.getroot().set(flag, 'yes')
51 document.clean_ed_note()
55 result = document.transform(style, **options)
56 del document # no longer needed large object :)
58 if html_has_content(result):
59 add_anchors(result.getroot())
60 add_table_of_themes(result.getroot())
61 add_table_of_contents(result.getroot())
63 return OutputFile.from_string(etree.tostring(result, method='html',
64 xml_declaration=False, pretty_print=True, encoding='utf-8'))
68 raise ValueError("'%s' is not a valid stylesheet.")
69 except (XMLSyntaxError, XSLTApplyError), e:
72 class Fragment(object):
73 def __init__(self, id, themes):
74 super(Fragment, self).__init__()
79 def append(self, event, element):
80 self.events.append((event, element))
82 def closed_events(self):
84 for event, element in self.events:
86 stack.append(('end', element))
91 print 'CLOSED NON-OPEN TAG:', element
94 return self.events + stack
98 for event, element in self.closed_events():
100 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
102 result.append(element.text)
104 result.append(u'</%s>' % element.tag)
106 result.append(element.tail)
108 result.append(element)
110 return ''.join(result)
112 def __unicode__(self):
113 return self.to_string()
116 def extract_fragments(input_filename):
117 """Extracts theme fragments from input_filename."""
119 closed_fragments = {}
121 # iterparse would die on a HTML document
122 parser = etree.HTMLParser(encoding='utf-8')
123 buf = cStringIO.StringIO()
124 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
127 for event, element in etree.iterparse(buf, events=('start', 'end')):
128 # Process begin and end elements
129 if element.get('class', '') in ('theme-begin', 'theme-end'):
130 if not event == 'end': continue # Process elements only once, on end event
133 if element.get('class', '') == 'theme-begin':
134 fragment = Fragment(id=element.get('fid'), themes=element.text)
137 parent = element.getparent()
139 while parent.get('id', None) != 'book-text':
140 cparent = copy.deepcopy(parent)
142 parents.append(cparent)
143 parent = parent.getparent()
146 for parent in parents:
147 fragment.append('start', parent)
149 open_fragments[fragment.id] = fragment
151 # Close existing fragment
154 fragment = open_fragments[element.get('fid')]
156 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
158 closed_fragments[fragment.id] = fragment
159 del open_fragments[fragment.id]
161 # Append element tail to lost_text (we don't want to lose any text)
163 for fragment_id in open_fragments:
164 open_fragments[fragment_id].append('text', element.tail)
167 # Process all elements except begin and end
169 # Omit annotation tags
170 if (len(element.get('name', '')) or
171 element.get('class', '') in ('annotation', 'anchor')):
172 if event == 'end' and element.tail:
173 for fragment_id in open_fragments:
174 open_fragments[fragment_id].append('text', element.tail)
176 for fragment_id in open_fragments:
177 open_fragments[fragment_id].append(event, copy.copy(element))
179 return closed_fragments, open_fragments
182 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
183 parent = element.getparent()
184 index = parent.index(element)
187 if link_text is None:
189 anchor = etree.Element('a', href='#%s' % prefix)
190 anchor.set('class', 'anchor')
191 anchor.text = unicode(link_text)
192 parent.insert(index, anchor)
195 anchor_target = etree.Element('a', name='%s' % prefix)
196 anchor_target.set('class', 'target')
197 anchor_target.text = u' '
198 parent.insert(index, anchor_target)
201 def any_ancestor(element, test):
202 for ancestor in element.iterancestors():
208 def add_anchors(root):
210 for element in root.iterdescendants():
211 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
212 or e.get('id') == 'nota_red'
213 or e.tag == 'blockquote'):
216 if element.tag == 'p' and 'verse' in element.get('class', ''):
217 if counter == 1 or counter % 5 == 0:
218 add_anchor(element, "f%d" % counter, link_text=counter)
220 elif 'paragraph' in element.get('class', ''):
221 add_anchor(element, "f%d" % counter, link_text=counter)
225 def raw_printable_text(element):
226 working = copy.deepcopy(element)
227 for e in working.findall('a'):
228 if e.get('class') in ('annotation', 'theme-begin'):
230 return etree.tostring(working, method='text', encoding=unicode).strip()
233 def add_table_of_contents(root):
236 for element in root.iterdescendants():
237 if element.tag in ('h2', 'h3'):
238 if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
241 element_text = raw_printable_text(element)
242 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
243 sections[-1][3].append((counter, element.tag, element_text, []))
245 sections.append((counter, element.tag, element_text, []))
246 add_anchor(element, "s%d" % counter, with_link=False)
249 toc = etree.Element('div')
251 toc_header = etree.SubElement(toc, 'h2')
252 toc_header.text = u'Spis treści'
253 toc_list = etree.SubElement(toc, 'ol')
255 for n, section, text, subsections in sections:
256 section_element = etree.SubElement(toc_list, 'li')
257 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
260 subsection_list = etree.SubElement(section_element, 'ol')
261 for n, subsection, text, _ in subsections:
262 subsection_element = etree.SubElement(subsection_list, 'li')
263 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
268 def add_table_of_themes(root):
270 from sortify import sortify
272 sortify = lambda x: x
275 for fragment in root.findall('.//a[@class="theme-begin"]'):
276 if not fragment.text:
278 theme_names = [s.strip() for s in fragment.text.split(',')]
279 for theme_name in theme_names:
280 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
281 book_themes = book_themes.items()
282 book_themes.sort(key=lambda s: sortify(s[0]))
283 themes_div = etree.Element('div', id="themes")
284 themes_ol = etree.SubElement(themes_div, 'ol')
285 for theme_name, fragments in book_themes:
286 themes_li = etree.SubElement(themes_ol, 'li')
287 themes_li.text = "%s: " % theme_name
288 for i, fragment in enumerate(fragments):
289 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
290 item.text = str(i + 1)
292 root.insert(0, themes_div)
296 def extract_annotations(html_path):
297 """For each annotation, yields a tuple: anchor, text, html."""
298 parser = etree.HTMLParser(encoding='utf-8')
299 tree = etree.parse(html_path, parser)
300 footnotes = tree.find('//*[@id="footnotes"]')
301 if footnotes is not None:
302 for footnote in footnotes.findall('div'):
303 anchor = footnote.find('a[@name]').get('name')
305 text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
306 html_str = etree.tostring(footnote, method='html', encoding='utf-8')
307 yield anchor, text_str, html_str