1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 functions.reg_substitute_entities()
18 functions.reg_person_name()
21 'legacy': 'xslt/book2html.xslt',
22 'full': 'xslt/wl2html_full.xslt',
23 'partial': 'xslt/wl2html_partial.xslt'
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
32 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
35 def transform_abstrakt(abstrakt_element):
36 from cStringIO import StringIO
37 style_filename = get_stylesheet('legacy')
38 style = etree.parse(style_filename)
39 xml = etree.tostring(abstrakt_element)
40 document = etree.parse(StringIO(xml.replace('abstrakt', 'dlugi_cytat'))) # HACK
41 result = document.xslt(style)
42 html = re.sub('<a name="sec[0-9]*"/>', '', etree.tostring(result))
43 return re.sub('</?blockquote[^>]*>', '', html)
46 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
47 """Transforms the WL document to XHTML.
49 If output_filename is None, returns an XML,
50 otherwise returns True if file has been written,False if it hasn't.
51 File won't be written if it has no content.
55 style_filename = get_stylesheet(stylesheet)
56 style = etree.parse(style_filename)
58 document = copy.deepcopy(wldoc)
60 document.swap_endlines()
64 document.edoc.getroot().set(flag, 'yes')
66 document.clean_ed_note()
67 document.clean_ed_note('abstrakt')
71 options.setdefault('gallery', "''")
72 result = document.transform(style, **options)
73 del document # no longer needed large object :)
75 if html_has_content(result):
76 add_anchors(result.getroot())
77 add_table_of_themes(result.getroot())
78 add_table_of_contents(result.getroot())
80 return OutputFile.from_string(etree.tostring(
81 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
85 raise ValueError("'%s' is not a valid stylesheet.")
86 except (XMLSyntaxError, XSLTApplyError), e:
90 class Fragment(object):
91 def __init__(self, id, themes):
92 super(Fragment, self).__init__()
97 def append(self, event, element):
98 self.events.append((event, element))
100 def closed_events(self):
102 for event, element in self.events:
104 stack.append(('end', element))
109 print 'CLOSED NON-OPEN TAG:', element
112 return self.events + stack
116 for event, element in self.closed_events():
118 result.append(u'<%s %s>' % (
119 element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
121 result.append(element.text)
123 result.append(u'</%s>' % element.tag)
125 result.append(element.tail)
127 result.append(element)
129 return ''.join(result)
131 def __unicode__(self):
132 return self.to_string()
135 def extract_fragments(input_filename):
136 """Extracts theme fragments from input_filename."""
138 closed_fragments = {}
140 # iterparse would die on a HTML document
141 parser = etree.HTMLParser(encoding='utf-8')
142 buf = cStringIO.StringIO()
143 buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
146 for event, element in etree.iterparse(buf, events=('start', 'end')):
147 # Process begin and end elements
148 if element.get('class', '') in ('theme-begin', 'theme-end'):
149 if not event == 'end':
150 continue # Process elements only once, on end event
153 if element.get('class', '') == 'theme-begin':
154 fragment = Fragment(id=element.get('fid'), themes=element.text)
157 parent = element.getparent()
159 while parent.get('id', None) != 'book-text':
160 cparent = copy.deepcopy(parent)
162 parents.append(cparent)
163 parent = parent.getparent()
166 for parent in parents:
167 fragment.append('start', parent)
169 open_fragments[fragment.id] = fragment
171 # Close existing fragment
174 fragment = open_fragments[element.get('fid')]
176 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
178 closed_fragments[fragment.id] = fragment
179 del open_fragments[fragment.id]
181 # Append element tail to lost_text (we don't want to lose any text)
183 for fragment_id in open_fragments:
184 open_fragments[fragment_id].append('text', element.tail)
186 # Process all elements except begin and end
188 # Omit annotation tags
189 if (len(element.get('name', '')) or
190 element.get('class', '') in ('annotation', 'anchor')):
191 if event == 'end' and element.tail:
192 for fragment_id in open_fragments:
193 open_fragments[fragment_id].append('text', element.tail)
195 for fragment_id in open_fragments:
196 open_fragments[fragment_id].append(event, copy.copy(element))
198 return closed_fragments, open_fragments
201 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
202 parent = element.getparent()
203 index = parent.index(element)
206 if link_text is None:
208 anchor = etree.Element('a', href='#%s' % prefix)
209 anchor.set('class', 'anchor')
210 anchor.text = unicode(link_text)
211 parent.insert(index, anchor)
214 anchor_target = etree.Element('a', name='%s' % prefix)
215 anchor_target.set('class', 'target')
216 anchor_target.text = u' '
217 parent.insert(index, anchor_target)
220 def any_ancestor(element, test):
221 for ancestor in element.iterancestors():
227 def add_anchors(root):
229 for element in root.iterdescendants():
231 return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication', 'frame') or \
232 e.get('id') == 'nota_red' or e.tag == 'blockquote'
233 if any_ancestor(element, f):
236 if element.tag == 'p' and 'verse' in element.get('class', ''):
237 if counter == 1 or counter % 5 == 0:
238 add_anchor(element, "f%d" % counter, link_text=counter)
240 elif 'paragraph' in element.get('class', ''):
241 add_anchor(element, "f%d" % counter, link_text=counter)
245 def raw_printable_text(element):
246 working = copy.deepcopy(element)
247 for e in working.findall('a'):
248 if e.get('class') in ('annotation', 'theme-begin'):
250 return etree.tostring(working, method='text', encoding=unicode).strip()
253 def add_table_of_contents(root):
256 for element in root.iterdescendants():
257 if element.tag in ('h2', 'h3'):
258 if any_ancestor(element,
259 lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
262 element_text = raw_printable_text(element)
263 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
264 sections[-1][3].append((counter, element.tag, element_text, []))
266 sections.append((counter, element.tag, element_text, []))
267 add_anchor(element, "s%d" % counter, with_link=False)
270 toc = etree.Element('div')
272 toc_header = etree.SubElement(toc, 'h2')
273 toc_header.text = u'Spis treści'
274 toc_list = etree.SubElement(toc, 'ol')
276 for n, section, text, subsections in sections:
277 section_element = etree.SubElement(toc_list, 'li')
278 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
281 subsection_list = etree.SubElement(section_element, 'ol')
282 for n1, subsection, subtext, _ in subsections:
283 subsection_element = etree.SubElement(subsection_list, 'li')
284 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
289 def add_table_of_themes(root):
291 from sortify import sortify
297 for fragment in root.findall('.//a[@class="theme-begin"]'):
298 if not fragment.text:
300 theme_names = [s.strip() for s in fragment.text.split(',')]
301 for theme_name in theme_names:
302 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
303 book_themes = book_themes.items()
304 book_themes.sort(key=lambda s: sortify(s[0]))
305 themes_div = etree.Element('div', id="themes")
306 themes_ol = etree.SubElement(themes_div, 'ol')
307 for theme_name, fragments in book_themes:
308 themes_li = etree.SubElement(themes_ol, 'li')
309 themes_li.text = "%s: " % theme_name
310 for i, fragment in enumerate(fragments):
311 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
312 item.text = str(i + 1)
314 root.insert(0, themes_div)
317 def extract_annotations(html_path):
318 """Extracts annotations from HTML for annotations dictionary.
320 For each annotation, yields a tuple of:
321 anchor, footnote type, valid qualifiers, text, html.
324 from .fn_qualifiers import FN_QUALIFIERS
326 parser = etree.HTMLParser(encoding='utf-8')
327 tree = etree.parse(html_path, parser)
328 footnotes = tree.find('//*[@id="footnotes"]')
329 re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
330 if footnotes is not None:
331 for footnote in footnotes.findall('div'):
332 fn_type = footnote.get('class').split('-')[1]
333 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
336 if len(footnote) and footnote[-1].tail == '\n':
337 footnote[-1].tail = None
338 text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
339 html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
341 match = re_qualifier.match(text_str)
343 qualifier_str = match.group(1)
345 for candidate in re.split('[;,]', qualifier_str):
346 candidate = candidate.strip()
347 if candidate in FN_QUALIFIERS:
348 qualifiers.append(candidate)
349 elif candidate.startswith('z '):
350 subcandidate = candidate.split()[1]
351 if subcandidate in FN_QUALIFIERS:
352 qualifiers.append(subcandidate)
356 yield anchor, fn_type, qualifiers, text_str, html_str