1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
11 from lxml import etree
12 from librarian.parser import WLDocument
13 from librarian import XHTMLNS, ParseError
14 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
18 functions.reg_substitute_entities()
21 'legacy': 'xslt/book2html.xslt',
22 'full': 'xslt/wl2html_full.xslt',
23 'partial': 'xslt/wl2html_partial.xslt'
26 def get_stylesheet(name):
27 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29 def html_has_content(text):
30 return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
32 def transform(input, output_filename=None, is_file=True, \
33 parse_dublincore=True, stylesheet='legacy', options={}):
34 """Transforms file input_filename in XML to output_filename in XHTML.
36 If output_filename is None, returns an XML,
37 otherwise returns True if file has been written,False if it hasn't.
38 File won't be written if it has no content.
42 style_filename = get_stylesheet(stylesheet)
43 style = etree.parse(style_filename)
46 document = WLDocument.from_file(input, True, \
47 parse_dublincore=parse_dublincore)
49 document = WLDocument.from_string(input, True, \
50 parse_dublincore=parse_dublincore)
52 document.clean_ed_note()
54 result = document.transform(style, **options)
55 del document # no longer needed large object :)
57 if html_has_content(result):
58 add_anchors(result.getroot())
59 add_table_of_contents(result.getroot())
61 if output_filename is not None:
62 result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
67 if output_filename is not None:
72 raise ValueError("'%s' is not a valid stylesheet.")
73 except (XMLSyntaxError, XSLTApplyError), e:
76 class Fragment(object):
77 def __init__(self, id, themes):
78 super(Fragment, self).__init__()
83 def append(self, event, element):
84 self.events.append((event, element))
86 def closed_events(self):
88 for event, element in self.events:
90 stack.append(('end', element))
95 print 'CLOSED NON-OPEN TAG:', element
98 return self.events + stack
102 for event, element in self.closed_events():
104 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
106 result.append(element.text)
108 result.append(u'</%s>' % element.tag)
110 result.append(element.tail)
112 result.append(element)
114 return ''.join(result)
116 def __unicode__(self):
117 return self.to_string()
120 def extract_fragments(input_filename):
121 """Extracts theme fragments from input_filename."""
123 closed_fragments = {}
125 for event, element in etree.iterparse(input_filename, events=('start', 'end')):
126 # Process begin and end elements
127 if element.get('class', '') in ('theme-begin', 'theme-end'):
128 if not event == 'end': continue # Process elements only once, on end event
131 if element.get('class', '') == 'theme-begin':
132 fragment = Fragment(id=element.get('fid'), themes=element.text)
135 if element.getparent().get('id', None) != 'book-text':
136 parents = [element.getparent()]
137 while parents[-1].getparent().get('id', None) != 'book-text':
138 parents.append(parents[-1].getparent())
141 for parent in parents:
142 fragment.append('start', parent)
144 open_fragments[fragment.id] = fragment
146 # Close existing fragment
149 fragment = open_fragments[element.get('fid')]
151 print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
153 closed_fragments[fragment.id] = fragment
154 del open_fragments[fragment.id]
156 # Append element tail to lost_text (we don't want to lose any text)
158 for fragment_id in open_fragments:
159 open_fragments[fragment_id].append('text', element.tail)
162 # Process all elements except begin and end
164 # Omit annotation tags
165 if len(element.get('name', '')) or element.get('class', '') == 'annotation':
166 if event == 'end' and element.tail:
167 for fragment_id in open_fragments:
168 open_fragments[fragment_id].append('text', element.tail)
170 for fragment_id in open_fragments:
171 open_fragments[fragment_id].append(event, copy.copy(element))
173 return closed_fragments, open_fragments
176 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
178 if link_text is None:
180 anchor = etree.Element('a', href='#%s' % prefix)
181 anchor.set('class', 'anchor')
182 anchor.text = unicode(link_text)
184 anchor.tail = element.text
186 element.insert(0, anchor)
189 anchor_target = etree.Element('a', name='%s' % prefix)
190 anchor_target.set('class', 'target')
191 anchor_target.text = u' '
193 anchor_target.tail = element.text
195 element.insert(0, anchor_target)
198 def any_ancestor(element, test):
199 for ancestor in element.iterancestors():
205 def add_anchors(root):
207 for element in root.iterdescendants():
208 if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
209 or e.get('id') == 'nota_red'
210 or e.tag == 'blockquote'):
213 if element.tag == 'p' and 'verse' in element.get('class', ''):
214 if counter == 1 or counter % 5 == 0:
215 add_anchor(element, "f%d" % counter, link_text=counter)
217 elif 'paragraph' in element.get('class', ''):
218 add_anchor(element, "f%d" % counter, link_text=counter)
222 def add_table_of_contents(root):
225 for element in root.iterdescendants():
226 if element.tag in ('h2', 'h3'):
227 if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
230 if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
231 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
233 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
234 add_anchor(element, "s%d" % counter, with_link=False)
237 toc = etree.Element('div')
239 toc_header = etree.SubElement(toc, 'h2')
240 toc_header.text = u'Spis treści'
241 toc_list = etree.SubElement(toc, 'ol')
243 for n, section, text, subsections in sections:
244 section_element = etree.SubElement(toc_list, 'li')
245 add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
248 subsection_list = etree.SubElement(section_element, 'ol')
249 for n, subsection, text, _ in subsections:
250 subsection_element = etree.SubElement(subsection_list, 'li')
251 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)