1 """ Tranform TeXML SAX stream """
2 # $Id: handler.py,v 1.14 2006-06-14 04:45:06 olpa Exp $
5 from xml.sax.handler import feature_namespaces
7 from Texml import texmlwr
8 from Texml import specmap
11 # Unbreakable spaces should not be deleted by strip(), but it happens:
12 # http://uucode.com/blog/2010/06/01/python-wtf-strip-eats-too-much/
13 # The solution from the web page does not work with old versions
14 # of python, therefore let's define a fallback functionality.
16 "dummy".strip(string.whitespace)
17 strip_args = (string.whitespace, )
22 # TeXML SAX handler works correct but misfeaturely when SAX parser
23 # reports characters in several calls instead of one call.
24 # This wrappers fixes the problem
30 Wrapper class to make the library easier to use.
32 See the above notes for use.
39 def parse_file(self, texml_writer, read_obj, use_context):
41 handle = glue_handler(texml_writer, use_context)
43 parser = xml.sax.make_parser()
44 parser.setFeature(feature_namespaces, 1)
45 parser.setContentHandler(handle)
46 parser.setFeature("http://xml.org/sax/features/external-general-entities", True)
49 parser.parse(read_obj)
51 class InvalidXmlException(Exception):
58 class glue_handler(xml.sax.ContentHandler):
61 Not really a public class. use ParseFile instead.
65 def __init__(self, texml_writer, use_context,
66 name_space = 'http://getfo.sourceforge.net/texml/ns1'):
67 self.h = Handler(texml_writer, use_context)
70 self.__name_space = name_space
71 self.__current_name_space = None
73 def startDocument(self):
75 self.h.startDocument()
79 self.h.characters(self.c)
82 def endDocument(self):
87 def startElement_off(self, name, attrs):
89 self.h.startElement(name, attrs)
91 def setDocumentLocator(self, locator):
92 self.locator = locator
94 def startElementNS(self, name, qname, attrs):
95 # change attrs to regular dictionary
101 the_attrs[att] = value
104 self.__current_name_space = name_space
107 # get the column and line number and use the handler
108 col_num = self.locator.getColumnNumber()
109 line_num = self.locator.getLineNumber()
110 self.h.set_location(col_num, line_num)
111 self.h.set_namespace(name_space)
113 if name_space == self.__name_space or name_space == None:
115 self.h.startElement(local_name, the_attrs)
116 # report an error and quit
118 self.h.invalid_xml(local_name)
121 def endElement_off(self, name):
123 self.h.endElement(name)
125 def endElementNS(self, name, qname):
126 col_num = self.locator.getColumnNumber()
127 line_num = self.locator.getLineNumber()
128 self.h.set_location(col_num, line_num)
131 if name_space == self.__name_space or name_space == None:
133 self.h.endElement(local_name)
137 def processingInstruction(self, target, data):
139 # No action. The only effect is that chunk
140 # ... aa <!-- xx --> bb ...
141 # is reported twice ('... aa ' and ' bb ...')
142 # instead of onece ('... aa bb ...')
144 def characters(self, content):
145 col_num = self.locator.getColumnNumber()
146 line_num = self.locator.getLineNumber()
147 self.h.set_location(col_num, line_num)
151 self.c = self.c + content
153 # WhiteSpace (WS) elimination
154 # In most cases, WS around tags (both opening and closing) are removed.
155 # But these tags save ws: <ctrl/> and <spec/>.
156 # WS processing is allowed or disallowed by "process_ws".
161 Not really a public class.
163 Handles the infile, using the glue_handle class to get the data as
164 elements or characters.
171 # text_is_only_spaces
173 # Whitespace handling:
179 # For <env/> support:
185 # For <cmd/> support:
186 # has_parm # Stacking is not required: if <cmd/> is in <cmd/>,
187 # # then is it wrapped by <parm/> or <opt/>
189 def __init__(self, texml_writer, use_context):
190 """ Create writer, create maps """
191 self.__use_context = use_context
192 # Paul Tremblay added this on 2005-03-08
193 self.writer = texml_writer
194 self.cmdname_stack = []
195 self.endenv_stack = []
199 self.no_text_content = 0
200 self.text_is_only_spaces = 0
202 self.process_ws_stack = []
204 self.nl_spec_stack = []
205 self.__name_space = None
207 # Create handler maps
209 self.model_nomath = {
210 'TeXML': self.on_texml,
213 'group': self.on_group,
214 'ctrl': self.on_ctrl,
215 'spec': self.on_spec,
218 self.model_content = self.model_nomath.copy()
219 self.model_content['math'] = self.on_math
220 self.model_content['dmath'] = self.on_dmath
225 self.model_env = self.model_content.copy() # copy, so == will true only for environment, not for any tag that shares model_content
226 self.model_env.update(self.model_cmd)
227 self.model_opt = self.model_content
228 self.model_parm = self.model_content
229 self.end_handlers = {
230 'TeXML': self.on_texml_end,
231 'cmd': self.on_cmd_end,
232 'env': self.on_env_end,
233 'group': self.on_group_end,
234 'ctrl': self.on_ctrl_end,
235 'spec': self.on_spec_end,
236 'opt': self.on_opt_end,
237 'parm': self.on_parm_end,
238 'math': self.on_math_end,
239 'dmath': self.on_dmath_end,
240 'pdf': self.on_pdf_end
243 def set_location(self, col, line):
245 self.__line_num = line
247 def set_namespace(self, name):
248 self.__name_space = name
250 def invalid_xml(self, local_name):
251 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
252 if self.__name_space:
253 msg += 'Element "%s" for namespace "%s" not expected' % (local_name, self.__name_space)
255 msg += '%s not expected' % (local_name)
257 raise InvalidXmlException(msg)
259 def invalid_xml_other(self, msg):
260 # for other types of invalid XML
261 raise InvalidXmlException(msg)
263 # -------------------------------------------------------------------
265 def startDocument(self):
266 """ Initialize data structures before parsing """
267 self.model = {'TeXML': self.on_texml}
268 self.model_stack = []
270 def endDocument(self):
271 """ Finalize document """
272 self.writer.conditionalNewline()
274 def startElement(self, name, attrs):
275 """ Handle start of an element"""
276 if name in self.model:
277 self.model[name](attrs)
279 self.invalid_xml(name)
281 def characters(self, content):
282 """ Handle text data """
284 # First, check if content allowed at all
286 # Elements like <spec/> should be empty
287 if self.no_text_content:
288 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
289 msg += "Text content is not expected: '%s'" % content.encode('latin-1', 'replace')
290 self.invalid_xml_other(msg)
291 # Element <cmd/> should not have text content,
292 # but it also may contain spaces due to indenting
293 # Element <env/> may have <opt/> and <parm/>, so we do
294 # magic to delete whitespace at beginning of environment
295 if self.text_is_only_spaces:
296 stripped = content.lstrip(*strip_args)
297 if 0 != len(stripped):
298 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
299 msg += "Only whitespaces are expected, not text content: '%s'" % content.encode('latin-1', 'replace')
300 self.invalid_xml_other(msg)
303 # Eliminate whitespaces
307 content2 = content.lstrip(*strip_args)
308 if len(content2) != len(content):
309 self.writer.writeWeakWS()
310 content = content2.rstrip(*strip_args)
311 if len(content2) != len(content):
314 # Finally, write content
316 self.writer.write(content)
318 self.writer.writeWeakWS()
320 def endElement(self, name):
321 """ Handle end of en element """
322 self.end_handlers[name]()
325 def stack_model(self, model):
326 """ Remember content model of parent and set model for current node """
327 self.model_stack.append(self.model)
330 def unstack_model(self):
331 """ Restore content model of parent """
332 self.model = self.model_stack.pop()
334 # -----------------------------------------------------------------
336 def get_boolean(self, attrs, aname, default):
337 """ Returns true if value of attribute "aname" is "1", false if "0" and None if attribute not exists. Raises error in other cases."""
338 aval = attrs.get(aname, None)
345 raise ValueError("Value of boolean attribute '%s' is not '0' or '1', but '%s'" % (aname, aval))
347 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
348 msg += "Value of boolean attribute '%s' is not '0' or '1', but '%s'" % (aname, aval)
349 self.invalid_xml_other(msg)
351 def on_texml(self, attrs):
352 """ Handle TeXML element """
353 self.stack_model(self.model_content)
355 # Set new mode ("text" or "math")
357 str = attrs.get('mode', None)
359 mode = texmlwr.DEFAULT
365 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
366 msg += "Unknown value of TeXML/@mode attribute: '%s'" % str
367 self.invalid_xml_other(msg)
368 emptylines = self.get_boolean(attrs, 'emptylines', None)
369 escape = self.get_boolean(attrs, 'escape', None)
370 ligatures = self.get_boolean(attrs, 'ligatures', None)
371 self.writer.stack_mode(mode)
372 self.writer.stack_emptylines(emptylines)
373 self.writer.stack_escape(escape)
374 self.writer.stack_ligatures(ligatures)
375 ws = self.get_boolean(attrs, 'ws', None)
376 self.process_ws_stack.append(self.process_ws)
378 self.process_ws = 0 == ws
379 self.writer.set_allow_weak_ws_to_nl(not ws)
381 def on_texml_end(self):
382 """ Handle TeXML element. Restore old mode. """
383 self.writer.unstack_ligatures()
384 self.writer.unstack_escape()
385 self.writer.unstack_emptylines()
386 self.writer.unstack_mode()
387 self.process_ws = self.process_ws_stack.pop()
388 self.writer.set_allow_weak_ws_to_nl(self.process_ws)
390 # -----------------------------------------------------------------
392 def on_cmd(self, attrs):
393 """ Handle 'cmd' element """
394 self.stack_model(self.model_cmd)
396 # Get name of the command
398 name = attrs.get('name', '')
400 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
401 msg += "Attribute cmd/@name is empty"
402 self.invalid_xml_other(msg)
403 if self.get_boolean(attrs, 'nl1', 0):
404 self.writer.conditionalNewline()
405 self.writer.writech('\\', 0)
406 self.writer.write(name, 0)
408 # Setup in-cmd processing
411 self.text_is_only_spaces = 1
412 self.nl_spec_stack.append(self.nl_spec)
413 self.nl_spec = (self.get_boolean(attrs, 'nl2', 0), self.get_boolean(attrs, 'gr', 1))
415 def on_cmd_end(self):
416 self.text_is_only_spaces = 0
418 # Write additional space or newline if command has no parameters
420 (nl, gr) = self.nl_spec
421 self.nl_spec = self.nl_spec_stack.pop()
422 if not(self.has_parm):
424 self.writer.write('{}', 0)
426 self.writer.writeWeakWS()
428 self.writer.conditionalNewline()
430 def on_opt(self, attrs):
431 """ Handle 'opt' element """
432 self.on_opt_parm('[', attrs)
434 def on_parm(self, attrs):
435 """ Handle 'parm' element """
436 self.on_opt_parm('{', attrs)
438 def on_opt_end(self):
439 self.on_opt_parm_end(']')
441 def on_parm_end(self):
442 self.on_opt_parm_end('}')
444 def on_opt_parm(self, ch, attrs):
445 """ Handle 'parm' and 'opt' """
446 self.stack_model(self.model_opt)
447 if self.model_stack[-1] == self.model_env:
448 self.nl_spec_stack.append(self.nl_spec)
449 self.nl_spec = self.writer.ungetWeakWS()
450 self.writer.writech(ch, 0)
451 self.text_is_only_spaces = 0
453 def on_opt_parm_end(self, ch):
454 self.writer.writech(ch, 0)
455 self.has_parm = 1 # At the end to avoid collision of nesting
456 # <opt/> can be only inside <cmd/> or (very rarely) in <env/>
457 if self.model_stack[-1] != self.model_env:
458 self.text_is_only_spaces = 1
460 self.text_is_only_spaces = 0
462 self.writer.writeWeakWS(self.nl_spec)
463 self.nl_spec = self.nl_spec_stack.pop()
465 # -----------------------------------------------------------------
467 def on_env(self, attrs):
468 """ Handle 'cmd' element """
469 self.stack_model(self.model_env)
471 # Get name of the environment, and begin and end commands
473 name = attrs.get('name', '')
475 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
476 msg += 'Attribute env/@name is empty'
477 self.invalid_xml_other(msg)
478 # added by Paul Tremblay on 2004-02-19
479 # the environment in context is \startenvironmentname ...
480 # \stopenvironmentname
481 if self.__use_context:
482 begenv = attrs.get('start', 'start')
484 begenv = attrs.get('begin', 'begin')
485 self.cmdname_stack.append(self.cmdname)
486 self.endenv_stack.append(self.endenv)
489 # added by Paul Tremblay on 2004-02-19
490 if self.__use_context:
491 self.endenv = attrs.get('stop', 'stop')
493 self.endenv = attrs.get('end', 'end')
495 # Write <env/> and setup newline processing
497 if self.get_boolean(attrs, 'nl1', 1):
498 self.writer.conditionalNewline()
500 # added by Paul Tremblay on 2004-02-19
502 if self.__use_context:
503 self.writer.write('\%s%s' % (begenv, name), 0)
505 self.writer.write('\%s{%s}' % (begenv, name), 0)
506 if self.get_boolean(attrs, 'nl2', 1):
507 self.writer.writeWeakWS(texmlwr.WEAK_WS_IS_NEWLINE)
508 self.nl_spec_stack.append(self.nl_spec)
509 self.nl_spec = (self.get_boolean(attrs, 'nl3', 1), self.get_boolean(attrs, 'nl4', 1))
511 def on_env_end(self):
512 nl3, nl4 = self.nl_spec
513 self.nl_spec = self.nl_spec_stack.pop()
515 self.writer.conditionalNewline()
517 # added by Paul Tremblay on 2004-02-19
518 if self.__use_context:
519 self.writer.write('\%s%s' % (self.endenv, self.cmdname), 0)
521 self.writer.write('\%s{%s}' % (self.endenv, self.cmdname), 0)
523 self.writer.conditionalNewline()
524 self.cmdname = self.cmdname_stack.pop()
525 self.endenv = self.endenv_stack.pop()
527 def on_group(self, attrs):
528 """ Handle 'group' element """
529 self.stack_model(self.model_content)
530 self.writer.writech('{', 0)
532 def on_group_end(self):
533 self.writer.writech('}', 0)
535 # -----------------------------------------------------------------
537 def on_ctrl(self, attrs):
539 # Get character, check and print tex command
541 ch = attrs.get('ch', '')
543 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
544 msg += "Attribute ctrl/@ch is not a char: '%s'" % ch
545 self.invalid_xml_other(msg)
546 self.writer.writech('\\', 0)
547 self.writer.writech(ch, 0)
549 # Content of this element is empty
552 self.no_text_content = 1
554 def on_ctrl_end(self):
555 self.no_text_content = 0
557 def on_spec(self, attrs):
559 # Get category, get corresponding character
561 cat = attrs.get('cat', '')
563 self.writer.conditionalNewline()
565 if not (cat in specmap.tocharmap):
566 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
567 msg += "Attribute spec/@cat unknown: '%s'" % cat
568 self.invalid_xml_other(msg)
569 ch = specmap.tocharmap[cat]
571 self.writer.stack_emptylines(1)
572 self.writer.write(ch, 0)
574 self.writer.unstack_emptylines()
576 # Content of this element is empty
579 self.no_text_content = 1
581 def on_spec_end(self):
582 self.no_text_content = 0
584 # -----------------------------------------------------------------
586 def on_math(self, attrs):
587 self.stack_model(self.model_nomath)
588 self.writer.writech('$', 0)
589 self.writer.stack_mode(texmlwr.MATH)
591 def on_math_end(self):
592 self.writer.unstack_mode()
593 self.writer.writech('$', 0)
595 def on_dmath(self, attrs):
596 self.writer.writech('$', 0)
599 def on_dmath_end(self):
601 self.writer.writech('$', 0)
603 # -----------------------------------------------------------------
605 def on_pdf(self, attrs):
607 self.writer.stack_mode(texmlwr.PDF)
609 def on_pdf_end(self):
610 self.writer.unstack_mode()