1 """ Tranform TeXML SAX stream """
2 # $Id: handler.py,v 1.14 2006-06-14 04:45:06 olpa Exp $
5 from xml.sax.handler import feature_namespaces
13 # Unbreakable spaces should not be deleted by strip(), but it happens:
14 # http://uucode.com/blog/2010/06/01/python-wtf-strip-eats-too-much/
15 # The solution from the web page does not work with old versions
16 # of python, therefore let's define a fallback functionality.
18 "dummy".strip(string.whitespace)
19 strip_args = (string.whitespace, )
24 # TeXML SAX handler works correct but misfeaturely when SAX parser
25 # reports characters in several calls instead of one call.
26 # This wrappers fixes the problem
32 Wrapper class to make the library easier to use.
34 See the above notes for use.
41 def parse_file(self, texml_writer, read_obj, use_context):
43 handle = glue_handler(texml_writer, use_context)
45 parser = xml.sax.make_parser()
46 parser.setFeature(feature_namespaces, 1)
47 parser.setContentHandler(handle)
48 parser.setFeature("http://xml.org/sax/features/external-general-entities", True)
51 parser.parse(read_obj)
53 class InvalidXmlException(Exception):
60 class glue_handler(xml.sax.ContentHandler):
63 Not really a public class. use ParseFile instead.
67 def __init__(self, texml_writer, use_context,
68 name_space = 'http://getfo.sourceforge.net/texml/ns1'):
69 self.h = Handler(texml_writer, use_context)
72 self.__name_space = name_space
73 self.__current_name_space = None
75 def startDocument(self):
77 self.h.startDocument()
81 self.h.characters(self.c)
84 def endDocument(self):
89 def startElement_off(self, name, attrs):
91 self.h.startElement(name, attrs)
93 def setDocumentLocator(self, locator):
94 self.locator = locator
96 def startElementNS(self, name, qname, attrs):
97 # change attrs to regular dictionary
103 the_attrs[att] = value
106 self.__current_name_space = name_space
109 # get the column and line number and use the handler
110 col_num = self.locator.getColumnNumber()
111 line_num = self.locator.getLineNumber()
112 self.h.set_location(col_num, line_num)
113 self.h.set_namespace(name_space)
115 if name_space == self.__name_space or name_space == None:
117 self.h.startElement(local_name, the_attrs)
118 # report an error and quit
120 self.h.invalid_xml(local_name)
123 def endElement_off(self, name):
125 self.h.endElement(name)
127 def endElementNS(self, name, qname):
128 col_num = self.locator.getColumnNumber()
129 line_num = self.locator.getLineNumber()
130 self.h.set_location(col_num, line_num)
133 if name_space == self.__name_space or name_space == None:
135 self.h.endElement(local_name)
139 def processingInstruction(self, target, data):
141 # No action. The only effect is that chunk
142 # ... aa <!-- xx --> bb ...
143 # is reported twice ('... aa ' and ' bb ...')
144 # instead of onece ('... aa bb ...')
146 def characters(self, content):
147 col_num = self.locator.getColumnNumber()
148 line_num = self.locator.getLineNumber()
149 self.h.set_location(col_num, line_num)
153 self.c = self.c + content
155 # WhiteSpace (WS) elimination
156 # In most cases, WS around tags (both opening and closing) are removed.
157 # But these tags save ws: <ctrl/> and <spec/>.
158 # WS processing is allowed or disallowed by "process_ws".
163 Not really a public class.
165 Handles the infile, using the glue_handle class to get the data as
166 elements or characters.
173 # text_is_only_spaces
175 # Whitespace handling:
181 # For <env/> support:
187 # For <cmd/> support:
188 # has_parm # Stacking is not required: if <cmd/> is in <cmd/>,
189 # # then is it wrapped by <parm/> or <opt/>
191 def __init__(self, texml_writer, use_context):
192 """ Create writer, create maps """
193 self.__use_context = use_context
194 # Paul Tremblay added this on 2005-03-08
195 self.writer = texml_writer
196 self.cmdname_stack = []
197 self.endenv_stack = []
201 self.no_text_content = 0
202 self.text_is_only_spaces = 0
204 self.process_ws_stack = []
206 self.nl_spec_stack = []
207 self.__name_space = None
209 # Create handler maps
211 self.model_nomath = {
212 'TeXML': self.on_texml,
215 'group': self.on_group,
216 'ctrl': self.on_ctrl,
217 'spec': self.on_spec,
220 self.model_content = self.model_nomath.copy()
221 self.model_content['math'] = self.on_math
222 self.model_content['dmath'] = self.on_dmath
227 self.model_env = self.model_content.copy() # copy, so == will true only for environment, not for any tag that shares model_content
228 self.model_env.update(self.model_cmd)
229 self.model_opt = self.model_content
230 self.model_parm = self.model_content
231 self.end_handlers = {
232 'TeXML': self.on_texml_end,
233 'cmd': self.on_cmd_end,
234 'env': self.on_env_end,
235 'group': self.on_group_end,
236 'ctrl': self.on_ctrl_end,
237 'spec': self.on_spec_end,
238 'opt': self.on_opt_end,
239 'parm': self.on_parm_end,
240 'math': self.on_math_end,
241 'dmath': self.on_dmath_end,
242 'pdf': self.on_pdf_end
245 def set_location(self, col, line):
247 self.__line_num = line
249 def set_namespace(self, name):
250 self.__name_space = name
252 def invalid_xml(self, local_name):
253 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
254 if self.__name_space:
255 msg += 'Element "%s" for namespace "%s" not expected' % (local_name, self.__name_space)
257 msg += '%s not expected' % (local_name)
259 raise InvalidXmlException, msg
261 def invalid_xml_other(self, msg):
262 # for other types of invalid XML
263 raise InvalidXmlException, msg
265 # -------------------------------------------------------------------
267 def startDocument(self):
268 """ Initialize data structures before parsing """
269 self.model = {'TeXML': self.on_texml}
270 self.model_stack = []
272 def endDocument(self):
273 """ Finalize document """
274 self.writer.conditionalNewline()
276 def startElement(self, name, attrs):
277 """ Handle start of an element"""
278 if name in self.model:
279 self.model[name](attrs)
281 self.invalid_xml(name)
283 def characters(self, content):
284 """ Handle text data """
286 # First, check if content allowed at all
288 # Elements like <spec/> should be empty
289 if self.no_text_content:
290 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
291 msg += "Text content is not expected: '%s'" % content.encode('latin-1', 'replace')
292 self.invalid_xml_other(msg)
293 # Element <cmd/> should not have text content,
294 # but it also may contain spaces due to indenting
295 # Element <env/> may have <opt/> and <parm/>, so we do
296 # magic to delete whitespace at beginning of environment
297 if self.text_is_only_spaces:
298 stripped = content.lstrip(*strip_args)
299 if 0 != len(stripped):
300 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
301 msg += "Only whitespaces are expected, not text content: '%s'" % content.encode('latin-1', 'replace')
302 self.invalid_xml_other(msg)
305 # Eliminate whitespaces
309 content2 = content.lstrip(*strip_args)
310 if len(content2) != len(content):
311 self.writer.writeWeakWS()
312 content = content2.rstrip(*strip_args)
313 if len(content2) != len(content):
316 # Finally, write content
318 self.writer.write(content)
320 self.writer.writeWeakWS()
322 def endElement(self, name):
323 """ Handle end of en element """
324 self.end_handlers[name]()
327 def stack_model(self, model):
328 """ Remember content model of parent and set model for current node """
329 self.model_stack.append(self.model)
332 def unstack_model(self):
333 """ Restore content model of parent """
334 self.model = self.model_stack.pop()
336 # -----------------------------------------------------------------
338 def get_boolean(self, attrs, aname, default):
339 """ Returns true if value of attribute "aname" is "1", false if "0" and None if attribute not exists. Raises error in other cases."""
340 aval = attrs.get(aname, None)
347 raise ValueError("Value of boolean attribute '%s' is not '0' or '1', but '%s'" % (aname, aval))
349 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
350 msg += "Value of boolean attribute '%s' is not '0' or '1', but '%s'" % (aname, aval)
351 self.invalid_xml_other(msg)
353 def on_texml(self, attrs):
354 """ Handle TeXML element """
355 self.stack_model(self.model_content)
357 # Set new mode ("text" or "math")
359 str = attrs.get('mode', None)
361 mode = texmlwr.DEFAULT
367 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
368 msg += "Unknown value of TeXML/@mode attribute: '%s'" % str
369 self.invalid_xml_other(msg)
370 emptylines = self.get_boolean(attrs, 'emptylines', None)
371 escape = self.get_boolean(attrs, 'escape', None)
372 ligatures = self.get_boolean(attrs, 'ligatures', None)
373 self.writer.stack_mode(mode)
374 self.writer.stack_emptylines(emptylines)
375 self.writer.stack_escape(escape)
376 self.writer.stack_ligatures(ligatures)
377 ws = self.get_boolean(attrs, 'ws', None)
378 self.process_ws_stack.append(self.process_ws)
380 self.process_ws = 0 == ws
381 self.writer.set_allow_weak_ws_to_nl(not ws)
383 def on_texml_end(self):
384 """ Handle TeXML element. Restore old mode. """
385 self.writer.unstack_ligatures()
386 self.writer.unstack_escape()
387 self.writer.unstack_emptylines()
388 self.writer.unstack_mode()
389 self.process_ws = self.process_ws_stack.pop()
390 self.writer.set_allow_weak_ws_to_nl(self.process_ws)
392 # -----------------------------------------------------------------
394 def on_cmd(self, attrs):
395 """ Handle 'cmd' element """
396 self.stack_model(self.model_cmd)
398 # Get name of the command
400 name = attrs.get('name', '')
402 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
403 msg += "Attribute cmd/@name is empty"
404 self.invalid_xml_other(msg)
405 if self.get_boolean(attrs, 'nl1', 0):
406 self.writer.conditionalNewline()
407 self.writer.writech('\\', 0)
408 self.writer.write(name, 0)
410 # Setup in-cmd processing
413 self.text_is_only_spaces = 1
414 self.nl_spec_stack.append(self.nl_spec)
415 self.nl_spec = (self.get_boolean(attrs, 'nl2', 0), self.get_boolean(attrs, 'gr', 1))
417 def on_cmd_end(self):
418 self.text_is_only_spaces = 0
420 # Write additional space or newline if command has no parameters
422 (nl, gr) = self.nl_spec
423 self.nl_spec = self.nl_spec_stack.pop()
424 if not(self.has_parm):
426 self.writer.write('{}', 0)
428 self.writer.writeWeakWS()
430 self.writer.conditionalNewline()
432 def on_opt(self, attrs):
433 """ Handle 'opt' element """
434 self.on_opt_parm('[', attrs)
436 def on_parm(self, attrs):
437 """ Handle 'parm' element """
438 self.on_opt_parm('{', attrs)
440 def on_opt_end(self):
441 self.on_opt_parm_end(']')
443 def on_parm_end(self):
444 self.on_opt_parm_end('}')
446 def on_opt_parm(self, ch, attrs):
447 """ Handle 'parm' and 'opt' """
448 self.stack_model(self.model_opt)
449 if self.model_stack[-1] == self.model_env:
450 self.nl_spec_stack.append(self.nl_spec)
451 self.nl_spec = self.writer.ungetWeakWS()
452 self.writer.writech(ch, 0)
453 self.text_is_only_spaces = 0
455 def on_opt_parm_end(self, ch):
456 self.writer.writech(ch, 0)
457 self.has_parm = 1 # At the end to avoid collision of nesting
458 # <opt/> can be only inside <cmd/> or (very rarely) in <env/>
459 if self.model_stack[-1] != self.model_env:
460 self.text_is_only_spaces = 1
462 self.text_is_only_spaces = 0
464 self.writer.writeWeakWS(self.nl_spec)
465 self.nl_spec = self.nl_spec_stack.pop()
467 # -----------------------------------------------------------------
469 def on_env(self, attrs):
470 """ Handle 'cmd' element """
471 self.stack_model(self.model_env)
473 # Get name of the environment, and begin and end commands
475 name = attrs.get('name', '')
477 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
478 msg += 'Attribute env/@name is empty'
479 self.invalid_xml_other(msg)
480 # added by Paul Tremblay on 2004-02-19
481 # the environment in context is \startenvironmentname ...
482 # \stopenvironmentname
483 if self.__use_context:
484 begenv = attrs.get('start', 'start')
486 begenv = attrs.get('begin', 'begin')
487 self.cmdname_stack.append(self.cmdname)
488 self.endenv_stack.append(self.endenv)
491 # added by Paul Tremblay on 2004-02-19
492 if self.__use_context:
493 self.endenv = attrs.get('stop', 'stop')
495 self.endenv = attrs.get('end', 'end')
497 # Write <env/> and setup newline processing
499 if self.get_boolean(attrs, 'nl1', 1):
500 self.writer.conditionalNewline()
502 # added by Paul Tremblay on 2004-02-19
504 if self.__use_context:
505 self.writer.write('\%s%s' % (begenv, name), 0)
507 self.writer.write('\%s{%s}' % (begenv, name), 0)
508 if self.get_boolean(attrs, 'nl2', 1):
509 self.writer.writeWeakWS(texmlwr.WEAK_WS_IS_NEWLINE)
510 self.nl_spec_stack.append(self.nl_spec)
511 self.nl_spec = (self.get_boolean(attrs, 'nl3', 1), self.get_boolean(attrs, 'nl4', 1))
513 def on_env_end(self):
514 nl3, nl4 = self.nl_spec
515 self.nl_spec = self.nl_spec_stack.pop()
517 self.writer.conditionalNewline()
519 # added by Paul Tremblay on 2004-02-19
520 if self.__use_context:
521 self.writer.write('\%s%s' % (self.endenv, self.cmdname), 0)
523 self.writer.write('\%s{%s}' % (self.endenv, self.cmdname), 0)
525 self.writer.conditionalNewline()
526 self.cmdname = self.cmdname_stack.pop()
527 self.endenv = self.endenv_stack.pop()
529 def on_group(self, attrs):
530 """ Handle 'group' element """
531 self.stack_model(self.model_content)
532 self.writer.writech('{', 0)
534 def on_group_end(self):
535 self.writer.writech('}', 0)
537 # -----------------------------------------------------------------
539 def on_ctrl(self, attrs):
541 # Get character, check and print tex command
543 ch = attrs.get('ch', '')
545 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
546 msg += "Attribute ctrl/@ch is not a char: '%s'" % ch
547 self.invalid_xml_other(msg)
548 self.writer.writech('\\', 0)
549 self.writer.writech(ch, 0)
551 # Content of this element is empty
554 self.no_text_content = 1
556 def on_ctrl_end(self):
557 self.no_text_content = 0
559 def on_spec(self, attrs):
561 # Get category, get corresponding character
563 cat = attrs.get('cat', '')
565 self.writer.conditionalNewline()
567 if not (cat in specmap.tocharmap):
568 msg = 'Invalid XML %s, %s: ' % (self.__col_num, self.__line_num)
569 msg += "Attribute spec/@cat unknown: '%s'" % cat
570 self.invalid_xml_other(msg)
571 ch = specmap.tocharmap[cat]
573 self.writer.stack_emptylines(1)
574 self.writer.write(ch, 0)
576 self.writer.unstack_emptylines()
578 # Content of this element is empty
581 self.no_text_content = 1
583 def on_spec_end(self):
584 self.no_text_content = 0
586 # -----------------------------------------------------------------
588 def on_math(self, attrs):
589 self.stack_model(self.model_nomath)
590 self.writer.writech('$', 0)
591 self.writer.stack_mode(texmlwr.MATH)
593 def on_math_end(self):
594 self.writer.unstack_mode()
595 self.writer.writech('$', 0)
597 def on_dmath(self, attrs):
598 self.writer.writech('$', 0)
601 def on_dmath_end(self):
603 self.writer.writech('$', 0)
605 # -----------------------------------------------------------------
607 def on_pdf(self, attrs):
609 self.writer.stack_mode(texmlwr.PDF)
611 def on_pdf_end(self):
612 self.writer.unstack_mode()