New Element-based builder API (WiP).
[librarian.git] / src / librarian / elements / base.py
1 # -*- coding: utf-8
2
3 import re
4 from lxml import etree
5 from librarian import dcparser, RDFNS
6
7
8 class WLElement(etree.ElementBase):
9     TXT_TOP_MARGIN = 0
10     TXT_BOTTOM_MARGIN = 0
11     TXT_PREFIX = ""
12     TXT_SUFFIX = ""
13
14     HTML_TAG = None
15     HTML_ATTR = {}
16     HTML_CLASS = None
17     HTML_SECTION = False
18     
19     CAN_HAVE_TEXT = True
20     STRIP = False
21
22     text_substitutions = [
23         (u'---', u'—'),
24         (u'--', u'–'),
25         (u'...', u'…'),
26         (u',,', u'„'),
27         (u'"', u'”'),
28         ('\ufeff', ''),
29     ]
30
31     @property
32     def meta_object(self):
33         if not hasattr(self, '_meta_object'):
34             elem = self.find(RDFNS('RDF'))
35             if elem is not None:
36                 self._meta_object = dcparser.BookInfo.from_element(elem)
37             else:
38                 self._meta_object = None
39         return self._meta_object
40     
41     @property
42     def meta(self):
43         if self.meta_object is not None:
44             return self.meta_object
45         else:
46             if self.getparent() is not None:
47                 return self.getparent().meta
48             else:
49                 return self.document.base_meta
50     
51     def normalize_text(self, text):
52         text = text or ''
53         for e, s in self.text_substitutions:
54             text = text.replace(e, s)
55         text = re.sub(r'\s+', ' ', text)
56         return text
57
58     def _build_inner(self, builder, build_method):
59         child_count = len(self)
60         if self.CAN_HAVE_TEXT and self.text:
61             text = self.normalize_text(self.text)
62             if self.STRIP:
63                 text = text.lstrip()
64                 if not child_count:
65                     text = text.rstrip()
66             builder.push_text(text)
67         for i, child in enumerate(self):
68             if isinstance(child, WLElement):
69                 getattr(child, build_method)(builder)
70             if self.CAN_HAVE_TEXT and child.tail:
71                 text = self.normalize_text(child.tail)
72                 if self.STRIP and i == child_count - 1:
73                     text = text.rstrip()
74                 builder.push_text(text)
75
76     def _txt_build_inner(self, builder):
77         self._build_inner(builder, 'txt_build')
78
79     def txt_build(self, builder):
80         if hasattr(self, 'TXT_LEGACY_TOP_MARGIN'):
81             builder.push_legacy_margin(self.TXT_LEGACY_TOP_MARGIN)
82         else:
83             builder.push_margin(self.TXT_TOP_MARGIN)
84         builder.push_text(self.TXT_PREFIX, True)
85         self._txt_build_inner(builder)
86         builder.push_text(self.TXT_SUFFIX, True)
87         if hasattr(self, 'TXT_LEGACY_BOTTOM_MARGIN'):
88             builder.push_legacy_margin(self.TXT_LEGACY_BOTTOM_MARGIN)
89         else:
90             builder.push_margin(self.TXT_BOTTOM_MARGIN)
91
92     def _html_build_inner(self, builder):
93         self._build_inner(builder, 'html_build')
94
95     def get_html_attr(self, builder):
96         attr = self.HTML_ATTR.copy()
97         if self.HTML_CLASS:
98             attr['class'] = self.HTML_CLASS
99         # always copy the id attribute (?)
100         if self.attrib.get('id'):
101             attr['id'] = self.attrib['id']
102         return attr
103         
104     def html_build(self, builder):
105         if self.HTML_SECTION:
106             builder.start_element(
107                 'a', {"name": "f18", "class": "target"}
108             )
109             builder.push_text(" ")
110             builder.end_element()
111
112             builder.start_element(
113                 "a", {"href": "#f18", "class": "anchor"}
114             )
115             builder.push_text("18")
116             builder.end_element()
117         
118
119         if self.HTML_TAG:
120             builder.start_element(
121                 self.HTML_TAG,
122                 self.get_html_attr(builder),
123             )
124
125         if self.HTML_SECTION:
126             builder.start_element(
127                 "a", {"name": "sec34"}
128             )
129             builder.end_element()
130
131         self._html_build_inner(builder)
132         if self.HTML_TAG:
133             builder.end_element()
134
135     def sanitize(self):
136         # TODO: Remove insanity here.
137         for e in self:
138             if isinstance(e, WLElement):
139                 e.sanitize()