1 # -*- coding: utf-8 -*-
2 from copy import deepcopy
6 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
8 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
9 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
12 class ParseError(BaseException):
16 def _trim(text, trim_begin=True, trim_end=True):
18 Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
19 that eg. one big XML file can be compiled from many small XML files.
22 text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
24 text = RE_TRIM_END.split(text, maxsplit=1)[0]
28 def compile_text(parts):
30 Compiles full text from an iterable of parts,
31 trimming where applicable.
36 for next_text in parts:
40 # trim the end, because there's more non-empty text
41 # don't trim beginning, if `text' is the first non-empty part
42 texts.append(_trim(text, trim_begin=trim_begin))
45 # don't trim the end, because there's no more text coming after `text'
46 # only trim beginning if it's not still the first non-empty
47 texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
51 def add_trim_begin(text):
52 trim_tag = etree.Comment(TRIM_BEGIN)
53 e = etree.fromstring(text)
54 for master in e[::-1]:
55 if master.tag in MASTERS:
57 if master.tag not in MASTERS:
58 raise ParseError('No master tag found!')
60 master.insert(0, trim_tag)
61 trim_tag.tail = '\n\n\n' + (master.text or '')
63 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
66 def add_trim_end(text):
67 trim_tag = etree.Comment(TRIM_END)
68 e = etree.fromstring(text)
69 for master in e[::-1]:
70 if master.tag in MASTERS:
72 if master.tag not in MASTERS:
73 raise ParseError('No master tag found!')
75 master.append(trim_tag)
77 prev = trim_tag.getprevious()
79 prev.tail = (prev.tail or '') + '\n\n\n'
81 master.text = (master.text or '') + '\n\n\n'
82 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
86 """Splits text into chapters.
88 All this stuff really must go somewhere else.
91 src = etree.fromstring(text)
94 splitter = u'naglowek_rozdzial'
95 parts = src.findall('.//naglowek_rozdzial')
98 copied = deepcopy(src)
102 # find the chapter's title
103 name_elem = deepcopy(element)
104 for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
105 for a in name_elem.findall('.//' + tag):
108 name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
110 # in the original, remove everything from the start of the last chapter
111 parent = element.getparent()
112 del parent[parent.index(element):]
113 element, parent = parent, parent.getparent()
114 while parent is not None:
115 del parent[parent.index(element) + 1:]
116 element, parent = parent, parent.getparent()
118 # in the copy, remove everything before the last chapter
119 element = copied.findall('.//naglowek_rozdzial')[-1]
120 parent = element.getparent()
121 while parent is not None:
123 while parent[0] is not element:
125 element, parent = parent, parent.getparent()
127 unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')
130 parts = src.findall('.//naglowek_rozdzial')
132 chunks[:0] = [[u'poczÄ…tek',
133 unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')
136 for ch in chunks[1:]:
137 ch[1] = add_trim_begin(ch[1])
138 for ch in chunks[:-1]:
139 ch[1] = add_trim_end(ch[1])