X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/3a0c83394d5783715fab2be29fa1a9cfc3574e28..877639d2f061295739bd15615c3f69d69c758d8e:/src/librarian/parser.py?ds=sidebyside diff --git a/src/librarian/parser.py b/src/librarian/parser.py index 484b8f9..1f18dbd 100644 --- a/src/librarian/parser.py +++ b/src/librarian/parser.py @@ -67,64 +67,6 @@ class WLDocument: else: self.book_info = None - def get_statistics(self): - def count_text(text, counter, in_fn=False, stanza=False): - if text: - text = re.sub(r'\s+', ' ', text) - - chars = len(text) if text.strip() else 0 - words = len(text.split()) if text.strip() else 0 - - counter['chars_with_fn'] += chars - counter['words_with_fn'] += words - if not in_fn: - counter['chars'] += chars - counter['words'] += words - if not stanza: - counter['chars_out_verse_with_fn'] += chars - if not in_fn: - counter['chars_out_verse'] += chars - - def count(elem, counter, in_fn=False, stanza=False): - if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'): - return - if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'): - in_fn = True - if elem.tag == 'strofa': - # count verses now - verses = len(elem.findall('.//br')) + 1 - counter['verses_with_fn'] += verses - if not in_fn: - counter['verses'] += verses - stanza = True - count_text(elem.text, counter, in_fn=in_fn, stanza=stanza) - for child in elem: - count(child, counter, in_fn=in_fn, stanza=stanza) - count_text(child.tail, counter, in_fn=in_fn, stanza=stanza) - - self.swap_endlines() - - data = { - "self": Counter(), - "parts": [], - "total": { - } - } - - count(self.edoc.getroot(), data['self']) - for k, v in data['self'].items(): - data['total'][k] = v - - for part in self.parts(pass_part_errors=True): - if isinstance(part, Exception): - data['parts'].append((None, {})) - else: - data['parts'].append((part, part.get_statistics())) - for k, v in data['parts'][-1][1]['total'].items(): - data['total'][k] = data['total'].get(k, 0) + v - - return data - @classmethod def from_bytes(cls, xml, *args, **kwargs): return cls.from_file(io.BytesIO(xml), *args, **kwargs) @@ -250,19 +192,26 @@ class WLDocument: for node in self.edoc.xpath('|'.join( '//%s//%s' % (note_tag, tag) for tag in - ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))): + ('pa', 'pe', 'pr', 'pt', 'ptrad', 'begin', 'end', 'motyw'))): tail = node.tail node.clear() node.tag = 'span' node.tail = tail def fix_pa_akap(self): - for pa in ('pa','pe','pr','pt'): + for pa in ('pa','pe','pr','pt', 'ptrad'): for akap in self.edoc.findall(f'//{pa}/akap'): akap.getparent().set('blocks', 'true') if not akap.getparent().index(akap): akap.set('inline', 'true') + def hebr_protect(self): + for s in self.edoc.findall('//slowo_obce'): + if not s.text and len(s) == 1 and s[0].tag == 'slowo_obce': + continue + if re.match(r'^[\s\u0590-\u05ff]+$', s.text or ''): + s.attrib['protect'] = 'true' + def editors(self): """Returns a set of all editors for book and its children.