X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/1fc6730667e7018f363771ab01584ffa716812de..2991698067a246ea9b99b4e668d261af6d418eca:/src/librarian/parser.py diff --git a/src/librarian/parser.py b/src/librarian/parser.py index 5e2fb08..f4288f8 100644 --- a/src/librarian/parser.py +++ b/src/librarian/parser.py @@ -77,30 +77,42 @@ class WLDocument(object): self.book_info = None def get_statistics(self): - def count_text(text, counter, in_fn=False): + def count_text(text, counter, in_fn=False, stanza=False): if text: text = re.sub(r'\s+', ' ', text) chars = len(text) if text.strip() else 0 words = len(text.split()) if text.strip() else 0 - counter['chars'] += chars - counter['words'] += words + counter['chars_with_fn'] += chars + counter['words_with_fn'] += words if not in_fn: - counter['chars_with_fn'] += chars - counter['words_with_fn'] += words + counter['chars'] += chars + counter['words'] += words + if not stanza: + counter['chars_out_verse_with_fn'] += chars + if not in_fn: + counter['chars_out_verse'] += chars - def count(elem, counter, in_fn=False): + def count(elem, counter, in_fn=False, stanza=False): if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'): return if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'): in_fn = True - count_text(elem.text, counter, in_fn=in_fn) + if elem.tag == 'strofa': + # count verses now + verses = len(elem.findall('.//br')) + 1 + counter['verses_with_fn'] += verses + if not in_fn: + counter['verses'] += verses + stanza = True + count_text(elem.text, counter, in_fn=in_fn, stanza=stanza) for child in elem: - count(child, counter, in_fn=in_fn) - count_text(child.tail, counter, in_fn=in_fn) - - + count(child, counter, in_fn=in_fn, stanza=stanza) + count_text(child.tail, counter, in_fn=in_fn, stanza=stanza) + + self.swap_endlines() + data = { "self": Counter(), "parts": [], @@ -181,7 +193,7 @@ class WLDocument(object): for part_uri in self.book_info.parts: try: yield self.from_file( - self.provider.by_uri(part_uri), provider=self.provider + self.provider.by_slug(part_uri.slug), provider=self.provider ) except Exception as e: if pass_part_errors: