X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/52de56522d8d29146b3be2266f57ccef0abe417a..86becb1ae4d215dd782dfc054471d724f6dcd1d7:/src/librarian/parser.py?ds=inline diff --git a/src/librarian/parser.py b/src/librarian/parser.py index 8adde33..f4288f8 100644 --- a/src/librarian/parser.py +++ b/src/librarian/parser.py @@ -30,7 +30,10 @@ class WLElementLookup(etree.CustomElementClassLookup): return if namespace: return - return WL_ELEMENTS[name] + try: + return WL_ELEMENTS[name] + except KeyError: + return parser = etree.XMLParser() @@ -74,30 +77,42 @@ class WLDocument(object): self.book_info = None def get_statistics(self): - def count_text(text, counter, in_fn=False): + def count_text(text, counter, in_fn=False, stanza=False): if text: text = re.sub(r'\s+', ' ', text) chars = len(text) if text.strip() else 0 words = len(text.split()) if text.strip() else 0 - counter['chars'] += chars - counter['words'] += words + counter['chars_with_fn'] += chars + counter['words_with_fn'] += words if not in_fn: - counter['chars_with_fn'] += chars - counter['words_with_fn'] += words + counter['chars'] += chars + counter['words'] += words + if not stanza: + counter['chars_out_verse_with_fn'] += chars + if not in_fn: + counter['chars_out_verse'] += chars - def count(elem, counter, in_fn=False): + def count(elem, counter, in_fn=False, stanza=False): if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'): return if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'): in_fn = True - count_text(elem.text, counter, in_fn=in_fn) + if elem.tag == 'strofa': + # count verses now + verses = len(elem.findall('.//br')) + 1 + counter['verses_with_fn'] += verses + if not in_fn: + counter['verses'] += verses + stanza = True + count_text(elem.text, counter, in_fn=in_fn, stanza=stanza) for child in elem: - count(child, counter, in_fn=in_fn) - count_text(child.tail, counter, in_fn=in_fn) - - + count(child, counter, in_fn=in_fn, stanza=stanza) + count_text(child.tail, counter, in_fn=in_fn, stanza=stanza) + + self.swap_endlines() + data = { "self": Counter(), "parts": [], @@ -115,7 +130,7 @@ class WLDocument(object): else: data['parts'].append((part, part.get_statistics())) for k, v in data['parts'][-1][1]['total'].items(): - data['total'][k] += v + data['total'][k] = data['total'].get(k, 0) + v return data @@ -178,7 +193,7 @@ class WLDocument(object): for part_uri in self.book_info.parts: try: yield self.from_file( - self.provider.by_uri(part_uri), provider=self.provider + self.provider.by_slug(part_uri.slug), provider=self.provider ) except Exception as e: if pass_part_errors: