move stats to L2 22.08
authorRadek Czajka <rczajka@rczajka.pl>
Thu, 10 Aug 2023 16:21:01 +0000 (18:21 +0200)
committerRadek Czajka <rczajka@rczajka.pl>
Thu, 10 Aug 2023 20:35:19 +0000 (22:35 +0200)
CHANGELOG.md
setup.py
src/librarian/document.py
src/librarian/parser.py

index a36fd2f..a1cff15 100644 (file)
@@ -2,6 +2,55 @@
 
 This document records all notable changes to Librarian.
 
+## 23.08
+
+- Move statistics counter to L2 WLgit lDocument.
+
+## 23.07.1
+
+- Add <category.thema.main>.
+- Support Python 3.7--3.11.
+
+## 2.6.1
+
+- Fix for better ignoring <extra>.
+
+## 2.6
+
+- Change default cover to marquise.
+- Add support for full predesigned covers to marquise.
+- Remove support for changing actual cover class via coverClass.
+
+## 2.5.1
+
+- Bugfix release.
+
+## 2.5
+
+- Add html-snippet builder.
+- Remove DateValue class.
+- Fix some texts and tests.
+- Drop Python < 3.6. Up to 3.9 is supported.
+
+## 2.4.13
+
+- Added thema meta field.
+
+## 2.4.12
+
+- Fix for marquise cover: allow scaling title text in all layouts.
+
+## 2.4.11.1
+
+- Added assigning and preserving id attribute .
+
+## 2.4.10
+
+- Added <wers_srodek>, <tab>, <rownlolegle> and <blok>.
+
+## 2.4.9
+
+- Added verse counters to document statistics.
 
 ## 2.4.8 (2022-07-23)
 
index b4eb161..0f7be1b 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
 
 setup(
     name='librarian',
-    version='23.07.1',
+    version='23.8',
     description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
     author="Marek Stępniowski",
     author_email='marek@stepniowski.com',
index aa6f37f..2e9a4a5 100644 (file)
@@ -1,13 +1,14 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 #
+from collections import Counter
 import gettext
 import os
 import re
 import urllib.request
 from lxml import etree
 from .parser import parser
-from . import dcparser, DCNS, DirDocProvider
+from . import dcparser, DCNS, RDFNS, DirDocProvider
 from .functions import lang_code_3to2
 
 
@@ -36,8 +37,12 @@ class WLDocument:
     def children(self):
         for part_uri in self.meta.parts or []:
             with self.provider.by_slug(part_uri.slug) as f:
-                yield type(self)(filename=f, provider=self.provider)
-    
+                try:
+                    yield type(self)(filename=f, provider=self.provider)
+                except Exception as e:
+
+                    yield e
+
     def build(self, builder, base_url=None, **kwargs):
         return builder(base_url=base_url).build(self, **kwargs)
 
@@ -67,7 +72,7 @@ class WLDocument:
                 i += 1
             item.attrib['id'] = f'e{i}'
             i += 1
-    
+
     def _compat_assign_ordered_ids(self):
         """
         Compatibility: ids in document order, to be roughly compatible with legacy
@@ -121,3 +126,64 @@ class WLDocument:
 
     def references(self):
         return self.tree.findall('.//ref')
+
+    def get_statistics(self):
+        def count_text(text, counter, in_fn=False, stanza=False):
+            if text:
+                text = re.sub(r'\s+', ' ', text)
+
+                chars = len(text) if text.strip() else 0
+                words = len(text.split()) if text.strip() else 0
+
+                counter['chars_with_fn'] += chars
+                counter['words_with_fn'] += words
+                if not in_fn:
+                    counter['chars'] += chars
+                    counter['words'] += words
+                if not stanza:
+                    counter['chars_out_verse_with_fn'] += chars
+                    if not in_fn:
+                        counter['chars_out_verse'] += chars
+
+        def count(elem, counter, in_fn=False, stanza=False):
+            if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
+                return
+            if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
+                in_fn = True
+            if elem.tag == 'strofa':
+                # count verses now
+                #verses = len(elem.findall('.//br')) + 1
+                verses = list(elem.get_verses())
+                counter['verses_with_fn'] += len(verses)
+                if not in_fn:
+                    counter['verses'] += len(verses)
+                stanza = True
+
+                for child in verses:
+                    count(child, counter, in_fn=in_fn, stanza=True)
+            else:
+                count_text(elem.text, counter, in_fn=in_fn, stanza=stanza)
+                for child in elem:
+                    count(child, counter, in_fn=in_fn, stanza=stanza)
+                    count_text(child.tail, counter, in_fn=in_fn, stanza=stanza)
+
+        data = {
+            "self": Counter(),
+            "parts": [],
+            "total": {
+            }
+        }
+
+        count(self.tree.getroot(), data['self'])
+        for k, v in data['self'].items():
+            data['total'][k] = v
+
+        for part in self.children:
+            if isinstance(part, Exception):
+                data['parts'].append((None, {'error': part}))
+            else:
+                data['parts'].append((part, part.get_statistics()))
+                for k, v in data['parts'][-1][1]['total'].items():
+                    data['total'][k] = data['total'].get(k, 0) + v
+
+        return data
index 484b8f9..b4e4c5c 100644 (file)
@@ -67,64 +67,6 @@ class WLDocument:
         else:
             self.book_info = None
 
-    def get_statistics(self):
-        def count_text(text, counter, in_fn=False, stanza=False):
-            if text:
-                text = re.sub(r'\s+', ' ', text)
-
-                chars = len(text) if text.strip() else 0
-                words = len(text.split()) if text.strip() else 0
-                
-                counter['chars_with_fn'] += chars
-                counter['words_with_fn'] += words
-                if not in_fn:
-                    counter['chars'] += chars
-                    counter['words'] += words
-                if not stanza:
-                    counter['chars_out_verse_with_fn'] += chars
-                    if not in_fn:
-                        counter['chars_out_verse'] += chars
-                
-        def count(elem, counter, in_fn=False, stanza=False):
-            if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
-                return
-            if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
-                in_fn = True
-            if elem.tag == 'strofa':
-                # count verses now
-                verses = len(elem.findall('.//br')) + 1
-                counter['verses_with_fn'] += verses
-                if not in_fn:
-                    counter['verses'] += verses
-                stanza = True
-            count_text(elem.text, counter, in_fn=in_fn, stanza=stanza)
-            for child in elem:
-                count(child, counter, in_fn=in_fn, stanza=stanza)
-                count_text(child.tail, counter, in_fn=in_fn, stanza=stanza)
-
-        self.swap_endlines()
-
-        data = {
-            "self": Counter(),
-            "parts": [],
-            "total": {
-            }
-        }
-
-        count(self.edoc.getroot(), data['self'])
-        for k, v in data['self'].items():
-            data['total'][k] = v
-        
-        for part in self.parts(pass_part_errors=True):
-            if isinstance(part, Exception):
-                data['parts'].append((None, {}))
-            else:
-                data['parts'].append((part, part.get_statistics()))
-                for k, v in data['parts'][-1][1]['total'].items():
-                    data['total'][k] = data['total'].get(k, 0) + v
-            
-        return data
-
     @classmethod
     def from_bytes(cls, xml, *args, **kwargs):
         return cls.from_file(io.BytesIO(xml), *args, **kwargs)