Basic biblical tools.
[librarian.git] / src / librarian / document.py
index aa6f37f..7f468fa 100644 (file)
@@ -1,13 +1,14 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 #
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 #
+from collections import defaultdict, Counter
 import gettext
 import os
 import re
 import urllib.request
 from lxml import etree
 from .parser import parser
 import gettext
 import os
 import re
 import urllib.request
 from lxml import etree
 from .parser import parser
-from . import dcparser, DCNS, DirDocProvider
+from . import dcparser, DCNS, RDFNS, DirDocProvider
 from .functions import lang_code_3to2
 
 
 from .functions import lang_code_3to2
 
 
@@ -16,7 +17,11 @@ class WLDocument:
         source = filename or urllib.request.urlopen(url)
         tree = etree.parse(source, parser=parser)
         self.tree = tree
         source = filename or urllib.request.urlopen(url)
         tree = etree.parse(source, parser=parser)
         self.tree = tree
+        self.counters = defaultdict(lambda: 1)
         tree.getroot().document = self
         tree.getroot().document = self
+
+        self.preprocess()
+
         self.base_meta = dcparser.BookInfo({}, {
             DCNS('language'): ["pol"],
         }, validate_required=False)
         self.base_meta = dcparser.BookInfo({}, {
             DCNS('language'): ["pol"],
         }, validate_required=False)
@@ -32,83 +37,29 @@ class WLDocument:
         return self.tree.getroot().meta
         return master.meta
 
         return self.tree.getroot().meta
         return master.meta
 
+    def preprocess(self):
+        # Change slash-verses into real verses.
+        for _e, elem in etree.iterwalk(self.tree, ('start',), 'strofa'):
+            elem.preprocess()
+
+    def assign_ids(self):
+        # Assign IDs depth-first, to account for any <numeracja> inside.
+        for _e, elem in etree.iterwalk(self.tree, events=('end',)):
+            if getattr(elem, 'NUMBERING', None):
+                elem.assign_id(self)
+
     @property
     def children(self):
         for part_uri in self.meta.parts or []:
             with self.provider.by_slug(part_uri.slug) as f:
     @property
     def children(self):
         for part_uri in self.meta.parts or []:
             with self.provider.by_slug(part_uri.slug) as f:
-                yield type(self)(filename=f, provider=self.provider)
-    
-    def build(self, builder, base_url=None, **kwargs):
-        return builder(base_url=base_url).build(self, **kwargs)
-
-    def assign_ids(self, existing=None):
-        # Find all existing IDs.
-        existing = existing or set()
-        que = [self.tree.getroot()]
-        while que:
-            item = que.pop(0)
-            try:
-                item.normalize_insides()
-            except AttributeError:
-                pass
-            existing.add(item.attrib.get('id'))
-            que.extend(item)
-
-        i = 1
-        que = [self.tree.getroot()]
-        while que:
-            item = que.pop(0)
-            que.extend(item)
-            if item.attrib.get('id'):
-                continue
-            if not getattr(item, 'SHOULD_HAVE_ID', False):
-                continue
-            while f'e{i}' in existing:
-                i += 1
-            item.attrib['id'] = f'e{i}'
-            i += 1
-    
-    def _compat_assign_ordered_ids(self):
-        """
-        Compatibility: ids in document order, to be roughly compatible with legacy
-        footnote ids. Just for testing consistency, change to some sane identifiers
-        at convenience.
-        """
-        EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
-        def _compat_assign_ordered_ids_in_elem(elem, i):
-            elem.attrib['_compat_ordered_id'] = str(i)
-            i += 1
-            if getattr(elem, 'HTML_CLASS', None) == 'stanza':
-                if elem.text:
-                    i += len(EXPR.split(elem.text)) - 1
-                for sub in elem:
-                    i = _compat_assign_ordered_ids_in_elem(sub, i)
-                    if sub.tail:
-                        i += len(EXPR.split(sub.tail)) - 1
-            else:
-                if elem.tag in ('uwaga', 'extra'):
-                    return i
-                for sub in elem:
-                    i = _compat_assign_ordered_ids_in_elem(sub, i)
-            return i
-
-        _compat_assign_ordered_ids_in_elem(self.tree.getroot(), 4)
-
-    def _compat_assign_section_ids(self):
-        """
-        Ids in master-section order. These need to be compatible with the
-        #secN anchors used by WL search results page to link to fragments.
-        """
-        def _compat_assigns_section_ids_in_elem(elem, prefix='sec'):
-            for i, child in enumerate(elem):
-                idfier = '{}{}'.format(prefix, i + 1)
                 try:
                 try:
-                    child.attrib['_compat_section_id'] = idfier
-                except:
-                    pass
-                _compat_assigns_section_ids_in_elem(child, idfier + '-')
-        _compat_assigns_section_ids_in_elem(self.tree.getroot().master)
+                    yield type(self)(filename=f, provider=self.provider)
+                except Exception as e:
+
+                    yield e
 
 
+    def build(self, builder, base_url=None, **kwargs):
+        return builder(base_url=base_url).build(self, **kwargs)
 
     def editors(self):
         persons = set(self.meta.editors
 
     def editors(self):
         persons = set(self.meta.editors
@@ -121,3 +72,64 @@ class WLDocument:
 
     def references(self):
         return self.tree.findall('.//ref')
 
     def references(self):
         return self.tree.findall('.//ref')
+
+    def get_statistics(self):
+        def count_text(text, counter, in_fn=False, stanza=False):
+            if text:
+                text = re.sub(r'\s+', ' ', text)
+
+                chars = len(text) if text.strip() else 0
+                words = len(text.split()) if text.strip() else 0
+
+                counter['chars_with_fn'] += chars
+                counter['words_with_fn'] += words
+                if not in_fn:
+                    counter['chars'] += chars
+                    counter['words'] += words
+                if not stanza:
+                    counter['chars_out_verse_with_fn'] += chars
+                    if not in_fn:
+                        counter['chars_out_verse'] += chars
+
+        def count(elem, counter, in_fn=False, stanza=False):
+            if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
+                return
+            if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
+                in_fn = True
+            if elem.tag == 'strofa':
+                # count verses now
+                #verses = len(elem.findall('.//br')) + 1
+                verses = list(elem.get_verses())
+                counter['verses_with_fn'] += len(verses)
+                if not in_fn:
+                    counter['verses'] += len(verses)
+                stanza = True
+
+                for child in verses:
+                    count(child, counter, in_fn=in_fn, stanza=True)
+            else:
+                count_text(elem.text, counter, in_fn=in_fn, stanza=stanza)
+                for child in elem:
+                    count(child, counter, in_fn=in_fn, stanza=stanza)
+                    count_text(child.tail, counter, in_fn=in_fn, stanza=stanza)
+
+        data = {
+            "self": Counter(),
+            "parts": [],
+            "total": {
+            }
+        }
+
+        count(self.tree.getroot(), data['self'])
+        for k, v in data['self'].items():
+            data['total'][k] = v
+
+        for part in self.children:
+            if isinstance(part, Exception):
+                data['parts'].append((None, {'error': part}))
+            else:
+                data['parts'].append((part, part.get_statistics()))
+                for k, v in data['parts'][-1][1]['total'].items():
+                    data['total'][k] = data['total'].get(k, 0) + v
+
+        return data