From: Radek Czajka <rczajka@rczajka.pl>
Date: Wed, 18 Sep 2024 12:53:27 +0000 (+0200)
Subject: Update to new librarian api for html, txt.
X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/4faaa8414a1038804df9999a094c8dfe4c5f513f?ds=inline

Update to new librarian api for html, txt.
---

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 8c508540a..54b199b6f 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -39,7 +39,7 @@ mutagen==1.45.1
 sorl-thumbnail==12.8.0
 
 # home-brewed & dependencies
-librarian==24.5
+librarian==24.9
 
 # celery tasks
 celery[redis]==5.2.7
diff --git a/src/catalogue/fields.py b/src/catalogue/fields.py
index 9c5696fa6..ebe5cf478 100644
--- a/src/catalogue/fields.py
+++ b/src/catalogue/fields.py
@@ -229,10 +229,12 @@ class XmlField(EbookField):
 class TxtField(EbookField):
     ext = 'txt'
     for_parents = False
+    librarian2_api = True
 
     @staticmethod
     def transform(wldoc, book):
-        return wldoc.as_text()
+        from librarian.builders.txt import TxtBuilder
+        return TxtBuilder().build(wldoc)
 
 
 class Fb2Field(EbookField):
@@ -299,6 +301,7 @@ class MobiField(EbookField):
 class HtmlField(EbookField):
     ext = 'html'
     for_parents = False
+    librarian2_api = True
 
     def build(self, fieldfile):
         from django.core.files.base import ContentFile
@@ -309,7 +312,7 @@ class HtmlField(EbookField):
 
         book = fieldfile.instance
 
-        html_output = self.transform(book.wldocument(parse_dublincore=False), book)
+        html_output = self.transform(book.wldocument2(), book)
 
         # Delete old fragments, create from scratch if necessary.
         book.fragments.all().delete()
@@ -385,17 +388,15 @@ class HtmlField(EbookField):
 
     @staticmethod
     def transform(wldoc, book):
-        # ugly, but we can't use wldoc.book_info here
-        from librarian import DCNS
-        url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
-        if url_elem is None:
+        from librarian.builders.html import HtmlBuilder
+        url = wldoc.meta.url
+        if not url:
             gal_url = ''
             gal_path = ''
         else:
-            slug = url_elem.text.rstrip('/').rsplit('/', 1)[1]
-            gal_url = gallery_url(slug=slug)
-            gal_path = gallery_path(slug=slug)
-        return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url))
+            gal_url = gallery_url(slug=url.slug)
+            gal_path = gallery_path(slug=url.slug)
+        return HtmlBuilder(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url)).build(wldoc)
 
 
 class CoverField(EbookField):
diff --git a/src/catalogue/management/commands/load_abstracts.py b/src/catalogue/management/commands/load_abstracts.py
deleted file mode 100644
index f9fb2c155..000000000
--- a/src/catalogue/management/commands/load_abstracts.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
-# Copyright Â© Fundacja Wolne Lektury. See NOTICE for more information.
-#
-from django.core.management.base import BaseCommand
-
-from catalogue.models import Book
-
-
-class Command(BaseCommand):
-    def handle(self, *args, **options):
-        for b in Book.objects.order_by('slug'):
-            print(b.slug)
-            b.load_abstract()
-            b.save()
diff --git a/src/catalogue/management/commands/update_tag_description.py b/src/catalogue/management/commands/update_tag_description.py
deleted file mode 100644
index 0fb2ffd95..000000000
--- a/src/catalogue/management/commands/update_tag_description.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
-# Copyright Â© Fundacja Wolne Lektury. See NOTICE for more information.
-#
-from django.core.management import BaseCommand
-from catalogue.models import Tag
-
-
-class Command(BaseCommand):
-    help = "Update description for given tag."
-
-    def add_arguments(self, parser):
-        parser.add_argument('category')
-        parser.add_argument('slug')
-        parser.add_argument('description_filename')
-
-    def handle(self, category, slug, description_filename, **options):
-        tag = Tag.objects.get(category=category, slug=slug)
-        description = open(description_filename).read().decode('utf-8')
-        tag.description = description
-        tag.save()
diff --git a/src/catalogue/migrations/0049_snippet_anchor.py b/src/catalogue/migrations/0049_snippet_anchor.py
new file mode 100644
index 000000000..c3e11d121
--- /dev/null
+++ b/src/catalogue/migrations/0049_snippet_anchor.py
@@ -0,0 +1,19 @@
+# Generated by Django 4.0.8 on 2024-09-17 14:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('catalogue', '0048_remove_collection_kind_remove_tag_for_books_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='snippet',
+            name='anchor',
+            field=models.CharField(default='', max_length=64),
+            preserve_default=False,
+        ),
+    ]
diff --git a/src/catalogue/models/book.py b/src/catalogue/models/book.py
index 29e375464..7d3666277 100644
--- a/src/catalogue/models/book.py
+++ b/src/catalogue/models/book.py
@@ -19,7 +19,7 @@ from django.utils.translation import gettext_lazy as _, get_language
 from fnpdjango.storage import BofhFileSystemStorage
 from lxml import html
 from librarian.cover import WLCover
-from librarian.html import transform_abstrakt
+from librarian.builders.html import AbstraktHtmlBuilder
 from librarian.builders import builders
 from newtagging import managers
 from catalogue import constants
@@ -327,7 +327,10 @@ class Book(models.Model):
         return int(total)
 
     def get_time(self):
-        return round(self.xml_file.size / 1000 * 40)
+        try:
+            return round(self.xml_file.size / 1000 * 40)
+        except ValueError:
+            return 0
     
     def has_media(self, type_):
         if type_ in Book.formats:
@@ -554,11 +557,8 @@ class Book(models.Model):
                 urlretrieve('%s/%s' % (remote_gallery_url, ilustr_src), ilustr_path)
 
     def load_abstract(self):
-        abstract = self.wldocument(parse_dublincore=False).edoc.getroot().find('.//abstrakt')
-        if abstract is not None:
-            self.abstract = transform_abstrakt(abstract)
-        else:
-            self.abstract = ''
+        self.abstract = AbstraktHtmlBuilder().build(
+            self.wldocument2()).get_bytes().decode('utf-8')
 
     def load_toc(self):
         self.toc = ''
@@ -717,13 +717,14 @@ class Book(models.Model):
         cls.published.send(sender=cls, instance=book)
         return book
 
+    # TODO TEST
     def update_references(self):
         Entity = apps.get_model('references', 'Entity')
         doc = self.wldocument2()
-        doc._compat_assign_section_ids()
-        doc._compat_assign_ordered_ids()
+        doc.assign_ids()
+
         refs = {}
-        for ref_elem in doc.references():
+        for i, ref_elem in enumerate(doc.references()):
             uri = ref_elem.attrib.get('href', '')
             if not uri:
                 continue
@@ -742,10 +743,8 @@ class Book(models.Model):
                 refs[uri] = ref
                 if not ref_created:
                     ref.occurence_set.all().delete()
-            sec = ref_elem.get_link()
-            m = re.match(r'sec(\d+)', sec)
-            assert m is not None
-            sec = int(m.group(1))
+            anchor = ref_elem.get_link()
+
             snippet = ref_elem.get_snippet()
             b = builders['html-snippet']()
             for s in snippet:
@@ -753,7 +752,8 @@ class Book(models.Model):
             html = b.output().get_bytes().decode('utf-8')
 
             ref.occurence_set.create(
-                section=sec,
+                section=i,
+                anchor=anchor,
                 html=html
             )
         self.reference_set.exclude(entity__uri__in=refs).delete()
diff --git a/src/catalogue/models/snippet.py b/src/catalogue/models/snippet.py
index 4c25b8c97..ec7e2f9d1 100644
--- a/src/catalogue/models/snippet.py
+++ b/src/catalogue/models/snippet.py
@@ -7,6 +7,7 @@ from search.utils import UnaccentSearchVector
 class Snippet(models.Model):
     book = models.ForeignKey('Book', models.CASCADE)
     sec = models.IntegerField()
+    anchor = models.CharField(max_length=64)
     text = models.TextField()
     search_vector = SearchVectorField()
 
diff --git a/src/catalogue/templatetags/catalogue_tags.py b/src/catalogue/templatetags/catalogue_tags.py
index d2298d9f5..d70ad970e 100644
--- a/src/catalogue/templatetags/catalogue_tags.py
+++ b/src/catalogue/templatetags/catalogue_tags.py
@@ -307,6 +307,8 @@ def plain_list(context, object_list, with_initials=True, by_author=False, choice
 @register.simple_tag
 def related_books_2022(book=None, limit=4, taken=0):
     limit -= taken
+    if limit < 0:
+        return []
     max_books = limit
 
     books_qs = Book.objects.filter(findable=True)
diff --git a/src/references/migrations/0003_occurence_anchor.py b/src/references/migrations/0003_occurence_anchor.py
new file mode 100644
index 000000000..05e60a4a3
--- /dev/null
+++ b/src/references/migrations/0003_occurence_anchor.py
@@ -0,0 +1,19 @@
+# Generated by Django 4.0.8 on 2024-09-18 11:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('references', '0002_remove_reference_first_section_occurence'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='occurence',
+            name='anchor',
+            field=models.CharField(default='', max_length=64),
+            preserve_default=False,
+        ),
+    ]
diff --git a/src/references/migrations/0004_update_anchor.py b/src/references/migrations/0004_update_anchor.py
new file mode 100644
index 000000000..cd971857f
--- /dev/null
+++ b/src/references/migrations/0004_update_anchor.py
@@ -0,0 +1,22 @@
+# Generated by Django 4.0.8 on 2024-09-18 11:29
+
+from django.db import migrations, models
+from django.db.models.functions import Concat
+
+
+def update_anchor(apps, schema_editor):
+    Occurence = apps.get_model('references', 'Occurence')
+    Occurence.objects.filter(anchor='').update(
+        anchor=Concat(models.Value('sec'), 'section')
+    )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('references', '0003_occurence_anchor'),
+    ]
+
+    operations = [
+        migrations.RunPython(update_anchor, migrations.RunPython.noop)
+    ]
diff --git a/src/references/models.py b/src/references/models.py
index d4733ffcf..f83518dcd 100644
--- a/src/references/models.py
+++ b/src/references/models.py
@@ -84,6 +84,7 @@ class Reference(models.Model):
 class Occurence(models.Model):
     reference = models.ForeignKey(Reference, models.CASCADE)
     section = models.IntegerField()
+    anchor = models.CharField(max_length=64)
     html = models.TextField()
 
     class Meta:
diff --git a/src/references/templates/references/popup.html b/src/references/templates/references/popup.html
index f8977e6d8..3655309e3 100644
--- a/src/references/templates/references/popup.html
+++ b/src/references/templates/references/popup.html
@@ -28,7 +28,7 @@
     </div>
 
     {% for occ in ref.occurence_set.all %}
-      <a target="_blank" href="/katalog/lektura/{{ ref.book.slug }}.html#sec{{ occ.section }}" class="c-search-result-fragment-text">
+      <a target="_blank" href="/katalog/lektura/{{ ref.book.slug }}.html#{{ occ.anchor }}" class="c-search-result-fragment-text">
         {{ occ.html|safe }}
       </a>
     {% endfor %}
diff --git a/src/search/index.py b/src/search/index.py
index e0a727ca2..3b0edebf8 100644
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -2,7 +2,8 @@
 # Copyright Â© Fundacja Wolne Lektury. See NOTICE for more information.
 #
 import re
-from librarian.parser import WLDocument
+from librarian.elements.base import WLElement
+from librarian.document import WLDocument
 from lxml import etree
 
 
@@ -31,22 +32,15 @@ class Index:
     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
-    @classmethod
-    def get_master(cls, root):
-        """
-        Returns the first master tag from an etree.
-        """
-        for master in root.iter():
-            if master.tag in cls.master_tags:
-                return master
-
     @staticmethod
-    def add_snippet(book, text, position):
+    def add_snippet(book, text, position, anchor):
         book.snippet_set.create(
             sec=position + 1,
-            text=text
+            text=text,
+            anchor=anchor
         )
 
+    # TODO: The section links stuff won't work.
     @classmethod
     def index_book(cls, book):
         """
@@ -57,13 +51,23 @@ class Index:
 
         book.snippet_set.all().delete()
 
-        wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
-        root = wld.edoc.getroot()
+        wld = WLDocument(filename=book.xml_file.path)
+        wld.assign_ids()
 
-        master = cls.get_master(root)
+        master = wld.tree.getroot().master
         if master is None:
             return []
 
+        def get_indexable(element):
+            for child in element:
+                if not isinstance(child, WLElement):
+                    continue
+                if not child.attrib.get('_id'):
+                    for e in get_indexable(child):
+                        yield e
+                else:
+                    yield child
+
         def walker(node):
             if node.tag not in cls.ignore_content_tags:
                 yield node, None, None
@@ -85,12 +89,14 @@ class Index:
 
             return re.sub("(?m)/$", "", text)
 
-        for position, header in enumerate(master):
+        for position, header in enumerate(get_indexable(master)):
             if header.tag in cls.skip_header_tags:
                 continue
             if header.tag is etree.Comment:
                 continue
 
+            el_id = header.attrib['_id']
+
             # section content
             content = []
             footnote = []
@@ -110,7 +116,7 @@ class Index:
                     handle_text.append(collect_footnote)
                 elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
                     handle_text.pop()
-                    cls.add_snippet(book, ''.join(footnote), position)
+                    cls.add_snippet(book, ''.join(footnote), position, el_id)
                     footnote = []
 
                 if text is not None and handle_text is not []:
@@ -118,4 +124,4 @@ class Index:
                     hdl(text)
 
             # in the end, add a section text.
-            cls.add_snippet(book, fix_format(content), position)
+            cls.add_snippet(book, fix_format(content), position, el_id)
diff --git a/src/search/templates/search/results.html b/src/search/templates/search/results.html
index c71cf45c8..2902f2601 100644
--- a/src/search/templates/search/results.html
+++ b/src/search/templates/search/results.html
@@ -126,7 +126,7 @@
               {{ book.title }}
             </a>
             {% for f in snippets %}
-              <a class="c-search-result-fragment-text" href='{% url 'book_text' f.book.slug %}#sec{{ f.sec }}'>
+              <a class="c-search-result-fragment-text" href='{% url 'book_text' f.book.slug %}#{{ f.anchor }}'>
                 {{ f.headline|safe }}
               </a>
             {% endfor %}