Update to new librarian api for html, txt.

author Radek Czajka <rczajka@rczajka.pl>

Wed, 18 Sep 2024 12:53:27 +0000 (14:53 +0200)

committer Radek Czajka <rczajka@rczajka.pl>

Wed, 18 Sep 2024 12:53:27 +0000 (14:53 +0200)
author Radek Czajka <rczajka@rczajka.pl>
Wed, 18 Sep 2024 12:53:27 +0000 (14:53 +0200)
committer Radek Czajka <rczajka@rczajka.pl>
Wed, 18 Sep 2024 12:53:27 +0000 (14:53 +0200)
diff --git a/requirements/requirements.txt b/requirements/requirements.txt

index 8c50854..54b199b 100644 (file)
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -39,7 +39,7 @@ mutagen==1.45.1
  sorl-thumbnail==12.8.0
  
  # home-brewed & dependencies
-librarian==24.5
+librarian==24.9
  
  # celery tasks
  celery[redis]==5.2.7
diff --git a/src/catalogue/fields.py b/src/catalogue/fields.py

index 9c5696f..ebe5cf4 100644 (file)
--- a/src/catalogue/fields.py
+++ b/src/catalogue/fields.py
@@ -229,10 +229,12 @@ class XmlField(EbookField):
  class TxtField(EbookField):
      ext = 'txt'
      for_parents = False
+    librarian2_api = True
  
      @staticmethod
      def transform(wldoc, book):
-        return wldoc.as_text()
+        from librarian.builders.txt import TxtBuilder
+        return TxtBuilder().build(wldoc)
  
  
  class Fb2Field(EbookField):
@@ -299,6 +301,7 @@ class MobiField(EbookField):
  class HtmlField(EbookField):
      ext = 'html'
      for_parents = False
+    librarian2_api = True
  
      def build(self, fieldfile):
          from django.core.files.base import ContentFile
@@ -309,7 +312,7 @@ class HtmlField(EbookField):
  
          book = fieldfile.instance
  
-        html_output = self.transform(book.wldocument(parse_dublincore=False), book)
+        html_output = self.transform(book.wldocument2(), book)
  
          # Delete old fragments, create from scratch if necessary.
          book.fragments.all().delete()
@@ -385,17 +388,15 @@ class HtmlField(EbookField):
  
      @staticmethod
      def transform(wldoc, book):
-        # ugly, but we can't use wldoc.book_info here
-        from librarian import DCNS
-        url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
-        if url_elem is None:
+        from librarian.builders.html import HtmlBuilder
+        url = wldoc.meta.url
+        if not url:
              gal_url = ''
              gal_path = ''
          else:
-            slug = url_elem.text.rstrip('/').rsplit('/', 1)[1]
-            gal_url = gallery_url(slug=slug)
-            gal_path = gallery_path(slug=slug)
-        return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url))
+            gal_url = gallery_url(slug=url.slug)
+            gal_path = gallery_path(slug=url.slug)
+        return HtmlBuilder(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url)).build(wldoc)
  
  
  class CoverField(EbookField):
diff --git a/src/catalogue/management/commands/load_abstracts.py b/src/catalogue/management/commands/load_abstracts.py

deleted file mode 100644 (file)

index f9fb2c1..0000000
--- a/src/catalogue/management/commands/load_abstracts.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
-#
-from django.core.management.base import BaseCommand
-
-from catalogue.models import Book
-
-
-class Command(BaseCommand):
-    def handle(self, *args, **options):
-        for b in Book.objects.order_by('slug'):
-            print(b.slug)
-            b.load_abstract()
-            b.save()
diff --git a/src/catalogue/management/commands/update_tag_description.py b/src/catalogue/management/commands/update_tag_description.py

deleted file mode 100644 (file)

index 0fb2ffd..0000000
--- a/src/catalogue/management/commands/update_tag_description.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
-#
-from django.core.management import BaseCommand
-from catalogue.models import Tag
-
-
-class Command(BaseCommand):
-    help = "Update description for given tag."
-
-    def add_arguments(self, parser):
-        parser.add_argument('category')
-        parser.add_argument('slug')
-        parser.add_argument('description_filename')
-
-    def handle(self, category, slug, description_filename, **options):
-        tag = Tag.objects.get(category=category, slug=slug)
-        description = open(description_filename).read().decode('utf-8')
-        tag.description = description
-        tag.save()
diff --git a/src/catalogue/migrations/0049_snippet_anchor.py b/src/catalogue/migrations/0049_snippet_anchor.py

new file mode 100644 (file)

index 0000000..c3e11d1
--- /dev/null
+++ b/src/catalogue/migrations/0049_snippet_anchor.py
@@ -0,0 +1,19 @@
+# Generated by Django 4.0.8 on 2024-09-17 14:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('catalogue', '0048_remove_collection_kind_remove_tag_for_books_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='snippet',
+            name='anchor',
+            field=models.CharField(default='', max_length=64),
+            preserve_default=False,
+        ),
+    ]
diff --git a/src/catalogue/models/book.py b/src/catalogue/models/book.py

index 29e3754..7d36662 100644 (file)
--- a/src/catalogue/models/book.py
+++ b/src/catalogue/models/book.py
@@ -19,7 +19,7 @@ from django.utils.translation import gettext_lazy as _, get_language
  from fnpdjango.storage import BofhFileSystemStorage
  from lxml import html
  from librarian.cover import WLCover
-from librarian.html import transform_abstrakt
+from librarian.builders.html import AbstraktHtmlBuilder
  from librarian.builders import builders
  from newtagging import managers
  from catalogue import constants
@@ -327,7 +327,10 @@ class Book(models.Model):
          return int(total)
  
      def get_time(self):
-        return round(self.xml_file.size / 1000 * 40)
+        try:
+            return round(self.xml_file.size / 1000 * 40)
+        except ValueError:
+            return 0
      
      def has_media(self, type_):
          if type_ in Book.formats:
@@ -554,11 +557,8 @@ class Book(models.Model):
                  urlretrieve('%s/%s' % (remote_gallery_url, ilustr_src), ilustr_path)
  
      def load_abstract(self):
-        abstract = self.wldocument(parse_dublincore=False).edoc.getroot().find('.//abstrakt')
-        if abstract is not None:
-            self.abstract = transform_abstrakt(abstract)
-        else:
-            self.abstract = ''
+        self.abstract = AbstraktHtmlBuilder().build(
+            self.wldocument2()).get_bytes().decode('utf-8')
  
      def load_toc(self):
          self.toc = ''
@@ -717,13 +717,14 @@ class Book(models.Model):
          cls.published.send(sender=cls, instance=book)
          return book
  
+    # TODO TEST
      def update_references(self):
          Entity = apps.get_model('references', 'Entity')
          doc = self.wldocument2()
-        doc._compat_assign_section_ids()
-        doc._compat_assign_ordered_ids()
+        doc.assign_ids()
+
          refs = {}
-        for ref_elem in doc.references():
+        for i, ref_elem in enumerate(doc.references()):
              uri = ref_elem.attrib.get('href', '')
              if not uri:
                  continue
@@ -742,10 +743,8 @@ class Book(models.Model):
                  refs[uri] = ref
                  if not ref_created:
                      ref.occurence_set.all().delete()
-            sec = ref_elem.get_link()
-            m = re.match(r'sec(\d+)', sec)
-            assert m is not None
-            sec = int(m.group(1))
+            anchor = ref_elem.get_link()
+
              snippet = ref_elem.get_snippet()
              b = builders['html-snippet']()
              for s in snippet:
@@ -753,7 +752,8 @@ class Book(models.Model):
              html = b.output().get_bytes().decode('utf-8')
  
              ref.occurence_set.create(
-                section=sec,
+                section=i,
+                anchor=anchor,
                  html=html
              )
          self.reference_set.exclude(entity__uri__in=refs).delete()
diff --git a/src/catalogue/models/snippet.py b/src/catalogue/models/snippet.py

index 4c25b8c..ec7e2f9 100644 (file)
--- a/src/catalogue/models/snippet.py
+++ b/src/catalogue/models/snippet.py
@@ -7,6 +7,7 @@ from search.utils import UnaccentSearchVector
  class Snippet(models.Model):
      book = models.ForeignKey('Book', models.CASCADE)
      sec = models.IntegerField()
+    anchor = models.CharField(max_length=64)
      text = models.TextField()
      search_vector = SearchVectorField()
  
diff --git a/src/catalogue/templatetags/catalogue_tags.py b/src/catalogue/templatetags/catalogue_tags.py

index d2298d9..d70ad97 100644 (file)
--- a/src/catalogue/templatetags/catalogue_tags.py
+++ b/src/catalogue/templatetags/catalogue_tags.py
@@ -307,6 +307,8 @@ def plain_list(context, object_list, with_initials=True, by_author=False, choice
  @register.simple_tag
  def related_books_2022(book=None, limit=4, taken=0):
      limit -= taken
+    if limit < 0:
+        return []
      max_books = limit
  
      books_qs = Book.objects.filter(findable=True)
diff --git a/src/references/migrations/0003_occurence_anchor.py b/src/references/migrations/0003_occurence_anchor.py

new file mode 100644 (file)

index 0000000..05e60a4
--- /dev/null
+++ b/src/references/migrations/0003_occurence_anchor.py
@@ -0,0 +1,19 @@
+# Generated by Django 4.0.8 on 2024-09-18 11:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('references', '0002_remove_reference_first_section_occurence'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='occurence',
+            name='anchor',
+            field=models.CharField(default='', max_length=64),
+            preserve_default=False,
+        ),
+    ]
diff --git a/src/references/migrations/0004_update_anchor.py b/src/references/migrations/0004_update_anchor.py

new file mode 100644 (file)

index 0000000..cd97185
--- /dev/null
+++ b/src/references/migrations/0004_update_anchor.py
@@ -0,0 +1,22 @@
+# Generated by Django 4.0.8 on 2024-09-18 11:29
+
+from django.db import migrations, models
+from django.db.models.functions import Concat
+
+
+def update_anchor(apps, schema_editor):
+    Occurence = apps.get_model('references', 'Occurence')
+    Occurence.objects.filter(anchor='').update(
+        anchor=Concat(models.Value('sec'), 'section')
+    )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('references', '0003_occurence_anchor'),
+    ]
+
+    operations = [
+        migrations.RunPython(update_anchor, migrations.RunPython.noop)
+    ]
diff --git a/src/references/models.py b/src/references/models.py

index d4733ff..f83518d 100644 (file)
--- a/src/references/models.py
+++ b/src/references/models.py
@@ -84,6 +84,7 @@ class Reference(models.Model):
  class Occurence(models.Model):
      reference = models.ForeignKey(Reference, models.CASCADE)
      section = models.IntegerField()
+    anchor = models.CharField(max_length=64)
      html = models.TextField()
  
      class Meta:
diff --git a/src/references/templates/references/popup.html b/src/references/templates/references/popup.html

index f8977e6..3655309 100644 (file)
--- a/src/references/templates/references/popup.html
+++ b/src/references/templates/references/popup.html
@@ -28,7 +28,7 @@
      </div>
  
      {% for occ in ref.occurence_set.all %}
-      <a target="_blank" href="/katalog/lektura/{{ ref.book.slug }}.html#sec{{ occ.section }}" class="c-search-result-fragment-text">
+      <a target="_blank" href="/katalog/lektura/{{ ref.book.slug }}.html#{{ occ.anchor }}" class="c-search-result-fragment-text">
          {{ occ.html|safe }}
        </a>
      {% endfor %}
diff --git a/src/search/index.py b/src/search/index.py

index e0a727c..3b0edeb 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -2,7 +2,8 @@
  # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
  #
  import re
-from librarian.parser import WLDocument
+from librarian.elements.base import WLElement
+from librarian.document import WLDocument
  from lxml import etree
  
  
@@ -31,22 +32,15 @@ class Index:
      skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
                          '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  
-    @classmethod
-    def get_master(cls, root):
-        """
-        Returns the first master tag from an etree.
-        """
-        for master in root.iter():
-            if master.tag in cls.master_tags:
-                return master
-
      @staticmethod
-    def add_snippet(book, text, position):
+    def add_snippet(book, text, position, anchor):
          book.snippet_set.create(
              sec=position + 1,
-            text=text
+            text=text,
+            anchor=anchor
          )
  
+    # TODO: The section links stuff won't work.
      @classmethod
      def index_book(cls, book):
          """
@@ -57,13 +51,23 @@ class Index:
  
          book.snippet_set.all().delete()
  
-        wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
-        root = wld.edoc.getroot()
+        wld = WLDocument(filename=book.xml_file.path)
+        wld.assign_ids()
  
-        master = cls.get_master(root)
+        master = wld.tree.getroot().master
          if master is None:
              return []
  
+        def get_indexable(element):
+            for child in element:
+                if not isinstance(child, WLElement):
+                    continue
+                if not child.attrib.get('_id'):
+                    for e in get_indexable(child):
+                        yield e
+                else:
+                    yield child
+
          def walker(node):
              if node.tag not in cls.ignore_content_tags:
                  yield node, None, None
@@ -85,12 +89,14 @@ class Index:
  
              return re.sub("(?m)/$", "", text)
  
-        for position, header in enumerate(master):
+        for position, header in enumerate(get_indexable(master)):
              if header.tag in cls.skip_header_tags:
                  continue
              if header.tag is etree.Comment:
                  continue
  
+            el_id = header.attrib['_id']
+
              # section content
              content = []
              footnote = []
@@ -110,7 +116,7 @@ class Index:
                      handle_text.append(collect_footnote)
                  elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
                      handle_text.pop()
-                    cls.add_snippet(book, ''.join(footnote), position)
+                    cls.add_snippet(book, ''.join(footnote), position, el_id)
                      footnote = []
  
                  if text is not None and handle_text is not []:
@@ -118,4 +124,4 @@ class Index:
                      hdl(text)
  
              # in the end, add a section text.
-            cls.add_snippet(book, fix_format(content), position)
+            cls.add_snippet(book, fix_format(content), position, el_id)
diff --git a/src/search/templates/search/results.html b/src/search/templates/search/results.html

index c71cf45..2902f26 100644 (file)
--- a/src/search/templates/search/results.html
+++ b/src/search/templates/search/results.html
@@ -126,7 +126,7 @@
                {{ book.title }}
              </a>
              {% for f in snippets %}
-              <a class="c-search-result-fragment-text" href='{% url 'book_text' f.book.slug %}#sec{{ f.sec }}'>
+              <a class="c-search-result-fragment-text" href='{% url 'book_text' f.book.slug %}#{{ f.anchor }}'>
                  {{ f.headline|safe }}
                </a>
              {% endfor %}
author	Radek Czajka <rczajka@rczajka.pl>
	Wed, 18 Sep 2024 12:53:27 +0000 (14:53 +0200)
committer	Radek Czajka <rczajka@rczajka.pl>
	Wed, 18 Sep 2024 12:53:27 +0000 (14:53 +0200)
requirements/requirements.txt		patch \| blob \| history
src/catalogue/fields.py		patch \| blob \| history
src/catalogue/management/commands/load_abstracts.py	[deleted file]	patch \| blob \| history
src/catalogue/management/commands/update_tag_description.py	[deleted file]	patch \| blob \| history
src/catalogue/migrations/0049_snippet_anchor.py	[new file with mode: 0644]	patch \| blob
src/catalogue/models/book.py		patch \| blob \| history
src/catalogue/models/snippet.py		patch \| blob \| history
src/catalogue/templatetags/catalogue_tags.py		patch \| blob \| history
src/references/migrations/0003_occurence_anchor.py	[new file with mode: 0644]	patch \| blob
src/references/migrations/0004_update_anchor.py	[new file with mode: 0644]	patch \| blob
src/references/models.py		patch \| blob \| history
src/references/templates/references/popup.html		patch \| blob \| history
src/search/index.py		patch \| blob \| history
src/search/templates/search/results.html		patch \| blob \| history