From: Radek Czajka Date: Wed, 18 Sep 2024 12:53:27 +0000 (+0200) Subject: Update to new librarian api for html, txt. X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/4faaa8414a1038804df9999a094c8dfe4c5f513f?ds=sidebyside Update to new librarian api for html, txt. --- diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 8c508540a..54b199b6f 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -39,7 +39,7 @@ mutagen==1.45.1 sorl-thumbnail==12.8.0 # home-brewed & dependencies -librarian==24.5 +librarian==24.9 # celery tasks celery[redis]==5.2.7 diff --git a/src/catalogue/fields.py b/src/catalogue/fields.py index 9c5696fa6..ebe5cf478 100644 --- a/src/catalogue/fields.py +++ b/src/catalogue/fields.py @@ -229,10 +229,12 @@ class XmlField(EbookField): class TxtField(EbookField): ext = 'txt' for_parents = False + librarian2_api = True @staticmethod def transform(wldoc, book): - return wldoc.as_text() + from librarian.builders.txt import TxtBuilder + return TxtBuilder().build(wldoc) class Fb2Field(EbookField): @@ -299,6 +301,7 @@ class MobiField(EbookField): class HtmlField(EbookField): ext = 'html' for_parents = False + librarian2_api = True def build(self, fieldfile): from django.core.files.base import ContentFile @@ -309,7 +312,7 @@ class HtmlField(EbookField): book = fieldfile.instance - html_output = self.transform(book.wldocument(parse_dublincore=False), book) + html_output = self.transform(book.wldocument2(), book) # Delete old fragments, create from scratch if necessary. book.fragments.all().delete() @@ -385,17 +388,15 @@ class HtmlField(EbookField): @staticmethod def transform(wldoc, book): - # ugly, but we can't use wldoc.book_info here - from librarian import DCNS - url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url')) - if url_elem is None: + from librarian.builders.html import HtmlBuilder + url = wldoc.meta.url + if not url: gal_url = '' gal_path = '' else: - slug = url_elem.text.rstrip('/').rsplit('/', 1)[1] - gal_url = gallery_url(slug=slug) - gal_path = gallery_path(slug=slug) - return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url)) + gal_url = gallery_url(slug=url.slug) + gal_path = gallery_path(slug=url.slug) + return HtmlBuilder(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url)).build(wldoc) class CoverField(EbookField): diff --git a/src/catalogue/management/commands/load_abstracts.py b/src/catalogue/management/commands/load_abstracts.py deleted file mode 100644 index f9fb2c155..000000000 --- a/src/catalogue/management/commands/load_abstracts.py +++ /dev/null @@ -1,14 +0,0 @@ -# This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Wolne Lektury. See NOTICE for more information. -# -from django.core.management.base import BaseCommand - -from catalogue.models import Book - - -class Command(BaseCommand): - def handle(self, *args, **options): - for b in Book.objects.order_by('slug'): - print(b.slug) - b.load_abstract() - b.save() diff --git a/src/catalogue/management/commands/update_tag_description.py b/src/catalogue/management/commands/update_tag_description.py deleted file mode 100644 index 0fb2ffd95..000000000 --- a/src/catalogue/management/commands/update_tag_description.py +++ /dev/null @@ -1,20 +0,0 @@ -# This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Wolne Lektury. See NOTICE for more information. -# -from django.core.management import BaseCommand -from catalogue.models import Tag - - -class Command(BaseCommand): - help = "Update description for given tag." - - def add_arguments(self, parser): - parser.add_argument('category') - parser.add_argument('slug') - parser.add_argument('description_filename') - - def handle(self, category, slug, description_filename, **options): - tag = Tag.objects.get(category=category, slug=slug) - description = open(description_filename).read().decode('utf-8') - tag.description = description - tag.save() diff --git a/src/catalogue/migrations/0049_snippet_anchor.py b/src/catalogue/migrations/0049_snippet_anchor.py new file mode 100644 index 000000000..c3e11d121 --- /dev/null +++ b/src/catalogue/migrations/0049_snippet_anchor.py @@ -0,0 +1,19 @@ +# Generated by Django 4.0.8 on 2024-09-17 14:14 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0048_remove_collection_kind_remove_tag_for_books_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='snippet', + name='anchor', + field=models.CharField(default='', max_length=64), + preserve_default=False, + ), + ] diff --git a/src/catalogue/models/book.py b/src/catalogue/models/book.py index 29e375464..7d3666277 100644 --- a/src/catalogue/models/book.py +++ b/src/catalogue/models/book.py @@ -19,7 +19,7 @@ from django.utils.translation import gettext_lazy as _, get_language from fnpdjango.storage import BofhFileSystemStorage from lxml import html from librarian.cover import WLCover -from librarian.html import transform_abstrakt +from librarian.builders.html import AbstraktHtmlBuilder from librarian.builders import builders from newtagging import managers from catalogue import constants @@ -327,7 +327,10 @@ class Book(models.Model): return int(total) def get_time(self): - return round(self.xml_file.size / 1000 * 40) + try: + return round(self.xml_file.size / 1000 * 40) + except ValueError: + return 0 def has_media(self, type_): if type_ in Book.formats: @@ -554,11 +557,8 @@ class Book(models.Model): urlretrieve('%s/%s' % (remote_gallery_url, ilustr_src), ilustr_path) def load_abstract(self): - abstract = self.wldocument(parse_dublincore=False).edoc.getroot().find('.//abstrakt') - if abstract is not None: - self.abstract = transform_abstrakt(abstract) - else: - self.abstract = '' + self.abstract = AbstraktHtmlBuilder().build( + self.wldocument2()).get_bytes().decode('utf-8') def load_toc(self): self.toc = '' @@ -717,13 +717,14 @@ class Book(models.Model): cls.published.send(sender=cls, instance=book) return book + # TODO TEST def update_references(self): Entity = apps.get_model('references', 'Entity') doc = self.wldocument2() - doc._compat_assign_section_ids() - doc._compat_assign_ordered_ids() + doc.assign_ids() + refs = {} - for ref_elem in doc.references(): + for i, ref_elem in enumerate(doc.references()): uri = ref_elem.attrib.get('href', '') if not uri: continue @@ -742,10 +743,8 @@ class Book(models.Model): refs[uri] = ref if not ref_created: ref.occurence_set.all().delete() - sec = ref_elem.get_link() - m = re.match(r'sec(\d+)', sec) - assert m is not None - sec = int(m.group(1)) + anchor = ref_elem.get_link() + snippet = ref_elem.get_snippet() b = builders['html-snippet']() for s in snippet: @@ -753,7 +752,8 @@ class Book(models.Model): html = b.output().get_bytes().decode('utf-8') ref.occurence_set.create( - section=sec, + section=i, + anchor=anchor, html=html ) self.reference_set.exclude(entity__uri__in=refs).delete() diff --git a/src/catalogue/models/snippet.py b/src/catalogue/models/snippet.py index 4c25b8c97..ec7e2f9d1 100644 --- a/src/catalogue/models/snippet.py +++ b/src/catalogue/models/snippet.py @@ -7,6 +7,7 @@ from search.utils import UnaccentSearchVector class Snippet(models.Model): book = models.ForeignKey('Book', models.CASCADE) sec = models.IntegerField() + anchor = models.CharField(max_length=64) text = models.TextField() search_vector = SearchVectorField() diff --git a/src/catalogue/templatetags/catalogue_tags.py b/src/catalogue/templatetags/catalogue_tags.py index d2298d9f5..d70ad970e 100644 --- a/src/catalogue/templatetags/catalogue_tags.py +++ b/src/catalogue/templatetags/catalogue_tags.py @@ -307,6 +307,8 @@ def plain_list(context, object_list, with_initials=True, by_author=False, choice @register.simple_tag def related_books_2022(book=None, limit=4, taken=0): limit -= taken + if limit < 0: + return [] max_books = limit books_qs = Book.objects.filter(findable=True) diff --git a/src/references/migrations/0003_occurence_anchor.py b/src/references/migrations/0003_occurence_anchor.py new file mode 100644 index 000000000..05e60a4a3 --- /dev/null +++ b/src/references/migrations/0003_occurence_anchor.py @@ -0,0 +1,19 @@ +# Generated by Django 4.0.8 on 2024-09-18 11:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('references', '0002_remove_reference_first_section_occurence'), + ] + + operations = [ + migrations.AddField( + model_name='occurence', + name='anchor', + field=models.CharField(default='', max_length=64), + preserve_default=False, + ), + ] diff --git a/src/references/migrations/0004_update_anchor.py b/src/references/migrations/0004_update_anchor.py new file mode 100644 index 000000000..cd971857f --- /dev/null +++ b/src/references/migrations/0004_update_anchor.py @@ -0,0 +1,22 @@ +# Generated by Django 4.0.8 on 2024-09-18 11:29 + +from django.db import migrations, models +from django.db.models.functions import Concat + + +def update_anchor(apps, schema_editor): + Occurence = apps.get_model('references', 'Occurence') + Occurence.objects.filter(anchor='').update( + anchor=Concat(models.Value('sec'), 'section') + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ('references', '0003_occurence_anchor'), + ] + + operations = [ + migrations.RunPython(update_anchor, migrations.RunPython.noop) + ] diff --git a/src/references/models.py b/src/references/models.py index d4733ffcf..f83518dcd 100644 --- a/src/references/models.py +++ b/src/references/models.py @@ -84,6 +84,7 @@ class Reference(models.Model): class Occurence(models.Model): reference = models.ForeignKey(Reference, models.CASCADE) section = models.IntegerField() + anchor = models.CharField(max_length=64) html = models.TextField() class Meta: diff --git a/src/references/templates/references/popup.html b/src/references/templates/references/popup.html index f8977e6d8..3655309e3 100644 --- a/src/references/templates/references/popup.html +++ b/src/references/templates/references/popup.html @@ -28,7 +28,7 @@ {% for occ in ref.occurence_set.all %} - + {{ occ.html|safe }} {% endfor %} diff --git a/src/search/index.py b/src/search/index.py index e0a727ca2..3b0edebf8 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -2,7 +2,8 @@ # Copyright © Fundacja Wolne Lektury. See NOTICE for more information. # import re -from librarian.parser import WLDocument +from librarian.elements.base import WLElement +from librarian.document import WLDocument from lxml import etree @@ -31,22 +32,15 @@ class Index: skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'] - @classmethod - def get_master(cls, root): - """ - Returns the first master tag from an etree. - """ - for master in root.iter(): - if master.tag in cls.master_tags: - return master - @staticmethod - def add_snippet(book, text, position): + def add_snippet(book, text, position, anchor): book.snippet_set.create( sec=position + 1, - text=text + text=text, + anchor=anchor ) + # TODO: The section links stuff won't work. @classmethod def index_book(cls, book): """ @@ -57,13 +51,23 @@ class Index: book.snippet_set.all().delete() - wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False) - root = wld.edoc.getroot() + wld = WLDocument(filename=book.xml_file.path) + wld.assign_ids() - master = cls.get_master(root) + master = wld.tree.getroot().master if master is None: return [] + def get_indexable(element): + for child in element: + if not isinstance(child, WLElement): + continue + if not child.attrib.get('_id'): + for e in get_indexable(child): + yield e + else: + yield child + def walker(node): if node.tag not in cls.ignore_content_tags: yield node, None, None @@ -85,12 +89,14 @@ class Index: return re.sub("(?m)/$", "", text) - for position, header in enumerate(master): + for position, header in enumerate(get_indexable(master)): if header.tag in cls.skip_header_tags: continue if header.tag is etree.Comment: continue + el_id = header.attrib['_id'] + # section content content = [] footnote = [] @@ -110,7 +116,7 @@ class Index: handle_text.append(collect_footnote) elif end is not None and footnote is not [] and end.tag in cls.footnote_tags: handle_text.pop() - cls.add_snippet(book, ''.join(footnote), position) + cls.add_snippet(book, ''.join(footnote), position, el_id) footnote = [] if text is not None and handle_text is not []: @@ -118,4 +124,4 @@ class Index: hdl(text) # in the end, add a section text. - cls.add_snippet(book, fix_format(content), position) + cls.add_snippet(book, fix_format(content), position, el_id) diff --git a/src/search/templates/search/results.html b/src/search/templates/search/results.html index c71cf45c8..2902f2601 100644 --- a/src/search/templates/search/results.html +++ b/src/search/templates/search/results.html @@ -126,7 +126,7 @@ {{ book.title }} {% for f in snippets %} - + {{ f.headline|safe }} {% endfor %}