From 2c88cd11935d3011fadcc1667ac886652ba189ca Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Tue, 14 Apr 2020 16:08:02 +0200 Subject: [PATCH 1/1] Fixes #4021: Ebook files regeneration mechanism. --- src/catalogue/fields.py | 69 +++++++++++++++++-- .../commands/schedule_stale_ebooks.py | 14 ++++ .../migrations/0029_auto_20200414_1516.py | 68 ++++++++++++++++++ src/catalogue/models/book.py | 7 ++ 4 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 src/catalogue/management/commands/schedule_stale_ebooks.py create mode 100644 src/catalogue/migrations/0029_auto_20200414_1516.py diff --git a/src/catalogue/fields.py b/src/catalogue/fields.py index 80d016c58..d88787e63 100644 --- a/src/catalogue/fields.py +++ b/src/catalogue/fields.py @@ -7,7 +7,7 @@ from django.core.files.storage import FileSystemStorage from django.db import models from django.db.models.fields.files import FieldFile from catalogue import app_settings -from catalogue.constants import LANGUAGES_3TO2 +from catalogue.constants import LANGUAGES_3TO2, EBOOK_FORMATS_WITH_CHILDREN, EBOOK_FORMATS_WITHOUT_CHILDREN from catalogue.utils import remove_zip, truncate_html_words, gallery_path, gallery_url from celery.task import Task, task from celery.utils.log import get_task_logger @@ -15,6 +15,10 @@ from waiter.utils import clear_cache task_logger = get_task_logger(__name__) +ETAG_SCHEDULED_SUFFIX = '-scheduled' +EBOOK_BUILD_PRIORITY = 0 +EBOOK_REBUILD_PRIORITY = 9 + class EbookFieldFile(FieldFile): """Represents contents of an ebook file field.""" @@ -23,9 +27,15 @@ class EbookFieldFile(FieldFile): """Build the ebook immediately.""" return self.field.builder.build(self) - def build_delay(self): + def build_delay(self, priority=EBOOK_BUILD_PRIORITY): """Builds the ebook in a delayed task.""" - return self.field.builder.delay(self.instance, self.field.attname) + self.update_etag( + "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX]) + ) + return self.field.builder.apply_async( + [self.instance, self.field.attname], + priority=priority + ) def get_url(self): return self.instance.media_url(self.field.attname.split('_')[0]) @@ -35,10 +45,16 @@ class EbookFieldFile(FieldFile): permissions = 0o644 if readable else 0o600 os.chmod(self.path, permissions) + def update_etag(self, etag): + setattr(self.instance, self.field.etag_field_name, etag) + if self.instance.pk: + self.instance.save(update_fields=[self.field.etag_field_name]) + class EbookField(models.FileField): """Represents an ebook file field, attachable to a model.""" attr_class = EbookFieldFile + registry = [] def __init__(self, format_name, *args, **kwargs): super(EbookField, self).__init__(*args, **kwargs) @@ -57,14 +73,54 @@ class EbookField(models.FileField): def contribute_to_class(self, cls, name): super(EbookField, self).contribute_to_class(cls, name) + self.etag_field_name = f'{name}_etag' + def has(model_instance): return bool(getattr(model_instance, self.attname, None)) has.__doc__ = None has.__name__ = str("has_%s" % self.attname) has.short_description = self.name has.boolean = True + + self.registry.append(self) + setattr(cls, 'has_%s' % self.attname, has) + def get_current_etag(self): + import pkg_resources + librarian_version = pkg_resources.get_distribution("librarian").version + return librarian_version + + def schedule_stale(self, queryset=None): + """Schedule building this format for all the books where etag is stale.""" + # If there is not ETag field, bail. That's true for xml file field. + if not hasattr(self.model, f'{self.attname}_etag'): + return + + etag = self.get_current_etag() + if queryset is None: + queryset = self.model.objects.all() + + if self.format_name in EBOOK_FORMATS_WITHOUT_CHILDREN + ['html']: + queryset = queryset.filter(children=None) + + queryset = queryset.exclude(**{ + f'{self.etag_field_name}__in': [ + etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}' + ] + }) + for obj in queryset: + fieldfile = getattr(obj, self.attname) + priority = EBOOK_REBUILD_PRIORITY if fieldfile else EBOOK_BUILD_PRIORITY + fieldfile.build_delay(priority=priority) + + @classmethod + def schedule_all_stale(cls): + """Schedules all stale ebooks of all formats to rebuild.""" + for field in cls.registry: + field.schedule_stale() + + class BuildEbook(Task): formats = {} @@ -93,8 +149,13 @@ class BuildEbook(Task): def run(self, obj, field_name): """Just run `build` on FieldFile, can't pass it directly to Celery.""" - task_logger.info("%s -> %s" % (obj.slug, field_name)) + fieldfile = getattr(obj, field_name) + + # Get etag value before actually building the file. + etag = fieldfile.field.get_current_etag() + task_logger.info("%s -> %s@%s" % (obj.slug, field_name, etag)) ret = self.build(getattr(obj, field_name)) + fieldfile.update_etag(etag) obj.clear_cache() return ret diff --git a/src/catalogue/management/commands/schedule_stale_ebooks.py b/src/catalogue/management/commands/schedule_stale_ebooks.py new file mode 100644 index 000000000..91836fbf3 --- /dev/null +++ b/src/catalogue/management/commands/schedule_stale_ebooks.py @@ -0,0 +1,14 @@ +# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from django.conf import settings +from django.core.management.base import BaseCommand + +from catalogue.fields import EbookField + + +class Command(BaseCommand): + help = 'Schedule regenerating stale ebook files.' + + def handle(self, **options): + EbookField.schedule_all_stale() diff --git a/src/catalogue/migrations/0029_auto_20200414_1516.py b/src/catalogue/migrations/0029_auto_20200414_1516.py new file mode 100644 index 000000000..b77b97bfd --- /dev/null +++ b/src/catalogue/migrations/0029_auto_20200414_1516.py @@ -0,0 +1,68 @@ +# Generated by Django 2.2.10 on 2020-04-14 13:16 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0028_book_cover_ebookpoint'), + ] + + operations = [ + migrations.AddField( + model_name='book', + name='cover_api_thumb_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='cover_ebookpoint_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='cover_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='cover_thumb_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='epub_file_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='fb2_file_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='html_file_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='mobi_file_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='pdf_file_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='simple_cover_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + migrations.AddField( + model_name='book', + name='txt_file_etag', + field=models.CharField(db_index=True, default='', editable=False, max_length=255), + ), + ] diff --git a/src/catalogue/models/book.py b/src/catalogue/models/book.py index 9535e9024..371bb3e2d 100644 --- a/src/catalogue/models/book.py +++ b/src/catalogue/models/book.py @@ -83,27 +83,32 @@ class Book(models.Model): null=True, blank=True, upload_to=_cover_upload_to, storage=bofh_storage, max_length=255) + cover_etag = models.CharField(max_length=255, editable=False, default='', db_index=True) # Cleaner version of cover for thumbs cover_thumb = EbookField( 'cover_thumb', _('cover thumbnail'), null=True, blank=True, upload_to=_cover_thumb_upload_to, max_length=255) + cover_thumb_etag = models.CharField(max_length=255, editable=False, default='', db_index=True) cover_api_thumb = EbookField( 'cover_api_thumb', _('cover thumbnail for mobile app'), null=True, blank=True, upload_to=_cover_api_thumb_upload_to, max_length=255) + cover_api_thumb_etag = models.CharField(max_length=255, editable=False, default='', db_index=True) simple_cover = EbookField( 'simple_cover', _('cover for mobile app'), null=True, blank=True, upload_to=_simple_cover_upload_to, max_length=255) + simple_cover_etag = models.CharField(max_length=255, editable=False, default='', db_index=True) cover_ebookpoint = EbookField( 'cover_ebookpoint', _('cover for Ebookpoint'), null=True, blank=True, upload_to=_cover_ebookpoint_upload_to, max_length=255) + cover_ebookpoint_etag = models.CharField(max_length=255, editable=False, default='', db_index=True) ebook_formats = constants.EBOOK_FORMATS formats = ebook_formats + ['html', 'xml'] @@ -856,6 +861,8 @@ def add_file_fields(): blank=True, default='' ).contribute_to_class(Book, field_name) + if format_ != 'xml': + models.CharField(max_length=255, editable=False, default='', db_index=True).contribute_to_class(Book, f'{field_name}_etag') add_file_fields() -- 2.20.1