Fixes #4021: Ebook files regeneration mechanism.
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.core.files import File
6 from django.core.files.storage import FileSystemStorage
7 from django.db import models
8 from django.db.models.fields.files import FieldFile
9 from catalogue import app_settings
10 from catalogue.constants import LANGUAGES_3TO2, EBOOK_FORMATS_WITH_CHILDREN, EBOOK_FORMATS_WITHOUT_CHILDREN
11 from catalogue.utils import remove_zip, truncate_html_words, gallery_path, gallery_url
12 from celery.task import Task, task
13 from celery.utils.log import get_task_logger
14 from waiter.utils import clear_cache
15
16 task_logger = get_task_logger(__name__)
17
18 ETAG_SCHEDULED_SUFFIX = '-scheduled'
19 EBOOK_BUILD_PRIORITY = 0
20 EBOOK_REBUILD_PRIORITY = 9
21
22
23 class EbookFieldFile(FieldFile):
24     """Represents contents of an ebook file field."""
25
26     def build(self):
27         """Build the ebook immediately."""
28         return self.field.builder.build(self)
29
30     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
31         """Builds the ebook in a delayed task."""
32         self.update_etag(
33             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
34         )
35         return self.field.builder.apply_async(
36             [self.instance, self.field.attname],
37             priority=priority
38         )
39
40     def get_url(self):
41         return self.instance.media_url(self.field.attname.split('_')[0])
42
43     def set_readable(self, readable):
44         import os
45         permissions = 0o644 if readable else 0o600
46         os.chmod(self.path, permissions)
47
48     def update_etag(self, etag):
49         setattr(self.instance, self.field.etag_field_name, etag)
50         if self.instance.pk:
51             self.instance.save(update_fields=[self.field.etag_field_name])
52
53
54 class EbookField(models.FileField):
55     """Represents an ebook file field, attachable to a model."""
56     attr_class = EbookFieldFile
57     registry = []
58
59     def __init__(self, format_name, *args, **kwargs):
60         super(EbookField, self).__init__(*args, **kwargs)
61         self.format_name = format_name
62
63     def deconstruct(self):
64         name, path, args, kwargs = super(EbookField, self).deconstruct()
65         args.insert(0, self.format_name)
66         return name, path, args, kwargs
67
68     @property
69     def builder(self):
70         """Finds a celery task suitable for the format of the field."""
71         return BuildEbook.for_format(self.format_name)
72
73     def contribute_to_class(self, cls, name):
74         super(EbookField, self).contribute_to_class(cls, name)
75
76         self.etag_field_name = f'{name}_etag'
77
78         def has(model_instance):
79             return bool(getattr(model_instance, self.attname, None))
80         has.__doc__ = None
81         has.__name__ = str("has_%s" % self.attname)
82         has.short_description = self.name
83         has.boolean = True
84
85         self.registry.append(self)
86
87         setattr(cls, 'has_%s' % self.attname, has)
88
89     def get_current_etag(self):
90         import pkg_resources
91         librarian_version = pkg_resources.get_distribution("librarian").version
92         return librarian_version
93
94     def schedule_stale(self, queryset=None):
95         """Schedule building this format for all the books where etag is stale."""
96         # If there is not ETag field, bail. That's true for xml file field.
97         if not hasattr(self.model, f'{self.attname}_etag'):
98             return
99
100         etag = self.get_current_etag()
101         if queryset is None:
102             queryset = self.model.objects.all()
103
104         if self.format_name in EBOOK_FORMATS_WITHOUT_CHILDREN + ['html']:
105             queryset = queryset.filter(children=None)
106
107         queryset = queryset.exclude(**{
108             f'{self.etag_field_name}__in': [
109                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
110             ]
111         })
112         for obj in queryset:
113             fieldfile = getattr(obj, self.attname)
114             priority = EBOOK_REBUILD_PRIORITY if fieldfile else EBOOK_BUILD_PRIORITY
115             fieldfile.build_delay(priority=priority)
116
117     @classmethod
118     def schedule_all_stale(cls):
119         """Schedules all stale ebooks of all formats to rebuild."""
120         for field in cls.registry:
121             field.schedule_stale()
122
123
124
125 class BuildEbook(Task):
126     formats = {}
127
128     @classmethod
129     def register(cls, format_name):
130         """A decorator for registering subclasses for particular formats."""
131         def wrapper(builder):
132             cls.formats[format_name] = builder
133             return builder
134         return wrapper
135
136     @classmethod
137     def for_format(cls, format_name):
138         """Returns a celery task suitable for specified format."""
139         return cls.formats.get(format_name, BuildEbookTask)
140
141     @staticmethod
142     def transform(wldoc, fieldfile):
143         """Transforms an librarian.WLDocument into an librarian.OutputFile.
144
145         By default, it just calls relevant wldoc.as_??? method.
146
147         """
148         return getattr(wldoc, "as_%s" % fieldfile.field.format_name)()
149
150     def run(self, obj, field_name):
151         """Just run `build` on FieldFile, can't pass it directly to Celery."""
152         fieldfile = getattr(obj, field_name)
153
154         # Get etag value before actually building the file.
155         etag = fieldfile.field.get_current_etag()
156         task_logger.info("%s -> %s@%s" % (obj.slug, field_name, etag))
157         ret = self.build(getattr(obj, field_name))
158         fieldfile.update_etag(etag)
159         obj.clear_cache()
160         return ret
161
162     def set_file_permissions(self, fieldfile):
163         if fieldfile.instance.preview:
164             fieldfile.set_readable(False)
165
166     def build(self, fieldfile):
167         book = fieldfile.instance
168         out = self.transform(book.wldocument(), fieldfile)
169         fieldfile.save(None, File(open(out.get_filename(), 'rb')), save=False)
170         self.set_file_permissions(fieldfile)
171         if book.pk is not None:
172             books = type(book).objects.filter(pk=book.pk)
173             books.update(**{
174                 fieldfile.field.attname: fieldfile
175             })
176             for book in books:
177                 book.save()  # just to trigger post-save
178         if fieldfile.field.format_name in app_settings.FORMAT_ZIPS:
179             remove_zip(app_settings.FORMAT_ZIPS[fieldfile.field.format_name])
180 # Don't decorate BuildEbook, because we want to subclass it.
181 BuildEbookTask = task(BuildEbook, ignore_result=True)
182
183
184 @BuildEbook.register('txt')
185 @task(ignore_result=True)
186 class BuildTxt(BuildEbook):
187     @staticmethod
188     def transform(wldoc, fieldfile):
189         return wldoc.as_text()
190
191
192 @BuildEbook.register('pdf')
193 @task(ignore_result=True)
194 class BuildPdf(BuildEbook):
195     @staticmethod
196     def transform(wldoc, fieldfile):
197         return wldoc.as_pdf(
198             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS, cover=True,
199             ilustr_path=gallery_path(wldoc.book_info.url.slug), customizations=['notoc'])
200
201     def build(self, fieldfile):
202         BuildEbook.build(self, fieldfile)
203         clear_cache(fieldfile.instance.slug)
204
205
206 @BuildEbook.register('epub')
207 @task(ignore_result=True)
208 class BuildEpub(BuildEbook):
209     @staticmethod
210     def transform(wldoc, fieldfile):
211         return wldoc.as_epub(cover=True, ilustr_path=gallery_path(wldoc.book_info.url.slug))
212
213
214 @BuildEbook.register('mobi')
215 @task(ignore_result=True)
216 class BuildMobi(BuildEbook):
217     @staticmethod
218     def transform(wldoc, fieldfile):
219         return wldoc.as_mobi(cover=True, ilustr_path=gallery_path(wldoc.book_info.url.slug))
220
221
222 @BuildEbook.register('html')
223 @task(ignore_result=True)
224 class BuildHtml(BuildEbook):
225     def build(self, fieldfile):
226         from django.core.files.base import ContentFile
227         from slugify import slugify
228         from sortify import sortify
229         from librarian import html
230         from catalogue.models import Fragment, Tag
231
232         book = fieldfile.instance
233
234         html_output = self.transform(book.wldocument(parse_dublincore=False), fieldfile)
235
236         # Delete old fragments, create from scratch if necessary.
237         book.fragments.all().delete()
238
239         if html_output:
240             meta_tags = list(book.tags.filter(
241                 category__in=('author', 'epoch', 'genre', 'kind')))
242
243             lang = book.language
244             lang = LANGUAGES_3TO2.get(lang, lang)
245             if lang not in [ln[0] for ln in settings.LANGUAGES]:
246                 lang = None
247
248             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
249             self.set_file_permissions(fieldfile)
250             type(book).objects.filter(pk=book.pk).update(**{
251                 fieldfile.field.attname: fieldfile
252             })
253
254             # Extract fragments
255             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
256             for fragment in closed_fragments.values():
257                 try:
258                     theme_names = [s.strip() for s in fragment.themes.split(',')]
259                 except AttributeError:
260                     continue
261                 themes = []
262                 for theme_name in theme_names:
263                     if not theme_name:
264                         continue
265                     if lang == settings.LANGUAGE_CODE:
266                         # Allow creating themes if book in default language.
267                         tag, created = Tag.objects.get_or_create(
268                             slug=slugify(theme_name),
269                             category='theme'
270                         )
271                         if created:
272                             tag.name = theme_name
273                             setattr(tag, "name_%s" % lang, theme_name)
274                             tag.sort_key = sortify(theme_name.lower())
275                             tag.for_books = True
276                             tag.save()
277                         themes.append(tag)
278                     elif lang is not None:
279                         # Don't create unknown themes in non-default languages.
280                         try:
281                             tag = Tag.objects.get(
282                                 category='theme',
283                                 **{"name_%s" % lang: theme_name}
284                             )
285                         except Tag.DoesNotExist:
286                             pass
287                         else:
288                             themes.append(tag)
289                 if not themes:
290                     continue
291
292                 text = fragment.to_string()
293                 short_text = truncate_html_words(text, 15)
294                 if text == short_text:
295                     short_text = ''
296                 new_fragment = Fragment.objects.create(
297                     anchor=fragment.id,
298                     book=book,
299                     text=text,
300                     short_text=short_text
301                 )
302
303                 new_fragment.save()
304                 new_fragment.tags = set(meta_tags + themes)
305                 for theme in themes:
306                     if not theme.for_books:
307                         theme.for_books = True
308                         theme.save()
309             book.html_built.send(sender=type(self), instance=book)
310             return True
311         return False
312
313     @staticmethod
314     def transform(wldoc, fieldfile):
315         # ugly, but we can't use wldoc.book_info here
316         from librarian import DCNS
317         url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
318         if url_elem is None:
319             gallery = ''
320         else:
321             gallery = gallery_url(slug=url_elem.text.rsplit('/', 1)[1])
322         return wldoc.as_html(options={'gallery': "'%s'" % gallery})
323
324
325 class BuildCover(BuildEbook):
326     def set_file_permissions(self, fieldfile):
327         pass
328
329
330 @BuildEbook.register('cover_thumb')
331 @task(ignore_result=True)
332 class BuildCoverThumb(BuildCover):
333     @classmethod
334     def transform(cls, wldoc, fieldfile):
335         from librarian.cover import WLCover
336         return WLCover(wldoc.book_info, height=193).output_file()
337
338
339 @BuildEbook.register('cover_api_thumb')
340 @task(ignore_result=True)
341 class BuildCoverApiThumb(BuildCover):
342     @classmethod
343     def transform(cls, wldoc, fieldfile):
344         from librarian.cover import WLNoBoxCover
345         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
346
347
348 @BuildEbook.register('simple_cover')
349 @task(ignore_result=True)
350 class BuildSimpleCover(BuildCover):
351     @classmethod
352     def transform(cls, wldoc, fieldfile):
353         from librarian.cover import WLNoBoxCover
354         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
355
356
357 @BuildEbook.register('cover_ebookpoint')
358 @task(ignore_result=True)
359 class BuildCoverEbookpoint(BuildCover):
360     @classmethod
361     def transform(cls, wldoc, fieldfile):
362         from librarian.cover import EbookpointCover
363         return EbookpointCover(wldoc.book_info).output_file()
364
365
366 # not used, but needed for migrations
367 class OverwritingFieldFile(FieldFile):
368     """
369         Deletes the old file before saving the new one.
370     """
371
372     def save(self, name, content, *args, **kwargs):
373         leave = kwargs.pop('leave', None)
374         # delete if there's a file already and there's a new one coming
375         if not leave and self and (not hasattr(content, 'path') or content.path != self.path):
376             self.delete(save=False)
377         return super(OverwritingFieldFile, self).save(name, content, *args, **kwargs)
378
379
380 class OverwritingFileField(models.FileField):
381     attr_class = OverwritingFieldFile
382
383
384 class OverwriteStorage(FileSystemStorage):
385
386     def get_available_name(self, name, max_length=None):
387         self.delete(name)
388         return name