celery fixes
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 import os
5 from django.conf import settings
6 from django.core.files import File
7 from django.core.files.storage import FileSystemStorage
8 from django.db import models
9 from django.db.models.fields.files import FieldFile
10 from catalogue import app_settings
11 from catalogue.constants import LANGUAGES_3TO2, EBOOK_FORMATS_WITH_CHILDREN, EBOOK_FORMATS_WITHOUT_CHILDREN
12 from catalogue.utils import absolute_url, remove_zip, truncate_html_words, gallery_path, gallery_url
13 #from celery import Task, shared_task
14 from celery.task import Task, task
15 from celery.utils.log import get_task_logger
16 from waiter.utils import clear_cache
17
18 task_logger = get_task_logger(__name__)
19
20 ETAG_SCHEDULED_SUFFIX = '-scheduled'
21 EBOOK_BUILD_PRIORITY = 0
22 EBOOK_REBUILD_PRIORITY = 9
23
24
25 class EbookFieldFile(FieldFile):
26     """Represents contents of an ebook file field."""
27
28     def build(self):
29         """Build the ebook immediately."""
30         return self.field.builder.build(self)
31
32     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
33         """Builds the ebook in a delayed task."""
34         self.update_etag(
35             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
36         )
37         return self.field.builder.apply_async(
38             [self.instance, self.field.attname],
39             priority=priority
40         )
41
42     def get_url(self):
43         return self.instance.media_url(self.field.attname.split('_')[0])
44
45     def set_readable(self, readable):
46         import os
47         permissions = 0o644 if readable else 0o600
48         os.chmod(self.path, permissions)
49
50     def update_etag(self, etag):
51         setattr(self.instance, self.field.etag_field_name, etag)
52         if self.instance.pk:
53             self.instance.save(update_fields=[self.field.etag_field_name])
54
55
56 class EbookField(models.FileField):
57     """Represents an ebook file field, attachable to a model."""
58     attr_class = EbookFieldFile
59     registry = []
60
61     def __init__(self, format_name, *args, **kwargs):
62         super(EbookField, self).__init__(*args, **kwargs)
63         self.format_name = format_name
64
65     def deconstruct(self):
66         name, path, args, kwargs = super(EbookField, self).deconstruct()
67         args.insert(0, self.format_name)
68         return name, path, args, kwargs
69
70     @property
71     def builder(self):
72         """Finds a celery task suitable for the format of the field."""
73         return BuildEbook.for_format(self.format_name)
74
75     def contribute_to_class(self, cls, name):
76         super(EbookField, self).contribute_to_class(cls, name)
77
78         self.etag_field_name = f'{name}_etag'
79
80         def has(model_instance):
81             return bool(getattr(model_instance, self.attname, None))
82         has.__doc__ = None
83         has.__name__ = str("has_%s" % self.attname)
84         has.short_description = self.name
85         has.boolean = True
86
87         self.registry.append(self)
88
89         setattr(cls, 'has_%s' % self.attname, has)
90
91     def get_current_etag(self):
92         import pkg_resources
93         librarian_version = pkg_resources.get_distribution("librarian").version
94         return librarian_version
95
96     def schedule_stale(self, queryset=None):
97         """Schedule building this format for all the books where etag is stale."""
98         # If there is not ETag field, bail. That's true for xml file field.
99         if not hasattr(self.model, f'{self.attname}_etag'):
100             return
101
102         etag = self.get_current_etag()
103         if queryset is None:
104             queryset = self.model.objects.all()
105
106         if self.format_name in EBOOK_FORMATS_WITHOUT_CHILDREN + ['html']:
107             queryset = queryset.filter(children=None)
108
109         queryset = queryset.exclude(**{
110             f'{self.etag_field_name}__in': [
111                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
112             ]
113         })
114         for obj in queryset:
115             fieldfile = getattr(obj, self.attname)
116             priority = EBOOK_REBUILD_PRIORITY if fieldfile else EBOOK_BUILD_PRIORITY
117             fieldfile.build_delay(priority=priority)
118
119     @classmethod
120     def schedule_all_stale(cls):
121         """Schedules all stale ebooks of all formats to rebuild."""
122         for field in cls.registry:
123             field.schedule_stale()
124
125
126
127 class BuildEbook(Task):
128     librarian2_api = False
129
130     formats = {}
131
132     @classmethod
133     def register(cls, format_name):
134         """A decorator for registering subclasses for particular formats."""
135         def wrapper(builder):
136             cls.formats[format_name] = builder
137             return builder
138         return wrapper
139
140     @classmethod
141     def for_format(cls, format_name):
142         """Returns a celery task suitable for specified format."""
143         return cls.formats.get(format_name, BuildEbookTask)
144
145     @staticmethod
146     def transform(wldoc, fieldfile):
147         """Transforms an librarian.WLDocument into an librarian.OutputFile.
148
149         By default, it just calls relevant wldoc.as_??? method.
150
151         """
152         return getattr(wldoc, "as_%s" % fieldfile.field.format_name)()
153
154     def run(self, obj, field_name):
155         """Just run `build` on FieldFile, can't pass it directly to Celery."""
156         fieldfile = getattr(obj, field_name)
157
158         # Get etag value before actually building the file.
159         etag = fieldfile.field.get_current_etag()
160         task_logger.info("%s -> %s@%s" % (obj.slug, field_name, etag))
161         ret = self.build(getattr(obj, field_name))
162         fieldfile.update_etag(etag)
163         obj.clear_cache()
164         return ret
165
166     def set_file_permissions(self, fieldfile):
167         if fieldfile.instance.preview:
168             fieldfile.set_readable(False)
169
170     def build(self, fieldfile):
171         book = fieldfile.instance
172         out = self.transform(
173             book.wldocument2() if self.librarian2_api else book.wldocument(),
174             fieldfile)
175         fieldfile.save(None, File(open(out.get_filename(), 'rb')), save=False)
176         self.set_file_permissions(fieldfile)
177         if book.pk is not None:
178             book.save(update_fields=[fieldfile.field.attname])
179         if fieldfile.field.format_name in app_settings.FORMAT_ZIPS:
180             remove_zip(app_settings.FORMAT_ZIPS[fieldfile.field.format_name])
181 # Don't decorate BuildEbook, because we want to subclass it.
182 BuildEbookTask = task(BuildEbook, ignore_result=True)
183
184
185 @BuildEbook.register('txt')
186 @task(ignore_result=True)
187 class BuildTxt(BuildEbook):
188     @staticmethod
189     def transform(wldoc, fieldfile):
190         return wldoc.as_text()
191
192
193 @BuildEbook.register('pdf')
194 @task(ignore_result=True)
195 class BuildPdf(BuildEbook):
196     @staticmethod
197     def transform(wldoc, fieldfile):
198         return wldoc.as_pdf(
199             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS, cover=True,
200             base_url=absolute_url(gallery_url(wldoc.book_info.url.slug)), customizations=['notoc'])
201
202     def build(self, fieldfile):
203         BuildEbook.build(self, fieldfile)
204         clear_cache(fieldfile.instance.slug)
205
206
207 @BuildEbook.register('epub')
208 @task(ignore_result=True)
209 class BuildEpub(BuildEbook):
210     librarian2_api = True
211
212     @staticmethod
213     def transform(wldoc, fieldfile):
214         from librarian.builders import EpubBuilder
215         return EpubBuilder(
216                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
217                 fundraising=settings.EPUB_FUNDRAISING
218             ).build(wldoc)
219
220
221 @BuildEbook.register('mobi')
222 @task(ignore_result=True)
223 class BuildMobi(BuildEbook):
224     librarian2_api = True
225
226     @staticmethod
227     def transform(wldoc, fieldfile):
228         from librarian.builders import MobiBuilder
229         return MobiBuilder(
230                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
231                 fundraising=settings.EPUB_FUNDRAISING
232             ).build(wldoc)
233
234
235 @BuildEbook.register('html')
236 @task(ignore_result=True)
237 class BuildHtml(BuildEbook):
238     def build(self, fieldfile):
239         from django.core.files.base import ContentFile
240         from slugify import slugify
241         from sortify import sortify
242         from librarian import html
243         from catalogue.models import Fragment, Tag
244
245         book = fieldfile.instance
246
247         html_output = self.transform(book.wldocument(parse_dublincore=False), fieldfile)
248
249         # Delete old fragments, create from scratch if necessary.
250         book.fragments.all().delete()
251
252         if html_output:
253             meta_tags = list(book.tags.filter(
254                 category__in=('author', 'epoch', 'genre', 'kind')))
255
256             lang = book.language
257             lang = LANGUAGES_3TO2.get(lang, lang)
258             if lang not in [ln[0] for ln in settings.LANGUAGES]:
259                 lang = None
260
261             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
262             self.set_file_permissions(fieldfile)
263             type(book).objects.filter(pk=book.pk).update(**{
264                 fieldfile.field.attname: fieldfile
265             })
266
267             # Extract fragments
268             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
269             for fragment in closed_fragments.values():
270                 try:
271                     theme_names = [s.strip() for s in fragment.themes.split(',')]
272                 except AttributeError:
273                     continue
274                 themes = []
275                 for theme_name in theme_names:
276                     if not theme_name:
277                         continue
278                     if lang == settings.LANGUAGE_CODE:
279                         # Allow creating themes if book in default language.
280                         tag, created = Tag.objects.get_or_create(
281                             slug=slugify(theme_name),
282                             category='theme'
283                         )
284                         if created:
285                             tag.name = theme_name
286                             setattr(tag, "name_%s" % lang, theme_name)
287                             tag.sort_key = sortify(theme_name.lower())
288                             tag.for_books = True
289                             tag.save()
290                         themes.append(tag)
291                     elif lang is not None:
292                         # Don't create unknown themes in non-default languages.
293                         try:
294                             tag = Tag.objects.get(
295                                 category='theme',
296                                 **{"name_%s" % lang: theme_name}
297                             )
298                         except Tag.DoesNotExist:
299                             pass
300                         else:
301                             themes.append(tag)
302                 if not themes:
303                     continue
304
305                 text = fragment.to_string()
306                 short_text = truncate_html_words(text, 15)
307                 if text == short_text:
308                     short_text = ''
309                 new_fragment = Fragment.objects.create(
310                     anchor=fragment.id,
311                     book=book,
312                     text=text,
313                     short_text=short_text
314                 )
315
316                 new_fragment.save()
317                 new_fragment.tags = set(meta_tags + themes)
318                 for theme in themes:
319                     if not theme.for_books:
320                         theme.for_books = True
321                         theme.save()
322             book.html_built.send(sender=type(self), instance=book)
323             return True
324         return False
325
326     @staticmethod
327     def transform(wldoc, fieldfile):
328         # ugly, but we can't use wldoc.book_info here
329         from librarian import DCNS
330         url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
331         if url_elem is None:
332             gal_url = ''
333             gal_path = ''
334         else:
335             slug = url_elem.text.rstrip('/').rsplit('/', 1)[1]
336             gal_url = gallery_url(slug=slug)
337             gal_path = gallery_path(slug=slug)
338         return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url))
339
340
341 class BuildCover(BuildEbook):
342     def set_file_permissions(self, fieldfile):
343         pass
344
345
346 @BuildEbook.register('cover_clean')
347 @task(ignore_result=True)
348 class BuildCoverClean(BuildCover):
349     @classmethod
350     def transform(cls, wldoc, fieldfile):
351         if wldoc.book_info.cover_box_position == 'none':
352             from librarian.cover import WLCover
353             return WLCover(wldoc.book_info, width=240).output_file()
354         from librarian.covers.marquise import MarquiseCover
355         return MarquiseCover(wldoc.book_info, width=240).output_file()
356
357
358 @BuildEbook.register('cover_thumb')
359 @task(ignore_result=True)
360 class BuildCoverThumb(BuildCover):
361     @classmethod
362     def transform(cls, wldoc, fieldfile):
363         from librarian.cover import WLCover
364         return WLCover(wldoc.book_info, height=193).output_file()
365
366
367 @BuildEbook.register('cover_api_thumb')
368 @task(ignore_result=True)
369 class BuildCoverApiThumb(BuildCover):
370     @classmethod
371     def transform(cls, wldoc, fieldfile):
372         from librarian.cover import WLNoBoxCover
373         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
374
375
376 @BuildEbook.register('simple_cover')
377 @task(ignore_result=True)
378 class BuildSimpleCover(BuildCover):
379     @classmethod
380     def transform(cls, wldoc, fieldfile):
381         from librarian.cover import WLNoBoxCover
382         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
383
384
385 @BuildEbook.register('cover_ebookpoint')
386 @task(ignore_result=True)
387 class BuildCoverEbookpoint(BuildCover):
388     @classmethod
389     def transform(cls, wldoc, fieldfile):
390         from librarian.cover import EbookpointCover
391         return EbookpointCover(wldoc.book_info).output_file()
392
393
394 # not used, but needed for migrations
395 class OverwritingFieldFile(FieldFile):
396     """
397         Deletes the old file before saving the new one.
398     """
399
400     def save(self, name, content, *args, **kwargs):
401         leave = kwargs.pop('leave', None)
402         # delete if there's a file already and there's a new one coming
403         if not leave and self and (not hasattr(content, 'path') or content.path != self.path):
404             self.delete(save=False)
405         return super(OverwritingFieldFile, self).save(name, content, *args, **kwargs)
406
407
408 class OverwritingFileField(models.FileField):
409     attr_class = OverwritingFieldFile
410
411
412 class OverwriteStorage(FileSystemStorage):
413
414     def get_available_name(self, name, max_length=None):
415         self.delete(name)
416         return name