Content warnings.
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.core.files import File
6 from django.core.files.storage import FileSystemStorage
7 from django.db import models
8 from django.db.models.fields.files import FieldFile
9 from catalogue import app_settings
10 from catalogue.constants import LANGUAGES_3TO2, EBOOK_FORMATS_WITH_CHILDREN, EBOOK_FORMATS_WITHOUT_CHILDREN
11 from catalogue.utils import remove_zip, truncate_html_words, gallery_path, gallery_url
12 from celery.task import Task, task
13 from celery.utils.log import get_task_logger
14 from waiter.utils import clear_cache
15
16 task_logger = get_task_logger(__name__)
17
18 ETAG_SCHEDULED_SUFFIX = '-scheduled'
19 EBOOK_BUILD_PRIORITY = 0
20 EBOOK_REBUILD_PRIORITY = 9
21
22
23 class EbookFieldFile(FieldFile):
24     """Represents contents of an ebook file field."""
25
26     def build(self):
27         """Build the ebook immediately."""
28         return self.field.builder.build(self)
29
30     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
31         """Builds the ebook in a delayed task."""
32         self.update_etag(
33             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
34         )
35         return self.field.builder.apply_async(
36             [self.instance, self.field.attname],
37             priority=priority
38         )
39
40     def get_url(self):
41         return self.instance.media_url(self.field.attname.split('_')[0])
42
43     def set_readable(self, readable):
44         import os
45         permissions = 0o644 if readable else 0o600
46         os.chmod(self.path, permissions)
47
48     def update_etag(self, etag):
49         setattr(self.instance, self.field.etag_field_name, etag)
50         if self.instance.pk:
51             self.instance.save(update_fields=[self.field.etag_field_name])
52
53
54 class EbookField(models.FileField):
55     """Represents an ebook file field, attachable to a model."""
56     attr_class = EbookFieldFile
57     registry = []
58
59     def __init__(self, format_name, *args, **kwargs):
60         super(EbookField, self).__init__(*args, **kwargs)
61         self.format_name = format_name
62
63     def deconstruct(self):
64         name, path, args, kwargs = super(EbookField, self).deconstruct()
65         args.insert(0, self.format_name)
66         return name, path, args, kwargs
67
68     @property
69     def builder(self):
70         """Finds a celery task suitable for the format of the field."""
71         return BuildEbook.for_format(self.format_name)
72
73     def contribute_to_class(self, cls, name):
74         super(EbookField, self).contribute_to_class(cls, name)
75
76         self.etag_field_name = f'{name}_etag'
77
78         def has(model_instance):
79             return bool(getattr(model_instance, self.attname, None))
80         has.__doc__ = None
81         has.__name__ = str("has_%s" % self.attname)
82         has.short_description = self.name
83         has.boolean = True
84
85         self.registry.append(self)
86
87         setattr(cls, 'has_%s' % self.attname, has)
88
89     def get_current_etag(self):
90         import pkg_resources
91         librarian_version = pkg_resources.get_distribution("librarian").version
92         return librarian_version
93
94     def schedule_stale(self, queryset=None):
95         """Schedule building this format for all the books where etag is stale."""
96         # If there is not ETag field, bail. That's true for xml file field.
97         if not hasattr(self.model, f'{self.attname}_etag'):
98             return
99
100         etag = self.get_current_etag()
101         if queryset is None:
102             queryset = self.model.objects.all()
103
104         if self.format_name in EBOOK_FORMATS_WITHOUT_CHILDREN + ['html']:
105             queryset = queryset.filter(children=None)
106
107         queryset = queryset.exclude(**{
108             f'{self.etag_field_name}__in': [
109                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
110             ]
111         })
112         for obj in queryset:
113             fieldfile = getattr(obj, self.attname)
114             priority = EBOOK_REBUILD_PRIORITY if fieldfile else EBOOK_BUILD_PRIORITY
115             fieldfile.build_delay(priority=priority)
116
117     @classmethod
118     def schedule_all_stale(cls):
119         """Schedules all stale ebooks of all formats to rebuild."""
120         for field in cls.registry:
121             field.schedule_stale()
122
123
124
125 class BuildEbook(Task):
126     formats = {}
127
128     @classmethod
129     def register(cls, format_name):
130         """A decorator for registering subclasses for particular formats."""
131         def wrapper(builder):
132             cls.formats[format_name] = builder
133             return builder
134         return wrapper
135
136     @classmethod
137     def for_format(cls, format_name):
138         """Returns a celery task suitable for specified format."""
139         return cls.formats.get(format_name, BuildEbookTask)
140
141     @staticmethod
142     def transform(wldoc, fieldfile):
143         """Transforms an librarian.WLDocument into an librarian.OutputFile.
144
145         By default, it just calls relevant wldoc.as_??? method.
146
147         """
148         return getattr(wldoc, "as_%s" % fieldfile.field.format_name)()
149
150     def run(self, obj, field_name):
151         """Just run `build` on FieldFile, can't pass it directly to Celery."""
152         fieldfile = getattr(obj, field_name)
153
154         # Get etag value before actually building the file.
155         etag = fieldfile.field.get_current_etag()
156         task_logger.info("%s -> %s@%s" % (obj.slug, field_name, etag))
157         ret = self.build(getattr(obj, field_name))
158         fieldfile.update_etag(etag)
159         obj.clear_cache()
160         return ret
161
162     def set_file_permissions(self, fieldfile):
163         if fieldfile.instance.preview:
164             fieldfile.set_readable(False)
165
166     def build(self, fieldfile):
167         book = fieldfile.instance
168         out = self.transform(book.wldocument(), fieldfile)
169         fieldfile.save(None, File(open(out.get_filename(), 'rb')), save=False)
170         self.set_file_permissions(fieldfile)
171         if book.pk is not None:
172             book.save(update_fields=[fieldfile.field.attname])
173         if fieldfile.field.format_name in app_settings.FORMAT_ZIPS:
174             remove_zip(app_settings.FORMAT_ZIPS[fieldfile.field.format_name])
175 # Don't decorate BuildEbook, because we want to subclass it.
176 BuildEbookTask = task(BuildEbook, ignore_result=True)
177
178
179 @BuildEbook.register('txt')
180 @task(ignore_result=True)
181 class BuildTxt(BuildEbook):
182     @staticmethod
183     def transform(wldoc, fieldfile):
184         return wldoc.as_text()
185
186
187 @BuildEbook.register('pdf')
188 @task(ignore_result=True)
189 class BuildPdf(BuildEbook):
190     @staticmethod
191     def transform(wldoc, fieldfile):
192         return wldoc.as_pdf(
193             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS, cover=True,
194             ilustr_path=gallery_path(wldoc.book_info.url.slug), customizations=['notoc'])
195
196     def build(self, fieldfile):
197         BuildEbook.build(self, fieldfile)
198         clear_cache(fieldfile.instance.slug)
199
200
201 @BuildEbook.register('epub')
202 @task(ignore_result=True)
203 class BuildEpub(BuildEbook):
204     @staticmethod
205     def transform(wldoc, fieldfile):
206         return wldoc.as_epub(cover=True, ilustr_path=gallery_path(wldoc.book_info.url.slug))
207
208
209 @BuildEbook.register('mobi')
210 @task(ignore_result=True)
211 class BuildMobi(BuildEbook):
212     @staticmethod
213     def transform(wldoc, fieldfile):
214         return wldoc.as_mobi(cover=True, ilustr_path=gallery_path(wldoc.book_info.url.slug))
215
216
217 @BuildEbook.register('html')
218 @task(ignore_result=True)
219 class BuildHtml(BuildEbook):
220     def build(self, fieldfile):
221         from django.core.files.base import ContentFile
222         from slugify import slugify
223         from sortify import sortify
224         from librarian import html
225         from catalogue.models import Fragment, Tag
226
227         book = fieldfile.instance
228
229         html_output = self.transform(book.wldocument(parse_dublincore=False), fieldfile)
230
231         # Delete old fragments, create from scratch if necessary.
232         book.fragments.all().delete()
233
234         if html_output:
235             meta_tags = list(book.tags.filter(
236                 category__in=('author', 'epoch', 'genre', 'kind')))
237
238             lang = book.language
239             lang = LANGUAGES_3TO2.get(lang, lang)
240             if lang not in [ln[0] for ln in settings.LANGUAGES]:
241                 lang = None
242
243             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
244             self.set_file_permissions(fieldfile)
245             type(book).objects.filter(pk=book.pk).update(**{
246                 fieldfile.field.attname: fieldfile
247             })
248
249             # Extract fragments
250             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
251             for fragment in closed_fragments.values():
252                 try:
253                     theme_names = [s.strip() for s in fragment.themes.split(',')]
254                 except AttributeError:
255                     continue
256                 themes = []
257                 for theme_name in theme_names:
258                     if not theme_name:
259                         continue
260                     if lang == settings.LANGUAGE_CODE:
261                         # Allow creating themes if book in default language.
262                         tag, created = Tag.objects.get_or_create(
263                             slug=slugify(theme_name),
264                             category='theme'
265                         )
266                         if created:
267                             tag.name = theme_name
268                             setattr(tag, "name_%s" % lang, theme_name)
269                             tag.sort_key = sortify(theme_name.lower())
270                             tag.for_books = True
271                             tag.save()
272                         themes.append(tag)
273                     elif lang is not None:
274                         # Don't create unknown themes in non-default languages.
275                         try:
276                             tag = Tag.objects.get(
277                                 category='theme',
278                                 **{"name_%s" % lang: theme_name}
279                             )
280                         except Tag.DoesNotExist:
281                             pass
282                         else:
283                             themes.append(tag)
284                 if not themes:
285                     continue
286
287                 text = fragment.to_string()
288                 short_text = truncate_html_words(text, 15)
289                 if text == short_text:
290                     short_text = ''
291                 new_fragment = Fragment.objects.create(
292                     anchor=fragment.id,
293                     book=book,
294                     text=text,
295                     short_text=short_text
296                 )
297
298                 new_fragment.save()
299                 new_fragment.tags = set(meta_tags + themes)
300                 for theme in themes:
301                     if not theme.for_books:
302                         theme.for_books = True
303                         theme.save()
304             book.html_built.send(sender=type(self), instance=book)
305             return True
306         return False
307
308     @staticmethod
309     def transform(wldoc, fieldfile):
310         # ugly, but we can't use wldoc.book_info here
311         from librarian import DCNS
312         url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
313         if url_elem is None:
314             gallery = ''
315         else:
316             gallery = gallery_url(slug=url_elem.text.rstrip('/').rsplit('/', 1)[1])
317         return wldoc.as_html(options={'gallery': "'%s'" % gallery})
318
319
320 class BuildCover(BuildEbook):
321     def set_file_permissions(self, fieldfile):
322         pass
323
324
325 @BuildEbook.register('cover_thumb')
326 @task(ignore_result=True)
327 class BuildCoverThumb(BuildCover):
328     @classmethod
329     def transform(cls, wldoc, fieldfile):
330         from librarian.cover import WLCover
331         return WLCover(wldoc.book_info, height=193).output_file()
332
333
334 @BuildEbook.register('cover_api_thumb')
335 @task(ignore_result=True)
336 class BuildCoverApiThumb(BuildCover):
337     @classmethod
338     def transform(cls, wldoc, fieldfile):
339         from librarian.cover import WLNoBoxCover
340         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
341
342
343 @BuildEbook.register('simple_cover')
344 @task(ignore_result=True)
345 class BuildSimpleCover(BuildCover):
346     @classmethod
347     def transform(cls, wldoc, fieldfile):
348         from librarian.cover import WLNoBoxCover
349         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
350
351
352 @BuildEbook.register('cover_ebookpoint')
353 @task(ignore_result=True)
354 class BuildCoverEbookpoint(BuildCover):
355     @classmethod
356     def transform(cls, wldoc, fieldfile):
357         from librarian.cover import EbookpointCover
358         return EbookpointCover(wldoc.book_info).output_file()
359
360
361 # not used, but needed for migrations
362 class OverwritingFieldFile(FieldFile):
363     """
364         Deletes the old file before saving the new one.
365     """
366
367     def save(self, name, content, *args, **kwargs):
368         leave = kwargs.pop('leave', None)
369         # delete if there's a file already and there's a new one coming
370         if not leave and self and (not hasattr(content, 'path') or content.path != self.path):
371             self.delete(save=False)
372         return super(OverwritingFieldFile, self).save(name, content, *args, **kwargs)
373
374
375 class OverwritingFileField(models.FileField):
376     attr_class = OverwritingFieldFile
377
378
379 class OverwriteStorage(FileSystemStorage):
380
381     def get_available_name(self, name, max_length=None):
382         self.delete(name)
383         return name