423be1d82ab652594e6d1c10488aea0b9ca7f814
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 import os
5 import pkg_resources
6 import random
7 from django.apps import apps
8 from django.conf import settings
9 from django.core.files import File
10 from django.db import models
11 from django.db.models.fields.files import FieldFile
12 from django.utils.deconstruct import deconstructible
13 from catalogue.constants import LANGUAGES_3TO2
14 from catalogue.utils import absolute_url, remove_zip, truncate_html_words, gallery_path, gallery_url
15 from waiter.utils import clear_cache
16
17 ETAG_SCHEDULED_SUFFIX = '-scheduled'
18 EBOOK_BUILD_PRIORITY = 0
19 EBOOK_REBUILD_PRIORITY = 9
20
21
22 @deconstructible
23 class UploadToPath(object):
24     def __init__(self, path):
25         self.path = path
26
27     def __call__(self, instance, filename):
28         return self.path % instance.slug
29
30     def __eq__(self, other):
31         return isinstance(other, type(self)) and other.path == self.path
32
33
34 class EbookFieldFile(FieldFile):
35     """Represents contents of an ebook file field."""
36
37     def build(self):
38         """Build the ebook immediately."""
39         etag = self.field.get_current_etag()
40         self.field.build(self)
41         self.update_etag(etag)
42         self.instance.clear_cache()
43
44     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
45         """Builds the ebook in a delayed task."""
46         from .tasks import build_field
47
48         self.update_etag(
49             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
50         )
51         return build_field.apply_async(
52             [self.instance.pk, self.field.attname],
53             priority=priority
54         )
55
56     def set_readable(self, readable):
57         import os
58         permissions = 0o644 if readable else 0o600
59         os.chmod(self.path, permissions)
60
61     def update_etag(self, etag):
62         setattr(self.instance, self.field.etag_field_name, etag)
63         if self.instance.pk:
64             self.instance.save(update_fields=[self.field.etag_field_name])
65
66
67 class EbookField(models.FileField):
68     """Represents an ebook file field, attachable to a model."""
69     attr_class = EbookFieldFile
70     ext = None
71     for_parents = True
72     librarian2_api = False
73     ZIP = None
74
75     def __init__(self, verbose_name=None, with_etag=True, etag_field_name=None, **kwargs):
76         kwargs.setdefault('verbose_name', verbose_name)
77         self.with_etag = with_etag
78         self.etag_field_name = etag_field_name
79         kwargs.setdefault('max_length', 255)
80         kwargs.setdefault('blank', True)
81         kwargs.setdefault('default', '')
82         kwargs.setdefault('upload_to', self.get_upload_to(self.ext))
83
84         super().__init__(**kwargs)
85
86     def deconstruct(self):
87         name, path, args, kwargs = super().deconstruct()
88         if kwargs.get('max_length') == 255:
89             del kwargs['max_length']
90         if kwargs.get('blank') is True:
91             del kwargs['blank']
92         if kwargs.get('default') == '':
93             del kwargs['default']
94         if self.get_upload_to(self.ext) == kwargs.get('upload_to'):
95             del kwargs['upload_to']
96         # with_etag creates a second field, which then deconstructs to manage
97         # its own migrations. So for migrations, etag_field_name is explicitly
98         # set to avoid double creation of the etag field.
99         if self.with_etag:
100             kwargs['etag_field_name'] = self.etag_field_name
101         else:
102             kwargs['with_etag'] = self.with_etag
103
104         return name, path, args, kwargs
105
106     @classmethod
107     def get_upload_to(cls, directory):
108         directory = getattr(cls, 'directory', cls.ext)
109         upload_template = f'book/{directory}/%s.{cls.ext}'
110         return UploadToPath(upload_template)
111
112     def contribute_to_class(self, cls, name):
113         super(EbookField, self).contribute_to_class(cls, name)
114
115         if self.with_etag and not self.etag_field_name:
116             self.etag_field_name = f'{name}_etag'
117             self.etag_field = models.CharField(max_length=255, editable=False, default='', db_index=True)
118             self.etag_field.contribute_to_class(cls, f'{name}_etag')
119
120         def has(model_instance):
121             return bool(getattr(model_instance, self.attname, None))
122         has.__doc__ = None
123         has.__name__ = str("has_%s" % self.attname)
124         has.short_description = self.name
125         has.boolean = True
126
127         setattr(cls, 'has_%s' % self.attname, has)
128
129     def get_current_etag(self):
130         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
131         librarian_version = pkg_resources.get_distribution("librarian").version
132         etag = librarian_version
133         mis = MediaInsertSet.get_for_format(self.ext)
134         if mis is not None:
135             etag += '_' + mis.etag
136         return etag
137
138     def find_stale(self, limit):
139         """Find some books where this format is stale."""
140         # If there is not ETag field, bail. That's true for xml file field.
141         if not self.with_etag:
142             return []
143
144         etag = self.get_current_etag()
145
146         queryset = self.model.objects.all()
147         if not self.for_parents:
148             queryset = queryset.filter(children=None)
149
150         queryset = queryset.exclude(**{
151             f'{self.etag_field_name}__in': [
152                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
153            ]
154         })
155
156         queryset = queryset.order_by('?')[:limit]
157         return queryset
158
159     @classmethod
160     def find_all_stale(cls, model, limit):
161         """Schedules all stale ebooks of all formats to rebuild."""
162         found = []
163         for field in model._meta.fields:
164             if isinstance(field, cls):
165                 for instance in field.find_stale(limit):
166                     found.append((
167                         field.name,
168                         instance
169                     ))
170         random.shuffle(found)
171         found = found[:limit]
172         return found
173
174     @staticmethod
175     def transform(wldoc):
176         """Transforms an librarian.WLDocument into an librarian.OutputFile.
177         """
178         raise NotImplemented()
179
180     def set_file_permissions(self, fieldfile):
181         if fieldfile.instance.preview:
182             fieldfile.set_readable(False)
183
184     def build(self, fieldfile):
185         book = fieldfile.instance
186         out = self.transform(
187             book.wldocument2() if self.librarian2_api else book.wldocument(),
188         )
189         with open(out.get_filename(), 'rb') as f:
190             fieldfile.save(None, File(f), save=False)
191         self.set_file_permissions(fieldfile)
192         if book.pk is not None:
193             book.save(update_fields=[self.attname])
194         if self.ZIP:
195             remove_zip(self.ZIP)
196
197
198 class XmlField(EbookField):
199     ext = 'xml'
200
201     def build(self, fieldfile):
202         pass
203
204
205 class TxtField(EbookField):
206     ext = 'txt'
207     for_parents = False
208
209     @staticmethod
210     def transform(wldoc):
211         return wldoc.as_text()
212
213
214 class Fb2Field(EbookField):
215     ext = 'fb2'
216     for_parents = False
217     ZIP = 'wolnelektury_pl_fb2'
218
219     @staticmethod
220     def transform(wldoc):
221         return wldoc.as_fb2()
222
223
224 class PdfField(EbookField):
225     ext = 'pdf'
226     ZIP = 'wolnelektury_pl_pdf'
227
228     @staticmethod
229     def transform(wldoc):
230         return wldoc.as_pdf(
231             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS, cover=True,
232             base_url=absolute_url(gallery_url(wldoc.book_info.url.slug)), customizations=['notoc'])
233
234     def build(self, fieldfile):
235         super().build(fieldfile)
236         clear_cache(fieldfile.instance.slug)
237
238
239 class EpubField(EbookField):
240     ext = 'epub'
241     librarian2_api = True
242     ZIP = 'wolnelektury_pl_epub'
243
244     @staticmethod
245     def transform(wldoc):
246         from librarian.builders import EpubBuilder
247         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
248         return EpubBuilder(
249                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
250                 fundraising=MediaInsertSet.get_texts_for('epub')
251             ).build(wldoc)
252
253
254 class MobiField(EbookField):
255     ext = 'mobi'
256     librarian2_api = True
257     ZIP = 'wolnelektury_pl_mobi'
258
259     @staticmethod
260     def transform(wldoc):
261         from librarian.builders import MobiBuilder
262         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
263         return MobiBuilder(
264                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
265                 fundraising=MediaInsertSet.get_texts_for('mobi')
266             ).build(wldoc)
267
268
269 class HtmlField(EbookField):
270     ext = 'html'
271     for_parents = False
272
273     def build(self, fieldfile):
274         from django.core.files.base import ContentFile
275         from slugify import slugify
276         from sortify import sortify
277         from librarian import html
278         from catalogue.models import Fragment, Tag
279
280         book = fieldfile.instance
281
282         html_output = self.transform(book.wldocument(parse_dublincore=False))
283
284         # Delete old fragments, create from scratch if necessary.
285         book.fragments.all().delete()
286
287         if html_output:
288             meta_tags = list(book.tags.filter(
289                 category__in=('author', 'epoch', 'genre', 'kind')))
290
291             lang = book.language
292             lang = LANGUAGES_3TO2.get(lang, lang)
293             if lang not in [ln[0] for ln in settings.LANGUAGES]:
294                 lang = None
295
296             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
297             self.set_file_permissions(fieldfile)
298             type(book).objects.filter(pk=book.pk).update(**{
299                 fieldfile.field.attname: fieldfile
300             })
301
302             # Extract fragments
303             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
304             for fragment in closed_fragments.values():
305                 try:
306                     theme_names = [s.strip() for s in fragment.themes.split(',')]
307                 except AttributeError:
308                     continue
309                 themes = []
310                 for theme_name in theme_names:
311                     if not theme_name:
312                         continue
313                     if lang == settings.LANGUAGE_CODE:
314                         # Allow creating themes if book in default language.
315                         tag, created = Tag.objects.get_or_create(
316                             slug=slugify(theme_name),
317                             category='theme'
318                         )
319                         if created:
320                             tag.name = theme_name
321                             setattr(tag, "name_%s" % lang, theme_name)
322                             tag.sort_key = sortify(theme_name.lower())
323                             tag.for_books = True
324                             tag.save()
325                         themes.append(tag)
326                     elif lang is not None:
327                         # Don't create unknown themes in non-default languages.
328                         try:
329                             tag = Tag.objects.get(
330                                 category='theme',
331                                 **{"name_%s" % lang: theme_name}
332                             )
333                         except Tag.DoesNotExist:
334                             pass
335                         else:
336                             themes.append(tag)
337                 if not themes:
338                     continue
339
340                 text = fragment.to_string()
341                 short_text = truncate_html_words(text, 15)
342                 if text == short_text:
343                     short_text = ''
344                 new_fragment = Fragment.objects.create(
345                     anchor=fragment.id,
346                     book=book,
347                     text=text,
348                     short_text=short_text
349                 )
350
351                 new_fragment.save()
352                 new_fragment.tags = set(meta_tags + themes)
353                 for theme in themes:
354                     if not theme.for_books:
355                         theme.for_books = True
356                         theme.save()
357             book.html_built.send(sender=type(self), instance=book)
358             return True
359         return False
360
361     @staticmethod
362     def transform(wldoc):
363         # ugly, but we can't use wldoc.book_info here
364         from librarian import DCNS
365         url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
366         if url_elem is None:
367             gal_url = ''
368             gal_path = ''
369         else:
370             slug = url_elem.text.rstrip('/').rsplit('/', 1)[1]
371             gal_url = gallery_url(slug=slug)
372             gal_path = gallery_path(slug=slug)
373         return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url))
374
375
376 class CoverField(EbookField):
377     ext = 'jpg'
378     directory = 'cover'
379
380     @staticmethod
381     def transform(wldoc):
382         return wldoc.as_cover()
383
384     def set_file_permissions(self, fieldfile):
385         pass
386
387
388 class CoverCleanField(CoverField):
389     directory = 'cover_clean'
390
391     @staticmethod
392     def transform(wldoc):
393         from librarian.covers.marquise import MarquiseCover
394         return MarquiseCover(wldoc.book_info, width=360).output_file()
395
396
397 class CoverThumbField(CoverField):
398     directory = 'cover_thumb'
399
400     @staticmethod
401     def transform(wldoc):
402         from librarian.cover import WLCover
403         return WLCover(wldoc.book_info, height=193).output_file()
404
405
406 class CoverApiThumbField(CoverField):
407     directory = 'cover_api_thumb'
408
409     @staticmethod
410     def transform(wldoc):
411         from librarian.cover import WLNoBoxCover
412         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
413
414
415 class SimpleCoverField(CoverField):
416     directory = 'cover_simple'
417
418     @staticmethod
419     def transform(wldoc):
420         from librarian.cover import WLNoBoxCover
421         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
422
423
424 class CoverEbookpointField(CoverField):
425     directory = 'cover_ebookpoint'
426
427     @staticmethod
428     def transform(wldoc):
429         from librarian.cover import EbookpointCover
430         return EbookpointCover(wldoc.book_info).output_file()