9c5696fa677a0264c45e4dff8524825bb9b18055
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import pkg_resources
7 import random
8 import time
9 from urllib.request import urlopen
10 from django.apps import apps
11 from django.conf import settings
12 from django.core.files import File
13 from django.db import models
14 from django.db.models.fields.files import FieldFile
15 from django.utils.deconstruct import deconstructible
16 from librarian.cover import make_cover
17 from catalogue.constants import LANGUAGES_3TO2
18 from catalogue.utils import absolute_url, remove_zip, truncate_html_words, gallery_path, gallery_url
19 from waiter.utils import clear_cache
20
21 ETAG_SCHEDULED_SUFFIX = '-scheduled'
22 EBOOK_BUILD_PRIORITY = 0
23 EBOOK_REBUILD_PRIORITY = 9
24
25
26 @deconstructible
27 class UploadToPath(object):
28     def __init__(self, path):
29         self.path = path
30
31     def __call__(self, instance, filename):
32         return self.path % instance.slug
33
34     def __eq__(self, other):
35         return isinstance(other, type(self)) and other.path == self.path
36
37
38 def get_make_cover(book):
39     extra = book.get_extra_info_json()
40     cover_logo = extra.get('logo_mono', extra.get('logo'))
41     if cover_logo:
42         while True:
43             try:
44                 cover_logo = io.BytesIO(urlopen(cover_logo, timeout=3).read())
45             except:
46                 time.sleep(2)
47             else:
48                 break
49     
50     def mc(*args, **kwargs):
51         if cover_logo:
52             kwargs['cover_logo'] = cover_logo
53         return make_cover(*args, **kwargs)
54     return mc
55     
56
57 class EbookFieldFile(FieldFile):
58     """Represents contents of an ebook file field."""
59
60     def build(self):
61         """Build the ebook immediately."""
62         etag = self.field.get_current_etag()
63         self.field.build(self)
64         self.update_etag(etag)
65         self.instance.clear_cache()
66
67     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
68         """Builds the ebook in a delayed task."""
69         from .tasks import build_field
70
71         self.update_etag(
72             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
73         )
74         return build_field.apply_async(
75             [self.instance.pk, self.field.attname],
76             priority=priority
77         )
78
79     def set_readable(self, readable):
80         import os
81         permissions = 0o644 if readable else 0o600
82         os.chmod(self.path, permissions)
83
84     def update_etag(self, etag):
85         setattr(self.instance, self.field.etag_field_name, etag)
86         if self.instance.pk:
87             self.instance.save(update_fields=[self.field.etag_field_name])
88
89
90 class EbookField(models.FileField):
91     """Represents an ebook file field, attachable to a model."""
92     attr_class = EbookFieldFile
93     ext = None
94     for_parents = True
95     librarian2_api = False
96     ZIP = None
97
98     def __init__(self, verbose_name=None, with_etag=True, etag_field_name=None, **kwargs):
99         kwargs.setdefault('verbose_name', verbose_name)
100         self.with_etag = with_etag
101         self.etag_field_name = etag_field_name
102         kwargs.setdefault('max_length', 255)
103         kwargs.setdefault('blank', True)
104         kwargs.setdefault('default', '')
105         kwargs.setdefault('upload_to', self.get_upload_to(self.ext))
106
107         super().__init__(**kwargs)
108
109     def deconstruct(self):
110         name, path, args, kwargs = super().deconstruct()
111         if kwargs.get('max_length') == 255:
112             del kwargs['max_length']
113         if kwargs.get('blank') is True:
114             del kwargs['blank']
115         if kwargs.get('default') == '':
116             del kwargs['default']
117         if self.get_upload_to(self.ext) == kwargs.get('upload_to'):
118             del kwargs['upload_to']
119         # with_etag creates a second field, which then deconstructs to manage
120         # its own migrations. So for migrations, etag_field_name is explicitly
121         # set to avoid double creation of the etag field.
122         if self.with_etag:
123             kwargs['etag_field_name'] = self.etag_field_name
124         else:
125             kwargs['with_etag'] = self.with_etag
126
127         return name, path, args, kwargs
128
129     @classmethod
130     def get_upload_to(cls, directory):
131         directory = getattr(cls, 'directory', cls.ext)
132         upload_template = f'book/{directory}/%s.{cls.ext}'
133         return UploadToPath(upload_template)
134
135     def contribute_to_class(self, cls, name):
136         super(EbookField, self).contribute_to_class(cls, name)
137
138         if self.with_etag and not self.etag_field_name:
139             self.etag_field_name = f'{name}_etag'
140             self.etag_field = models.CharField(max_length=255, editable=False, default='', db_index=True)
141             self.etag_field.contribute_to_class(cls, f'{name}_etag')
142
143         def has(model_instance):
144             return bool(getattr(model_instance, self.attname, None))
145         has.__doc__ = None
146         has.__name__ = str("has_%s" % self.attname)
147         has.short_description = self.name
148         has.boolean = True
149
150         setattr(cls, 'has_%s' % self.attname, has)
151
152     def get_current_etag(self):
153         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
154         librarian_version = pkg_resources.get_distribution("librarian").version
155         etag = librarian_version
156         mis = MediaInsertSet.get_for_format(self.ext)
157         if mis is not None:
158             etag += '_' + mis.etag
159         return etag
160
161     def find_stale(self, limit):
162         """Find some books where this format is stale."""
163         # If there is not ETag field, bail. That's true for xml file field.
164         if not self.with_etag:
165             return []
166
167         etag = self.get_current_etag()
168
169         queryset = self.model.objects.all()
170         if not self.for_parents:
171             queryset = queryset.filter(children=None)
172
173         queryset = queryset.exclude(**{
174             f'{self.etag_field_name}__in': [
175                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
176            ]
177         })
178
179         queryset = queryset.order_by('?')[:limit]
180         return queryset
181
182     @classmethod
183     def find_all_stale(cls, model, limit):
184         """Schedules all stale ebooks of all formats to rebuild."""
185         found = []
186         for field in model._meta.fields:
187             if isinstance(field, cls):
188                 for instance in field.find_stale(limit):
189                     found.append((
190                         field.name,
191                         instance
192                     ))
193         random.shuffle(found)
194         found = found[:limit]
195         return found
196
197     @staticmethod
198     def transform(wldoc, book):
199         """Transforms an librarian.WLDocument into an librarian.OutputFile.
200         """
201         raise NotImplemented()
202
203     def set_file_permissions(self, fieldfile):
204         if fieldfile.instance.preview:
205             fieldfile.set_readable(False)
206
207     def build(self, fieldfile):
208         book = fieldfile.instance
209         out = self.transform(
210             book.wldocument2() if self.librarian2_api else book.wldocument(),
211             book,
212         )
213         with open(out.get_filename(), 'rb') as f:
214             fieldfile.save(None, File(f), save=False)
215         self.set_file_permissions(fieldfile)
216         if book.pk is not None:
217             book.save(update_fields=[self.attname])
218         if self.ZIP:
219             remove_zip(self.ZIP)
220
221
222 class XmlField(EbookField):
223     ext = 'xml'
224
225     def build(self, fieldfile):
226         pass
227
228
229 class TxtField(EbookField):
230     ext = 'txt'
231     for_parents = False
232
233     @staticmethod
234     def transform(wldoc, book):
235         return wldoc.as_text()
236
237
238 class Fb2Field(EbookField):
239     ext = 'fb2'
240     for_parents = False
241     ZIP = 'wolnelektury_pl_fb2'
242
243     @staticmethod
244     def transform(wldoc, book):
245         return wldoc.as_fb2()
246
247
248 class PdfField(EbookField):
249     ext = 'pdf'
250     ZIP = 'wolnelektury_pl_pdf'
251
252     @staticmethod
253     def transform(wldoc, book):
254         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
255         return wldoc.as_pdf(
256             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS,
257             cover=get_make_cover(book),
258             base_url=absolute_url(gallery_url(wldoc.book_info.url.slug)), customizations=['notoc'],
259             fundraising=MediaInsertSet.get_texts_for('pdf'),
260         )
261
262     def build(self, fieldfile):
263         super().build(fieldfile)
264         clear_cache(fieldfile.instance.slug)
265
266
267 class EpubField(EbookField):
268     ext = 'epub'
269     librarian2_api = True
270     ZIP = 'wolnelektury_pl_epub'
271
272     @staticmethod
273     def transform(wldoc, book):
274         from librarian.builders import EpubBuilder
275         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
276         return EpubBuilder(
277                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
278                 fundraising=MediaInsertSet.get_texts_for('epub'),
279                 cover=get_make_cover(book),
280             ).build(wldoc)
281
282
283 class MobiField(EbookField):
284     ext = 'mobi'
285     librarian2_api = True
286     ZIP = 'wolnelektury_pl_mobi'
287
288     @staticmethod
289     def transform(wldoc, book):
290         from librarian.builders import MobiBuilder
291         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
292         return MobiBuilder(
293                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
294                 fundraising=MediaInsertSet.get_texts_for('mobi'),
295                 cover=get_make_cover(book),
296             ).build(wldoc)
297
298
299 class HtmlField(EbookField):
300     ext = 'html'
301     for_parents = False
302
303     def build(self, fieldfile):
304         from django.core.files.base import ContentFile
305         from slugify import slugify
306         from sortify import sortify
307         from librarian import html
308         from catalogue.models import Fragment, Tag
309
310         book = fieldfile.instance
311
312         html_output = self.transform(book.wldocument(parse_dublincore=False), book)
313
314         # Delete old fragments, create from scratch if necessary.
315         book.fragments.all().delete()
316
317         if html_output:
318             meta_tags = list(book.tags.filter(
319                 category__in=('author', 'epoch', 'genre', 'kind')))
320
321             lang = book.language
322             lang = LANGUAGES_3TO2.get(lang, lang)
323             if lang not in [ln[0] for ln in settings.LANGUAGES]:
324                 lang = None
325
326             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
327             self.set_file_permissions(fieldfile)
328             type(book).objects.filter(pk=book.pk).update(**{
329                 fieldfile.field.attname: fieldfile
330             })
331
332             # Extract fragments
333             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
334             for fragment in closed_fragments.values():
335                 try:
336                     theme_names = [s.strip() for s in fragment.themes.split(',')]
337                 except AttributeError:
338                     continue
339                 themes = []
340                 for theme_name in theme_names:
341                     if not theme_name:
342                         continue
343                     if lang == settings.LANGUAGE_CODE:
344                         # Allow creating themes if book in default language.
345                         tag, created = Tag.objects.get_or_create(
346                             slug=slugify(theme_name),
347                             category='theme'
348                         )
349                         if created:
350                             tag.name = theme_name
351                             setattr(tag, "name_%s" % lang, theme_name)
352                             tag.sort_key = sortify(theme_name.lower())
353                             tag.save()
354                         themes.append(tag)
355                     elif lang is not None:
356                         # Don't create unknown themes in non-default languages.
357                         try:
358                             tag = Tag.objects.get(
359                                 category='theme',
360                                 **{"name_%s" % lang: theme_name}
361                             )
362                         except Tag.DoesNotExist:
363                             pass
364                         else:
365                             themes.append(tag)
366                 if not themes:
367                     continue
368
369                 text = fragment.to_string()
370                 short_text = truncate_html_words(text, 15)
371                 if text == short_text:
372                     short_text = ''
373                 new_fragment = Fragment.objects.create(
374                     anchor=fragment.id,
375                     book=book,
376                     text=text,
377                     short_text=short_text
378                 )
379
380                 new_fragment.save()
381                 new_fragment.tags = set(meta_tags + themes)
382             book.html_built.send(sender=type(self), instance=book)
383             return True
384         return False
385
386     @staticmethod
387     def transform(wldoc, book):
388         # ugly, but we can't use wldoc.book_info here
389         from librarian import DCNS
390         url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
391         if url_elem is None:
392             gal_url = ''
393             gal_path = ''
394         else:
395             slug = url_elem.text.rstrip('/').rsplit('/', 1)[1]
396             gal_url = gallery_url(slug=slug)
397             gal_path = gallery_path(slug=slug)
398         return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url))
399
400
401 class CoverField(EbookField):
402     ext = 'jpg'
403     directory = 'cover'
404
405     @staticmethod
406     def transform(wldoc, book):
407         return get_make_cover(book)(wldoc.book_info, width=360).output_file()
408
409     def set_file_permissions(self, fieldfile):
410         pass
411
412
413 class CoverCleanField(CoverField):
414     directory = 'cover_clean'
415
416     @staticmethod
417     def transform(wldoc, book):
418         return get_make_cover(book)(wldoc.book_info, width=360).output_file()
419
420
421 class CoverThumbField(CoverField):
422     directory = 'cover_thumb'
423
424     @staticmethod
425     def transform(wldoc, book):
426         from librarian.cover import WLCover
427         return WLCover(wldoc.book_info, height=193).output_file()
428
429
430 class CoverApiThumbField(CoverField):
431     directory = 'cover_api_thumb'
432
433     @staticmethod
434     def transform(wldoc, book):
435         from librarian.cover import WLNoBoxCover
436         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
437
438
439 class SimpleCoverField(CoverField):
440     directory = 'cover_simple'
441
442     @staticmethod
443     def transform(wldoc, book):
444         from librarian.cover import WLNoBoxCover
445         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
446
447
448 class CoverEbookpointField(CoverField):
449     directory = 'cover_ebookpoint'
450
451     @staticmethod
452     def transform(wldoc, book):
453         from librarian.cover import EbookpointCover
454         return EbookpointCover(wldoc.book_info).output_file()