Update to new librarian api for html, txt.
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import pkg_resources
7 import random
8 import time
9 from urllib.request import urlopen
10 from django.apps import apps
11 from django.conf import settings
12 from django.core.files import File
13 from django.db import models
14 from django.db.models.fields.files import FieldFile
15 from django.utils.deconstruct import deconstructible
16 from librarian.cover import make_cover
17 from catalogue.constants import LANGUAGES_3TO2
18 from catalogue.utils import absolute_url, remove_zip, truncate_html_words, gallery_path, gallery_url
19 from waiter.utils import clear_cache
20
21 ETAG_SCHEDULED_SUFFIX = '-scheduled'
22 EBOOK_BUILD_PRIORITY = 0
23 EBOOK_REBUILD_PRIORITY = 9
24
25
26 @deconstructible
27 class UploadToPath(object):
28     def __init__(self, path):
29         self.path = path
30
31     def __call__(self, instance, filename):
32         return self.path % instance.slug
33
34     def __eq__(self, other):
35         return isinstance(other, type(self)) and other.path == self.path
36
37
38 def get_make_cover(book):
39     extra = book.get_extra_info_json()
40     cover_logo = extra.get('logo_mono', extra.get('logo'))
41     if cover_logo:
42         while True:
43             try:
44                 cover_logo = io.BytesIO(urlopen(cover_logo, timeout=3).read())
45             except:
46                 time.sleep(2)
47             else:
48                 break
49     
50     def mc(*args, **kwargs):
51         if cover_logo:
52             kwargs['cover_logo'] = cover_logo
53         return make_cover(*args, **kwargs)
54     return mc
55     
56
57 class EbookFieldFile(FieldFile):
58     """Represents contents of an ebook file field."""
59
60     def build(self):
61         """Build the ebook immediately."""
62         etag = self.field.get_current_etag()
63         self.field.build(self)
64         self.update_etag(etag)
65         self.instance.clear_cache()
66
67     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
68         """Builds the ebook in a delayed task."""
69         from .tasks import build_field
70
71         self.update_etag(
72             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
73         )
74         return build_field.apply_async(
75             [self.instance.pk, self.field.attname],
76             priority=priority
77         )
78
79     def set_readable(self, readable):
80         import os
81         permissions = 0o644 if readable else 0o600
82         os.chmod(self.path, permissions)
83
84     def update_etag(self, etag):
85         setattr(self.instance, self.field.etag_field_name, etag)
86         if self.instance.pk:
87             self.instance.save(update_fields=[self.field.etag_field_name])
88
89
90 class EbookField(models.FileField):
91     """Represents an ebook file field, attachable to a model."""
92     attr_class = EbookFieldFile
93     ext = None
94     for_parents = True
95     librarian2_api = False
96     ZIP = None
97
98     def __init__(self, verbose_name=None, with_etag=True, etag_field_name=None, **kwargs):
99         kwargs.setdefault('verbose_name', verbose_name)
100         self.with_etag = with_etag
101         self.etag_field_name = etag_field_name
102         kwargs.setdefault('max_length', 255)
103         kwargs.setdefault('blank', True)
104         kwargs.setdefault('default', '')
105         kwargs.setdefault('upload_to', self.get_upload_to(self.ext))
106
107         super().__init__(**kwargs)
108
109     def deconstruct(self):
110         name, path, args, kwargs = super().deconstruct()
111         if kwargs.get('max_length') == 255:
112             del kwargs['max_length']
113         if kwargs.get('blank') is True:
114             del kwargs['blank']
115         if kwargs.get('default') == '':
116             del kwargs['default']
117         if self.get_upload_to(self.ext) == kwargs.get('upload_to'):
118             del kwargs['upload_to']
119         # with_etag creates a second field, which then deconstructs to manage
120         # its own migrations. So for migrations, etag_field_name is explicitly
121         # set to avoid double creation of the etag field.
122         if self.with_etag:
123             kwargs['etag_field_name'] = self.etag_field_name
124         else:
125             kwargs['with_etag'] = self.with_etag
126
127         return name, path, args, kwargs
128
129     @classmethod
130     def get_upload_to(cls, directory):
131         directory = getattr(cls, 'directory', cls.ext)
132         upload_template = f'book/{directory}/%s.{cls.ext}'
133         return UploadToPath(upload_template)
134
135     def contribute_to_class(self, cls, name):
136         super(EbookField, self).contribute_to_class(cls, name)
137
138         if self.with_etag and not self.etag_field_name:
139             self.etag_field_name = f'{name}_etag'
140             self.etag_field = models.CharField(max_length=255, editable=False, default='', db_index=True)
141             self.etag_field.contribute_to_class(cls, f'{name}_etag')
142
143         def has(model_instance):
144             return bool(getattr(model_instance, self.attname, None))
145         has.__doc__ = None
146         has.__name__ = str("has_%s" % self.attname)
147         has.short_description = self.name
148         has.boolean = True
149
150         setattr(cls, 'has_%s' % self.attname, has)
151
152     def get_current_etag(self):
153         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
154         librarian_version = pkg_resources.get_distribution("librarian").version
155         etag = librarian_version
156         mis = MediaInsertSet.get_for_format(self.ext)
157         if mis is not None:
158             etag += '_' + mis.etag
159         return etag
160
161     def find_stale(self, limit):
162         """Find some books where this format is stale."""
163         # If there is not ETag field, bail. That's true for xml file field.
164         if not self.with_etag:
165             return []
166
167         etag = self.get_current_etag()
168
169         queryset = self.model.objects.all()
170         if not self.for_parents:
171             queryset = queryset.filter(children=None)
172
173         queryset = queryset.exclude(**{
174             f'{self.etag_field_name}__in': [
175                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
176            ]
177         })
178
179         queryset = queryset.order_by('?')[:limit]
180         return queryset
181
182     @classmethod
183     def find_all_stale(cls, model, limit):
184         """Schedules all stale ebooks of all formats to rebuild."""
185         found = []
186         for field in model._meta.fields:
187             if isinstance(field, cls):
188                 for instance in field.find_stale(limit):
189                     found.append((
190                         field.name,
191                         instance
192                     ))
193         random.shuffle(found)
194         found = found[:limit]
195         return found
196
197     @staticmethod
198     def transform(wldoc, book):
199         """Transforms an librarian.WLDocument into an librarian.OutputFile.
200         """
201         raise NotImplemented()
202
203     def set_file_permissions(self, fieldfile):
204         if fieldfile.instance.preview:
205             fieldfile.set_readable(False)
206
207     def build(self, fieldfile):
208         book = fieldfile.instance
209         out = self.transform(
210             book.wldocument2() if self.librarian2_api else book.wldocument(),
211             book,
212         )
213         with open(out.get_filename(), 'rb') as f:
214             fieldfile.save(None, File(f), save=False)
215         self.set_file_permissions(fieldfile)
216         if book.pk is not None:
217             book.save(update_fields=[self.attname])
218         if self.ZIP:
219             remove_zip(self.ZIP)
220
221
222 class XmlField(EbookField):
223     ext = 'xml'
224
225     def build(self, fieldfile):
226         pass
227
228
229 class TxtField(EbookField):
230     ext = 'txt'
231     for_parents = False
232     librarian2_api = True
233
234     @staticmethod
235     def transform(wldoc, book):
236         from librarian.builders.txt import TxtBuilder
237         return TxtBuilder().build(wldoc)
238
239
240 class Fb2Field(EbookField):
241     ext = 'fb2'
242     for_parents = False
243     ZIP = 'wolnelektury_pl_fb2'
244
245     @staticmethod
246     def transform(wldoc, book):
247         return wldoc.as_fb2()
248
249
250 class PdfField(EbookField):
251     ext = 'pdf'
252     ZIP = 'wolnelektury_pl_pdf'
253
254     @staticmethod
255     def transform(wldoc, book):
256         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
257         return wldoc.as_pdf(
258             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS,
259             cover=get_make_cover(book),
260             base_url=absolute_url(gallery_url(wldoc.book_info.url.slug)), customizations=['notoc'],
261             fundraising=MediaInsertSet.get_texts_for('pdf'),
262         )
263
264     def build(self, fieldfile):
265         super().build(fieldfile)
266         clear_cache(fieldfile.instance.slug)
267
268
269 class EpubField(EbookField):
270     ext = 'epub'
271     librarian2_api = True
272     ZIP = 'wolnelektury_pl_epub'
273
274     @staticmethod
275     def transform(wldoc, book):
276         from librarian.builders import EpubBuilder
277         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
278         return EpubBuilder(
279                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
280                 fundraising=MediaInsertSet.get_texts_for('epub'),
281                 cover=get_make_cover(book),
282             ).build(wldoc)
283
284
285 class MobiField(EbookField):
286     ext = 'mobi'
287     librarian2_api = True
288     ZIP = 'wolnelektury_pl_mobi'
289
290     @staticmethod
291     def transform(wldoc, book):
292         from librarian.builders import MobiBuilder
293         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
294         return MobiBuilder(
295                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
296                 fundraising=MediaInsertSet.get_texts_for('mobi'),
297                 cover=get_make_cover(book),
298             ).build(wldoc)
299
300
301 class HtmlField(EbookField):
302     ext = 'html'
303     for_parents = False
304     librarian2_api = True
305
306     def build(self, fieldfile):
307         from django.core.files.base import ContentFile
308         from slugify import slugify
309         from sortify import sortify
310         from librarian import html
311         from catalogue.models import Fragment, Tag
312
313         book = fieldfile.instance
314
315         html_output = self.transform(book.wldocument2(), book)
316
317         # Delete old fragments, create from scratch if necessary.
318         book.fragments.all().delete()
319
320         if html_output:
321             meta_tags = list(book.tags.filter(
322                 category__in=('author', 'epoch', 'genre', 'kind')))
323
324             lang = book.language
325             lang = LANGUAGES_3TO2.get(lang, lang)
326             if lang not in [ln[0] for ln in settings.LANGUAGES]:
327                 lang = None
328
329             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
330             self.set_file_permissions(fieldfile)
331             type(book).objects.filter(pk=book.pk).update(**{
332                 fieldfile.field.attname: fieldfile
333             })
334
335             # Extract fragments
336             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
337             for fragment in closed_fragments.values():
338                 try:
339                     theme_names = [s.strip() for s in fragment.themes.split(',')]
340                 except AttributeError:
341                     continue
342                 themes = []
343                 for theme_name in theme_names:
344                     if not theme_name:
345                         continue
346                     if lang == settings.LANGUAGE_CODE:
347                         # Allow creating themes if book in default language.
348                         tag, created = Tag.objects.get_or_create(
349                             slug=slugify(theme_name),
350                             category='theme'
351                         )
352                         if created:
353                             tag.name = theme_name
354                             setattr(tag, "name_%s" % lang, theme_name)
355                             tag.sort_key = sortify(theme_name.lower())
356                             tag.save()
357                         themes.append(tag)
358                     elif lang is not None:
359                         # Don't create unknown themes in non-default languages.
360                         try:
361                             tag = Tag.objects.get(
362                                 category='theme',
363                                 **{"name_%s" % lang: theme_name}
364                             )
365                         except Tag.DoesNotExist:
366                             pass
367                         else:
368                             themes.append(tag)
369                 if not themes:
370                     continue
371
372                 text = fragment.to_string()
373                 short_text = truncate_html_words(text, 15)
374                 if text == short_text:
375                     short_text = ''
376                 new_fragment = Fragment.objects.create(
377                     anchor=fragment.id,
378                     book=book,
379                     text=text,
380                     short_text=short_text
381                 )
382
383                 new_fragment.save()
384                 new_fragment.tags = set(meta_tags + themes)
385             book.html_built.send(sender=type(self), instance=book)
386             return True
387         return False
388
389     @staticmethod
390     def transform(wldoc, book):
391         from librarian.builders.html import HtmlBuilder
392         url = wldoc.meta.url
393         if not url:
394             gal_url = ''
395             gal_path = ''
396         else:
397             gal_url = gallery_url(slug=url.slug)
398             gal_path = gallery_path(slug=url.slug)
399         return HtmlBuilder(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url)).build(wldoc)
400
401
402 class CoverField(EbookField):
403     ext = 'jpg'
404     directory = 'cover'
405
406     @staticmethod
407     def transform(wldoc, book):
408         return get_make_cover(book)(wldoc.book_info, width=360).output_file()
409
410     def set_file_permissions(self, fieldfile):
411         pass
412
413
414 class CoverCleanField(CoverField):
415     directory = 'cover_clean'
416
417     @staticmethod
418     def transform(wldoc, book):
419         return get_make_cover(book)(wldoc.book_info, width=360).output_file()
420
421
422 class CoverThumbField(CoverField):
423     directory = 'cover_thumb'
424
425     @staticmethod
426     def transform(wldoc, book):
427         from librarian.cover import WLCover
428         return WLCover(wldoc.book_info, height=193).output_file()
429
430
431 class CoverApiThumbField(CoverField):
432     directory = 'cover_api_thumb'
433
434     @staticmethod
435     def transform(wldoc, book):
436         from librarian.cover import WLNoBoxCover
437         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
438
439
440 class SimpleCoverField(CoverField):
441     directory = 'cover_simple'
442
443     @staticmethod
444     def transform(wldoc, book):
445         from librarian.cover import WLNoBoxCover
446         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
447
448
449 class CoverEbookpointField(CoverField):
450     directory = 'cover_ebookpoint'
451
452     @staticmethod
453     def transform(wldoc, book):
454         from librarian.cover import EbookpointCover
455         return EbookpointCover(wldoc.book_info).output_file()