Find unused tags.
[wolnelektury.git] / src / catalogue / fields.py
1 # This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import pkg_resources
7 import random
8 import time
9 from urllib.request import urlopen
10 from django.apps import apps
11 from django.conf import settings
12 from django.core.files import File
13 from django.db import models
14 from django.db.models.fields.files import FieldFile
15 from django.utils.deconstruct import deconstructible
16 from librarian.cover import make_cover
17 from catalogue.constants import LANGUAGES_3TO2
18 from catalogue.utils import absolute_url, remove_zip, truncate_html_words, gallery_path, gallery_url
19 from waiter.utils import clear_cache
20
21 ETAG_SCHEDULED_SUFFIX = '-scheduled'
22 EBOOK_BUILD_PRIORITY = 0
23 EBOOK_REBUILD_PRIORITY = 9
24
25
26 @deconstructible
27 class UploadToPath(object):
28     def __init__(self, path):
29         self.path = path
30
31     def __call__(self, instance, filename):
32         return self.path % instance.slug
33
34     def __eq__(self, other):
35         return isinstance(other, type(self)) and other.path == self.path
36
37
38 def get_make_cover(book):
39     extra = book.get_extra_info_json()
40     cover_logo = extra.get('logo_mono', extra.get('logo'))
41     if cover_logo:
42         while True:
43             try:
44                 cover_logo = io.BytesIO(urlopen(cover_logo, timeout=3).read())
45             except:
46                 time.sleep(2)
47             else:
48                 break
49     
50     def mc(*args, **kwargs):
51         if cover_logo:
52             kwargs['cover_logo'] = cover_logo
53         return make_cover(*args, **kwargs)
54     return mc
55     
56
57 class EbookFieldFile(FieldFile):
58     """Represents contents of an ebook file field."""
59
60     def build(self):
61         """Build the ebook immediately."""
62         etag = self.field.get_current_etag()
63         self.field.build(self)
64         self.update_etag(etag)
65         self.instance.clear_cache()
66
67     def build_delay(self, priority=EBOOK_BUILD_PRIORITY):
68         """Builds the ebook in a delayed task."""
69         from .tasks import build_field
70
71         self.update_etag(
72             "".join([self.field.get_current_etag(), ETAG_SCHEDULED_SUFFIX])
73         )
74         return build_field.apply_async(
75             [self.instance.pk, self.field.attname],
76             priority=priority
77         )
78
79     def set_readable(self, readable):
80         import os
81         permissions = 0o644 if readable else 0o600
82         os.chmod(self.path, permissions)
83
84     def update_etag(self, etag):
85         setattr(self.instance, self.field.etag_field_name, etag)
86         if self.instance.pk:
87             self.instance.save(update_fields=[self.field.etag_field_name])
88
89
90 class EbookField(models.FileField):
91     """Represents an ebook file field, attachable to a model."""
92     attr_class = EbookFieldFile
93     ext = None
94     for_parents = True
95     librarian2_api = False
96     ZIP = None
97
98     def __init__(self, verbose_name=None, with_etag=True, etag_field_name=None, **kwargs):
99         kwargs.setdefault('verbose_name', verbose_name)
100         self.with_etag = with_etag
101         self.etag_field_name = etag_field_name
102         kwargs.setdefault('max_length', 255)
103         kwargs.setdefault('blank', True)
104         kwargs.setdefault('default', '')
105         kwargs.setdefault('upload_to', self.get_upload_to(self.ext))
106
107         super().__init__(**kwargs)
108
109     def deconstruct(self):
110         name, path, args, kwargs = super().deconstruct()
111         if kwargs.get('max_length') == 255:
112             del kwargs['max_length']
113         if kwargs.get('blank') is True:
114             del kwargs['blank']
115         if kwargs.get('default') == '':
116             del kwargs['default']
117         if self.get_upload_to(self.ext) == kwargs.get('upload_to'):
118             del kwargs['upload_to']
119         # with_etag creates a second field, which then deconstructs to manage
120         # its own migrations. So for migrations, etag_field_name is explicitly
121         # set to avoid double creation of the etag field.
122         if self.with_etag:
123             kwargs['etag_field_name'] = self.etag_field_name
124         else:
125             kwargs['with_etag'] = self.with_etag
126
127         return name, path, args, kwargs
128
129     @classmethod
130     def get_upload_to(cls, directory):
131         directory = getattr(cls, 'directory', cls.ext)
132         upload_template = f'book/{directory}/%s.{cls.ext}'
133         return UploadToPath(upload_template)
134
135     def contribute_to_class(self, cls, name):
136         super(EbookField, self).contribute_to_class(cls, name)
137
138         if self.with_etag and not self.etag_field_name:
139             self.etag_field_name = f'{name}_etag'
140             self.etag_field = models.CharField(max_length=255, editable=False, default='', db_index=True)
141             self.etag_field.contribute_to_class(cls, f'{name}_etag')
142
143         def has(model_instance):
144             return bool(getattr(model_instance, self.attname, None))
145         has.__doc__ = None
146         has.__name__ = str("has_%s" % self.attname)
147         has.short_description = self.name
148         has.boolean = True
149
150         setattr(cls, 'has_%s' % self.attname, has)
151
152     def get_current_etag(self):
153         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
154         librarian_version = pkg_resources.get_distribution("librarian").version
155         etag = librarian_version
156         mis = MediaInsertSet.get_for_format(self.ext)
157         if mis is not None:
158             etag += '_' + mis.etag
159         return etag
160
161     def find_stale(self, limit):
162         """Find some books where this format is stale."""
163         # If there is not ETag field, bail. That's true for xml file field.
164         if not self.with_etag:
165             return []
166
167         etag = self.get_current_etag()
168
169         queryset = self.model.objects.all()
170         if not self.for_parents:
171             queryset = queryset.filter(children=None)
172
173         queryset = queryset.exclude(**{
174             f'{self.etag_field_name}__in': [
175                 etag, f'{etag}{ETAG_SCHEDULED_SUFFIX}'
176            ]
177         })
178
179         queryset = queryset.order_by('?')[:limit]
180         return queryset
181
182     @classmethod
183     def find_all_stale(cls, model, limit):
184         """Schedules all stale ebooks of all formats to rebuild."""
185         found = []
186         for field in model._meta.fields:
187             if isinstance(field, cls):
188                 for instance in field.find_stale(limit):
189                     found.append((
190                         field.name,
191                         instance
192                     ))
193         random.shuffle(found)
194         found = found[:limit]
195         return found
196
197     @staticmethod
198     def transform(wldoc, book):
199         """Transforms an librarian.WLDocument into an librarian.OutputFile.
200         """
201         raise NotImplemented()
202
203     def set_file_permissions(self, fieldfile):
204         if fieldfile.instance.preview:
205             fieldfile.set_readable(False)
206
207     def build(self, fieldfile):
208         book = fieldfile.instance
209         out = self.transform(
210             book.wldocument2() if self.librarian2_api else book.wldocument(),
211             book,
212         )
213         with open(out.get_filename(), 'rb') as f:
214             fieldfile.save(None, File(f), save=False)
215         self.set_file_permissions(fieldfile)
216         if book.pk is not None:
217             book.save(update_fields=[self.attname])
218         if self.ZIP:
219             remove_zip(self.ZIP)
220
221
222 class XmlField(EbookField):
223     ext = 'xml'
224
225     def build(self, fieldfile):
226         pass
227
228
229 class TxtField(EbookField):
230     ext = 'txt'
231     for_parents = False
232
233     @staticmethod
234     def transform(wldoc, book):
235         return wldoc.as_text()
236
237
238 class Fb2Field(EbookField):
239     ext = 'fb2'
240     for_parents = False
241     ZIP = 'wolnelektury_pl_fb2'
242
243     @staticmethod
244     def transform(wldoc, book):
245         return wldoc.as_fb2()
246
247
248 class PdfField(EbookField):
249     ext = 'pdf'
250     ZIP = 'wolnelektury_pl_pdf'
251
252     @staticmethod
253     def transform(wldoc, book):
254         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
255         return wldoc.as_pdf(
256             morefloats=settings.LIBRARIAN_PDF_MOREFLOATS,
257             cover=get_make_cover(book),
258             base_url=absolute_url(gallery_url(wldoc.book_info.url.slug)), customizations=['notoc'],
259             fundraising=MediaInsertSet.get_texts_for('pdf'),
260         )
261
262     def build(self, fieldfile):
263         super().build(fieldfile)
264         clear_cache(fieldfile.instance.slug)
265
266
267 class EpubField(EbookField):
268     ext = 'epub'
269     librarian2_api = True
270     ZIP = 'wolnelektury_pl_epub'
271
272     @staticmethod
273     def transform(wldoc, book):
274         from librarian.builders import EpubBuilder
275         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
276         return EpubBuilder(
277                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
278                 fundraising=MediaInsertSet.get_texts_for('epub'),
279                 cover=get_make_cover(book),
280             ).build(wldoc)
281
282
283 class MobiField(EbookField):
284     ext = 'mobi'
285     librarian2_api = True
286     ZIP = 'wolnelektury_pl_mobi'
287
288     @staticmethod
289     def transform(wldoc, book):
290         from librarian.builders import MobiBuilder
291         MediaInsertSet = apps.get_model('annoy', 'MediaInsertSet')
292         return MobiBuilder(
293                 base_url='file://' + os.path.abspath(gallery_path(wldoc.meta.url.slug)) + '/',
294                 fundraising=MediaInsertSet.get_texts_for('mobi'),
295                 cover=get_make_cover(book),
296             ).build(wldoc)
297
298
299 class HtmlField(EbookField):
300     ext = 'html'
301     for_parents = False
302
303     def build(self, fieldfile):
304         from django.core.files.base import ContentFile
305         from slugify import slugify
306         from sortify import sortify
307         from librarian import html
308         from catalogue.models import Fragment, Tag
309
310         book = fieldfile.instance
311
312         html_output = self.transform(book.wldocument(parse_dublincore=False), book)
313
314         # Delete old fragments, create from scratch if necessary.
315         book.fragments.all().delete()
316
317         if html_output:
318             meta_tags = list(book.tags.filter(
319                 category__in=('author', 'epoch', 'genre', 'kind')))
320
321             lang = book.language
322             lang = LANGUAGES_3TO2.get(lang, lang)
323             if lang not in [ln[0] for ln in settings.LANGUAGES]:
324                 lang = None
325
326             fieldfile.save(None, ContentFile(html_output.get_bytes()), save=False)
327             self.set_file_permissions(fieldfile)
328             type(book).objects.filter(pk=book.pk).update(**{
329                 fieldfile.field.attname: fieldfile
330             })
331
332             # Extract fragments
333             closed_fragments, open_fragments = html.extract_fragments(fieldfile.path)
334             for fragment in closed_fragments.values():
335                 try:
336                     theme_names = [s.strip() for s in fragment.themes.split(',')]
337                 except AttributeError:
338                     continue
339                 themes = []
340                 for theme_name in theme_names:
341                     if not theme_name:
342                         continue
343                     if lang == settings.LANGUAGE_CODE:
344                         # Allow creating themes if book in default language.
345                         tag, created = Tag.objects.get_or_create(
346                             slug=slugify(theme_name),
347                             category='theme'
348                         )
349                         if created:
350                             tag.name = theme_name
351                             setattr(tag, "name_%s" % lang, theme_name)
352                             tag.sort_key = sortify(theme_name.lower())
353                             tag.for_books = True
354                             tag.save()
355                         themes.append(tag)
356                     elif lang is not None:
357                         # Don't create unknown themes in non-default languages.
358                         try:
359                             tag = Tag.objects.get(
360                                 category='theme',
361                                 **{"name_%s" % lang: theme_name}
362                             )
363                         except Tag.DoesNotExist:
364                             pass
365                         else:
366                             themes.append(tag)
367                 if not themes:
368                     continue
369
370                 text = fragment.to_string()
371                 short_text = truncate_html_words(text, 15)
372                 if text == short_text:
373                     short_text = ''
374                 new_fragment = Fragment.objects.create(
375                     anchor=fragment.id,
376                     book=book,
377                     text=text,
378                     short_text=short_text
379                 )
380
381                 new_fragment.save()
382                 new_fragment.tags = set(meta_tags + themes)
383                 for theme in themes:
384                     if not theme.for_books:
385                         theme.for_books = True
386                         theme.save()
387             book.html_built.send(sender=type(self), instance=book)
388             return True
389         return False
390
391     @staticmethod
392     def transform(wldoc, book):
393         # ugly, but we can't use wldoc.book_info here
394         from librarian import DCNS
395         url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
396         if url_elem is None:
397             gal_url = ''
398             gal_path = ''
399         else:
400             slug = url_elem.text.rstrip('/').rsplit('/', 1)[1]
401             gal_url = gallery_url(slug=slug)
402             gal_path = gallery_path(slug=slug)
403         return wldoc.as_html(gallery_path=gal_path, gallery_url=gal_url, base_url=absolute_url(gal_url))
404
405
406 class CoverField(EbookField):
407     ext = 'jpg'
408     directory = 'cover'
409
410     @staticmethod
411     def transform(wldoc, book):
412         return get_make_cover(book)(wldoc.book_info, width=360).output_file()
413
414     def set_file_permissions(self, fieldfile):
415         pass
416
417
418 class CoverCleanField(CoverField):
419     directory = 'cover_clean'
420
421     @staticmethod
422     def transform(wldoc, book):
423         return get_make_cover(book)(wldoc.book_info, width=360).output_file()
424
425
426 class CoverThumbField(CoverField):
427     directory = 'cover_thumb'
428
429     @staticmethod
430     def transform(wldoc, book):
431         from librarian.cover import WLCover
432         return WLCover(wldoc.book_info, height=193).output_file()
433
434
435 class CoverApiThumbField(CoverField):
436     directory = 'cover_api_thumb'
437
438     @staticmethod
439     def transform(wldoc, book):
440         from librarian.cover import WLNoBoxCover
441         return WLNoBoxCover(wldoc.book_info, height=500).output_file()
442
443
444 class SimpleCoverField(CoverField):
445     directory = 'cover_simple'
446
447     @staticmethod
448     def transform(wldoc, book):
449         from librarian.cover import WLNoBoxCover
450         return WLNoBoxCover(wldoc.book_info, height=1000).output_file()
451
452
453 class CoverEbookpointField(CoverField):
454     directory = 'cover_ebookpoint'
455
456     @staticmethod
457     def transform(wldoc, book):
458         from librarian.cover import EbookpointCover
459         return EbookpointCover(wldoc.book_info).output_file()