ignore abstrakt tags when indexing book
[wolnelektury.git] / src / catalogue / models / book.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from collections import OrderedDict
6 from random import randint
7 import os.path
8 import re
9 import urllib
10 from django.conf import settings
11 from django.db import connection, models, transaction
12 from django.db.models import permalink
13 import django.dispatch
14 from django.contrib.contenttypes.fields import GenericRelation
15 from django.core.urlresolvers import reverse
16 from django.utils.translation import ugettext_lazy as _, get_language
17 import jsonfield
18 from fnpdjango.storage import BofhFileSystemStorage
19 from ssify import flush_ssi_includes
20 from newtagging import managers
21 from catalogue import constants
22 from catalogue.fields import EbookField
23 from catalogue.models import Tag, Fragment, BookMedia
24 from catalogue.utils import create_zip, gallery_url, gallery_path, split_tags
25 from catalogue.models.tag import prefetched_relations
26 from catalogue import app_settings
27 from catalogue import tasks
28 from wolnelektury.utils import makedirs
29
30 bofh_storage = BofhFileSystemStorage()
31
32
33 def _make_upload_to(path):
34     def _upload_to(i, n):
35         return path % i.slug
36     return _upload_to
37
38
39 _cover_upload_to = _make_upload_to('book/cover/%s.jpg')
40 _cover_thumb_upload_to = _make_upload_to('book/cover_thumb/%s.jpg')
41
42
43 def _ebook_upload_to(upload_path):
44     return _make_upload_to(upload_path)
45
46
47 class Book(models.Model):
48     """Represents a book imported from WL-XML."""
49     title = models.CharField(_('title'), max_length=32767)
50     sort_key = models.CharField(_('sort key'), max_length=120, db_index=True, editable=False)
51     sort_key_author = models.CharField(
52         _('sort key by author'), max_length=120, db_index=True, editable=False, default=u'')
53     slug = models.SlugField(_('slug'), max_length=120, db_index=True, unique=True)
54     common_slug = models.SlugField(_('slug'), max_length=120, db_index=True)
55     language = models.CharField(_('language code'), max_length=3, db_index=True, default=app_settings.DEFAULT_LANGUAGE)
56     description = models.TextField(_('description'), blank=True)
57     created_at = models.DateTimeField(_('creation date'), auto_now_add=True, db_index=True)
58     changed_at = models.DateTimeField(_('change date'), auto_now=True, db_index=True)
59     parent_number = models.IntegerField(_('parent number'), default=0)
60     extra_info = jsonfield.JSONField(_('extra information'), default={})
61     gazeta_link = models.CharField(blank=True, max_length=240)
62     wiki_link = models.CharField(blank=True, max_length=240)
63     print_on_demand = models.BooleanField(_('print on demand'), default=False)
64     recommended = models.BooleanField(_('recommended'), default=False)
65
66     # files generated during publication
67     cover = EbookField(
68         'cover', _('cover'),
69         null=True, blank=True,
70         upload_to=_cover_upload_to,
71         storage=bofh_storage, max_length=255)
72     # Cleaner version of cover for thumbs
73     cover_thumb = EbookField(
74         'cover_thumb', _('cover thumbnail'),
75         null=True, blank=True,
76         upload_to=_cover_thumb_upload_to,
77         max_length=255)
78     ebook_formats = constants.EBOOK_FORMATS
79     formats = ebook_formats + ['html', 'xml']
80
81     parent = models.ForeignKey('self', blank=True, null=True, related_name='children')
82     ancestor = models.ManyToManyField('self', blank=True, editable=False, related_name='descendant', symmetrical=False)
83
84     cached_author = models.CharField(blank=True, max_length=240, db_index=True)
85     has_audience = models.BooleanField(default=False)
86
87     objects = models.Manager()
88     tagged = managers.ModelTaggedItemManager(Tag)
89     tags = managers.TagDescriptor(Tag)
90     tag_relations = GenericRelation(Tag.intermediary_table_model)
91
92     html_built = django.dispatch.Signal()
93     published = django.dispatch.Signal()
94
95     short_html_url_name = 'catalogue_book_short'
96
97     class AlreadyExists(Exception):
98         pass
99
100     class Meta:
101         ordering = ('sort_key_author', 'sort_key')
102         verbose_name = _('book')
103         verbose_name_plural = _('books')
104         app_label = 'catalogue'
105
106     def __unicode__(self):
107         return self.title
108
109     def get_initial(self):
110         try:
111             return re.search(r'\w', self.title, re.U).group(0)
112         except AttributeError:
113             return ''
114
115     def authors(self):
116         return self.tags.filter(category='author')
117
118     def tag_unicode(self, category):
119         relations = prefetched_relations(self, category)
120         if relations:
121             return ', '.join(rel.tag.name for rel in relations)
122         else:
123             return ', '.join(self.tags.filter(category=category).values_list('name', flat=True))
124
125     def tags_by_category(self):
126         return split_tags(self.tags.exclude(category__in=('set', 'theme')))
127
128     def author_unicode(self):
129         return self.cached_author
130
131     def translator(self):
132         translators = self.extra_info.get('translators')
133         if not translators:
134             return None
135         if len(translators) > 3:
136             translators = translators[:2]
137             others = ' i inni'
138         else:
139             others = ''
140         return ', '.join(u'\xa0'.join(reversed(translator.split(', ', 1))) for translator in translators) + others
141
142     def cover_source(self):
143         return self.extra_info.get('cover_source', self.parent.cover_source() if self.parent else '')
144
145     def save(self, force_insert=False, force_update=False, **kwargs):
146         from sortify import sortify
147
148         self.sort_key = sortify(self.title)[:120]
149         self.title = unicode(self.title)  # ???
150
151         try:
152             author = self.authors().first().sort_key
153         except AttributeError:
154             author = u''
155         self.sort_key_author = author
156
157         self.cached_author = self.tag_unicode('author')
158         self.has_audience = 'audience' in self.extra_info
159
160         ret = super(Book, self).save(force_insert, force_update, **kwargs)
161
162         return ret
163
164     @permalink
165     def get_absolute_url(self):
166         return 'catalogue.views.book_detail', [self.slug]
167
168     @staticmethod
169     @permalink
170     def create_url(slug):
171         return 'catalogue.views.book_detail', [slug]
172
173     def gallery_path(self):
174         return gallery_path(self.slug)
175
176     def gallery_url(self):
177         return gallery_url(self.slug)
178
179     @property
180     def name(self):
181         return self.title
182
183     def language_code(self):
184         return constants.LANGUAGES_3TO2.get(self.language, self.language)
185
186     def language_name(self):
187         return dict(settings.LANGUAGES).get(self.language_code(), "")
188
189     def is_foreign(self):
190         return self.language_code() != settings.LANGUAGE_CODE
191
192     def has_media(self, type_):
193         if type_ in Book.formats:
194             return bool(getattr(self, "%s_file" % type_))
195         else:
196             return self.media.filter(type=type_).exists()
197
198     def has_audio(self):
199         return self.has_media('mp3')
200
201     def get_media(self, type_):
202         if self.has_media(type_):
203             if type_ in Book.formats:
204                 return getattr(self, "%s_file" % type_)
205             else:
206                 return self.media.filter(type=type_)
207         else:
208             return None
209
210     def get_mp3(self):
211         return self.get_media("mp3")
212
213     def get_odt(self):
214         return self.get_media("odt")
215
216     def get_ogg(self):
217         return self.get_media("ogg")
218
219     def get_daisy(self):
220         return self.get_media("daisy")
221
222     def has_description(self):
223         return len(self.description) > 0
224     has_description.short_description = _('description')
225     has_description.boolean = True
226
227     # ugly ugly ugly
228     def has_mp3_file(self):
229         return bool(self.has_media("mp3"))
230     has_mp3_file.short_description = 'MP3'
231     has_mp3_file.boolean = True
232
233     def has_ogg_file(self):
234         return bool(self.has_media("ogg"))
235     has_ogg_file.short_description = 'OGG'
236     has_ogg_file.boolean = True
237
238     def has_daisy_file(self):
239         return bool(self.has_media("daisy"))
240     has_daisy_file.short_description = 'DAISY'
241     has_daisy_file.boolean = True
242
243     def get_audiobooks(self):
244         ogg_files = {}
245         for m in self.media.filter(type='ogg').order_by().iterator():
246             ogg_files[m.name] = m
247
248         audiobooks = []
249         projects = set()
250         for mp3 in self.media.filter(type='mp3').iterator():
251             # ogg files are always from the same project
252             meta = mp3.extra_info
253             project = meta.get('project')
254             if not project:
255                 # temporary fallback
256                 project = u'CzytamySłuchając'
257
258             projects.add((project, meta.get('funded_by', '')))
259
260             media = {'mp3': mp3}
261
262             ogg = ogg_files.get(mp3.name)
263             if ogg:
264                 media['ogg'] = ogg
265             audiobooks.append(media)
266
267         projects = sorted(projects)
268         return audiobooks, projects
269
270     def wldocument(self, parse_dublincore=True, inherit=True):
271         from catalogue.import_utils import ORMDocProvider
272         from librarian.parser import WLDocument
273
274         if inherit and self.parent:
275             meta_fallbacks = self.parent.cover_info()
276         else:
277             meta_fallbacks = None
278
279         return WLDocument.from_file(
280             self.xml_file.path,
281             provider=ORMDocProvider(self),
282             parse_dublincore=parse_dublincore,
283             meta_fallbacks=meta_fallbacks)
284
285     @staticmethod
286     def zip_format(format_):
287         def pretty_file_name(book):
288             return "%s/%s.%s" % (
289                 book.extra_info['author'],
290                 book.slug,
291                 format_)
292
293         field_name = "%s_file" % format_
294         books = Book.objects.filter(parent=None).exclude(**{field_name: ""})
295         paths = [(pretty_file_name(b), getattr(b, field_name).path) for b in books.iterator()]
296         return create_zip(paths, app_settings.FORMAT_ZIPS[format_])
297
298     def zip_audiobooks(self, format_):
299         bm = BookMedia.objects.filter(book=self, type=format_)
300         paths = map(lambda bm: (None, bm.file.path), bm)
301         return create_zip(paths, "%s_%s" % (self.slug, format_))
302
303     def search_index(self, book_info=None, index=None, index_tags=True, commit=True):
304         if index is None:
305             from search.index import Index
306             index = Index()
307         try:
308             index.index_book(self, book_info)
309             if index_tags:
310                 index.index_tags()
311             if commit:
312                 index.index.commit()
313         except Exception, e:
314             index.index.rollback()
315             raise e
316
317     def download_pictures(self, remote_gallery_url):
318         gallery_path = self.gallery_path()
319         # delete previous files, so we don't include old files in ebooks
320         if os.path.isdir(gallery_path):
321             for filename in os.listdir(gallery_path):
322                 file_path = os.path.join(gallery_path, filename)
323                 os.unlink(file_path)
324         ilustr_elements = list(self.wldocument().edoc.findall('//ilustr'))
325         if ilustr_elements:
326             makedirs(gallery_path)
327             for ilustr in ilustr_elements:
328                 ilustr_src = ilustr.get('src')
329                 ilustr_path = os.path.join(gallery_path, ilustr_src)
330                 urllib.urlretrieve('%s/%s' % (remote_gallery_url, ilustr_src), ilustr_path)
331
332     @classmethod
333     def from_xml_file(cls, xml_file, **kwargs):
334         from django.core.files import File
335         from librarian import dcparser
336
337         # use librarian to parse meta-data
338         book_info = dcparser.parse(xml_file)
339
340         if not isinstance(xml_file, File):
341             xml_file = File(open(xml_file))
342
343         try:
344             return cls.from_text_and_meta(xml_file, book_info, **kwargs)
345         finally:
346             xml_file.close()
347
348     @classmethod
349     def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
350                            search_index_tags=True, remote_gallery_url=None):
351         if dont_build is None:
352             dont_build = set()
353         dont_build = set.union(set(dont_build), set(app_settings.DONT_BUILD))
354
355         # check for parts before we do anything
356         children = []
357         if hasattr(book_info, 'parts'):
358             for part_url in book_info.parts:
359                 try:
360                     children.append(Book.objects.get(slug=part_url.slug))
361                 except Book.DoesNotExist:
362                     raise Book.DoesNotExist(_('Book "%s" does not exist.') % part_url.slug)
363
364         # Read book metadata
365         book_slug = book_info.url.slug
366         if re.search(r'[^a-z0-9-]', book_slug):
367             raise ValueError('Invalid characters in slug')
368         book, created = Book.objects.get_or_create(slug=book_slug)
369
370         if created:
371             book_shelves = []
372             old_cover = None
373         else:
374             if not overwrite:
375                 raise Book.AlreadyExists(_('Book %s already exists') % book_slug)
376             # Save shelves for this book
377             book_shelves = list(book.tags.filter(category='set'))
378             old_cover = book.cover_info()
379
380         # Save XML file
381         book.xml_file.save('%s.xml' % book.slug, raw_file, save=False)
382
383         book.language = book_info.language
384         book.title = book_info.title
385         if book_info.variant_of:
386             book.common_slug = book_info.variant_of.slug
387         else:
388             book.common_slug = book.slug
389         book.extra_info = book_info.to_dict()
390         book.save()
391
392         meta_tags = Tag.tags_from_info(book_info)
393
394         book.tags = set(meta_tags + book_shelves)
395
396         cover_changed = old_cover != book.cover_info()
397         obsolete_children = set(b for b in book.children.all()
398                                 if b not in children)
399         notify_cover_changed = []
400         for n, child_book in enumerate(children):
401             new_child = child_book.parent != book
402             child_book.parent = book
403             child_book.parent_number = n
404             child_book.save()
405             if new_child or cover_changed:
406                 notify_cover_changed.append(child_book)
407         # Disown unfaithful children and let them cope on their own.
408         for child in obsolete_children:
409             child.parent = None
410             child.parent_number = 0
411             child.save()
412             if old_cover:
413                 notify_cover_changed.append(child)
414
415         cls.repopulate_ancestors()
416         tasks.update_counters.delay()
417
418         if remote_gallery_url:
419             book.download_pictures(remote_gallery_url)
420
421         # No saves beyond this point.
422
423         # Build cover.
424         if 'cover' not in dont_build:
425             book.cover.build_delay()
426             book.cover_thumb.build_delay()
427
428         # Build HTML and ebooks.
429         book.html_file.build_delay()
430         if not children:
431             for format_ in constants.EBOOK_FORMATS_WITHOUT_CHILDREN:
432                 if format_ not in dont_build:
433                     getattr(book, '%s_file' % format_).build_delay()
434         for format_ in constants.EBOOK_FORMATS_WITH_CHILDREN:
435             if format_ not in dont_build:
436                 getattr(book, '%s_file' % format_).build_delay()
437
438         if not settings.NO_SEARCH_INDEX and search_index:
439             tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags)
440
441         for child in notify_cover_changed:
442             child.parent_cover_changed()
443
444         book.save()  # update sort_key_author
445         cls.published.send(sender=cls, instance=book)
446         return book
447
448     @classmethod
449     @transaction.atomic
450     def repopulate_ancestors(cls):
451         """Fixes the ancestry cache."""
452         # TODO: table names
453         cursor = connection.cursor()
454         if connection.vendor == 'postgres':
455             cursor.execute("TRUNCATE catalogue_book_ancestor")
456             cursor.execute("""
457                 WITH RECURSIVE ancestry AS (
458                     SELECT book.id, book.parent_id
459                     FROM catalogue_book AS book
460                     WHERE book.parent_id IS NOT NULL
461                     UNION
462                     SELECT ancestor.id, book.parent_id
463                     FROM ancestry AS ancestor, catalogue_book AS book
464                     WHERE ancestor.parent_id = book.id
465                         AND book.parent_id IS NOT NULL
466                     )
467                 INSERT INTO catalogue_book_ancestor
468                     (from_book_id, to_book_id)
469                     SELECT id, parent_id
470                     FROM ancestry
471                     ORDER BY id;
472                 """)
473         else:
474             cursor.execute("DELETE FROM catalogue_book_ancestor")
475             for b in cls.objects.exclude(parent=None):
476                 parent = b.parent
477                 while parent is not None:
478                     b.ancestor.add(parent)
479                     parent = parent.parent
480
481     def flush_includes(self, languages=True):
482         if not languages:
483             return
484         if languages is True:
485             languages = [lc for (lc, _ln) in settings.LANGUAGES]
486         flush_ssi_includes([
487             template % (self.pk, lang)
488             for template in [
489                 '/katalog/b/%d/mini.%s.html',
490                 '/katalog/b/%d/mini_nolink.%s.html',
491                 '/katalog/b/%d/short.%s.html',
492                 '/katalog/b/%d/wide.%s.html',
493                 '/api/include/book/%d.%s.json',
494                 '/api/include/book/%d.%s.xml',
495                 ]
496             for lang in languages
497             ])
498
499     def cover_info(self, inherit=True):
500         """Returns a dictionary to serve as fallback for BookInfo.
501
502         For now, the only thing inherited is the cover image.
503         """
504         need = False
505         info = {}
506         for field in ('cover_url', 'cover_by', 'cover_source'):
507             val = self.extra_info.get(field)
508             if val:
509                 info[field] = val
510             else:
511                 need = True
512         if inherit and need and self.parent is not None:
513             parent_info = self.parent.cover_info()
514             parent_info.update(info)
515             info = parent_info
516         return info
517
518     def related_themes(self):
519         return Tag.objects.usage_for_queryset(
520             Fragment.objects.filter(models.Q(book=self) | models.Q(book__ancestor=self)),
521             counts=True).filter(category='theme')
522
523     def parent_cover_changed(self):
524         """Called when parent book's cover image is changed."""
525         if not self.cover_info(inherit=False):
526             if 'cover' not in app_settings.DONT_BUILD:
527                 self.cover.build_delay()
528                 self.cover_thumb.build_delay()
529             for format_ in constants.EBOOK_FORMATS_WITH_COVERS:
530                 if format_ not in app_settings.DONT_BUILD:
531                     getattr(self, '%s_file' % format_).build_delay()
532             for child in self.children.all():
533                 child.parent_cover_changed()
534
535     def other_versions(self):
536         """Find other versions (i.e. in other languages) of the book."""
537         return type(self).objects.filter(common_slug=self.common_slug).exclude(pk=self.pk)
538
539     def parents(self):
540         books = []
541         parent = self.parent
542         while parent is not None:
543             books.insert(0, parent)
544             parent = parent.parent
545         return books
546
547     def pretty_title(self, html_links=False):
548         names = [(tag.name, tag.get_absolute_url()) for tag in self.authors().only('name', 'category', 'slug')]
549         books = self.parents() + [self]
550         names.extend([(b.title, b.get_absolute_url()) for b in books])
551
552         if html_links:
553             names = ['<a href="%s">%s</a>' % (tag[1], tag[0]) for tag in names]
554         else:
555             names = [tag[0] for tag in names]
556         return ', '.join(names)
557
558     def publisher(self):
559         publisher = self.extra_info['publisher']
560         if isinstance(publisher, basestring):
561             return publisher
562         elif isinstance(publisher, list):
563             return ', '.join(publisher)
564
565     @classmethod
566     def tagged_top_level(cls, tags):
567         """ Returns top-level books tagged with `tags`.
568
569         It only returns those books which don't have ancestors which are
570         also tagged with those tags.
571
572         """
573         objects = cls.tagged.with_all(tags)
574         return objects.exclude(ancestor__in=objects)
575
576     @classmethod
577     def book_list(cls, book_filter=None):
578         """Generates a hierarchical listing of all books.
579
580         Books are optionally filtered with a test function.
581
582         """
583
584         books_by_parent = {}
585         books = cls.objects.order_by('parent_number', 'sort_key').only('title', 'parent', 'slug')
586         if book_filter:
587             books = books.filter(book_filter).distinct()
588
589             book_ids = set(b['pk'] for b in books.values("pk").iterator())
590             for book in books.iterator():
591                 parent = book.parent_id
592                 if parent not in book_ids:
593                     parent = None
594                 books_by_parent.setdefault(parent, []).append(book)
595         else:
596             for book in books.iterator():
597                 books_by_parent.setdefault(book.parent_id, []).append(book)
598
599         orphans = []
600         books_by_author = OrderedDict()
601         for tag in Tag.objects.filter(category='author').iterator():
602             books_by_author[tag] = []
603
604         for book in books_by_parent.get(None, ()):
605             authors = list(book.authors().only('pk'))
606             if authors:
607                 for author in authors:
608                     books_by_author[author].append(book)
609             else:
610                 orphans.append(book)
611
612         return books_by_author, orphans, books_by_parent
613
614     _audiences_pl = {
615         "SP": (1, u"szkoła podstawowa"),
616         "SP1": (1, u"szkoła podstawowa"),
617         "SP2": (1, u"szkoła podstawowa"),
618         "SP3": (1, u"szkoła podstawowa"),
619         "P": (1, u"szkoła podstawowa"),
620         "G": (2, u"gimnazjum"),
621         "L": (3, u"liceum"),
622         "LP": (3, u"liceum"),
623     }
624
625     def audiences_pl(self):
626         audiences = self.extra_info.get('audiences', [])
627         audiences = sorted(set([self._audiences_pl.get(a, (99, a)) for a in audiences]))
628         return [a[1] for a in audiences]
629
630     def stage_note(self):
631         stage = self.extra_info.get('stage')
632         if stage and stage < '0.4':
633             return (_('This work needs modernisation'),
634                     reverse('infopage', args=['wymagajace-uwspolczesnienia']))
635         else:
636             return None, None
637
638     def choose_fragment(self):
639         fragments = self.fragments.order_by()
640         fragments_count = fragments.count()
641         if not fragments_count and self.children.exists():
642             fragments = Fragment.objects.filter(book__ancestor=self).order_by()
643             fragments_count = fragments.count()
644         if fragments_count:
645             return fragments[randint(0, fragments_count - 1)]
646         elif self.parent:
647             return self.parent.choose_fragment()
648         else:
649             return None
650
651     def fragment_data(self):
652         fragment = self.choose_fragment()
653         if fragment:
654             return {'title': fragment.book.pretty_title(), 'html': fragment.get_short_text()}
655         else:
656             return None
657
658     def update_popularity(self):
659         count = self.tags.filter(category='set').values('user').order_by('user').distinct().count()
660         try:
661             pop = self.popularity
662             pop.count = count
663             pop.save()
664         except BookPopularity.DoesNotExist:
665             BookPopularity.objects.create(book=self, count=count)
666
667     def ridero_link(self):
668         return 'https://ridero.eu/%s/books/wl_%s/' % (get_language(), self.slug.replace('-', '_'))
669
670
671 def add_file_fields():
672     for format_ in Book.formats:
673         field_name = "%s_file" % format_
674         # This weird globals() assignment makes Django migrations comfortable.
675         _upload_to = _ebook_upload_to('book/%s/%%s.%s' % (format_, format_))
676         _upload_to.__name__ = '_%s_upload_to' % format_
677         globals()[_upload_to.__name__] = _upload_to
678
679         EbookField(
680             format_, _("%s file" % format_.upper()),
681             upload_to=_upload_to,
682             storage=bofh_storage,
683             max_length=255,
684             blank=True,
685             default=''
686         ).contribute_to_class(Book, field_name)
687
688 add_file_fields()
689
690
691 class BookPopularity(models.Model):
692     book = models.OneToOneField(Book, related_name='popularity')
693     count = models.IntegerField(default=0)