limited stopwords + better search by author + remove descendant results
[wolnelektury.git] / src / catalogue / models / book.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from collections import OrderedDict
6 from random import randint
7 import os.path
8 import re
9 import urllib
10 from django.conf import settings
11 from django.db import connection, models, transaction
12 from django.db.models import permalink
13 import django.dispatch
14 from django.contrib.contenttypes.fields import GenericRelation
15 from django.core.urlresolvers import reverse
16 from django.utils.translation import ugettext_lazy as _, get_language
17 from django.utils.deconstruct import deconstructible
18 import jsonfield
19 from fnpdjango.storage import BofhFileSystemStorage
20 from ssify import flush_ssi_includes
21 from newtagging import managers
22 from catalogue import constants
23 from catalogue.fields import EbookField
24 from catalogue.models import Tag, Fragment, BookMedia
25 from catalogue.utils import create_zip, gallery_url, gallery_path, split_tags
26 from catalogue.models.tag import prefetched_relations
27 from catalogue import app_settings
28 from catalogue import tasks
29 from wolnelektury.utils import makedirs
30
31 bofh_storage = BofhFileSystemStorage()
32
33
34 @deconstructible
35 class UploadToPath(object):
36     def __init__(self, path):
37         self.path = path
38
39     def __call__(self, instance, filename):
40         return self.path % instance.slug
41
42
43 _cover_upload_to = UploadToPath('book/cover/%s.jpg')
44 _cover_thumb_upload_to = UploadToPath('book/cover_thumb/%s.jpg')
45 _cover_api_thumb_upload_to = UploadToPath('book/cover_api_thumb/%s.jpg')
46 _simple_cover_upload_to = UploadToPath('book/cover_simple/%s.jpg')
47
48
49 def _ebook_upload_to(upload_path):
50     return UploadToPath(upload_path)
51
52
53 class Book(models.Model):
54     """Represents a book imported from WL-XML."""
55     title = models.CharField(_('title'), max_length=32767)
56     sort_key = models.CharField(_('sort key'), max_length=120, db_index=True, editable=False)
57     sort_key_author = models.CharField(
58         _('sort key by author'), max_length=120, db_index=True, editable=False, default=u'')
59     slug = models.SlugField(_('slug'), max_length=120, db_index=True, unique=True)
60     common_slug = models.SlugField(_('slug'), max_length=120, db_index=True)
61     language = models.CharField(_('language code'), max_length=3, db_index=True, default=app_settings.DEFAULT_LANGUAGE)
62     description = models.TextField(_('description'), blank=True)
63     created_at = models.DateTimeField(_('creation date'), auto_now_add=True, db_index=True)
64     changed_at = models.DateTimeField(_('change date'), auto_now=True, db_index=True)
65     parent_number = models.IntegerField(_('parent number'), default=0)
66     extra_info = jsonfield.JSONField(_('extra information'), default={})
67     gazeta_link = models.CharField(blank=True, max_length=240)
68     wiki_link = models.CharField(blank=True, max_length=240)
69     print_on_demand = models.BooleanField(_('print on demand'), default=False)
70     recommended = models.BooleanField(_('recommended'), default=False)
71
72     # files generated during publication
73     cover = EbookField(
74         'cover', _('cover'),
75         null=True, blank=True,
76         upload_to=_cover_upload_to,
77         storage=bofh_storage, max_length=255)
78     # Cleaner version of cover for thumbs
79     cover_thumb = EbookField(
80         'cover_thumb', _('cover thumbnail'),
81         null=True, blank=True,
82         upload_to=_cover_thumb_upload_to,
83         max_length=255)
84     cover_api_thumb = EbookField(
85         'cover_api_thumb', _('cover thumbnail for mobile app'),
86         null=True, blank=True,
87         upload_to=_cover_api_thumb_upload_to,
88         max_length=255)
89     simple_cover = EbookField(
90         'simple_cover', _('cover for mobile app'),
91         null=True, blank=True,
92         upload_to=_simple_cover_upload_to,
93         max_length=255)
94     ebook_formats = constants.EBOOK_FORMATS
95     formats = ebook_formats + ['html', 'xml']
96
97     parent = models.ForeignKey('self', blank=True, null=True, related_name='children')
98     ancestor = models.ManyToManyField('self', blank=True, editable=False, related_name='descendant', symmetrical=False)
99
100     cached_author = models.CharField(blank=True, max_length=240, db_index=True)
101     has_audience = models.BooleanField(default=False)
102
103     objects = models.Manager()
104     tagged = managers.ModelTaggedItemManager(Tag)
105     tags = managers.TagDescriptor(Tag)
106     tag_relations = GenericRelation(Tag.intermediary_table_model)
107
108     html_built = django.dispatch.Signal()
109     published = django.dispatch.Signal()
110
111     short_html_url_name = 'catalogue_book_short'
112
113     class AlreadyExists(Exception):
114         pass
115
116     class Meta:
117         ordering = ('sort_key_author', 'sort_key')
118         verbose_name = _('book')
119         verbose_name_plural = _('books')
120         app_label = 'catalogue'
121
122     def __unicode__(self):
123         return self.title
124
125     def get_initial(self):
126         try:
127             return re.search(r'\w', self.title, re.U).group(0)
128         except AttributeError:
129             return ''
130
131     def authors(self):
132         return self.tags.filter(category='author')
133
134     def tag_unicode(self, category):
135         relations = prefetched_relations(self, category)
136         if relations:
137             return ', '.join(rel.tag.name for rel in relations)
138         else:
139             return ', '.join(self.tags.filter(category=category).values_list('name', flat=True))
140
141     def tags_by_category(self):
142         return split_tags(self.tags.exclude(category__in=('set', 'theme')))
143
144     def author_unicode(self):
145         return self.cached_author
146
147     def translator(self):
148         translators = self.extra_info.get('translators')
149         if not translators:
150             return None
151         if len(translators) > 3:
152             translators = translators[:2]
153             others = ' i inni'
154         else:
155             others = ''
156         return ', '.join(u'\xa0'.join(reversed(translator.split(', ', 1))) for translator in translators) + others
157
158     def cover_source(self):
159         return self.extra_info.get('cover_source', self.parent.cover_source() if self.parent else '')
160
161     def save(self, force_insert=False, force_update=False, **kwargs):
162         from sortify import sortify
163
164         self.sort_key = sortify(self.title)[:120]
165         self.title = unicode(self.title)  # ???
166
167         try:
168             author = self.authors().first().sort_key
169         except AttributeError:
170             author = u''
171         self.sort_key_author = author
172
173         self.cached_author = self.tag_unicode('author')
174         self.has_audience = 'audience' in self.extra_info
175
176         ret = super(Book, self).save(force_insert, force_update, **kwargs)
177
178         return ret
179
180     @permalink
181     def get_absolute_url(self):
182         return 'catalogue.views.book_detail', [self.slug]
183
184     @staticmethod
185     @permalink
186     def create_url(slug):
187         return 'catalogue.views.book_detail', [slug]
188
189     def gallery_path(self):
190         return gallery_path(self.slug)
191
192     def gallery_url(self):
193         return gallery_url(self.slug)
194
195     @property
196     def name(self):
197         return self.title
198
199     def language_code(self):
200         return constants.LANGUAGES_3TO2.get(self.language, self.language)
201
202     def language_name(self):
203         return dict(settings.LANGUAGES).get(self.language_code(), "")
204
205     def is_foreign(self):
206         return self.language_code() != settings.LANGUAGE_CODE
207
208     def has_media(self, type_):
209         if type_ in Book.formats:
210             return bool(getattr(self, "%s_file" % type_))
211         else:
212             return self.media.filter(type=type_).exists()
213
214     def has_audio(self):
215         return self.has_media('mp3')
216
217     def get_media(self, type_):
218         if self.has_media(type_):
219             if type_ in Book.formats:
220                 return getattr(self, "%s_file" % type_)
221             else:
222                 return self.media.filter(type=type_)
223         else:
224             return None
225
226     def get_mp3(self):
227         return self.get_media("mp3")
228
229     def get_odt(self):
230         return self.get_media("odt")
231
232     def get_ogg(self):
233         return self.get_media("ogg")
234
235     def get_daisy(self):
236         return self.get_media("daisy")
237
238     def has_description(self):
239         return len(self.description) > 0
240     has_description.short_description = _('description')
241     has_description.boolean = True
242
243     # ugly ugly ugly
244     def has_mp3_file(self):
245         return bool(self.has_media("mp3"))
246     has_mp3_file.short_description = 'MP3'
247     has_mp3_file.boolean = True
248
249     def has_ogg_file(self):
250         return bool(self.has_media("ogg"))
251     has_ogg_file.short_description = 'OGG'
252     has_ogg_file.boolean = True
253
254     def has_daisy_file(self):
255         return bool(self.has_media("daisy"))
256     has_daisy_file.short_description = 'DAISY'
257     has_daisy_file.boolean = True
258
259     def get_audiobooks(self):
260         ogg_files = {}
261         for m in self.media.filter(type='ogg').order_by().iterator():
262             ogg_files[m.name] = m
263
264         audiobooks = []
265         projects = set()
266         for mp3 in self.media.filter(type='mp3').iterator():
267             # ogg files are always from the same project
268             meta = mp3.extra_info
269             project = meta.get('project')
270             if not project:
271                 # temporary fallback
272                 project = u'CzytamySłuchając'
273
274             projects.add((project, meta.get('funded_by', '')))
275
276             media = {'mp3': mp3}
277
278             ogg = ogg_files.get(mp3.name)
279             if ogg:
280                 media['ogg'] = ogg
281             audiobooks.append(media)
282
283         projects = sorted(projects)
284         return audiobooks, projects
285
286     def wldocument(self, parse_dublincore=True, inherit=True):
287         from catalogue.import_utils import ORMDocProvider
288         from librarian.parser import WLDocument
289
290         if inherit and self.parent:
291             meta_fallbacks = self.parent.cover_info()
292         else:
293             meta_fallbacks = None
294
295         return WLDocument.from_file(
296             self.xml_file.path,
297             provider=ORMDocProvider(self),
298             parse_dublincore=parse_dublincore,
299             meta_fallbacks=meta_fallbacks)
300
301     @staticmethod
302     def zip_format(format_):
303         def pretty_file_name(book):
304             return "%s/%s.%s" % (
305                 book.extra_info['author'],
306                 book.slug,
307                 format_)
308
309         field_name = "%s_file" % format_
310         books = Book.objects.filter(parent=None).exclude(**{field_name: ""})
311         paths = [(pretty_file_name(b), getattr(b, field_name).path) for b in books.iterator()]
312         return create_zip(paths, app_settings.FORMAT_ZIPS[format_])
313
314     def zip_audiobooks(self, format_):
315         bm = BookMedia.objects.filter(book=self, type=format_)
316         paths = map(lambda bm: (None, bm.file.path), bm)
317         return create_zip(paths, "%s_%s" % (self.slug, format_))
318
319     def search_index(self, book_info=None, index=None, index_tags=True, commit=True):
320         if index is None:
321             from search.index import Index
322             index = Index()
323         try:
324             index.index_book(self, book_info)
325             if index_tags:
326                 index.index_tags()
327             if commit:
328                 index.index.commit()
329         except Exception, e:
330             index.index.rollback()
331             raise e
332
333     def download_pictures(self, remote_gallery_url):
334         gallery_path = self.gallery_path()
335         # delete previous files, so we don't include old files in ebooks
336         if os.path.isdir(gallery_path):
337             for filename in os.listdir(gallery_path):
338                 file_path = os.path.join(gallery_path, filename)
339                 os.unlink(file_path)
340         ilustr_elements = list(self.wldocument().edoc.findall('//ilustr'))
341         if ilustr_elements:
342             makedirs(gallery_path)
343             for ilustr in ilustr_elements:
344                 ilustr_src = ilustr.get('src')
345                 ilustr_path = os.path.join(gallery_path, ilustr_src)
346                 urllib.urlretrieve('%s/%s' % (remote_gallery_url, ilustr_src), ilustr_path)
347
348     @classmethod
349     def from_xml_file(cls, xml_file, **kwargs):
350         from django.core.files import File
351         from librarian import dcparser
352
353         # use librarian to parse meta-data
354         book_info = dcparser.parse(xml_file)
355
356         if not isinstance(xml_file, File):
357             xml_file = File(open(xml_file))
358
359         try:
360             return cls.from_text_and_meta(xml_file, book_info, **kwargs)
361         finally:
362             xml_file.close()
363
364     @classmethod
365     def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
366                            search_index_tags=True, remote_gallery_url=None):
367         if dont_build is None:
368             dont_build = set()
369         dont_build = set.union(set(dont_build), set(app_settings.DONT_BUILD))
370
371         # check for parts before we do anything
372         children = []
373         if hasattr(book_info, 'parts'):
374             for part_url in book_info.parts:
375                 try:
376                     children.append(Book.objects.get(slug=part_url.slug))
377                 except Book.DoesNotExist:
378                     raise Book.DoesNotExist(_('Book "%s" does not exist.') % part_url.slug)
379
380         # Read book metadata
381         book_slug = book_info.url.slug
382         if re.search(r'[^a-z0-9-]', book_slug):
383             raise ValueError('Invalid characters in slug')
384         book, created = Book.objects.get_or_create(slug=book_slug)
385
386         if created:
387             book_shelves = []
388             old_cover = None
389         else:
390             if not overwrite:
391                 raise Book.AlreadyExists(_('Book %s already exists') % book_slug)
392             # Save shelves for this book
393             book_shelves = list(book.tags.filter(category='set'))
394             old_cover = book.cover_info()
395
396         # Save XML file
397         book.xml_file.save('%s.xml' % book.slug, raw_file, save=False)
398
399         book.language = book_info.language
400         book.title = book_info.title
401         if book_info.variant_of:
402             book.common_slug = book_info.variant_of.slug
403         else:
404             book.common_slug = book.slug
405         book.extra_info = book_info.to_dict()
406         book.save()
407
408         meta_tags = Tag.tags_from_info(book_info)
409
410         for tag in meta_tags:
411             if not tag.for_books:
412                 tag.for_books = True
413                 tag.save()
414
415         book.tags = set(meta_tags + book_shelves)
416
417         cover_changed = old_cover != book.cover_info()
418         obsolete_children = set(b for b in book.children.all()
419                                 if b not in children)
420         notify_cover_changed = []
421         for n, child_book in enumerate(children):
422             new_child = child_book.parent != book
423             child_book.parent = book
424             child_book.parent_number = n
425             child_book.save()
426             if new_child or cover_changed:
427                 notify_cover_changed.append(child_book)
428         # Disown unfaithful children and let them cope on their own.
429         for child in obsolete_children:
430             child.parent = None
431             child.parent_number = 0
432             child.save()
433             if old_cover:
434                 notify_cover_changed.append(child)
435
436         cls.repopulate_ancestors()
437         tasks.update_counters.delay()
438
439         if remote_gallery_url:
440             book.download_pictures(remote_gallery_url)
441
442         # No saves beyond this point.
443
444         # Build cover.
445         if 'cover' not in dont_build:
446             book.cover.build_delay()
447             book.cover_thumb.build_delay()
448             book.cover_api_thumb.build_delay()
449             book.simple_cover.build_delay()
450
451         # Build HTML and ebooks.
452         book.html_file.build_delay()
453         if not children:
454             for format_ in constants.EBOOK_FORMATS_WITHOUT_CHILDREN:
455                 if format_ not in dont_build:
456                     getattr(book, '%s_file' % format_).build_delay()
457         for format_ in constants.EBOOK_FORMATS_WITH_CHILDREN:
458             if format_ not in dont_build:
459                 getattr(book, '%s_file' % format_).build_delay()
460
461         if not settings.NO_SEARCH_INDEX and search_index:
462             tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags)
463
464         for child in notify_cover_changed:
465             child.parent_cover_changed()
466
467         book.save()  # update sort_key_author
468         cls.published.send(sender=cls, instance=book)
469         return book
470
471     @classmethod
472     @transaction.atomic
473     def repopulate_ancestors(cls):
474         """Fixes the ancestry cache."""
475         # TODO: table names
476         cursor = connection.cursor()
477         if connection.vendor == 'postgres':
478             cursor.execute("TRUNCATE catalogue_book_ancestor")
479             cursor.execute("""
480                 WITH RECURSIVE ancestry AS (
481                     SELECT book.id, book.parent_id
482                     FROM catalogue_book AS book
483                     WHERE book.parent_id IS NOT NULL
484                     UNION
485                     SELECT ancestor.id, book.parent_id
486                     FROM ancestry AS ancestor, catalogue_book AS book
487                     WHERE ancestor.parent_id = book.id
488                         AND book.parent_id IS NOT NULL
489                     )
490                 INSERT INTO catalogue_book_ancestor
491                     (from_book_id, to_book_id)
492                     SELECT id, parent_id
493                     FROM ancestry
494                     ORDER BY id;
495                 """)
496         else:
497             cursor.execute("DELETE FROM catalogue_book_ancestor")
498             for b in cls.objects.exclude(parent=None):
499                 parent = b.parent
500                 while parent is not None:
501                     b.ancestor.add(parent)
502                     parent = parent.parent
503
504     def flush_includes(self, languages=True):
505         if not languages:
506             return
507         if languages is True:
508             languages = [lc for (lc, _ln) in settings.LANGUAGES]
509         flush_ssi_includes([
510             template % (self.pk, lang)
511             for template in [
512                 '/katalog/b/%d/mini.%s.html',
513                 '/katalog/b/%d/mini_nolink.%s.html',
514                 '/katalog/b/%d/short.%s.html',
515                 '/katalog/b/%d/wide.%s.html',
516                 '/api/include/book/%d.%s.json',
517                 '/api/include/book/%d.%s.xml',
518                 ]
519             for lang in languages
520             ])
521
522     def cover_info(self, inherit=True):
523         """Returns a dictionary to serve as fallback for BookInfo.
524
525         For now, the only thing inherited is the cover image.
526         """
527         need = False
528         info = {}
529         for field in ('cover_url', 'cover_by', 'cover_source'):
530             val = self.extra_info.get(field)
531             if val:
532                 info[field] = val
533             else:
534                 need = True
535         if inherit and need and self.parent is not None:
536             parent_info = self.parent.cover_info()
537             parent_info.update(info)
538             info = parent_info
539         return info
540
541     def related_themes(self):
542         return Tag.objects.usage_for_queryset(
543             Fragment.objects.filter(models.Q(book=self) | models.Q(book__ancestor=self)),
544             counts=True).filter(category='theme')
545
546     def parent_cover_changed(self):
547         """Called when parent book's cover image is changed."""
548         if not self.cover_info(inherit=False):
549             if 'cover' not in app_settings.DONT_BUILD:
550                 self.cover.build_delay()
551                 self.cover_thumb.build_delay()
552                 self.cover_api_thumb.build_delay()
553                 self.simple_cover.build_delay()
554             for format_ in constants.EBOOK_FORMATS_WITH_COVERS:
555                 if format_ not in app_settings.DONT_BUILD:
556                     getattr(self, '%s_file' % format_).build_delay()
557             for child in self.children.all():
558                 child.parent_cover_changed()
559
560     def other_versions(self):
561         """Find other versions (i.e. in other languages) of the book."""
562         return type(self).objects.filter(common_slug=self.common_slug).exclude(pk=self.pk)
563
564     def parents(self):
565         books = []
566         parent = self.parent
567         while parent is not None:
568             books.insert(0, parent)
569             parent = parent.parent
570         return books
571
572     def pretty_title(self, html_links=False):
573         names = [(tag.name, tag.get_absolute_url()) for tag in self.authors().only('name', 'category', 'slug')]
574         books = self.parents() + [self]
575         names.extend([(b.title, b.get_absolute_url()) for b in books])
576
577         if html_links:
578             names = ['<a href="%s">%s</a>' % (tag[1], tag[0]) for tag in names]
579         else:
580             names = [tag[0] for tag in names]
581         return ', '.join(names)
582
583     def publisher(self):
584         publisher = self.extra_info['publisher']
585         if isinstance(publisher, basestring):
586             return publisher
587         elif isinstance(publisher, list):
588             return ', '.join(publisher)
589
590     @classmethod
591     def tagged_top_level(cls, tags):
592         """ Returns top-level books tagged with `tags`.
593
594         It only returns those books which don't have ancestors which are
595         also tagged with those tags.
596
597         """
598         objects = cls.tagged.with_all(tags)
599         return objects.exclude(ancestor__in=objects)
600
601     @classmethod
602     def book_list(cls, book_filter=None):
603         """Generates a hierarchical listing of all books.
604
605         Books are optionally filtered with a test function.
606
607         """
608
609         books_by_parent = {}
610         books = cls.objects.order_by('parent_number', 'sort_key').only('title', 'parent', 'slug')
611         if book_filter:
612             books = books.filter(book_filter).distinct()
613
614             book_ids = set(b['pk'] for b in books.values("pk").iterator())
615             for book in books.iterator():
616                 parent = book.parent_id
617                 if parent not in book_ids:
618                     parent = None
619                 books_by_parent.setdefault(parent, []).append(book)
620         else:
621             for book in books.iterator():
622                 books_by_parent.setdefault(book.parent_id, []).append(book)
623
624         orphans = []
625         books_by_author = OrderedDict()
626         for tag in Tag.objects.filter(category='author').iterator():
627             books_by_author[tag] = []
628
629         for book in books_by_parent.get(None, ()):
630             authors = list(book.authors().only('pk'))
631             if authors:
632                 for author in authors:
633                     books_by_author[author].append(book)
634             else:
635                 orphans.append(book)
636
637         return books_by_author, orphans, books_by_parent
638
639     _audiences_pl = {
640         "SP": (1, u"szkoła podstawowa"),
641         "SP1": (1, u"szkoła podstawowa"),
642         "SP2": (1, u"szkoła podstawowa"),
643         "SP3": (1, u"szkoła podstawowa"),
644         "P": (1, u"szkoła podstawowa"),
645         "G": (2, u"gimnazjum"),
646         "L": (3, u"liceum"),
647         "LP": (3, u"liceum"),
648     }
649
650     def audiences_pl(self):
651         audiences = self.extra_info.get('audiences', [])
652         audiences = sorted(set([self._audiences_pl.get(a, (99, a)) for a in audiences]))
653         return [a[1] for a in audiences]
654
655     def stage_note(self):
656         stage = self.extra_info.get('stage')
657         if stage and stage < '0.4':
658             return (_('This work needs modernisation'),
659                     reverse('infopage', args=['wymagajace-uwspolczesnienia']))
660         else:
661             return None, None
662
663     def choose_fragment(self):
664         fragments = self.fragments.order_by()
665         fragments_count = fragments.count()
666         if not fragments_count and self.children.exists():
667             fragments = Fragment.objects.filter(book__ancestor=self).order_by()
668             fragments_count = fragments.count()
669         if fragments_count:
670             return fragments[randint(0, fragments_count - 1)]
671         elif self.parent:
672             return self.parent.choose_fragment()
673         else:
674             return None
675
676     def fragment_data(self):
677         fragment = self.choose_fragment()
678         if fragment:
679             return {'title': fragment.book.pretty_title(), 'html': fragment.get_short_text()}
680         else:
681             return None
682
683     def update_popularity(self):
684         count = self.tags.filter(category='set').values('user').order_by('user').distinct().count()
685         try:
686             pop = self.popularity
687             pop.count = count
688             pop.save()
689         except BookPopularity.DoesNotExist:
690             BookPopularity.objects.create(book=self, count=count)
691
692     def ridero_link(self):
693         return 'https://ridero.eu/%s/books/wl_%s/' % (get_language(), self.slug.replace('-', '_'))
694
695
696 def add_file_fields():
697     for format_ in Book.formats:
698         field_name = "%s_file" % format_
699         # This weird globals() assignment makes Django migrations comfortable.
700         _upload_to = _ebook_upload_to('book/%s/%%s.%s' % (format_, format_))
701         _upload_to.__name__ = '_%s_upload_to' % format_
702         globals()[_upload_to.__name__] = _upload_to
703
704         EbookField(
705             format_, _("%s file" % format_.upper()),
706             upload_to=_upload_to,
707             storage=bofh_storage,
708             max_length=255,
709             blank=True,
710             default=''
711         ).contribute_to_class(Book, field_name)
712
713 add_file_fields()
714
715
716 class BookPopularity(models.Model):
717     book = models.OneToOneField(Book, related_name='popularity')
718     count = models.IntegerField(default=0)