#933,#934: xmls and epubs to download
[wolnelektury.git] / apps / catalogue / models.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.db import models
6 from django.db.models import permalink, Q
7 from django.utils.translation import ugettext_lazy as _
8 from django.contrib.auth.models import User
9 from django.core.files import File
10 from django.template.loader import render_to_string
11 from django.utils.safestring import mark_safe
12 from django.utils.translation import get_language
13 from django.core.urlresolvers import reverse
14 from datetime import datetime
15
16 from django.conf import settings
17
18 from newtagging.models import TagBase, tags_updated
19 from newtagging import managers
20 from catalogue.fields import JSONField
21
22 from librarian import dcparser, html, epub, NoDublinCore
23 from mutagen import id3
24
25
26 TAG_CATEGORIES = (
27     ('author', _('author')),
28     ('epoch', _('epoch')),
29     ('kind', _('kind')),
30     ('genre', _('genre')),
31     ('theme', _('theme')),
32     ('set', _('set')),
33     ('book', _('book')),
34 )
35
36
37 class TagSubcategoryManager(models.Manager):
38     def __init__(self, subcategory):
39         super(TagSubcategoryManager, self).__init__()
40         self.subcategory = subcategory
41
42     def get_query_set(self):
43         return super(TagSubcategoryManager, self).get_query_set().filter(category=self.subcategory)
44
45
46 class Tag(TagBase):
47     name = models.CharField(_('name'), max_length=50, db_index=True)
48     slug = models.SlugField(_('slug'), max_length=120, db_index=True)
49     sort_key = models.CharField(_('sort key'), max_length=120, db_index=True)
50     category = models.CharField(_('category'), max_length=50, blank=False, null=False,
51         db_index=True, choices=TAG_CATEGORIES)
52     description = models.TextField(_('description'), blank=True)
53     main_page = models.BooleanField(_('main page'), default=False, db_index=True, help_text=_('Show tag on main page'))
54
55     user = models.ForeignKey(User, blank=True, null=True)
56     book_count = models.IntegerField(_('book count'), blank=False, null=True)
57     death = models.IntegerField(_(u'year of death'), blank=True, null=True)
58     gazeta_link = models.CharField(blank=True, max_length=240)
59     wiki_link = models.CharField(blank=True, max_length=240)
60
61     categories_rev = {
62         'autor': 'author',
63         'epoka': 'epoch',
64         'rodzaj': 'kind',
65         'gatunek': 'genre',
66         'motyw': 'theme',
67         'polka': 'set',
68     }
69     categories_dict = dict((item[::-1] for item in categories_rev.iteritems()))
70
71     class Meta:
72         ordering = ('sort_key',)
73         verbose_name = _('tag')
74         verbose_name_plural = _('tags')
75         unique_together = (("slug", "category"),)
76
77     def __unicode__(self):
78         return self.name
79
80     def __repr__(self):
81         return "Tag(slug=%r)" % self.slug
82
83     @permalink
84     def get_absolute_url(self):
85         return ('catalogue.views.tagged_object_list', [self.url_chunk])
86
87     def has_description(self):
88         return len(self.description) > 0
89     has_description.short_description = _('description')
90     has_description.boolean = True
91
92     def alive(self):
93         return self.death is None
94
95     def in_pd(self):
96         """ tests whether an author is in public domain """
97         return self.death is not None and self.goes_to_pd() <= datetime.now().year
98
99     def goes_to_pd(self):
100         """ calculates the year of public domain entry for an author """
101         return self.death + 71 if self.death is not None else None
102
103     def get_count(self):
104         """ returns global book count for book tags, fragment count for themes """
105
106         if self.book_count is None:
107             if self.category == 'book':
108                 # never used
109                 objects = Book.objects.none()
110             elif self.category == 'theme':
111                 objects = Fragment.tagged.with_all((self,))
112             else:
113                 objects = Book.tagged.with_all((self,)).order_by()
114                 if self.category != 'set':
115                     # eliminate descendants
116                     l_tags = Tag.objects.filter(slug__in=[book.book_tag_slug() for book in objects])
117                     descendants_keys = [book.pk for book in Book.tagged.with_any(l_tags)]
118                     if descendants_keys:
119                         objects = objects.exclude(pk__in=descendants_keys)
120             self.book_count = objects.count()
121             self.save()
122         return self.book_count
123
124     @staticmethod
125     def get_tag_list(tags):
126         if isinstance(tags, basestring):
127             real_tags = []
128             ambiguous_slugs = []
129             category = None
130             tags_splitted = tags.split('/')
131             for index, name in enumerate(tags_splitted):
132                 if name in Tag.categories_rev:
133                     category = Tag.categories_rev[name]
134                 else:
135                     if category:
136                         real_tags.append(Tag.objects.get(slug=name, category=category))
137                         category = None
138                     else:
139                         try:
140                             real_tags.append(Tag.objects.exclude(category='book').get(slug=name))
141                         except Tag.MultipleObjectsReturned, e:
142                             ambiguous_slugs.append(name)
143
144             if category:
145                 # something strange left off
146                 raise Tag.DoesNotExist()
147             if ambiguous_slugs:
148                 # some tags should be qualified
149                 e = Tag.MultipleObjectsReturned()
150                 e.tags = real_tags
151                 e.ambiguous_slugs = ambiguous_slugs
152                 raise e
153             else:
154                 return real_tags
155         else:
156             return TagBase.get_tag_list(tags)
157
158     @property
159     def url_chunk(self):
160         return '/'.join((Tag.categories_dict[self.category], self.slug))
161
162
163 # TODO: why is this hard-coded ?
164 def book_upload_path(ext):
165     def get_dynamic_path(book, filename):
166         return 'lektura/%s.%s' % (book.slug, ext)
167     return get_dynamic_path
168
169
170 class Book(models.Model):
171     title = models.CharField(_('title'), max_length=120)
172     slug = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
173     description = models.TextField(_('description'), blank=True)
174     created_at = models.DateTimeField(_('creation date'), auto_now_add=True)
175     _short_html = models.TextField(_('short HTML'), editable=False)
176     parent_number = models.IntegerField(_('parent number'), default=0)
177     extra_info = JSONField(_('extra information'))
178     gazeta_link = models.CharField(blank=True, max_length=240)
179     wiki_link = models.CharField(blank=True, max_length=240)
180
181
182     # Formats
183     xml_file = models.FileField(_('XML file'), upload_to=book_upload_path('xml'), blank=True)
184     html_file = models.FileField(_('HTML file'), upload_to=book_upload_path('html'), blank=True)
185     pdf_file = models.FileField(_('PDF file'), upload_to=book_upload_path('pdf'), blank=True)
186     epub_file = models.FileField(_('EPUB file'), upload_to=book_upload_path('epub'), blank=True)
187     odt_file = models.FileField(_('ODT file'), upload_to=book_upload_path('odt'), blank=True)
188     txt_file = models.FileField(_('TXT file'), upload_to=book_upload_path('txt'), blank=True)
189     mp3_file = models.FileField(_('MP3 file'), upload_to=book_upload_path('mp3'), blank=True)
190     ogg_file = models.FileField(_('OGG file'), upload_to=book_upload_path('ogg'), blank=True)
191     daisy_file = models.FileField(_('DAISY file'), upload_to=book_upload_path('daisy.zip'), blank=True)
192
193     parent = models.ForeignKey('self', blank=True, null=True, related_name='children')
194
195     objects = models.Manager()
196     tagged = managers.ModelTaggedItemManager(Tag)
197     tags = managers.TagDescriptor(Tag)
198
199     _tag_counter = JSONField(null=True, editable=False)
200     _theme_counter = JSONField(null=True, editable=False)
201
202     class AlreadyExists(Exception):
203         pass
204
205     class Meta:
206         ordering = ('title',)
207         verbose_name = _('book')
208         verbose_name_plural = _('books')
209
210     def __unicode__(self):
211         return self.title
212
213     def save(self, force_insert=False, force_update=False, reset_short_html=True, refresh_mp3=True, **kwargs):
214         if reset_short_html:
215             # Reset _short_html during save
216             update = {}
217             for key in filter(lambda x: x.startswith('_short_html'), self.__dict__):
218                 update[key] = ''
219                 self.__setattr__(key, '')
220             # Fragment.short_html relies on book's tags, so reset it here too
221             self.fragments.all().update(**update)
222
223         book = super(Book, self).save(force_insert, force_update)
224
225         if refresh_mp3 and self.mp3_file:
226             print self.mp3_file, self.mp3_file.path
227             extra_info = self.get_extra_info_value()
228             extra_info.update(self.get_mp3_info())
229             self.set_extra_info_value(extra_info)
230             book = super(Book, self).save(force_insert, force_update)
231
232         return book
233
234     @permalink
235     def get_absolute_url(self):
236         return ('catalogue.views.book_detail', [self.slug])
237
238     @property
239     def name(self):
240         return self.title
241
242     def book_tag_slug(self):
243         return ('l-' + self.slug)[:120]
244
245     def book_tag(self):
246         slug = self.book_tag_slug()
247         book_tag, created = Tag.objects.get_or_create(slug=slug, category='book')
248         if created:
249             book_tag.name = self.title[:50]
250             book_tag.sort_key = self.title.lower()
251             book_tag.save()
252         return book_tag
253
254     def short_html(self):
255         key = '_short_html_%s' % get_language()
256         short_html = getattr(self, key)
257
258         if short_html and len(short_html):
259             return mark_safe(short_html)
260         else:
261             tags = self.tags.filter(~Q(category__in=('set', 'theme', 'book')))
262             tags = [mark_safe(u'<a href="%s">%s</a>' % (tag.get_absolute_url(), tag.name)) for tag in tags]
263
264             formats = []
265             if self.html_file:
266                 formats.append(u'<a href="%s">%s</a>' % (reverse('book_text', kwargs={'slug': self.slug}), _('Read online')))
267             if self.pdf_file:
268                 formats.append(u'<a href="%s">PDF</a>' % self.pdf_file.url)
269             if self.root_ancestor.epub_file:
270                 formats.append(u'<a href="%s">EPUB</a>' % self.root_ancestor.epub_file.url)
271             if self.odt_file:
272                 formats.append(u'<a href="%s">ODT</a>' % self.odt_file.url)
273             if self.txt_file:
274                 formats.append(u'<a href="%s">TXT</a>' % self.txt_file.url)
275             if self.mp3_file:
276                 formats.append(u'<a href="%s">MP3</a>' % self.mp3_file.url)
277             if self.ogg_file:
278                 formats.append(u'<a href="%s">OGG</a>' % self.ogg_file.url)
279             if self.daisy_file:
280                 formats.append(u'<a href="%s">DAISY</a>' % self.daisy_file.url)
281
282             formats = [mark_safe(format) for format in formats]
283
284             setattr(self, key, unicode(render_to_string('catalogue/book_short.html',
285                 {'book': self, 'tags': tags, 'formats': formats})))
286             self.save(reset_short_html=False)
287             return mark_safe(getattr(self, key))
288
289
290     @property
291     def root_ancestor(self):
292         """ returns the oldest ancestor """
293
294         if not hasattr(self, '_root_ancestor'):
295             book = self
296             while book.parent:
297                 book = book.parent
298             self._root_ancestor = book
299         return self._root_ancestor
300
301
302     def get_mp3_info(self):
303         """Retrieves artist and director names from audio ID3 tags."""
304         audio = id3.ID3(self.mp3_file.path)
305         artist_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE1'))
306         director_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE3'))
307         return {'artist_name': artist_name, 'director_name': director_name}
308
309     def has_description(self):
310         return len(self.description) > 0
311     has_description.short_description = _('description')
312     has_description.boolean = True
313
314     def has_pdf_file(self):
315         return bool(self.pdf_file)
316     has_pdf_file.short_description = 'PDF'
317     has_pdf_file.boolean = True
318
319     def has_epub_file(self):
320         return bool(self.epub_file)
321     has_epub_file.short_description = 'EPUB'
322     has_epub_file.boolean = True
323
324     def has_odt_file(self):
325         return bool(self.odt_file)
326     has_odt_file.short_description = 'ODT'
327     has_odt_file.boolean = True
328
329     def has_html_file(self):
330         return bool(self.html_file)
331     has_html_file.short_description = 'HTML'
332     has_html_file.boolean = True
333
334     def build_epub(self, remove_descendants=True):
335         """ (Re)builds the epub file.
336             If book has a parent, does nothing.
337             Unless remove_descendants is False, descendants' epubs are removed.
338         """
339     
340         from StringIO import StringIO
341         from hashlib import sha1
342         from django.core.files.base import ContentFile
343         from librarian import DocProvider
344
345         class BookImportDocProvider(DocProvider):
346             """ used for joined EPUBs """
347
348             def __init__(self, book):
349                 self.book = book
350
351             def by_slug(self, slug):
352                 if slug == self.book.slug:
353                     return self.book.xml_file
354                 else:
355                     return Book.objects.get(slug=slug).xml_file
356
357         if self.parent:
358             # don't need an epub
359             return
360
361         epub_file = StringIO()
362         try:
363             epub.transform(BookImportDocProvider(self), self.slug, epub_file)
364             self.epub_file.save('%s.epub' % self.slug, ContentFile(epub_file.getvalue()), save=False)
365             self.save(refresh_mp3=False)
366             FileRecord(slug=self.slug, type='epub', sha1=sha1(epub_file.getvalue()).hexdigest()).save()
367         except NoDublinCore:
368             pass
369
370         book_descendants = list(self.children.all())
371         while len(book_descendants) > 0:
372             child_book = book_descendants.pop(0)
373             if remove_descendants and child_book.has_epub_file():
374                 child_book.epub_file.delete()
375             # save anyway, to refresh short_html
376             child_book.save(refresh_mp3=False)
377             book_descendants += list(child_book.children.all())
378
379
380     @classmethod
381     def from_xml_file(cls, xml_file, overwrite=False):
382         # use librarian to parse meta-data
383         book_info = dcparser.parse(xml_file)
384
385         if not isinstance(xml_file, File):
386             xml_file = File(open(xml_file))
387
388         try:
389             return cls.from_text_and_meta(xml_file, book_info, overwrite)
390         finally:
391             xml_file.close()
392
393     @classmethod
394     def from_text_and_meta(cls, raw_file, book_info, overwrite=False):
395         from tempfile import NamedTemporaryFile
396         from slughifi import slughifi
397         from markupstring import MarkupString
398         from django.core.files.storage import default_storage
399
400         # check for parts before we do anything
401         children = []
402         if hasattr(book_info, 'parts'):
403             for part_url in book_info.parts:
404                 base, slug = part_url.rsplit('/', 1)
405                 try:
406                     children.append(Book.objects.get(slug=slug))
407                 except Book.DoesNotExist, e:
408                     raise Book.DoesNotExist(_('Book with slug = "%s" does not exist.') % slug)
409
410
411         # Read book metadata
412         book_base, book_slug = book_info.url.rsplit('/', 1)
413         book, created = Book.objects.get_or_create(slug=book_slug)
414
415         if created:
416             book_shelves = []
417         else:
418             if not overwrite:
419                 raise Book.AlreadyExists(_('Book %s already exists') % book_slug)
420             # Save shelves for this book
421             book_shelves = list(book.tags.filter(category='set'))
422
423         book.title = book_info.title
424         book.set_extra_info_value(book_info.to_dict())
425         book._short_html = ''
426         book.save()
427
428         book_tags = []
429         categories = (('kinds', 'kind'), ('genres', 'genre'), ('authors', 'author'), ('epochs', 'epoch'))
430         for field_name, category in categories:
431             try:
432                 tag_names = getattr(book_info, field_name)
433             except:
434                 tag_names = [getattr(book_info, category)]
435             for tag_name in tag_names:
436                 tag_sort_key = tag_name
437                 if category == 'author':
438                     tag_sort_key = tag_name.last_name
439                     tag_name = ' '.join(tag_name.first_names) + ' ' + tag_name.last_name
440                 tag, created = Tag.objects.get_or_create(slug=slughifi(tag_name), category=category)
441                 if created:
442                     tag.name = tag_name
443                     tag.sort_key = tag_sort_key.lower()
444                     tag.save()
445                 book_tags.append(tag)
446
447         book.tags = book_tags + book_shelves
448
449         book_tag = book.book_tag()
450
451         for n, child_book in enumerate(children):
452             child_book.parent = book
453             child_book.parent_number = n
454             child_book.save()
455
456         # Save XML and HTML files
457         book.xml_file.save('%s.xml' % book.slug, raw_file, save=False)
458
459         # delete old fragments when overwriting
460         book.fragments.all().delete()
461
462         html_file = NamedTemporaryFile()
463         if html.transform(book.xml_file.path, html_file, parse_dublincore=False):
464             book.html_file.save('%s.html' % book.slug, File(html_file), save=False)
465
466             # get ancestor l-tags for adding to new fragments
467             ancestor_tags = []
468             p = book.parent
469             while p:
470                 ancestor_tags.append(p.book_tag())
471                 p = p.parent
472
473             # Extract fragments
474             closed_fragments, open_fragments = html.extract_fragments(book.html_file.path)
475             for fragment in closed_fragments.values():
476                 try:
477                     theme_names = [s.strip() for s in fragment.themes.split(',')]
478                 except AttributeError:
479                     continue
480                 themes = []
481                 for theme_name in theme_names:
482                     if not theme_name:
483                         continue
484                     tag, created = Tag.objects.get_or_create(slug=slughifi(theme_name), category='theme')
485                     if created:
486                         tag.name = theme_name
487                         tag.sort_key = theme_name.lower()
488                         tag.save()
489                     themes.append(tag)
490                 if not themes:
491                     continue
492
493                 text = fragment.to_string()
494                 short_text = ''
495                 if (len(MarkupString(text)) > 240):
496                     short_text = unicode(MarkupString(text)[:160])
497                 new_fragment, created = Fragment.objects.get_or_create(anchor=fragment.id, book=book,
498                     defaults={'text': text, 'short_text': short_text})
499
500                 new_fragment.save()
501                 new_fragment.tags = set(book_tags + themes + [book_tag] + ancestor_tags)
502
503         if not settings.NO_BUILD_EPUB:
504             book.root_ancestor().build_epub()
505
506         book_descendants = list(book.children.all())
507         # add l-tag to descendants and their fragments
508         # delete unnecessary EPUB files
509         while len(book_descendants) > 0:
510             child_book = book_descendants.pop(0)
511             child_book.tags = list(child_book.tags) + [book_tag]
512             child_book.save()
513             for fragment in child_book.fragments.all():
514                 fragment.tags = set(list(fragment.tags) + [book_tag])
515             book_descendants += list(child_book.children.all())
516
517         # refresh cache
518         book.reset_tag_counter()
519         book.reset_theme_counter()
520
521         book.save()
522         return book
523
524
525     def refresh_tag_counter(self):
526         tags = {}
527         for child in self.children.all().order_by():
528             for tag_pk, value in child.tag_counter.iteritems():
529                 tags[tag_pk] = tags.get(tag_pk, 0) + value
530         for tag in self.tags.exclude(category__in=('book', 'theme', 'set')).order_by():
531             tags[tag.pk] = 1
532         self.set__tag_counter_value(tags)
533         self.save(reset_short_html=False, refresh_mp3=False)
534         return tags
535
536     def reset_tag_counter(self):
537         self._tag_counter = None
538         self.save(reset_short_html=False, refresh_mp3=False)
539         if self.parent:
540             self.parent.reset_tag_counter()
541
542     @property
543     def tag_counter(self):
544         if self._tag_counter is None:
545             return self.refresh_tag_counter()
546         return dict((int(k), v) for k, v in self.get__tag_counter_value().iteritems())
547
548     def refresh_theme_counter(self):
549         tags = {}
550         for fragment in Fragment.tagged.with_any([self.book_tag()]).order_by():
551             for tag in fragment.tags.filter(category='theme').order_by():
552                 tags[tag.pk] = tags.get(tag.pk, 0) + 1
553         self.set__theme_counter_value(tags)
554         self.save(reset_short_html=False, refresh_mp3=False)
555         return tags
556
557     def reset_theme_counter(self):
558         self._theme_counter = None
559         self.save(reset_short_html=False, refresh_mp3=False)
560         if self.parent:
561             self.parent.reset_theme_counter()
562
563     @property
564     def theme_counter(self):
565         if self._theme_counter is None:
566             return self.refresh_theme_counter()
567         return dict((int(k), v) for k, v in self.get__theme_counter_value().iteritems())
568
569
570
571 class Fragment(models.Model):
572     text = models.TextField()
573     short_text = models.TextField(editable=False)
574     _short_html = models.TextField(editable=False)
575     anchor = models.CharField(max_length=120)
576     book = models.ForeignKey(Book, related_name='fragments')
577
578     objects = models.Manager()
579     tagged = managers.ModelTaggedItemManager(Tag)
580     tags = managers.TagDescriptor(Tag)
581
582     class Meta:
583         ordering = ('book', 'anchor',)
584         verbose_name = _('fragment')
585         verbose_name_plural = _('fragments')
586
587     def get_absolute_url(self):
588         return '%s#m%s' % (reverse('book_text', kwargs={'slug': self.book.slug}), self.anchor)
589
590     def short_html(self):
591         key = '_short_html_%s' % get_language()
592         short_html = getattr(self, key)
593         if short_html and len(short_html):
594             return mark_safe(short_html)
595         else:
596             setattr(self, key, unicode(render_to_string('catalogue/fragment_short.html',
597                 {'fragment': self})))
598             self.save()
599             return mark_safe(getattr(self, key))
600
601
602 class BookStub(models.Model):
603     title = models.CharField(_('title'), max_length=120)
604     author = models.CharField(_('author'), max_length=120)
605     pd = models.IntegerField(_('goes to public domain'), null=True, blank=True)
606     slug = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
607     translator = models.TextField(_('translator'), blank=True)
608     translator_death = models.TextField(_('year of translator\'s death'), blank=True)
609
610     class Meta:
611         ordering = ('title',)
612         verbose_name = _('book stub')
613         verbose_name_plural = _('book stubs')
614
615     def __unicode__(self):
616         return self.title
617
618     @permalink
619     def get_absolute_url(self):
620         return ('catalogue.views.book_detail', [self.slug])
621
622     def in_pd(self):
623         return self.pd is not None and self.pd <= datetime.now().year
624
625     @property
626     def name(self):
627         return self.title
628
629
630 class FileRecord(models.Model):
631     slug = models.SlugField(_('slug'), max_length=120, db_index=True)
632     type = models.CharField(_('type'), max_length=20, db_index=True)
633     sha1 = models.CharField(_('sha-1 hash'), max_length=40)
634     time = models.DateTimeField(_('time'), auto_now_add=True)
635
636     class Meta:
637         ordering = ('-time','-slug', '-type')
638         verbose_name = _('file record')
639         verbose_name_plural = _('file records')
640
641     def __unicode__(self):
642         return "%s %s.%s" % (self.sha1,  self.slug, self.type)
643
644
645 def _tags_updated_handler(sender, affected_tags, **kwargs):
646     # reset tag global counter
647     Tag.objects.filter(pk__in=[tag.pk for tag in affected_tags]).update(book_count=None)
648
649     # if book tags changed, reset book tag counter
650     if isinstance(sender, Book) and \
651                 Tag.objects.filter(pk__in=(tag.pk for tag in affected_tags)).\
652                     exclude(category__in=('book', 'theme', 'set')).count():
653         sender.reset_tag_counter()
654     # if fragment theme changed, reset book theme counter
655     elif isinstance(sender, Fragment) and \
656                 Tag.objects.filter(pk__in=(tag.pk for tag in affected_tags)).\
657                     filter(category='theme').count():
658         sender.book.reset_theme_counter()
659 tags_updated.connect(_tags_updated_handler)
660