importbooks: no-build-epub and wait-until options
[wolnelektury.git] / apps / catalogue / models.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.db import models
6 from django.db.models import permalink, Q
7 from django.utils.translation import ugettext_lazy as _
8 from django.contrib.auth.models import User
9 from django.core.files import File
10 from django.template.loader import render_to_string
11 from django.utils.safestring import mark_safe
12 from django.utils.translation import get_language
13 from django.core.urlresolvers import reverse
14
15 from newtagging.models import TagBase, tags_updated
16 from newtagging import managers
17 from catalogue.fields import JSONField
18
19 from librarian import dcparser, html, epub, NoDublinCore
20 from mutagen import id3
21
22
23 TAG_CATEGORIES = (
24     ('author', _('author')),
25     ('epoch', _('epoch')),
26     ('kind', _('kind')),
27     ('genre', _('genre')),
28     ('theme', _('theme')),
29     ('set', _('set')),
30     ('book', _('book')),
31 )
32
33
34 class TagSubcategoryManager(models.Manager):
35     def __init__(self, subcategory):
36         super(TagSubcategoryManager, self).__init__()
37         self.subcategory = subcategory
38
39     def get_query_set(self):
40         return super(TagSubcategoryManager, self).get_query_set().filter(category=self.subcategory)
41
42
43 class Tag(TagBase):
44     name = models.CharField(_('name'), max_length=50, db_index=True)
45     slug = models.SlugField(_('slug'), max_length=120, db_index=True)
46     sort_key = models.SlugField(_('sort key'), max_length=120, db_index=True)
47     category = models.CharField(_('category'), max_length=50, blank=False, null=False,
48         db_index=True, choices=TAG_CATEGORIES)
49     description = models.TextField(_('description'), blank=True)
50     main_page = models.BooleanField(_('main page'), default=False, db_index=True, help_text=_('Show tag on main page'))
51
52     user = models.ForeignKey(User, blank=True, null=True)
53     book_count = models.IntegerField(_('book count'), blank=False, null=True)
54     gazeta_link = models.CharField(blank=True, max_length=240)
55     wiki_link = models.CharField(blank=True, max_length=240)
56
57     categories_rev = {
58         'autor': 'author',
59         'epoka': 'epoch',
60         'rodzaj': 'kind',
61         'gatunek': 'genre',
62         'motyw': 'theme',
63         'polka': 'set',
64     }
65     categories_dict = dict((item[::-1] for item in categories_rev.iteritems()))
66
67     class Meta:
68         ordering = ('sort_key',)
69         verbose_name = _('tag')
70         verbose_name_plural = _('tags')
71         unique_together = (("slug", "category"),)
72
73     def __unicode__(self):
74         return self.name
75
76     def __repr__(self):
77         return "Tag(slug=%r)" % self.slug
78
79     @permalink
80     def get_absolute_url(self):
81         return ('catalogue.views.tagged_object_list', [self.url_chunk])
82
83     def has_description(self):
84         return len(self.description) > 0
85     has_description.short_description = _('description')
86     has_description.boolean = True
87
88     def get_count(self):
89         """ returns global book count for book tags, fragment count for themes """
90
91         if self.book_count is None:
92             if self.category == 'book':
93                 # never used
94                 objects = Book.objects.none()
95             elif self.category == 'theme':
96                 objects = Fragment.tagged.with_all((self,))
97             else:
98                 objects = Book.tagged.with_all((self,)).order_by()
99                 if self.category != 'set':
100                     # eliminate descendants
101                     l_tags = Tag.objects.filter(slug__in=[book.book_tag_slug() for book in objects])
102                     descendants_keys = [book.pk for book in Book.tagged.with_any(l_tags)]
103                     if descendants_keys:
104                         objects = objects.exclude(pk__in=descendants_keys)
105             self.book_count = objects.count()
106             self.save()
107         return self.book_count
108
109     @staticmethod
110     def get_tag_list(tags):
111         if isinstance(tags, basestring):
112             real_tags = []
113             ambiguous_slugs = []
114             category = None
115             tags_splitted = tags.split('/')
116             for index, name in enumerate(tags_splitted):
117                 if name in Tag.categories_rev:
118                     category = Tag.categories_rev[name]
119                 else:
120                     if category:
121                         real_tags.append(Tag.objects.get(slug=name, category=category))
122                         category = None
123                     else:
124                         try:
125                             real_tags.append(Tag.objects.exclude(category='book').get(slug=name))
126                         except Tag.MultipleObjectsReturned, e:
127                             ambiguous_slugs.append(name)
128
129             if category:
130                 # something strange left off
131                 raise Tag.DoesNotExist()
132             if ambiguous_slugs:
133                 # some tags should be qualified
134                 e = Tag.MultipleObjectsReturned()
135                 e.tags = real_tags
136                 e.ambiguous_slugs = ambiguous_slugs
137                 raise e
138             else:
139                 return real_tags
140         else:
141             return TagBase.get_tag_list(tags)
142
143     @property
144     def url_chunk(self):
145         return '/'.join((Tag.categories_dict[self.category], self.slug))
146
147
148 # TODO: why is this hard-coded ?
149 def book_upload_path(ext):
150     def get_dynamic_path(book, filename):
151         return 'lektura/%s.%s' % (book.slug, ext)
152     return get_dynamic_path
153
154
155 class Book(models.Model):
156     title = models.CharField(_('title'), max_length=120)
157     slug = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
158     description = models.TextField(_('description'), blank=True)
159     created_at = models.DateTimeField(_('creation date'), auto_now=True)
160     _short_html = models.TextField(_('short HTML'), editable=False)
161     parent_number = models.IntegerField(_('parent number'), default=0)
162     extra_info = JSONField(_('extra information'))
163     gazeta_link = models.CharField(blank=True, max_length=240)
164     wiki_link = models.CharField(blank=True, max_length=240)
165
166
167     # Formats
168     xml_file = models.FileField(_('XML file'), upload_to=book_upload_path('xml'), blank=True)
169     html_file = models.FileField(_('HTML file'), upload_to=book_upload_path('html'), blank=True)
170     pdf_file = models.FileField(_('PDF file'), upload_to=book_upload_path('pdf'), blank=True)
171     epub_file = models.FileField(_('EPUB file'), upload_to=book_upload_path('epub'), blank=True)
172     odt_file = models.FileField(_('ODT file'), upload_to=book_upload_path('odt'), blank=True)
173     txt_file = models.FileField(_('TXT file'), upload_to=book_upload_path('txt'), blank=True)
174     mp3_file = models.FileField(_('MP3 file'), upload_to=book_upload_path('mp3'), blank=True)
175     ogg_file = models.FileField(_('OGG file'), upload_to=book_upload_path('ogg'), blank=True)
176     daisy_file = models.FileField(_('DAISY file'), upload_to=book_upload_path('daisy.zip'), blank=True)
177
178     parent = models.ForeignKey('self', blank=True, null=True, related_name='children')
179
180     objects = models.Manager()
181     tagged = managers.ModelTaggedItemManager(Tag)
182     tags = managers.TagDescriptor(Tag)
183
184     _tag_counter = JSONField(null=True, editable=False)
185     _theme_counter = JSONField(null=True, editable=False)
186
187     class AlreadyExists(Exception):
188         pass
189
190     class Meta:
191         ordering = ('title',)
192         verbose_name = _('book')
193         verbose_name_plural = _('books')
194
195     def __unicode__(self):
196         return self.title
197
198     def save(self, force_insert=False, force_update=False, reset_short_html=True, refresh_mp3=True, **kwargs):
199         if reset_short_html:
200             # Reset _short_html during save
201             update = {}
202             for key in filter(lambda x: x.startswith('_short_html'), self.__dict__):
203                 update[key] = ''
204                 self.__setattr__(key, '')
205             # Fragment.short_html relies on book's tags, so reset it here too
206             self.fragments.all().update(**update)
207
208         book = super(Book, self).save(force_insert, force_update)
209
210         if refresh_mp3 and self.mp3_file:
211             print self.mp3_file, self.mp3_file.path
212             extra_info = self.get_extra_info_value()
213             extra_info.update(self.get_mp3_info())
214             self.set_extra_info_value(extra_info)
215             book = super(Book, self).save(force_insert, force_update)
216
217         return book
218
219     @permalink
220     def get_absolute_url(self):
221         return ('catalogue.views.book_detail', [self.slug])
222
223     @property
224     def name(self):
225         return self.title
226
227     def book_tag_slug(self):
228         return ('l-' + self.slug)[:120]
229
230     def book_tag(self):
231         slug = self.book_tag_slug()
232         book_tag, created = Tag.objects.get_or_create(slug=slug, category='book')
233         if created:
234             book_tag.name = self.title[:50]
235             book_tag.sort_key = slug
236             book_tag.save()
237         return book_tag
238
239     def short_html(self):
240         key = '_short_html_%s' % get_language()
241         short_html = getattr(self, key)
242
243         if short_html and len(short_html):
244             return mark_safe(short_html)
245         else:
246             tags = self.tags.filter(~Q(category__in=('set', 'theme', 'book')))
247             tags = [mark_safe(u'<a href="%s">%s</a>' % (tag.get_absolute_url(), tag.name)) for tag in tags]
248
249             formats = []
250             if self.html_file:
251                 formats.append(u'<a href="%s">%s</a>' % (reverse('book_text', kwargs={'slug': self.slug}), _('Read online')))
252             if self.pdf_file:
253                 formats.append(u'<a href="%s">PDF</a>' % self.pdf_file.url)
254             if self.root_ancestor.epub_file:
255                 formats.append(u'<a href="%s">EPUB</a>' % self.root_ancestor.epub_file.url)
256             if self.odt_file:
257                 formats.append(u'<a href="%s">ODT</a>' % self.odt_file.url)
258             if self.txt_file:
259                 formats.append(u'<a href="%s">TXT</a>' % self.txt_file.url)
260             if self.mp3_file:
261                 formats.append(u'<a href="%s">MP3</a>' % self.mp3_file.url)
262             if self.ogg_file:
263                 formats.append(u'<a href="%s">OGG</a>' % self.ogg_file.url)
264             if self.daisy_file:
265                 formats.append(u'<a href="%s">DAISY</a>' % self.daisy_file.url)
266
267             formats = [mark_safe(format) for format in formats]
268
269             setattr(self, key, unicode(render_to_string('catalogue/book_short.html',
270                 {'book': self, 'tags': tags, 'formats': formats})))
271             self.save(reset_short_html=False)
272             return mark_safe(getattr(self, key))
273
274
275     @property
276     def root_ancestor(self):
277         """ returns the oldest ancestor """
278
279         if not hasattr(self, '_root_ancestor'):
280             book = self
281             while book.parent:
282                 book = book.parent
283             self._root_ancestor = book
284         return self._root_ancestor
285
286
287     def get_mp3_info(self):
288         """Retrieves artist and director names from audio ID3 tags."""
289         audio = id3.ID3(self.mp3_file.path)
290         artist_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE1'))
291         director_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE3'))
292         return {'artist_name': artist_name, 'director_name': director_name}
293
294     def has_description(self):
295         return len(self.description) > 0
296     has_description.short_description = _('description')
297     has_description.boolean = True
298
299     def has_pdf_file(self):
300         return bool(self.pdf_file)
301     has_pdf_file.short_description = 'PDF'
302     has_pdf_file.boolean = True
303
304     def has_epub_file(self):
305         return bool(self.epub_file)
306     has_epub_file.short_description = 'EPUB'
307     has_epub_file.boolean = True
308
309     def has_odt_file(self):
310         return bool(self.odt_file)
311     has_odt_file.short_description = 'ODT'
312     has_odt_file.boolean = True
313
314     def has_html_file(self):
315         return bool(self.html_file)
316     has_html_file.short_description = 'HTML'
317     has_html_file.boolean = True
318
319     def build_epub(self, remove_descendants=True):
320         """ (Re)builds the epub file.
321             If book has a parent, does nothing.
322             Unless remove_descendants is False, descendants' epubs are removed.
323         """
324     
325         from StringIO import StringIO
326         from hashlib import sha1
327         from django.core.files.base import ContentFile
328         from librarian import DocProvider
329
330         class BookImportDocProvider(DocProvider):
331             """ used for joined EPUBs """
332
333             def __init__(self, book):
334                 self.book = book
335
336             def by_slug(self, slug):
337                 if slug == self.book.slug:
338                     return self.book.xml_file
339                 else:
340                     return Book.objects.get(slug=slug).xml_file
341
342         if self.parent:
343             # don't need an epub
344             return
345
346         epub_file = StringIO()
347         try:
348             epub.transform(BookImportDocProvider(self), self.slug, output_file=epub_file)
349             self.epub_file.save('%s.epub' % self.slug, ContentFile(epub_file.getvalue()), save=False)
350             self.save()
351             FileRecord(slug=self.slug, type='epub', sha1=sha1(epub_file.getvalue()).hexdigest()).save()
352         except NoDublinCore:
353             pass
354
355         book_descendants = list(self.children.all())
356         while len(book_descendants) > 0:
357             child_book = book_descendants.pop(0)
358             if remove_descendants and child_book.has_epub_file():
359                 child_book.epub_file.delete()
360             # save anyway, to refresh short_html
361             child_book.save()
362             book_descendants += list(child_book.children.all())
363
364
365     @classmethod
366     def from_xml_file(cls, xml_file, overwrite=False, build_epub=True):
367         # use librarian to parse meta-data
368         book_info = dcparser.parse(xml_file)
369
370         if not isinstance(xml_file, File):
371             xml_file = File(open(xml_file))
372
373         try:
374             return cls.from_text_and_meta(xml_file, book_info, overwrite, build_epub=build_epub)
375         finally:
376             xml_file.close()
377
378     @classmethod
379     def from_text_and_meta(cls, raw_file, book_info, overwrite=False, build_epub=True):
380         from tempfile import NamedTemporaryFile
381         from slughifi import slughifi
382         from markupstring import MarkupString
383         from django.core.files.storage import default_storage
384
385         # Read book metadata
386         book_base, book_slug = book_info.url.rsplit('/', 1)
387         book, created = Book.objects.get_or_create(slug=book_slug)
388
389         if created:
390             book_shelves = []
391         else:
392             if not overwrite:
393                 raise Book.AlreadyExists(_('Book %s already exists') % book_slug)
394             # Save shelves for this book
395             book_shelves = list(book.tags.filter(category='set'))
396
397         book.title = book_info.title
398         book.set_extra_info_value(book_info.to_dict())
399         book._short_html = ''
400         book.save()
401
402         book_tags = []
403         categories = (('kinds', 'kind'), ('genres', 'genre'), ('authors', 'author'), ('epochs', 'epoch'))
404         for field_name, category in categories:
405             try:
406                 tag_names = getattr(book_info, field_name)
407             except:
408                 tag_names = [getattr(book_info, category)]
409             for tag_name in tag_names:
410                 tag_sort_key = tag_name
411                 if category == 'author':
412                     tag_sort_key = tag_name.last_name
413                     tag_name = ' '.join(tag_name.first_names) + ' ' + tag_name.last_name
414                 tag, created = Tag.objects.get_or_create(slug=slughifi(tag_name), category=category)
415                 if created:
416                     tag.name = tag_name
417                     tag.sort_key = slughifi(tag_sort_key)
418                     tag.save()
419                 book_tags.append(tag)
420
421         book.tags = book_tags + book_shelves
422
423         book_tag = book.book_tag()
424
425         if hasattr(book_info, 'parts'):
426             for n, part_url in enumerate(book_info.parts):
427                 base, slug = part_url.rsplit('/', 1)
428                 try:
429                     child_book = Book.objects.get(slug=slug)
430                     child_book.parent = book
431                     child_book.parent_number = n
432                     child_book.save()
433                 except Book.DoesNotExist, e:
434                     raise Book.DoesNotExist(_('Book with slug = "%s" does not exist.') % slug)
435
436         # Save XML and HTML files
437         book.xml_file.save('%s.xml' % book.slug, raw_file, save=False)
438
439         # delete old fragments when overwriting
440         book.fragments.all().delete()
441
442         html_file = NamedTemporaryFile()
443         if html.transform(book.xml_file.path, html_file, parse_dublincore=False):
444             book.html_file.save('%s.html' % book.slug, File(html_file), save=False)
445
446             # get ancestor l-tags for adding to new fragments
447             ancestor_tags = []
448             p = book.parent
449             while p:
450                 ancestor_tags.append(p.book_tag())
451                 p = p.parent
452
453             # Extract fragments
454             closed_fragments, open_fragments = html.extract_fragments(book.html_file.path)
455             for fragment in closed_fragments.values():
456                 try:
457                     theme_names = [s.strip() for s in fragment.themes.split(',')]
458                 except AttributeError:
459                     continue
460                 themes = []
461                 for theme_name in theme_names:
462                     if not theme_name:
463                         continue
464                     tag, created = Tag.objects.get_or_create(slug=slughifi(theme_name), category='theme')
465                     if created:
466                         tag.name = theme_name
467                         tag.sort_key = slughifi(theme_name)
468                         tag.save()
469                     themes.append(tag)
470                 if not themes:
471                     continue
472
473                 text = fragment.to_string()
474                 short_text = ''
475                 if (len(MarkupString(text)) > 240):
476                     short_text = unicode(MarkupString(text)[:160])
477                 new_fragment, created = Fragment.objects.get_or_create(anchor=fragment.id, book=book,
478                     defaults={'text': text, 'short_text': short_text})
479
480                 new_fragment.save()
481                 new_fragment.tags = set(book_tags + themes + [book_tag] + ancestor_tags)
482
483         if build_epub and not book.parent:
484             print 'epub'
485             book.build_epub(remove_descendants=False)
486
487         book_descendants = list(book.children.all())
488         # add l-tag to descendants and their fragments
489         # delete unnecessary EPUB files
490         while len(book_descendants) > 0:
491             child_book = book_descendants.pop(0)
492             child_book.tags = list(child_book.tags) + [book_tag]
493             if child_book.has_epub_file():
494                 child_book.epub_file.delete()
495             child_book.save()
496             for fragment in child_book.fragments.all():
497                 fragment.tags = set(list(fragment.tags) + [book_tag])
498             book_descendants += list(child_book.children.all())
499
500         # refresh cache
501         book.reset_tag_counter()
502         book.reset_theme_counter()
503
504         book.save()
505         return book
506
507
508     def refresh_tag_counter(self):
509         tags = {}
510         for child in self.children.all().order_by():
511             for tag_pk, value in child.tag_counter.iteritems():
512                 tags[tag_pk] = tags.get(tag_pk, 0) + value
513         for tag in self.tags.exclude(category__in=('book', 'theme', 'set')).order_by():
514             tags[tag.pk] = 1
515         self.set__tag_counter_value(tags)
516         self.save(reset_short_html=False, refresh_mp3=False)
517         return tags
518
519     def reset_tag_counter(self):
520         self._tag_counter = None
521         self.save(reset_short_html=False, refresh_mp3=False)
522         if self.parent:
523             self.parent.reset_tag_counter()
524
525     @property
526     def tag_counter(self):
527         if self._tag_counter is None:
528             return self.refresh_tag_counter()
529         return dict((int(k), v) for k, v in self.get__tag_counter_value().iteritems())
530
531     def refresh_theme_counter(self):
532         tags = {}
533         for fragment in Fragment.tagged.with_any([self.book_tag()]).order_by():
534             for tag in fragment.tags.filter(category='theme').order_by():
535                 tags[tag.pk] = tags.get(tag.pk, 0) + 1
536         self.set__theme_counter_value(tags)
537         self.save(reset_short_html=False, refresh_mp3=False)
538         return tags
539
540     def reset_theme_counter(self):
541         self._theme_counter = None
542         self.save(reset_short_html=False, refresh_mp3=False)
543         if self.parent:
544             self.parent.reset_theme_counter()
545
546     @property
547     def theme_counter(self):
548         if self._theme_counter is None:
549             return self.refresh_theme_counter()
550         return dict((int(k), v) for k, v in self.get__theme_counter_value().iteritems())
551
552     def pretty_title(self, html_links=False):
553         book = self
554         names = list(book.tags.filter(category='author'))
555
556         books = []
557         while book:
558             books.append(book)
559             book = book.parent
560         names.extend(reversed(books))
561
562         if html_links:
563             names = ['<a href="%s">%s</a>' % (tag.get_absolute_url(), tag.name) for tag in names]
564         else:
565             names = [tag.name for tag in names]
566
567         return ', '.join(names)
568
569
570 class Fragment(models.Model):
571     text = models.TextField()
572     short_text = models.TextField(editable=False)
573     _short_html = models.TextField(editable=False)
574     anchor = models.CharField(max_length=120)
575     book = models.ForeignKey(Book, related_name='fragments')
576
577     objects = models.Manager()
578     tagged = managers.ModelTaggedItemManager(Tag)
579     tags = managers.TagDescriptor(Tag)
580
581     class Meta:
582         ordering = ('book', 'anchor',)
583         verbose_name = _('fragment')
584         verbose_name_plural = _('fragments')
585
586     def get_absolute_url(self):
587         return '%s#m%s' % (reverse('book_text', kwargs={'slug': self.book.slug}), self.anchor)
588
589     def short_html(self):
590         key = '_short_html_%s' % get_language()
591         short_html = getattr(self, key)
592         if short_html and len(short_html):
593             return mark_safe(short_html)
594         else:
595             setattr(self, key, unicode(render_to_string('catalogue/fragment_short.html',
596                 {'fragment': self})))
597             self.save()
598             return mark_safe(getattr(self, key))
599
600
601 class FileRecord(models.Model):
602     slug = models.SlugField(_('slug'), max_length=120, db_index=True)
603     type = models.CharField(_('type'), max_length=20, db_index=True)
604     sha1 = models.CharField(_('sha-1 hash'), max_length=40)
605     time = models.DateTimeField(_('time'), auto_now_add=True)
606
607     class Meta:
608         ordering = ('-time','-slug', '-type')
609         verbose_name = _('file record')
610         verbose_name_plural = _('file records')
611
612     def __unicode__(self):
613         return "%s %s.%s" % (self.sha1,  self.slug, self.type)
614
615
616 def _tags_updated_handler(sender, affected_tags, **kwargs):
617     # reset tag global counter
618     Tag.objects.filter(pk__in=[tag.pk for tag in affected_tags]).update(book_count=None)
619
620     # if book tags changed, reset book tag counter
621     if isinstance(sender, Book) and \
622                 Tag.objects.filter(pk__in=(tag.pk for tag in affected_tags)).\
623                     exclude(category__in=('book', 'theme', 'set')).count():
624         sender.reset_tag_counter()
625     # if fragment theme changed, reset book theme counter
626     elif isinstance(sender, Fragment) and \
627                 Tag.objects.filter(pk__in=(tag.pk for tag in affected_tags)).\
628                     filter(category='theme').count():
629         sender.book.reset_theme_counter()
630 tags_updated.connect(_tags_updated_handler)
631