some book import cleanup
[wolnelektury.git] / apps / catalogue / models.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.db import models
6 from django.db.models import permalink, Q
7 from django.utils.translation import ugettext_lazy as _
8 from django.contrib.auth.models import User
9 from django.core.files import File
10 from django.template.loader import render_to_string
11 from django.utils.safestring import mark_safe
12 from django.utils.translation import get_language
13 from django.core.urlresolvers import reverse
14 from datetime import datetime
15
16 from newtagging.models import TagBase, tags_updated
17 from newtagging import managers
18 from catalogue.fields import JSONField
19
20 from librarian import dcparser, html, epub, NoDublinCore
21 from mutagen import id3
22
23
24 TAG_CATEGORIES = (
25     ('author', _('author')),
26     ('epoch', _('epoch')),
27     ('kind', _('kind')),
28     ('genre', _('genre')),
29     ('theme', _('theme')),
30     ('set', _('set')),
31     ('book', _('book')),
32 )
33
34
35 class TagSubcategoryManager(models.Manager):
36     def __init__(self, subcategory):
37         super(TagSubcategoryManager, self).__init__()
38         self.subcategory = subcategory
39
40     def get_query_set(self):
41         return super(TagSubcategoryManager, self).get_query_set().filter(category=self.subcategory)
42
43
44 class Tag(TagBase):
45     name = models.CharField(_('name'), max_length=50, db_index=True)
46     slug = models.SlugField(_('slug'), max_length=120, db_index=True)
47     sort_key = models.SlugField(_('sort key'), max_length=120, db_index=True)
48     category = models.CharField(_('category'), max_length=50, blank=False, null=False,
49         db_index=True, choices=TAG_CATEGORIES)
50     description = models.TextField(_('description'), blank=True)
51     main_page = models.BooleanField(_('main page'), default=False, db_index=True, help_text=_('Show tag on main page'))
52
53     user = models.ForeignKey(User, blank=True, null=True)
54     book_count = models.IntegerField(_('book count'), blank=False, null=True)
55     death = models.IntegerField(_(u'year of death'), blank=True, null=True)
56     gazeta_link = models.CharField(blank=True, max_length=240)
57     wiki_link = models.CharField(blank=True, max_length=240)
58
59     categories_rev = {
60         'autor': 'author',
61         'epoka': 'epoch',
62         'rodzaj': 'kind',
63         'gatunek': 'genre',
64         'motyw': 'theme',
65         'polka': 'set',
66     }
67     categories_dict = dict((item[::-1] for item in categories_rev.iteritems()))
68
69     class Meta:
70         ordering = ('sort_key',)
71         verbose_name = _('tag')
72         verbose_name_plural = _('tags')
73         unique_together = (("slug", "category"),)
74
75     def __unicode__(self):
76         return self.name
77
78     def __repr__(self):
79         return "Tag(slug=%r)" % self.slug
80
81     @permalink
82     def get_absolute_url(self):
83         return ('catalogue.views.tagged_object_list', [self.url_chunk])
84
85     def has_description(self):
86         return len(self.description) > 0
87     has_description.short_description = _('description')
88     has_description.boolean = True
89
90     def alive(self):
91         return self.death is None
92
93     def in_pd(self):
94         """ tests whether an author is in public domain """
95         return self.death is not None and self.goes_to_pd() <= datetime.now().year
96
97     def goes_to_pd(self):
98         """ calculates the year of public domain entry for an author """
99         return self.death + 71 if self.death is not None else None
100
101     def get_count(self):
102         """ returns global book count for book tags, fragment count for themes """
103
104         if self.book_count is None:
105             if self.category == 'book':
106                 # never used
107                 objects = Book.objects.none()
108             elif self.category == 'theme':
109                 objects = Fragment.tagged.with_all((self,))
110             else:
111                 objects = Book.tagged.with_all((self,)).order_by()
112                 if self.category != 'set':
113                     # eliminate descendants
114                     l_tags = Tag.objects.filter(slug__in=[book.book_tag_slug() for book in objects])
115                     descendants_keys = [book.pk for book in Book.tagged.with_any(l_tags)]
116                     if descendants_keys:
117                         objects = objects.exclude(pk__in=descendants_keys)
118             self.book_count = objects.count()
119             self.save()
120         return self.book_count
121
122     @staticmethod
123     def get_tag_list(tags):
124         if isinstance(tags, basestring):
125             real_tags = []
126             ambiguous_slugs = []
127             category = None
128             tags_splitted = tags.split('/')
129             for index, name in enumerate(tags_splitted):
130                 if name in Tag.categories_rev:
131                     category = Tag.categories_rev[name]
132                 else:
133                     if category:
134                         real_tags.append(Tag.objects.get(slug=name, category=category))
135                         category = None
136                     else:
137                         try:
138                             real_tags.append(Tag.objects.exclude(category='book').get(slug=name))
139                         except Tag.MultipleObjectsReturned, e:
140                             ambiguous_slugs.append(name)
141
142             if category:
143                 # something strange left off
144                 raise Tag.DoesNotExist()
145             if ambiguous_slugs:
146                 # some tags should be qualified
147                 e = Tag.MultipleObjectsReturned()
148                 e.tags = real_tags
149                 e.ambiguous_slugs = ambiguous_slugs
150                 raise e
151             else:
152                 return real_tags
153         else:
154             return TagBase.get_tag_list(tags)
155
156     @property
157     def url_chunk(self):
158         return '/'.join((Tag.categories_dict[self.category], self.slug))
159
160
161 # TODO: why is this hard-coded ?
162 def book_upload_path(ext):
163     def get_dynamic_path(book, filename):
164         return 'lektura/%s.%s' % (book.slug, ext)
165     return get_dynamic_path
166
167
168 class Book(models.Model):
169     title = models.CharField(_('title'), max_length=120)
170     slug = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
171     description = models.TextField(_('description'), blank=True)
172     created_at = models.DateTimeField(_('creation date'), auto_now=True)
173     _short_html = models.TextField(_('short HTML'), editable=False)
174     parent_number = models.IntegerField(_('parent number'), default=0)
175     extra_info = JSONField(_('extra information'))
176     gazeta_link = models.CharField(blank=True, max_length=240)
177     wiki_link = models.CharField(blank=True, max_length=240)
178
179
180     # Formats
181     xml_file = models.FileField(_('XML file'), upload_to=book_upload_path('xml'), blank=True)
182     html_file = models.FileField(_('HTML file'), upload_to=book_upload_path('html'), blank=True)
183     pdf_file = models.FileField(_('PDF file'), upload_to=book_upload_path('pdf'), blank=True)
184     epub_file = models.FileField(_('EPUB file'), upload_to=book_upload_path('epub'), blank=True)
185     odt_file = models.FileField(_('ODT file'), upload_to=book_upload_path('odt'), blank=True)
186     txt_file = models.FileField(_('TXT file'), upload_to=book_upload_path('txt'), blank=True)
187     mp3_file = models.FileField(_('MP3 file'), upload_to=book_upload_path('mp3'), blank=True)
188     ogg_file = models.FileField(_('OGG file'), upload_to=book_upload_path('ogg'), blank=True)
189
190     parent = models.ForeignKey('self', blank=True, null=True, related_name='children')
191
192     objects = models.Manager()
193     tagged = managers.ModelTaggedItemManager(Tag)
194     tags = managers.TagDescriptor(Tag)
195
196     _tag_counter = JSONField(null=True, editable=False)
197     _theme_counter = JSONField(null=True, editable=False)
198
199     class AlreadyExists(Exception):
200         pass
201
202     class Meta:
203         ordering = ('title',)
204         verbose_name = _('book')
205         verbose_name_plural = _('books')
206
207     def __unicode__(self):
208         return self.title
209
210     def save(self, force_insert=False, force_update=False, reset_short_html=True, refresh_mp3=True, **kwargs):
211         if reset_short_html:
212             # Reset _short_html during save
213             update = {}
214             for key in filter(lambda x: x.startswith('_short_html'), self.__dict__):
215                 update[key] = ''
216                 self.__setattr__(key, '')
217             # Fragment.short_html relies on book's tags, so reset it here too
218             self.fragments.all().update(**update)
219
220         book = super(Book, self).save(force_insert, force_update)
221
222         if refresh_mp3 and self.mp3_file:
223             print self.mp3_file, self.mp3_file.path
224             extra_info = self.get_extra_info_value()
225             extra_info.update(self.get_mp3_info())
226             self.set_extra_info_value(extra_info)
227             book = super(Book, self).save(force_insert, force_update)
228
229         return book
230
231     @permalink
232     def get_absolute_url(self):
233         return ('catalogue.views.book_detail', [self.slug])
234
235     @property
236     def name(self):
237         return self.title
238
239     def book_tag_slug(self):
240         return ('l-' + self.slug)[:120]
241
242     def book_tag(self):
243         slug = self.book_tag_slug()
244         book_tag, created = Tag.objects.get_or_create(slug=slug, category='book')
245         if created:
246             book_tag.name = self.title[:50]
247             book_tag.sort_key = slug
248             book_tag.save()
249         return book_tag
250
251     def short_html(self):
252         key = '_short_html_%s' % get_language()
253         short_html = getattr(self, key)
254
255         if short_html and len(short_html):
256             return mark_safe(short_html)
257         else:
258             tags = self.tags.filter(~Q(category__in=('set', 'theme', 'book')))
259             tags = [mark_safe(u'<a href="%s">%s</a>' % (tag.get_absolute_url(), tag.name)) for tag in tags]
260
261             formats = []
262             if self.html_file:
263                 formats.append(u'<a href="%s">%s</a>' % (reverse('book_text', kwargs={'slug': self.slug}), _('Read online')))
264             if self.pdf_file:
265                 formats.append(u'<a href="%s">PDF</a>' % self.pdf_file.url)
266             if self.root_ancestor.epub_file:
267                 formats.append(u'<a href="%s">EPUB</a>' % self.root_ancestor.epub_file.url)
268             if self.odt_file:
269                 formats.append(u'<a href="%s">ODT</a>' % self.odt_file.url)
270             if self.txt_file:
271                 formats.append(u'<a href="%s">TXT</a>' % self.txt_file.url)
272             if self.mp3_file:
273                 formats.append(u'<a href="%s">MP3</a>' % self.mp3_file.url)
274             if self.ogg_file:
275                 formats.append(u'<a href="%s">OGG</a>' % self.ogg_file.url)
276
277             formats = [mark_safe(format) for format in formats]
278
279             setattr(self, key, unicode(render_to_string('catalogue/book_short.html',
280                 {'book': self, 'tags': tags, 'formats': formats})))
281             self.save(reset_short_html=False)
282             return mark_safe(getattr(self, key))
283
284
285     @property
286     def root_ancestor(self):
287         """ returns the oldest ancestor """
288
289         if not hasattr(self, '_root_ancestor'):
290             book = self
291             while book.parent:
292                 book = book.parent
293             self._root_ancestor = book
294         return self._root_ancestor
295
296
297     def get_mp3_info(self):
298         """Retrieves artist and director names from audio ID3 tags."""
299         audio = id3.ID3(self.mp3_file.path)
300         artist_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE1'))
301         director_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE3'))
302         return {'artist_name': artist_name, 'director_name': director_name}
303
304     def has_description(self):
305         return len(self.description) > 0
306     has_description.short_description = _('description')
307     has_description.boolean = True
308
309     def has_pdf_file(self):
310         return bool(self.pdf_file)
311     has_pdf_file.short_description = 'PDF'
312     has_pdf_file.boolean = True
313
314     def has_epub_file(self):
315         return bool(self.epub_file)
316     has_epub_file.short_description = 'EPUB'
317     has_epub_file.boolean = True
318
319     def has_odt_file(self):
320         return bool(self.odt_file)
321     has_odt_file.short_description = 'ODT'
322     has_odt_file.boolean = True
323
324     def has_html_file(self):
325         return bool(self.html_file)
326     has_html_file.short_description = 'HTML'
327     has_html_file.boolean = True
328
329     def build_epub(self, remove_descendants=True):
330         """ (Re)builds the epub file.
331             If book has a parent, does nothing.
332             Unless remove_descendants is False, descendants' epubs are removed.
333         """
334     
335         from StringIO import StringIO
336         from hashlib import sha1
337         from django.core.files.base import ContentFile
338         from librarian import DocProvider
339
340         class BookImportDocProvider(DocProvider):
341             """ used for joined EPUBs """
342
343             def __init__(self, book):
344                 self.book = book
345
346             def by_slug(self, slug):
347                 if slug == self.book.slug:
348                     return self.book.xml_file
349                 else:
350                     return Book.objects.get(slug=slug).xml_file
351
352         if self.parent:
353             # don't need an epub
354             return
355
356         epub_file = StringIO()
357         try:
358             epub.transform(BookImportDocProvider(self), self.slug, epub_file)
359             self.epub_file.save('%s.epub' % self.slug, ContentFile(epub_file.getvalue()), save=False)
360             FileRecord(slug=self.slug, type='epub', sha1=sha1(epub_file.getvalue()).hexdigest()).save()
361         except NoDublinCore:
362             pass
363
364         if remove_descendants:
365             book_descendants = list(self.children.all())
366             while len(book_descendants) > 0:
367                 child_book = book_descendants.pop(0)
368                 if child_book.has_epub_file():
369                     child_book.epub_file.delete()
370                 book_descendants += list(child_book.children.all())
371
372
373     @classmethod
374     def from_xml_file(cls, xml_file, overwrite=False):
375         # use librarian to parse meta-data
376         book_info = dcparser.parse(xml_file)
377
378         if not isinstance(xml_file, File):
379             xml_file = File(open(xml_file))
380
381         try:
382             return cls.from_text_and_meta(xml_file, book_info, overwrite)
383         finally:
384             xml_file.close()
385
386     @classmethod
387     def from_text_and_meta(cls, raw_file, book_info, overwrite=False):
388         from tempfile import NamedTemporaryFile
389         from slughifi import slughifi
390         from markupstring import MarkupString
391         from django.core.files.storage import default_storage
392
393         # Read book metadata
394         book_base, book_slug = book_info.url.rsplit('/', 1)
395         book, created = Book.objects.get_or_create(slug=book_slug)
396
397         if created:
398             book_shelves = []
399         else:
400             if not overwrite:
401                 raise Book.AlreadyExists(_('Book %s already exists') % book_slug)
402             # Save shelves for this book
403             book_shelves = list(book.tags.filter(category='set'))
404
405         book.title = book_info.title
406         book.set_extra_info_value(book_info.to_dict())
407         book._short_html = ''
408         book.save()
409
410         book_tags = []
411         categories = (('kinds', 'kind'), ('genres', 'genre'), ('authors', 'author'), ('epochs', 'epoch'))
412         for field_name, category in categories:
413             try:
414                 tag_names = getattr(book_info, field_name)
415             except:
416                 tag_names = [getattr(book_info, category)]
417             for tag_name in tag_names:
418                 tag_sort_key = tag_name
419                 if category == 'author':
420                     tag_sort_key = tag_name.last_name
421                     tag_name = ' '.join(tag_name.first_names) + ' ' + tag_name.last_name
422                 tag, created = Tag.objects.get_or_create(slug=slughifi(tag_name), category=category)
423                 if created:
424                     tag.name = tag_name
425                     tag.sort_key = slughifi(tag_sort_key)
426                     tag.save()
427                 book_tags.append(tag)
428
429         book.tags = book_tags + book_shelves
430
431         book_tag = book.book_tag()
432
433         if hasattr(book_info, 'parts'):
434             for n, part_url in enumerate(book_info.parts):
435                 base, slug = part_url.rsplit('/', 1)
436                 try:
437                     child_book = Book.objects.get(slug=slug)
438                     child_book.parent = book
439                     child_book.parent_number = n
440                     child_book.save()
441                 except Book.DoesNotExist, e:
442                     raise Book.DoesNotExist(_('Book with slug = "%s" does not exist.') % slug)
443
444         # Save XML and HTML files
445         book.xml_file.save('%s.xml' % book.slug, raw_file, save=False)
446
447         html_file = NamedTemporaryFile()
448         if html.transform(book.xml_file.path, html_file, parse_dublincore=False):
449             book.html_file.save('%s.html' % book.slug, File(html_file), save=False)
450
451             # Extract fragments
452             closed_fragments, open_fragments = html.extract_fragments(book.html_file.path)
453             for fragment in closed_fragments.values():
454                 try:
455                     theme_names = [s.strip() for s in fragment.themes.split(',')]
456                 except AttributeError:
457                     continue
458                 themes = []
459                 for theme_name in theme_names:
460                     if not theme_name:
461                         continue
462                     tag, created = Tag.objects.get_or_create(slug=slughifi(theme_name), category='theme')
463                     if created:
464                         tag.name = theme_name
465                         tag.sort_key = slughifi(theme_name)
466                         tag.save()
467                     themes.append(tag)
468                 if not themes:
469                     continue
470
471                 text = fragment.to_string()
472                 short_text = ''
473                 if (len(MarkupString(text)) > 240):
474                     short_text = unicode(MarkupString(text)[:160])
475                 new_fragment, created = Fragment.objects.get_or_create(anchor=fragment.id, book=book,
476                     defaults={'text': text, 'short_text': short_text})
477
478                 new_fragment.save()
479                 new_fragment.tags = set(book_tags + themes + [book_tag])
480
481         book.build_epub(remove_descendants=False)
482
483         book_descendants = list(book.children.all())
484         # add l-tag to descendants and their fragments
485         # delete unnecessary EPUB files
486         while len(book_descendants) > 0:
487             child_book = book_descendants.pop(0)
488             child_book.tags = list(child_book.tags) + [book_tag]
489             if child_book.has_epub_file():
490                 child_book.epub_file.delete()
491             child_book.save()
492             for fragment in child_book.fragments.all():
493                 fragment.tags = set(list(fragment.tags) + [book_tag])
494             book_descendants += list(child_book.children.all())
495
496         # refresh cache
497         book.tag_counter
498         book.theme_counter
499
500         book.save()
501         return book
502
503
504     def refresh_tag_counter(self):
505         tags = {}
506         for child in self.children.all().order_by():
507             for tag_pk, value in child.tag_counter.iteritems():
508                 tags[tag_pk] = tags.get(tag_pk, 0) + value
509         for tag in self.tags.exclude(category__in=('book', 'theme', 'set')).order_by():
510             tags[tag.pk] = 1
511         self.set__tag_counter_value(tags)
512         self.save(reset_short_html=False, refresh_mp3=False)
513         return tags
514
515     def reset_tag_counter(self):
516         self._tag_counter = None
517         self.save(reset_short_html=False, refresh_mp3=False)
518         if self.parent:
519             self.parent.reset_tag_counter()
520
521     @property
522     def tag_counter(self):
523         if self._tag_counter is None:
524             return self.refresh_tag_counter()
525         return dict((int(k), v) for k, v in self.get__tag_counter_value().iteritems())
526
527     def refresh_theme_counter(self):
528         tags = {}
529         for fragment in Fragment.tagged.with_any([self.book_tag()]).order_by():
530             for tag in fragment.tags.filter(category='theme').order_by():
531                 tags[tag.pk] = tags.get(tag.pk, 0) + 1
532         self.set__theme_counter_value(tags)
533         self.save(reset_short_html=False, refresh_mp3=False)
534         return tags
535
536     def reset_theme_counter(self):
537         self._theme_counter = None
538         self.save(reset_short_html=False, refresh_mp3=False)
539         if self.parent:
540             self.parent.reset_theme_counter()
541
542     @property
543     def theme_counter(self):
544         if self._theme_counter is None:
545             return self.refresh_theme_counter()
546         return dict((int(k), v) for k, v in self.get__theme_counter_value().iteritems())
547
548
549
550 class Fragment(models.Model):
551     text = models.TextField()
552     short_text = models.TextField(editable=False)
553     _short_html = models.TextField(editable=False)
554     anchor = models.CharField(max_length=120)
555     book = models.ForeignKey(Book, related_name='fragments')
556
557     objects = models.Manager()
558     tagged = managers.ModelTaggedItemManager(Tag)
559     tags = managers.TagDescriptor(Tag)
560
561     class Meta:
562         ordering = ('book', 'anchor',)
563         verbose_name = _('fragment')
564         verbose_name_plural = _('fragments')
565
566     def get_absolute_url(self):
567         return '%s#m%s' % (reverse('book_text', kwargs={'slug': self.book.slug}), self.anchor)
568
569     def short_html(self):
570         key = '_short_html_%s' % get_language()
571         short_html = getattr(self, key)
572         if short_html and len(short_html):
573             return mark_safe(short_html)
574         else:
575             setattr(self, key, unicode(render_to_string('catalogue/fragment_short.html',
576                 {'fragment': self})))
577             self.save()
578             return mark_safe(getattr(self, key))
579
580
581 class BookStub(models.Model):
582     title = models.CharField(_('title'), max_length=120)
583     author = models.CharField(_('author'), max_length=120)
584     pd = models.IntegerField(_('goes to public domain'), null=True, blank=True)
585     slug = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
586     translator = models.TextField(_('translator'), blank=True)
587     translator_death = models.TextField(_('year of translator\'s death'), blank=True)
588
589     class Meta:
590         ordering = ('title',)
591         verbose_name = _('book stub')
592         verbose_name_plural = _('book stubs')
593
594     def __unicode__(self):
595         return self.title
596
597     @permalink
598     def get_absolute_url(self):
599         return ('catalogue.views.book_detail', [self.slug])
600
601     def in_pd(self):
602         return self.pd is not None and self.pd <= datetime.now().year
603
604     @property
605     def name(self):
606         return self.title
607
608
609 class FileRecord(models.Model):
610     slug = models.SlugField(_('slug'), max_length=120, db_index=True)
611     type = models.CharField(_('type'), max_length=20, db_index=True)
612     sha1 = models.CharField(_('sha-1 hash'), max_length=40)
613     time = models.DateTimeField(_('time'), auto_now_add=True)
614
615     class Meta:
616         ordering = ('-time','-slug', '-type')
617         verbose_name = _('file record')
618         verbose_name_plural = _('file records')
619
620     def __unicode__(self):
621         return "%s %s.%s" % (self.sha1,  self.slug, self.type)
622
623
624 def _tags_updated_handler(sender, affected_tags, **kwargs):
625     # reset tag global counter
626     Tag.objects.filter(pk__in=[tag.pk for tag in affected_tags]).update(book_count=None)
627
628     # if book tags changed, reset book tag counter
629     if isinstance(sender, Book) and \
630                 Tag.objects.filter(pk__in=(tag.pk for tag in affected_tags)).\
631                     exclude(category__in=('book', 'theme', 'set')).count():
632         sender.reset_tag_counter()
633     # if fragment theme changed, reset book theme counter
634     elif isinstance(sender, Fragment) and \
635                 Tag.objects.filter(pk__in=(tag.pk for tag in affected_tags)).\
636                     filter(category='theme').count():
637         sender.book.reset_theme_counter()
638 tags_updated.connect(_tags_updated_handler)
639