new librarian sub-module: custom pdfs

[wolnelektury.git] / apps / catalogue / models.py
diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py

index 84bcfd3..1a2e8f8 100644 (file)
--- a/apps/catalogue/models.py
+++ b/apps/catalogue/models.py
@@ -2,8 +2,12 @@
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
+from datetime import datetime
+
  from django.db import models
  from django.db.models import permalink, Q
  from django.db import models
  from django.db.models import permalink, Q
+import django.dispatch
+from django.core.cache import cache
  from django.utils.translation import ugettext_lazy as _
  from django.contrib.auth.models import User
  from django.core.files import File
  from django.utils.translation import ugettext_lazy as _
  from django.contrib.auth.models import User
  from django.core.files import File
@@ -17,13 +21,17 @@ from django.conf import settings
  
  from newtagging.models import TagBase, tags_updated
  from newtagging import managers
  
  from newtagging.models import TagBase, tags_updated
  from newtagging import managers
-from catalogue.fields import JSONField
+from catalogue.fields import JSONField, OverwritingFileField
+from catalogue.utils import ExistingFile, ORMDocProvider, create_zip, remove_zip
  
  from librarian import dcparser, html, epub, NoDublinCore
  import mutagen
  from mutagen import id3
  from slughifi import slughifi
  
  from librarian import dcparser, html, epub, NoDublinCore
  import mutagen
  from mutagen import id3
  from slughifi import slughifi
+from sortify import sortify
+from os import unlink
  
  
+import search
  
  TAG_CATEGORIES = (
      ('author', _('author')),
  
  TAG_CATEGORIES = (
      ('author', _('author')),
@@ -42,6 +50,10 @@ MEDIA_FORMATS = (
      ('daisy', _('DAISY file')), 
  )
  
      ('daisy', _('DAISY file')), 
  )
  
+# not quite, but Django wants you to set a timeout
+CACHE_FOREVER = 2419200  # 28 days
+
+
  class TagSubcategoryManager(models.Manager):
      def __init__(self, subcategory):
          super(TagSubcategoryManager, self).__init__()
  class TagSubcategoryManager(models.Manager):
      def __init__(self, subcategory):
          super(TagSubcategoryManager, self).__init__()
@@ -65,6 +77,9 @@ class Tag(TagBase):
      gazeta_link = models.CharField(blank=True, max_length=240)
      wiki_link = models.CharField(blank=True, max_length=240)
  
      gazeta_link = models.CharField(blank=True, max_length=240)
      wiki_link = models.CharField(blank=True, max_length=240)
  
+    created_at    = models.DateTimeField(_('creation date'), auto_now_add=True, db_index=True)
+    changed_at    = models.DateTimeField(_('creation date'), auto_now=True, db_index=True)
+
      class UrlDeprecationWarning(DeprecationWarning):
          pass
  
      class UrlDeprecationWarning(DeprecationWarning):
          pass
  
@@ -176,14 +191,14 @@ def book_upload_path(ext=None, maxlen=100):
              name = slughifi(filename.split(".")[0])
          else:
              name = slughifi(media.name)
              name = slughifi(filename.split(".")[0])
          else:
              name = slughifi(media.name)
-        return 'lektura/%s.%s' % (name[:maxlen-len('lektura/.%s' % ext)-4], ext)
+        return 'book/%s/%s.%s' % (ext, name[:maxlen-len('book/%s/.%s' % (ext, ext))-4], ext)
      return get_dynamic_path
  
  
  class BookMedia(models.Model):
      type        = models.CharField(_('type'), choices=MEDIA_FORMATS, max_length="100")
      name        = models.CharField(_('name'), max_length="100")
      return get_dynamic_path
  
  
  class BookMedia(models.Model):
      type        = models.CharField(_('type'), choices=MEDIA_FORMATS, max_length="100")
      name        = models.CharField(_('name'), max_length="100")
-    file        = models.FileField(_('file'), upload_to=book_upload_path())
+    file        = OverwritingFileField(_('file'), upload_to=book_upload_path())
      uploaded_at = models.DateTimeField(_('creation date'), auto_now_add=True, editable=False)
      extra_info  = JSONField(_('extra information'), default='{}', editable=False)
      book = models.ForeignKey('Book', related_name='media')
      uploaded_at = models.DateTimeField(_('creation date'), auto_now_add=True, editable=False)
      extra_info  = JSONField(_('extra information'), default='{}', editable=False)
      book = models.ForeignKey('Book', related_name='media')
@@ -198,7 +213,20 @@ class BookMedia(models.Model):
          verbose_name_plural = _('book media')
  
      def save(self, *args, **kwargs):
          verbose_name_plural = _('book media')
  
      def save(self, *args, **kwargs):
+        try:
+            old = BookMedia.objects.get(pk=self.pk)
+        except BookMedia.DoesNotExist, e:
+            pass
+        else:
+            # if name changed, change the file name, too
+            if slughifi(self.name) != slughifi(old.name):
+                self.file.save(None, ExistingFile(self.file.path), save=False, leave=True)
+
          super(BookMedia, self).save(*args, **kwargs)
          super(BookMedia, self).save(*args, **kwargs)
+
+        # remove the zip package for book with modified media
+        remove_zip(self.book.slug)
+
          extra_info = self.get_extra_info_value()
          extra_info.update(self.read_meta())
          self.set_extra_info_value(extra_info)
          extra_info = self.get_extra_info_value()
          extra_info.update(self.read_meta())
          self.set_extra_info_value(extra_info)
@@ -210,12 +238,16 @@ class BookMedia(models.Model):
              Reads some metadata from the audiobook.
          """
  
              Reads some metadata from the audiobook.
          """
  
-        artist_name = director_name = ''
+        artist_name = director_name = project = funded_by = ''
          if self.type == 'mp3':
              try:
                  audio = id3.ID3(self.file.path)
                  artist_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE1'))
                  director_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE3'))
          if self.type == 'mp3':
              try:
                  audio = id3.ID3(self.file.path)
                  artist_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE1'))
                  director_name = ', '.join(', '.join(tag.text) for tag in audio.getall('TPE3'))
+                project = ", ".join([t.data for t in audio.getall('PRIV') 
+                        if t.owner=='wolnelektury.pl?project'])
+                funded_by = ", ".join([t.data for t in audio.getall('PRIV') 
+                        if t.owner=='wolnelektury.pl?funded_by'])
              except:
                  pass
          elif self.type == 'ogg':
              except:
                  pass
          elif self.type == 'ogg':
@@ -223,11 +255,14 @@ class BookMedia(models.Model):
                  audio = mutagen.File(self.file.path)
                  artist_name = ', '.join(audio.get('artist', []))
                  director_name = ', '.join(audio.get('conductor', []))
                  audio = mutagen.File(self.file.path)
                  artist_name = ', '.join(audio.get('artist', []))
                  director_name = ', '.join(audio.get('conductor', []))
+                project = ", ".join(audio.get('project', []))
+                funded_by = ", ".join(audio.get('funded_by', []))
              except:
                  pass
          else:
              return {}
              except:
                  pass
          else:
              return {}
-        return {'artist_name': artist_name, 'director_name': director_name}
+        return {'artist_name': artist_name, 'director_name': director_name,
+                'project': project, 'funded_by': funded_by}
  
      @staticmethod
      def read_source_sha1(filepath, filetype):
  
      @staticmethod
      def read_source_sha1(filepath, filetype):
@@ -239,7 +274,7 @@ class BookMedia(models.Model):
              try:
                  audio = id3.ID3(filepath)
                  return [t.data for t in audio.getall('PRIV') 
              try:
                  audio = id3.ID3(filepath)
                  return [t.data for t in audio.getall('PRIV') 
-                        if t.owner=='http://wolnelektury.pl?flac_sha1'][0]
+                        if t.owner=='wolnelektury.pl?flac_sha1'][0]
              except:
                  return None
          elif filetype == 'ogg':
              except:
                  return None
          elif filetype == 'ogg':
@@ -254,34 +289,31 @@ class BookMedia(models.Model):
  
  class Book(models.Model):
      title         = models.CharField(_('title'), max_length=120)
  
  class Book(models.Model):
      title         = models.CharField(_('title'), max_length=120)
+    sort_key = models.CharField(_('sort key'), max_length=120, db_index=True, editable=False)
      slug          = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
      description   = models.TextField(_('description'), blank=True)
      slug          = models.SlugField(_('slug'), max_length=120, unique=True, db_index=True)
      description   = models.TextField(_('description'), blank=True)
-    created_at    = models.DateTimeField(_('creation date'), auto_now_add=True)
-    _short_html   = models.TextField(_('short HTML'), editable=False)
+    created_at    = models.DateTimeField(_('creation date'), auto_now_add=True, db_index=True)
+    changed_at    = models.DateTimeField(_('creation date'), auto_now=True, db_index=True)
      parent_number = models.IntegerField(_('parent number'), default=0)
      parent_number = models.IntegerField(_('parent number'), default=0)
-    extra_info    = JSONField(_('extra information'))
+    extra_info    = JSONField(_('extra information'), default='{}')
      gazeta_link   = models.CharField(blank=True, max_length=240)
      wiki_link     = models.CharField(blank=True, max_length=240)
      # files generated during publication
      gazeta_link   = models.CharField(blank=True, max_length=240)
      wiki_link     = models.CharField(blank=True, max_length=240)
      # files generated during publication
-    xml_file      = models.FileField(_('XML file'), upload_to=book_upload_path('xml'), blank=True)
-    html_file     = models.FileField(_('HTML file'), upload_to=book_upload_path('html'), blank=True)
-    pdf_file      = models.FileField(_('PDF file'), upload_to=book_upload_path('pdf'), blank=True)
-    epub_file     = models.FileField(_('EPUB file'), upload_to=book_upload_path('epub'), blank=True)    
-    txt_file      = models.FileField(_('TXT file'), upload_to=book_upload_path('txt'), blank=True)        
  
  
+    file_types = ['epub', 'html', 'mobi', 'pdf', 'txt', 'xml']
+    
      parent        = models.ForeignKey('self', blank=True, null=True, related_name='children')
      objects  = models.Manager()
      tagged   = managers.ModelTaggedItemManager(Tag)
      tags     = managers.TagDescriptor(Tag)
  
      parent        = models.ForeignKey('self', blank=True, null=True, related_name='children')
      objects  = models.Manager()
      tagged   = managers.ModelTaggedItemManager(Tag)
      tags     = managers.TagDescriptor(Tag)
  
-    _tag_counter = JSONField(null=True, editable=False)
-    _theme_counter = JSONField(null=True, editable=False)
+    html_built = django.dispatch.Signal()
  
      class AlreadyExists(Exception):
          pass
  
      class Meta:
  
      class AlreadyExists(Exception):
          pass
  
      class Meta:
-        ordering = ('title',)
+        ordering = ('sort_key',)
          verbose_name = _('book')
          verbose_name_plural = _('books')
  
          verbose_name = _('book')
          verbose_name_plural = _('books')
  
@@ -289,16 +321,14 @@ class Book(models.Model):
          return self.title
  
      def save(self, force_insert=False, force_update=False, reset_short_html=True, **kwargs):
          return self.title
  
      def save(self, force_insert=False, force_update=False, reset_short_html=True, **kwargs):
+        self.sort_key = sortify(self.title)
+
+        ret = super(Book, self).save(force_insert, force_update)
+
          if reset_short_html:
          if reset_short_html:
-            # Reset _short_html during save
-            update = {}
-            for key in filter(lambda x: x.startswith('_short_html'), self.__dict__):
-                update[key] = ''
-                self.__setattr__(key, '')
-            # Fragment.short_html relies on book's tags, so reset it here too
-            self.fragments.all().update(**update)
+            self.reset_short_html()
  
  
-        return super(Book, self).save(force_insert, force_update)
+        return ret
  
      @permalink
      def get_absolute_url(self):
  
      @permalink
      def get_absolute_url(self):
@@ -321,49 +351,15 @@ class Book(models.Model):
          return book_tag
  
      def has_media(self, type):
          return book_tag
  
      def has_media(self, type):
-        if   type == 'xml':
-            if self.xml_file:
-                return True
-            else:
-                return False
-        elif type == 'html':
-            if self.html_file:
-                return True
-            else:
-                return False        
-        elif type == 'txt':
-            if self.txt_file:
-                return True
-            else:
-                return False        
-        elif type == 'pdf':
-            if self.pdf_file:
-                return True
-            else:
-                return False  
-        elif type == 'epub':
-            if self.epub_file:
-                return True
-            else:
-                return False                          
+        if type in Book.file_types:
+            return bool(getattr(self, "%s_file" % type))
          else:
          else:
-            if self.media.filter(type=type).exists():
-                return True
-            else:
-                return False
+            return self.media.filter(type=type).exists()
  
      def get_media(self, type):
          if self.has_media(type):
  
      def get_media(self, type):
          if self.has_media(type):
-            if   type == "xml":
-                return self.xml_file
-            elif type == "html":
-                return self.html_file
-            elif type == "epub":
-                return self.epub_file
-            elif type == "txt":
-                return self.txt_file
-            elif type == "pdf":
-                return self.pdf_file
+            if type in Book.file_types:
+                return getattr(self, "%s_file" % type)
              else:                                             
                  return self.media.filter(type=type)
          else:
              else:                                             
                  return self.media.filter(type=type)
          else:
@@ -378,22 +374,38 @@ class Book(models.Model):
      def get_daisy(self):
          return self.get_media("daisy")                       
  
      def get_daisy(self):
          return self.get_media("daisy")                       
  
+    def reset_short_html(self):
+        if self.id is None:
+            return
+
+        cache_key = "Book.short_html/%d/%s"
+        for lang, langname in settings.LANGUAGES:
+            cache.delete(cache_key % (self.id, lang))
+        # Fragment.short_html relies on book's tags, so reset it here too
+        for fragm in self.fragments.all():
+            fragm.reset_short_html()
+
      def short_html(self):
      def short_html(self):
-        key = '_short_html_%s' % get_language()
-        short_html = getattr(self, key)
+        if self.id:
+            cache_key = "Book.short_html/%d/%s" % (self.id, get_language())
+            short_html = cache.get(cache_key)
+        else:
+            short_html = None
  
  
-        if short_html and len(short_html):
+        if short_html is not None:
              return mark_safe(short_html)
          else:
              tags = self.tags.filter(~Q(category__in=('set', 'theme', 'book')))
              tags = [mark_safe(u'<a href="%s">%s</a>' % (tag.get_absolute_url(), tag.name)) for tag in tags]
  
              formats = []
              return mark_safe(short_html)
          else:
              tags = self.tags.filter(~Q(category__in=('set', 'theme', 'book')))
              tags = [mark_safe(u'<a href="%s">%s</a>' % (tag.get_absolute_url(), tag.name)) for tag in tags]
  
              formats = []
-            # files generated during publication               
+            # files generated during publication
              if self.has_media("html"):
                  formats.append(u'<a href="%s">%s</a>' % (reverse('book_text', kwargs={'slug': self.slug}), _('Read online')))
              if self.has_media("pdf"):
                  formats.append(u'<a href="%s">PDF</a>' % self.get_media('pdf').url)
              if self.has_media("html"):
                  formats.append(u'<a href="%s">%s</a>' % (reverse('book_text', kwargs={'slug': self.slug}), _('Read online')))
              if self.has_media("pdf"):
                  formats.append(u'<a href="%s">PDF</a>' % self.get_media('pdf').url)
+            if self.has_media("mobi"):
+                formats.append(u'<a href="%s">MOBI</a>' % self.get_media('mobi').url)
              if self.root_ancestor.has_media("epub"):
                  formats.append(u'<a href="%s">EPUB</a>' % self.root_ancestor.get_media('epub').url)
              if self.has_media("txt"):
              if self.root_ancestor.has_media("epub"):
                  formats.append(u'<a href="%s">EPUB</a>' % self.root_ancestor.get_media('epub').url)
              if self.has_media("txt"):
@@ -404,11 +416,12 @@ class Book(models.Model):
  
              formats = [mark_safe(format) for format in formats]
  
  
              formats = [mark_safe(format) for format in formats]
  
-            setattr(self, key, unicode(render_to_string('catalogue/book_short.html',
-                {'book': self, 'tags': tags, 'formats': formats})))
-            self.save(reset_short_html=False)
-            return mark_safe(getattr(self, key))
+            short_html = unicode(render_to_string('catalogue/book_short.html',
+                {'book': self, 'tags': tags, 'formats': formats}))
  
  
+            if self.id:
+                cache.set(cache_key, short_html, CACHE_FOREVER)
+            return mark_safe(short_html)
  
      @property
      def root_ancestor(self):
  
      @property
      def root_ancestor(self):
@@ -428,26 +441,6 @@ class Book(models.Model):
      has_description.boolean = True
  
      # ugly ugly ugly
      has_description.boolean = True
  
      # ugly ugly ugly
-    def has_pdf_file(self):
-        return bool(self.pdf_file)
-    has_pdf_file.short_description = 'PDF'
-    has_pdf_file.boolean = True
-
-    def has_epub_file(self):
-        return bool(self.epub_file)
-    has_epub_file.short_description = 'EPUB'
-    has_epub_file.boolean = True
-
-    def has_txt_file(self):
-        return bool(self.txt_file)
-    has_txt_file.short_description = 'HTML'
-    has_txt_file.boolean = True
-
-    def has_html_file(self):
-        return bool(self.html_file)
-    has_html_file.short_description = 'HTML'
-    has_html_file.boolean = True
-
      def has_odt_file(self):
          return bool(self.has_media("odt"))
      has_odt_file.short_description = 'ODT'
      def has_odt_file(self):
          return bool(self.has_media("odt"))
      has_odt_file.short_description = 'ODT'
@@ -462,34 +455,64 @@ class Book(models.Model):
          return bool(self.has_media("ogg"))
      has_ogg_file.short_description = 'OGG'
      has_ogg_file.boolean = True
          return bool(self.has_media("ogg"))
      has_ogg_file.short_description = 'OGG'
      has_ogg_file.boolean = True
-    
+
      def has_daisy_file(self):
          return bool(self.has_media("daisy"))
      has_daisy_file.short_description = 'DAISY'
      def has_daisy_file(self):
          return bool(self.has_media("daisy"))
      has_daisy_file.short_description = 'DAISY'
-    has_daisy_file.boolean = True    
-    
+    has_daisy_file.boolean = True
+
+    def build_pdf(self):
+        """ (Re)builds the pdf file.
+
+        """
+        from librarian import pdf
+        from tempfile import NamedTemporaryFile
+        import os
+
+        try:
+            pdf_file = NamedTemporaryFile(delete=False)
+            pdf.transform(ORMDocProvider(self),
+                      file_path=str(self.xml_file.path),
+                      output_file=pdf_file,
+                      )
+
+            self.pdf_file.save('%s.pdf' % self.slug, File(open(pdf_file.name)))
+        finally:
+            unlink(pdf_file.name)
+
+        # remove zip with all pdf files
+        remove_zip(settings.ALL_PDF_ZIP)
+
+    def build_mobi(self):
+        """ (Re)builds the MOBI file.
+
+        """
+        from librarian import mobi
+        from tempfile import NamedTemporaryFile
+        import os
+
+        try:
+            mobi_file = NamedTemporaryFile(suffix='.mobi', delete=False)
+            mobi.transform(ORMDocProvider(self), verbose=1,
+                      file_path=str(self.xml_file.path),
+                      output_file=mobi_file.name,
+                      )
+
+            self.mobi_file.save('%s.mobi' % self.slug, File(open(mobi_file.name)))
+        finally:
+            unlink(mobi_file.name)
+
+        # remove zip with all mobi files
+        remove_zip(settings.ALL_MOBI_ZIP)
+
      def build_epub(self, remove_descendants=True):
          """ (Re)builds the epub file.
              If book has a parent, does nothing.
              Unless remove_descendants is False, descendants' epubs are removed.
          """
      def build_epub(self, remove_descendants=True):
          """ (Re)builds the epub file.
              If book has a parent, does nothing.
              Unless remove_descendants is False, descendants' epubs are removed.
          """
-    
          from StringIO import StringIO
          from hashlib import sha1
          from django.core.files.base import ContentFile
          from StringIO import StringIO
          from hashlib import sha1
          from django.core.files.base import ContentFile
-        from librarian import DocProvider
-
-        class BookImportDocProvider(DocProvider):
-            """ used for joined EPUBs """
-
-            def __init__(self, book):
-                self.book = book
-
-            def by_slug(self, slug):
-                if slug == self.book.slug:
-                    return self.book.xml_file
-                else:
-                    return Book.objects.get(slug=slug).xml_file
  
          if self.parent:
              # don't need an epub
  
          if self.parent:
              # don't need an epub
@@ -497,7 +520,7 @@ class Book(models.Model):
  
          epub_file = StringIO()
          try:
  
          epub_file = StringIO()
          try:
-            epub.transform(BookImportDocProvider(self), self.slug, output_file=epub_file)
+            epub.transform(ORMDocProvider(self), self.slug, output_file=epub_file)
              self.epub_file.save('%s.epub' % self.slug, ContentFile(epub_file.getvalue()))
              FileRecord(slug=self.slug, type='epub', sha1=sha1(epub_file.getvalue()).hexdigest()).save()
          except NoDublinCore:
              self.epub_file.save('%s.epub' % self.slug, ContentFile(epub_file.getvalue()))
              FileRecord(slug=self.slug, type='epub', sha1=sha1(epub_file.getvalue()).hexdigest()).save()
          except NoDublinCore:
@@ -512,6 +535,9 @@ class Book(models.Model):
              child_book.save()
              book_descendants += list(child_book.children.all())
  
              child_book.save()
              book_descendants += list(child_book.children.all())
  
+        # remove zip package with all epub files
+        remove_zip(settings.ALL_EPUB_ZIP)
+
      def build_txt(self):
          from StringIO import StringIO
          from django.core.files.base import ContentFile
      def build_txt(self):
          from StringIO import StringIO
          from django.core.files.base import ContentFile
@@ -520,9 +546,100 @@ class Book(models.Model):
          out = StringIO()
          text.transform(open(self.xml_file.path), out)
          self.txt_file.save('%s.txt' % self.slug, ContentFile(out.getvalue()))
          out = StringIO()
          text.transform(open(self.xml_file.path), out)
          self.txt_file.save('%s.txt' % self.slug, ContentFile(out.getvalue()))
-        self.save()
  
  
  
  
+    def build_html(self):
+        from tempfile import NamedTemporaryFile
+        from markupstring import MarkupString
+
+        meta_tags = list(self.tags.filter(
+            category__in=('author', 'epoch', 'genre', 'kind')))
+        book_tag = self.book_tag()
+
+        html_file = NamedTemporaryFile()
+        if html.transform(self.xml_file.path, html_file, parse_dublincore=False):
+            self.html_file.save('%s.html' % self.slug, File(html_file))
+
+            # get ancestor l-tags for adding to new fragments
+            ancestor_tags = []
+            p = self.parent
+            while p:
+                ancestor_tags.append(p.book_tag())
+                p = p.parent
+
+            # Delete old fragments and create them from scratch
+            self.fragments.all().delete()
+            # Extract fragments
+            closed_fragments, open_fragments = html.extract_fragments(self.html_file.path)
+            for fragment in closed_fragments.values():
+                try:
+                    theme_names = [s.strip() for s in fragment.themes.split(',')]
+                except AttributeError:
+                    continue
+                themes = []
+                for theme_name in theme_names:
+                    if not theme_name:
+                        continue
+                    tag, created = Tag.objects.get_or_create(slug=slughifi(theme_name), category='theme')
+                    if created:
+                        tag.name = theme_name
+                        tag.sort_key = theme_name.lower()
+                        tag.save()
+                    themes.append(tag)
+                if not themes:
+                    continue
+
+                text = fragment.to_string()
+                short_text = ''
+                if (len(MarkupString(text)) > 240):
+                    short_text = unicode(MarkupString(text)[:160])
+                new_fragment = Fragment.objects.create(anchor=fragment.id, book=self,
+                    text=text, short_text=short_text)
+
+                new_fragment.save()
+                new_fragment.tags = set(meta_tags + themes + [book_tag] + ancestor_tags)
+            self.save()
+            self.html_built.send(sender=self)
+            return True
+        return False
+
+    @staticmethod
+    def zip_format(format_):
+        def pretty_file_name(book):
+            return "%s/%s.%s" % (
+                b.get_extra_info_value()['author'],
+                b.slug,
+                format_)
+
+        field_name = "%s_file" % format_
+        books = Book.objects.filter(parent=None).exclude(**{field_name: ""})
+        paths = [(pretty_file_name(b), getattr(b, field_name).path)
+                    for b in books]
+        result = create_zip.delay(paths,
+                    getattr(settings, "ALL_%s_ZIP" % format_.upper()))
+        return result.wait()
+
+    def zip_audiobooks(self):
+        bm = BookMedia.objects.filter(book=self, type='mp3')
+        paths = map(lambda bm: (None, bm.file.path), bm)
+        result = create_zip.delay(paths, self.slug)
+        return result.wait()
+
+    def search_index(self):
+        if settings.SEARCH_INDEX_PARALLEL:
+            if instance(settings.SEARCH_INDEX_PARALLEL, int):
+                idx = search.ReusableIndex(threads=4)
+            else:
+                idx = search.ReusableIndex()
+        else:
+            idx = search.Index()
+            
+        idx.open()
+        try:
+            idx.index_book(self)
+        finally:
+            idx.close()
+
      @classmethod
      def from_xml_file(cls, xml_file, **kwargs):
          # use librarian to parse meta-data
      @classmethod
      def from_xml_file(cls, xml_file, **kwargs):
          # use librarian to parse meta-data
@@ -537,11 +654,10 @@ class Book(models.Model):
              xml_file.close()
  
      @classmethod
              xml_file.close()
  
      @classmethod
-    def from_text_and_meta(cls, raw_file, book_info, overwrite=False, build_epub=True, build_txt=True):
+    def from_text_and_meta(cls, raw_file, book_info, overwrite=False,
+            build_epub=True, build_txt=True, build_pdf=True, build_mobi=True,
+            search_index=True):
          import re
          import re
-        from tempfile import NamedTemporaryFile
-        from markupstring import MarkupString
-        from django.core.files.storage import default_storage
  
          # check for parts before we do anything
          children = []
  
          # check for parts before we do anything
          children = []
@@ -570,10 +686,9 @@ class Book(models.Model):
  
          book.title = book_info.title
          book.set_extra_info_value(book_info.to_dict())
  
          book.title = book_info.title
          book.set_extra_info_value(book_info.to_dict())
-        book._short_html = ''
          book.save()
  
          book.save()
  
-        book_tags = []
+        meta_tags = []
          categories = (('kinds', 'kind'), ('genres', 'genre'), ('authors', 'author'), ('epochs', 'epoch'))
          for field_name, category in categories:
              try:
          categories = (('kinds', 'kind'), ('genres', 'genre'), ('authors', 'author'), ('epochs', 'epoch'))
          for field_name, category in categories:
              try:
@@ -588,11 +703,11 @@ class Book(models.Model):
                  tag, created = Tag.objects.get_or_create(slug=slughifi(tag_name), category=category)
                  if created:
                      tag.name = tag_name
                  tag, created = Tag.objects.get_or_create(slug=slughifi(tag_name), category=category)
                  if created:
                      tag.name = tag_name
-                    tag.sort_key = tag_sort_key.lower()
+                    tag.sort_key = sortify(tag_sort_key.lower())
                      tag.save()
                      tag.save()
-                book_tags.append(tag)
+                meta_tags.append(tag)
  
  
-        book.tags = set(book_tags + book_shelves)
+        book.tags = set(meta_tags + book_shelves)
  
          book_tag = book.book_tag()
  
  
          book_tag = book.book_tag()
  
@@ -607,53 +722,22 @@ class Book(models.Model):
          # delete old fragments when overwriting
          book.fragments.all().delete()
  
          # delete old fragments when overwriting
          book.fragments.all().delete()
  
-        html_file = NamedTemporaryFile()
-        if html.transform(book.xml_file.path, html_file, parse_dublincore=False):
-            book.html_file.save('%s.html' % book.slug, File(html_file), save=False)
-
-            # get ancestor l-tags for adding to new fragments
-            ancestor_tags = []
-            p = book.parent
-            while p:
-                ancestor_tags.append(p.book_tag())
-                p = p.parent
-
-            # Extract fragments
-            closed_fragments, open_fragments = html.extract_fragments(book.html_file.path)
-            for fragment in closed_fragments.values():
-                try:
-                    theme_names = [s.strip() for s in fragment.themes.split(',')]
-                except AttributeError:
-                    continue
-                themes = []
-                for theme_name in theme_names:
-                    if not theme_name:
-                        continue
-                    tag, created = Tag.objects.get_or_create(slug=slughifi(theme_name), category='theme')
-                    if created:
-                        tag.name = theme_name
-                        tag.sort_key = theme_name.lower()
-                        tag.save()
-                    themes.append(tag)
-                if not themes:
-                    continue
-
-                text = fragment.to_string()
-                short_text = ''
-                if (len(MarkupString(text)) > 240):
-                    short_text = unicode(MarkupString(text)[:160])
-                new_fragment, created = Fragment.objects.get_or_create(anchor=fragment.id, book=book,
-                    defaults={'text': text, 'short_text': short_text})
-
-                new_fragment.save()
-                new_fragment.tags = set(book_tags + themes + [book_tag] + ancestor_tags)
-
+        if book.build_html():
              if not settings.NO_BUILD_TXT and build_txt:
                  book.build_txt()
  
          if not settings.NO_BUILD_EPUB and build_epub:
              book.root_ancestor.build_epub()
  
              if not settings.NO_BUILD_TXT and build_txt:
                  book.build_txt()
  
          if not settings.NO_BUILD_EPUB and build_epub:
              book.root_ancestor.build_epub()
  
+        if not settings.NO_BUILD_PDF and build_pdf:
+            book.root_ancestor.build_pdf()
+
+        if not settings.NO_BUILD_MOBI and build_mobi:
+            book.build_mobi()
+
+        if not settings.NO_SEARCH_INDEX and search_index:
+            book.search_index()
+
          book_descendants = list(book.children.all())
          # add l-tag to descendants and their fragments
          # delete unnecessary EPUB files
          book_descendants = list(book.children.all())
          # add l-tag to descendants and their fragments
          # delete unnecessary EPUB files
@@ -665,57 +749,69 @@ class Book(models.Model):
                  fragment.tags = set(list(fragment.tags) + [book_tag])
              book_descendants += list(child_book.children.all())
  
                  fragment.tags = set(list(fragment.tags) + [book_tag])
              book_descendants += list(child_book.children.all())
  
+        book.save()
+
          # refresh cache
          book.reset_tag_counter()
          book.reset_theme_counter()
  
          # refresh cache
          book.reset_tag_counter()
          book.reset_theme_counter()
  
-        book.save()
          return book
  
          return book
  
-
-    def refresh_tag_counter(self):
-        tags = {}
-        for child in self.children.all().order_by():
-            for tag_pk, value in child.tag_counter.iteritems():
-                tags[tag_pk] = tags.get(tag_pk, 0) + value
-        for tag in self.tags.exclude(category__in=('book', 'theme', 'set')).order_by():
-            tags[tag.pk] = 1
-        self.set__tag_counter_value(tags)
-        self.save(reset_short_html=False)
-        return tags
-
      def reset_tag_counter(self):
      def reset_tag_counter(self):
-        self._tag_counter = None
-        self.save(reset_short_html=False)
+        if self.id is None:
+            return
+
+        cache_key = "Book.tag_counter/%d" % self.id
+        cache.delete(cache_key)
          if self.parent:
              self.parent.reset_tag_counter()
  
      @property
      def tag_counter(self):
          if self.parent:
              self.parent.reset_tag_counter()
  
      @property
      def tag_counter(self):
-        if self._tag_counter is None:
-            return self.refresh_tag_counter()
-        return dict((int(k), v) for k, v in self.get__tag_counter_value().iteritems())
-
-    def refresh_theme_counter(self):
-        tags = {}
-        for fragment in Fragment.tagged.with_any([self.book_tag()]).order_by():
-            for tag in fragment.tags.filter(category='theme').order_by():
-                tags[tag.pk] = tags.get(tag.pk, 0) + 1
-        self.set__theme_counter_value(tags)
-        self.save(reset_short_html=False)
+        if self.id:
+            cache_key = "Book.tag_counter/%d" % self.id
+            tags = cache.get(cache_key)
+        else:
+            tags = None
+
+        if tags is None:
+            tags = {}
+            for child in self.children.all().order_by():
+                for tag_pk, value in child.tag_counter.iteritems():
+                    tags[tag_pk] = tags.get(tag_pk, 0) + value
+            for tag in self.tags.exclude(category__in=('book', 'theme', 'set')).order_by():
+                tags[tag.pk] = 1
+
+            if self.id:
+                cache.set(cache_key, tags, CACHE_FOREVER)
          return tags
  
      def reset_theme_counter(self):
          return tags
  
      def reset_theme_counter(self):
-        self._theme_counter = None
-        self.save(reset_short_html=False)
+        if self.id is None:
+            return
+
+        cache_key = "Book.theme_counter/%d" % self.id
+        cache.delete(cache_key)
          if self.parent:
              self.parent.reset_theme_counter()
  
      @property
      def theme_counter(self):
          if self.parent:
              self.parent.reset_theme_counter()
  
      @property
      def theme_counter(self):
-        if self._theme_counter is None:
-            return self.refresh_theme_counter()
-        return dict((int(k), v) for k, v in self.get__theme_counter_value().iteritems())
+        if self.id:
+            cache_key = "Book.theme_counter/%d" % self.id
+            tags = cache.get(cache_key)
+        else:
+            tags = None
+
+        if tags is None:
+            tags = {}
+            for fragment in Fragment.tagged.with_any([self.book_tag()]).order_by():
+                for tag in fragment.tags.filter(category='theme').order_by():
+                    tags[tag.pk] = tags.get(tag.pk, 0) + 1
+
+            if self.id:
+                cache.set(cache_key, tags, CACHE_FOREVER)
+        return tags
  
      def pretty_title(self, html_links=False):
          book = self
  
      def pretty_title(self, html_links=False):
          book = self
@@ -734,11 +830,46 @@ class Book(models.Model):
  
          return ', '.join(names)
  
  
          return ', '.join(names)
  
+    @classmethod
+    def tagged_top_level(cls, tags):
+        """ Returns top-level books tagged with `tags'.
+
+        It only returns those books which don't have ancestors which are
+        also tagged with those tags.
+
+        """
+        # get relevant books and their tags
+        objects = cls.tagged.with_all(tags)
+        # eliminate descendants
+        l_tags = Tag.objects.filter(category='book', slug__in=[book.book_tag_slug() for book in objects])
+        descendants_keys = [book.pk for book in cls.tagged.with_any(l_tags)]
+        if descendants_keys:
+            objects = objects.exclude(pk__in=descendants_keys)
+
+        return objects
+
+
+def _has_factory(ftype):
+    has = lambda self: bool(getattr(self, "%s_file" % ftype))
+    has.short_description = t.upper()
+    has.boolean = True
+    has.__name__ = "has_%s_file" % ftype
+    return has
+
+    
+# add the file fields
+for t in Book.file_types:
+    field_name = "%s_file" % t
+    models.FileField(_("%s file" % t.upper()),
+            upload_to=book_upload_path(t),
+            blank=True).contribute_to_class(Book, field_name)
+
+    setattr(Book, "has_%s_file" % t, _has_factory(t))
+
  
  class Fragment(models.Model):
      text = models.TextField()
      short_text = models.TextField(editable=False)
  
  class Fragment(models.Model):
      text = models.TextField()
      short_text = models.TextField(editable=False)
-    _short_html = models.TextField(editable=False)
      anchor = models.CharField(max_length=120)
      book = models.ForeignKey(Book, related_name='fragments')
  
      anchor = models.CharField(max_length=120)
      book = models.ForeignKey(Book, related_name='fragments')
  
@@ -754,16 +885,29 @@ class Fragment(models.Model):
      def get_absolute_url(self):
          return '%s#m%s' % (reverse('book_text', kwargs={'slug': self.book.slug}), self.anchor)
  
      def get_absolute_url(self):
          return '%s#m%s' % (reverse('book_text', kwargs={'slug': self.book.slug}), self.anchor)
  
+    def reset_short_html(self):
+        if self.id is None:
+            return
+
+        cache_key = "Fragment.short_html/%d/%s"
+        for lang, langname in settings.LANGUAGES:
+            cache.delete(cache_key % (self.id, lang))
+
      def short_html(self):
      def short_html(self):
-        key = '_short_html_%s' % get_language()
-        short_html = getattr(self, key)
-        if short_html and len(short_html):
+        if self.id:
+            cache_key = "Fragment.short_html/%d/%s" % (self.id, get_language())
+            short_html = cache.get(cache_key)
+        else:
+            short_html = None
+
+        if short_html is not None:
              return mark_safe(short_html)
          else:
              return mark_safe(short_html)
          else:
-            setattr(self, key, unicode(render_to_string('catalogue/fragment_short.html',
-                {'fragment': self})))
-            self.save()
-            return mark_safe(getattr(self, key))
+            short_html = unicode(render_to_string('catalogue/fragment_short.html',
+                {'fragment': self}))
+            if self.id:
+                cache.set(cache_key, short_html, CACHE_FOREVER)
+            return mark_safe(short_html)
  
  
  class FileRecord(models.Model):
  
  
  class FileRecord(models.Model):
@@ -789,7 +933,8 @@ class FileRecord(models.Model):
  
  def _tags_updated_handler(sender, affected_tags, **kwargs):
      # reset tag global counter
  
  def _tags_updated_handler(sender, affected_tags, **kwargs):
      # reset tag global counter
-    Tag.objects.filter(pk__in=[tag.pk for tag in affected_tags]).update(book_count=None)
+    # we want Tag.changed_at updated for API to know the tag was touched
+    Tag.objects.filter(pk__in=[tag.pk for tag in affected_tags]).update(book_count=None, changed_at=datetime.now())
  
      # if book tags changed, reset book tag counter
      if isinstance(sender, Book) and \
  
      # if book tags changed, reset book tag counter
      if isinstance(sender, Book) and \