nicer fragments: no anchors in text, prettier short_text

[wolnelektury.git] / apps / catalogue / utils.py
diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py

index acbd778..185f5fa 100644 (file)
--- a/apps/catalogue/utils.py
+++ b/apps/catalogue/utils.py
@@ -5,6 +5,7 @@
  from __future__ import with_statement
  
  import random
  from __future__ import with_statement
  
  import random
+import re
  import time
  from base64 import urlsafe_b64encode
  
  import time
  from base64 import urlsafe_b64encode
  
@@ -12,6 +13,7 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
+from django.utils.encoding import force_unicode
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
@@ -19,8 +21,8 @@ from os import mkdir, path, unlink
  from errno import EEXIST, ENOENT
  from fcntl import flock, LOCK_EX
  from zipfile import ZipFile
  from errno import EEXIST, ENOENT
  from fcntl import flock, LOCK_EX
  from zipfile import ZipFile
+from traceback import print_exc
  
  
-from librarian import DocProvider
  from reporting.utils import read_chunks
  from celery.task import task
  import catalogue.models
  from reporting.utils import read_chunks
  from celery.task import task
  import catalogue.models
@@ -51,7 +53,7 @@ class ExistingFile(UploadedFile):
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
-        return super(ExistingFile, self).__init__(*args, **kwargs)
+        super(ExistingFile, self).__init__(*args, **kwargs)
  
      def temporary_file_path(self):
          return self.path
  
      def temporary_file_path(self):
          return self.path
@@ -60,20 +62,6 @@ class ExistingFile(UploadedFile):
          pass
  
  
          pass
  
  
-class ORMDocProvider(DocProvider):
-    """Used for getting books' children."""
-
-    def __init__(self, book):
-        self.book = book
-
-    def by_slug_and_lang(self, slug, language):
-        if slug == self.book.slug and language == self.language:
-            return open(self.book.xml_file.path)
-        else:
-            return type(self.book).objects.get(
-                    slug=slug, language=language).xml_file
-
-
  class LockFile(object):
      """
      A file lock monitor class; createas an ${objname}.lock
  class LockFile(object):
      """
      A file lock monitor class; createas an ${objname}.lock
@@ -159,8 +147,116 @@ def async_build_pdf(book_id, customizations, file_name):
      Accepts the same args as Book.build_pdf, but with book id as first parameter
      instead of Book instance
      """
      Accepts the same args as Book.build_pdf, but with book id as first parameter
      instead of Book instance
      """
-    book = catalogue.models.Book.objects.get(id=book_id)
-    print "will gen %s" % DefaultStorage().path(file_name)
-    if not DefaultStorage().exists(file_name):
-        book.build_pdf(customizations=customizations, file_name=file_name)
-    print "done."
+    try:
+        book = catalogue.models.Book.objects.get(id=book_id)
+        print "will gen %s" % DefaultStorage().path(file_name)
+        if not DefaultStorage().exists(file_name):
+            book.build_pdf(customizations=customizations, file_name=file_name)
+        print "done."
+    except Exception, e:
+        print "Error during pdf creation: %s" % e
+        print_exc
+        raise e
+
+
+class MultiQuerySet(object):
+    def __init__(self, *args, **kwargs):
+        self.querysets = args
+        self._count = None
+    
+    def count(self):
+        if not self._count:
+            self._count = sum(len(qs) for qs in self.querysets)
+        return self._count
+    
+    def __len__(self):
+        return self.count()
+        
+    def __getitem__(self, item):
+        try:
+            indices = (offset, stop, step) = item.indices(self.count())
+        except AttributeError:
+            # it's not a slice - make it one
+            return self[item : item + 1][0]
+        items = []
+        total_len = stop - offset
+        for qs in self.querysets:
+            if len(qs) < offset:
+                offset -= len(qs)
+            else:
+                items += list(qs[offset:stop])
+                if len(items) >= total_len:
+                    return items
+                else:
+                    offset = 0
+                    stop = total_len - len(items)
+                    continue
+
+
+def truncate_html_words(s, num, end_text='...'):
+    """Truncates HTML to a certain number of words (not counting tags and
+    comments). Closes opened tags if they were correctly closed in the given
+    html. Takes an optional argument of what should be used to notify that the
+    string has been truncated, defaulting to ellipsis (...).
+
+    Newlines in the HTML are preserved.
+
+    This is just a version of django.utils.text.truncate_html_words with no space before the end_text.
+    """
+    s = force_unicode(s)
+    length = int(num)
+    if length <= 0:
+        return u''
+    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
+    # Set up regular expressions
+    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
+    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
+    # Count non-HTML words and keep note of open tags
+    pos = 0
+    end_text_pos = 0
+    words = 0
+    open_tags = []
+    while words <= length:
+        m = re_words.search(s, pos)
+        if not m:
+            # Checked through whole string
+            break
+        pos = m.end(0)
+        if m.group(1):
+            # It's an actual non-HTML word
+            words += 1
+            if words == length:
+                end_text_pos = pos
+            continue
+        # Check for tag
+        tag = re_tag.match(m.group(0))
+        if not tag or end_text_pos:
+            # Don't worry about non tags or tags after our truncate point
+            continue
+        closing_tag, tagname, self_closing = tag.groups()
+        tagname = tagname.lower()  # Element names are always case-insensitive
+        if self_closing or tagname in html4_singlets:
+            pass
+        elif closing_tag:
+            # Check for match in open tags list
+            try:
+                i = open_tags.index(tagname)
+            except ValueError:
+                pass
+            else:
+                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
+                open_tags = open_tags[i+1:]
+        else:
+            # Add it to the start of the open tags list
+            open_tags.insert(0, tagname)
+    if words <= length:
+        # Don't try to close tags if we don't need to truncate
+        return s
+    out = s[:end_text_pos]
+    if end_text:
+        out += end_text
+    # Close any tags still open
+    for tag in open_tags:
+        out += '</%s>' % tag
+    # Return string
+    return out