nicer fragments: no anchors in text, prettier short_text

[wolnelektury.git] / apps / catalogue / utils.py
diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py

index 0fdeaf8..185f5fa 100644 (file)
--- a/apps/catalogue/utils.py
+++ b/apps/catalogue/utils.py
@@ -5,6 +5,7 @@
  from __future__ import with_statement
  
  import random
  from __future__ import with_statement
  
  import random
+import re
  import time
  from base64 import urlsafe_b64encode
  
  import time
  from base64 import urlsafe_b64encode
  
@@ -12,6 +13,7 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
+from django.utils.encoding import force_unicode
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
@@ -51,7 +53,7 @@ class ExistingFile(UploadedFile):
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
-        return super(ExistingFile, self).__init__(*args, **kwargs)
+        super(ExistingFile, self).__init__(*args, **kwargs)
  
      def temporary_file_path(self):
          return self.path
  
      def temporary_file_path(self):
          return self.path
@@ -189,3 +191,72 @@ class MultiQuerySet(object):
                      offset = 0
                      stop = total_len - len(items)
                      continue
                      offset = 0
                      stop = total_len - len(items)
                      continue
+
+
+def truncate_html_words(s, num, end_text='...'):
+    """Truncates HTML to a certain number of words (not counting tags and
+    comments). Closes opened tags if they were correctly closed in the given
+    html. Takes an optional argument of what should be used to notify that the
+    string has been truncated, defaulting to ellipsis (...).
+
+    Newlines in the HTML are preserved.
+
+    This is just a version of django.utils.text.truncate_html_words with no space before the end_text.
+    """
+    s = force_unicode(s)
+    length = int(num)
+    if length <= 0:
+        return u''
+    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
+    # Set up regular expressions
+    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
+    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
+    # Count non-HTML words and keep note of open tags
+    pos = 0
+    end_text_pos = 0
+    words = 0
+    open_tags = []
+    while words <= length:
+        m = re_words.search(s, pos)
+        if not m:
+            # Checked through whole string
+            break
+        pos = m.end(0)
+        if m.group(1):
+            # It's an actual non-HTML word
+            words += 1
+            if words == length:
+                end_text_pos = pos
+            continue
+        # Check for tag
+        tag = re_tag.match(m.group(0))
+        if not tag or end_text_pos:
+            # Don't worry about non tags or tags after our truncate point
+            continue
+        closing_tag, tagname, self_closing = tag.groups()
+        tagname = tagname.lower()  # Element names are always case-insensitive
+        if self_closing or tagname in html4_singlets:
+            pass
+        elif closing_tag:
+            # Check for match in open tags list
+            try:
+                i = open_tags.index(tagname)
+            except ValueError:
+                pass
+            else:
+                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
+                open_tags = open_tags[i+1:]
+        else:
+            # Add it to the start of the open tags list
+            open_tags.insert(0, tagname)
+    if words <= length:
+        # Don't try to close tags if we don't need to truncate
+        return s
+    out = s[:end_text_pos]
+    if end_text:
+        out += end_text
+    # Close any tags still open
+    for tag in open_tags:
+        out += '</%s>' % tag
+    # Return string
+    return out