From: Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Date: Wed, 25 Jan 2012 12:17:21 +0000 (+0100)
Subject: nicer fragments: no anchors in text, prettier short_text
X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/0d6bbe91d75e82866aaa232eee4a036759446a86

nicer fragments: no anchors in text, prettier short_text
---

diff --git a/README.md b/README.md
index 725dc7c2d..fd193b233 100644
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ django-compress
  - License: [MIT License](http://www.opensource.org/licenses/mit-license.php)
  - Type: library (Django application)
  
- django-chunks
+django-chunks
 -------------
  - Source: [Google Code](http://code.google.com/p/django-chunks/)
  - Authors: Clint Ecker <clintecker@gmail.com>
@@ -131,19 +131,7 @@ django-newtagging
  - Type: library (Django aplication)
  - Notes: Aplication based on  [django-tagging](http://code.google.com/p/django-tagging/), also [MIT](http://www.opensource.org/licenses/mit-license.php) license.
  
-django-piston (0.2.3rc)
-------------------------
- - http://bitbucket.org/jespern/django-piston/wiki/Home
 
-markupstring
-------------
- - Source: [ASPN Cookbook](http://code.activestate.com/recipes/389023/)
- - Authors: Thomas Hinkle
- - License: [MIT License](http://code.activestate.com/help/terms/)
- - Type: library
- - Notes: Patched by Marek StÄpniowski <marek@stepniowski.com> to accept Unicode strings
- 
- 
 Authors
 =======
  
diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py
index a94411191..7f20b4e59 100644
--- a/apps/catalogue/models.py
+++ b/apps/catalogue/models.py
@@ -24,7 +24,7 @@ from django.conf import settings
 from newtagging.models import TagBase, tags_updated
 from newtagging import managers
 from catalogue.fields import JSONField, OverwritingFileField
-from catalogue.utils import create_zip, split_tags
+from catalogue.utils import create_zip, split_tags, truncate_html_words
 from catalogue.tasks import touch_tag, index_book
 from shutil import copy
 from glob import glob
@@ -565,7 +565,6 @@ class Book(models.Model):
 
 
     def build_html(self):
-        from markupstring import MarkupString
         from django.core.files.base import ContentFile
         from slughifi import slughifi
         from librarian import html
@@ -609,10 +608,9 @@ class Book(models.Model):
                     continue
 
                 text = fragment.to_string()
-                short_text = ''
-                markup = MarkupString(text)
-                if (len(markup) > 240):
-                    short_text = unicode(markup[:160])
+                short_text = truncate_html_words(text, 15)
+                if text == short_text:
+                    short_text = ''
                 new_fragment = Fragment.objects.create(anchor=fragment.id, book=self,
                     text=text, short_text=short_text)
 
@@ -1003,6 +1001,10 @@ class Fragment(models.Model):
         for lang, langname in settings.LANGUAGES:
             permanent_cache.delete(cache_key % (self.id, lang))
 
+    def get_short_text(self):
+        """Returns short version of the fragment."""
+        return self.short_text if self.short_text else self.text
+
     def short_html(self):
         if self.id:
             cache_key = "Fragment.short_html/%d/%s" % (self.id, get_language())
diff --git a/apps/catalogue/templatetags/catalogue_tags.py b/apps/catalogue/templatetags/catalogue_tags.py
index c2d7dd6d5..961cc015f 100644
--- a/apps/catalogue/templatetags/catalogue_tags.py
+++ b/apps/catalogue/templatetags/catalogue_tags.py
@@ -363,18 +363,3 @@ def tag_url(category, slug):
     return reverse('catalogue.views.tagged_object_list', args=[
         '/'.join((Tag.categories_dict[category], slug))
     ])
-
-
-@register.filter
-@stringfilter
-def removewholetags(value, tags):
-    """Removes a space separated list of [X]HTML tags from the output.
-
-    FIXME: It makes the assumption the removed tags aren't nested.
-
-    """
-    tags = [re.escape(tag) for tag in tags.split()]
-    tags_re = u'(%s)' % u'|'.join(tags)
-    tag_re = re.compile(ur'<%s[^>]*>.*?</\s*\1\s*>' % tags_re, re.U)
-    value = tag_re.sub(u'', value)
-    return value
diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py
index 0fdeaf810..185f5fa34 100644
--- a/apps/catalogue/utils.py
+++ b/apps/catalogue/utils.py
@@ -5,6 +5,7 @@
 from __future__ import with_statement
 
 import random
+import re
 import time
 from base64 import urlsafe_b64encode
 
@@ -12,6 +13,7 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons
 from django.core.files.uploadedfile import UploadedFile
 from django.core.files.base import File
 from django.core.files.storage import DefaultStorage
+from django.utils.encoding import force_unicode
 from django.utils.hashcompat import sha_constructor
 from django.conf import settings
 from celery.task import task
@@ -51,7 +53,7 @@ class ExistingFile(UploadedFile):
 
     def __init__(self, path, *args, **kwargs):
         self.path = path
-        return super(ExistingFile, self).__init__(*args, **kwargs)
+        super(ExistingFile, self).__init__(*args, **kwargs)
 
     def temporary_file_path(self):
         return self.path
@@ -189,3 +191,72 @@ class MultiQuerySet(object):
                     offset = 0
                     stop = total_len - len(items)
                     continue
+
+
+def truncate_html_words(s, num, end_text='...'):
+    """Truncates HTML to a certain number of words (not counting tags and
+    comments). Closes opened tags if they were correctly closed in the given
+    html. Takes an optional argument of what should be used to notify that the
+    string has been truncated, defaulting to ellipsis (...).
+
+    Newlines in the HTML are preserved.
+
+    This is just a version of django.utils.text.truncate_html_words with no space before the end_text.
+    """
+    s = force_unicode(s)
+    length = int(num)
+    if length <= 0:
+        return u''
+    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
+    # Set up regular expressions
+    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
+    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
+    # Count non-HTML words and keep note of open tags
+    pos = 0
+    end_text_pos = 0
+    words = 0
+    open_tags = []
+    while words <= length:
+        m = re_words.search(s, pos)
+        if not m:
+            # Checked through whole string
+            break
+        pos = m.end(0)
+        if m.group(1):
+            # It's an actual non-HTML word
+            words += 1
+            if words == length:
+                end_text_pos = pos
+            continue
+        # Check for tag
+        tag = re_tag.match(m.group(0))
+        if not tag or end_text_pos:
+            # Don't worry about non tags or tags after our truncate point
+            continue
+        closing_tag, tagname, self_closing = tag.groups()
+        tagname = tagname.lower()  # Element names are always case-insensitive
+        if self_closing or tagname in html4_singlets:
+            pass
+        elif closing_tag:
+            # Check for match in open tags list
+            try:
+                i = open_tags.index(tagname)
+            except ValueError:
+                pass
+            else:
+                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
+                open_tags = open_tags[i+1:]
+        else:
+            # Add it to the start of the open tags list
+            open_tags.insert(0, tagname)
+    if words <= length:
+        # Don't try to close tags if we don't need to truncate
+        return s
+    out = s[:end_text_pos]
+    if end_text:
+        out += end_text
+    # Close any tags still open
+    for tag in open_tags:
+        out += '</%s>' % tag
+    # Return string
+    return out
diff --git a/apps/search/templatetags/search_tags.py b/apps/search/templatetags/search_tags.py
index 32d5b64c5..06631b2dc 100644
--- a/apps/search/templatetags/search_tags.py
+++ b/apps/search/templatetags/search_tags.py
@@ -46,4 +46,5 @@ def book_searched(context, result):
         'book': book,
         'request': context.get('request'),
         'hits': hits,
+        'main_link': book.get_absolute_url(),
     }
diff --git a/lib/librarian b/lib/librarian
index e394602de..05843e29b 160000
--- a/lib/librarian
+++ b/lib/librarian
@@ -1 +1 @@
-Subproject commit e394602de9243608d1e99a3de448a75646f1a77f
+Subproject commit 05843e29b4fffcc676da0e67b7a840a24d7b91d4
diff --git a/lib/markupstring.py b/lib/markupstring.py
deleted file mode 100644
index 0e273f2a1..000000000
--- a/lib/markupstring.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Code taken from ActiveState Python recipes:
-# http://code.activestate.com/recipes/389023/
-#
-# Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
-import xml.sax
-
-
-class simpleHandler(xml.sax.ContentHandler):
-    """A simple handler that provides us with indices of marked up content."""
-    def __init__(self):
-        self.elements = [] #this will contain a list of elements and their start/end indices
-        self.open_elements = [] #this holds info on open elements while we wait for their close
-        self.content = ""
-
-    def startElement(self, name, attrs):
-        if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
-        self.open_elements.append({'name':name,
-                                   'attrs':attrs.copy(),
-                                   'start':len(self.content),
-                                   })
-
-    def endElement(self, name):
-        if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
-        for i in range(len(self.open_elements)):
-            e = self.open_elements[i]
-            if e['name'] == name:
-                # append a  (start,end), name, attrs
-                self.elements.append(((e['start'], #start position
-                                       len(self.content)), # current (end) position
-                                      e['name'], e['attrs'])
-                                     )
-                del self.open_elements[i]
-                return
-
-    def characters(self, chunk):
-        self.content += chunk
-
-
-class MarkupString(unicode):
-    """A simple class for dealing with marked up strings. When we are sliced, we return
-    valid marked up strings, preserving markup."""
-    def __init__(self, string):
-        unicode.__init__(self)
-        self.handler = simpleHandler()
-        xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
-        self.raw = self.handler.content
-
-    def __getitem__(self, n):
-        return self.__getslice__(n, n + 1)
-
-    def __getslice__(self, s, e):
-        # only include relevant elements
-        if not e or e > len(self.raw): e = len(self.raw)
-        elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
-                                      tp[0][0] <= e # and start before the end
-                                      ),
-                          self.handler.elements)
-        ends = {}
-        starts = {}
-        for el in elements:
-            # cycle through elements that effect our slice and keep track of
-            # where their start and end tags should go.
-            pos = el[0]
-            name = el[1]
-            attrs = el[2]
-            # write our start tag <stag att="val"...>
-            stag = "<%s" % name
-            for k, v in attrs.items(): stag += " %s=%s" % (k, xml.sax.saxutils.quoteattr(v))
-            stag += ">"
-            etag = "</%s>" % name # simple end tag
-            spos = pos[0]
-            epos = pos[1]
-            if spos < s: spos = s
-            if epos > e: epos = e
-            if epos != spos: # we don't care about tags that don't markup any text
-                if not starts.has_key(spos): starts[spos] = []
-                starts[spos].append(stag)
-                if not ends.has_key(epos): ends[epos] = []
-                ends[epos].append(etag)
-        outbuf = "" # our actual output string
-        for pos in range(s, e): # we move through positions
-            char = self.raw[pos]
-            if ends.has_key(pos):  # if there are endtags to insert...
-                for et in ends[pos]: outbuf += et
-            if starts.has_key(pos): # if there are start tags to insert
-                mystarts = starts[pos]
-                # reverse these so the order works out,e.g. <i><b><u></u></b></i>
-                mystarts.reverse()
-                for st in mystarts: outbuf += st
-            outbuf += char
-        if ends.has_key(e):
-            for et in ends[e]: outbuf += et
-        return MarkupString(outbuf)
-
-    def __len__(self):
-        return len(self.raw)
-
diff --git a/wolnelektury/templates/catalogue/book_searched.html b/wolnelektury/templates/catalogue/book_searched.html
index 4b92c19d2..722e3233f 100644
--- a/wolnelektury/templates/catalogue/book_searched.html
+++ b/wolnelektury/templates/catalogue/book_searched.html
@@ -16,7 +16,7 @@
     <p>{% trans "In fragment" %}
       {% if hit.themes_hit %}{% trans ", for themes:" %}{% for t in hit.themes_hit %}{{t.name}} {% endfor %}{% endif %}
     </p>
-    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.short_text|safe}}</a>
+    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.get_short_text|safe}}</a>
   </div>
   {% endif %}
   {% endif %}
diff --git a/wolnelektury/templates/catalogue/fragment_promo.html b/wolnelektury/templates/catalogue/fragment_promo.html
index a3b01be16..979fff4f8 100755
--- a/wolnelektury/templates/catalogue/fragment_promo.html
+++ b/wolnelektury/templates/catalogue/fragment_promo.html
@@ -3,7 +3,7 @@
 {% if fragment %}
 <a href="{{ fragment.get_absolute_url }}" class="cite">
     <blockquote class="cite-body">
-        {{ fragment.text|removewholetags:"a"|truncatewords_html:15|safe }}
+        {{ fragment.get_short_text|safe }}
     </blockquote>
     <p class="mono">{{ fragment.book.pretty_title }}</p>
 </a>