nicer fragments: no anchors in text, prettier short_text

author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)

committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
diff --git a/README.md b/README.md

index 725dc7c..fd193b2 100644 (file)
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ django-compress
   - License: [MIT License](http://www.opensource.org/licenses/mit-license.php)
   - Type: library (Django application)
   
- django-chunks
+django-chunks
  -------------
   - Source: [Google Code](http://code.google.com/p/django-chunks/)
   - Authors: Clint Ecker <clintecker@gmail.com>
@@ -131,19 +131,7 @@ django-newtagging
   - Type: library (Django aplication)
   - Notes: Aplication based on  [django-tagging](http://code.google.com/p/django-tagging/), also [MIT](http://www.opensource.org/licenses/mit-license.php) license.
   
-django-piston (0.2.3rc)
-------------------------
- - http://bitbucket.org/jespern/django-piston/wiki/Home
  
-markupstring
-------------
- - Source: [ASPN Cookbook](http://code.activestate.com/recipes/389023/)
- - Authors: Thomas Hinkle
- - License: [MIT License](http://code.activestate.com/help/terms/)
- - Type: library
- - Notes: Patched by Marek Stępniowski <marek@stepniowski.com> to accept Unicode strings
- 
- 
  Authors
  =======
   
diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py

index a944111..7f20b4e 100644 (file)
--- a/apps/catalogue/models.py
+++ b/apps/catalogue/models.py
@@ -24,7 +24,7 @@ from django.conf import settings
  from newtagging.models import TagBase, tags_updated
  from newtagging import managers
  from catalogue.fields import JSONField, OverwritingFileField
-from catalogue.utils import create_zip, split_tags
+from catalogue.utils import create_zip, split_tags, truncate_html_words
  from catalogue.tasks import touch_tag, index_book
  from shutil import copy
  from glob import glob
@@ -565,7 +565,6 @@ class Book(models.Model):
  
  
      def build_html(self):
-        from markupstring import MarkupString
          from django.core.files.base import ContentFile
          from slughifi import slughifi
          from librarian import html
@@ -609,10 +608,9 @@ class Book(models.Model):
                      continue
  
                  text = fragment.to_string()
-                short_text = ''
-                markup = MarkupString(text)
-                if (len(markup) > 240):
-                    short_text = unicode(markup[:160])
+                short_text = truncate_html_words(text, 15)
+                if text == short_text:
+                    short_text = ''
                  new_fragment = Fragment.objects.create(anchor=fragment.id, book=self,
                      text=text, short_text=short_text)
  
@@ -1003,6 +1001,10 @@ class Fragment(models.Model):
          for lang, langname in settings.LANGUAGES:
              permanent_cache.delete(cache_key % (self.id, lang))
  
+    def get_short_text(self):
+        """Returns short version of the fragment."""
+        return self.short_text if self.short_text else self.text
+
      def short_html(self):
          if self.id:
              cache_key = "Fragment.short_html/%d/%s" % (self.id, get_language())
diff --git a/apps/catalogue/templatetags/catalogue_tags.py b/apps/catalogue/templatetags/catalogue_tags.py

index c2d7dd6..961cc01 100644 (file)
--- a/apps/catalogue/templatetags/catalogue_tags.py
+++ b/apps/catalogue/templatetags/catalogue_tags.py
@@ -363,18 +363,3 @@ def tag_url(category, slug):
      return reverse('catalogue.views.tagged_object_list', args=[
          '/'.join((Tag.categories_dict[category], slug))
      ])
-
-
-@register.filter
-@stringfilter
-def removewholetags(value, tags):
-    """Removes a space separated list of [X]HTML tags from the output.
-
-    FIXME: It makes the assumption the removed tags aren't nested.
-
-    """
-    tags = [re.escape(tag) for tag in tags.split()]
-    tags_re = u'(%s)' % u'|'.join(tags)
-    tag_re = re.compile(ur'<%s[^>]*>.*?</\s*\1\s*>' % tags_re, re.U)
-    value = tag_re.sub(u'', value)
-    return value
diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py

index 0fdeaf8..185f5fa 100644 (file)
--- a/apps/catalogue/utils.py
+++ b/apps/catalogue/utils.py
@@ -5,6 +5,7 @@
  from __future__ import with_statement
  
  import random
+import re
  import time
  from base64 import urlsafe_b64encode
  
@@ -12,6 +13,7 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
+from django.utils.encoding import force_unicode
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
@@ -51,7 +53,7 @@ class ExistingFile(UploadedFile):
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
-        return super(ExistingFile, self).__init__(*args, **kwargs)
+        super(ExistingFile, self).__init__(*args, **kwargs)
  
      def temporary_file_path(self):
          return self.path
@@ -189,3 +191,72 @@ class MultiQuerySet(object):
                      offset = 0
                      stop = total_len - len(items)
                      continue
+
+
+def truncate_html_words(s, num, end_text='...'):
+    """Truncates HTML to a certain number of words (not counting tags and
+    comments). Closes opened tags if they were correctly closed in the given
+    html. Takes an optional argument of what should be used to notify that the
+    string has been truncated, defaulting to ellipsis (...).
+
+    Newlines in the HTML are preserved.
+
+    This is just a version of django.utils.text.truncate_html_words with no space before the end_text.
+    """
+    s = force_unicode(s)
+    length = int(num)
+    if length <= 0:
+        return u''
+    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
+    # Set up regular expressions
+    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
+    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
+    # Count non-HTML words and keep note of open tags
+    pos = 0
+    end_text_pos = 0
+    words = 0
+    open_tags = []
+    while words <= length:
+        m = re_words.search(s, pos)
+        if not m:
+            # Checked through whole string
+            break
+        pos = m.end(0)
+        if m.group(1):
+            # It's an actual non-HTML word
+            words += 1
+            if words == length:
+                end_text_pos = pos
+            continue
+        # Check for tag
+        tag = re_tag.match(m.group(0))
+        if not tag or end_text_pos:
+            # Don't worry about non tags or tags after our truncate point
+            continue
+        closing_tag, tagname, self_closing = tag.groups()
+        tagname = tagname.lower()  # Element names are always case-insensitive
+        if self_closing or tagname in html4_singlets:
+            pass
+        elif closing_tag:
+            # Check for match in open tags list
+            try:
+                i = open_tags.index(tagname)
+            except ValueError:
+                pass
+            else:
+                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
+                open_tags = open_tags[i+1:]
+        else:
+            # Add it to the start of the open tags list
+            open_tags.insert(0, tagname)
+    if words <= length:
+        # Don't try to close tags if we don't need to truncate
+        return s
+    out = s[:end_text_pos]
+    if end_text:
+        out += end_text
+    # Close any tags still open
+    for tag in open_tags:
+        out += '</%s>' % tag
+    # Return string
+    return out
diff --git a/apps/search/templatetags/search_tags.py b/apps/search/templatetags/search_tags.py

index 32d5b64..06631b2 100644 (file)
--- a/apps/search/templatetags/search_tags.py
+++ b/apps/search/templatetags/search_tags.py
@@ -46,4 +46,5 @@ def book_searched(context, result):
          'book': book,
          'request': context.get('request'),
          'hits': hits,
+        'main_link': book.get_absolute_url(),
      }
diff --git a/lib/librarian b/lib/librarian

index e394602..05843e2 160000 (submodule)
--- a/lib/librarian
+++ b/lib/librarian
@@ -1 +1 @@
-Subproject commit e394602de9243608d1e99a3de448a75646f1a77f
+Subproject commit 05843e29b4fffcc676da0e67b7a840a24d7b91d4
diff --git a/lib/markupstring.py b/lib/markupstring.py

deleted file mode 100644 (file)

index 0e273f2..0000000
--- a/lib/markupstring.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Code taken from ActiveState Python recipes:
-# http://code.activestate.com/recipes/389023/
-#
-# Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
-import xml.sax
-
-
-class simpleHandler(xml.sax.ContentHandler):
-    """A simple handler that provides us with indices of marked up content."""
-    def __init__(self):
-        self.elements = [] #this will contain a list of elements and their start/end indices
-        self.open_elements = [] #this holds info on open elements while we wait for their close
-        self.content = ""
-
-    def startElement(self, name, attrs):
-        if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
-        self.open_elements.append({'name':name,
-                                   'attrs':attrs.copy(),
-                                   'start':len(self.content),
-                                   })
-
-    def endElement(self, name):
-        if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
-        for i in range(len(self.open_elements)):
-            e = self.open_elements[i]
-            if e['name'] == name:
-                # append a  (start,end), name, attrs
-                self.elements.append(((e['start'], #start position
-                                       len(self.content)), # current (end) position
-                                      e['name'], e['attrs'])
-                                     )
-                del self.open_elements[i]
-                return
-
-    def characters(self, chunk):
-        self.content += chunk
-
-
-class MarkupString(unicode):
-    """A simple class for dealing with marked up strings. When we are sliced, we return
-    valid marked up strings, preserving markup."""
-    def __init__(self, string):
-        unicode.__init__(self)
-        self.handler = simpleHandler()
-        xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
-        self.raw = self.handler.content
-
-    def __getitem__(self, n):
-        return self.__getslice__(n, n + 1)
-
-    def __getslice__(self, s, e):
-        # only include relevant elements
-        if not e or e > len(self.raw): e = len(self.raw)
-        elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
-                                      tp[0][0] <= e # and start before the end
-                                      ),
-                          self.handler.elements)
-        ends = {}
-        starts = {}
-        for el in elements:
-            # cycle through elements that effect our slice and keep track of
-            # where their start and end tags should go.
-            pos = el[0]
-            name = el[1]
-            attrs = el[2]
-            # write our start tag <stag att="val"...>
-            stag = "<%s" % name
-            for k, v in attrs.items(): stag += " %s=%s" % (k, xml.sax.saxutils.quoteattr(v))
-            stag += ">"
-            etag = "</%s>" % name # simple end tag
-            spos = pos[0]
-            epos = pos[1]
-            if spos < s: spos = s
-            if epos > e: epos = e
-            if epos != spos: # we don't care about tags that don't markup any text
-                if not starts.has_key(spos): starts[spos] = []
-                starts[spos].append(stag)
-                if not ends.has_key(epos): ends[epos] = []
-                ends[epos].append(etag)
-        outbuf = "" # our actual output string
-        for pos in range(s, e): # we move through positions
-            char = self.raw[pos]
-            if ends.has_key(pos):  # if there are endtags to insert...
-                for et in ends[pos]: outbuf += et
-            if starts.has_key(pos): # if there are start tags to insert
-                mystarts = starts[pos]
-                # reverse these so the order works out,e.g. <i><b><u></u></b></i>
-                mystarts.reverse()
-                for st in mystarts: outbuf += st
-            outbuf += char
-        if ends.has_key(e):
-            for et in ends[e]: outbuf += et
-        return MarkupString(outbuf)
-
-    def __len__(self):
-        return len(self.raw)
-
diff --git a/wolnelektury/templates/catalogue/book_searched.html b/wolnelektury/templates/catalogue/book_searched.html

index 4b92c19..722e323 100644 (file)
--- a/wolnelektury/templates/catalogue/book_searched.html
+++ b/wolnelektury/templates/catalogue/book_searched.html
@@ -16,7 +16,7 @@
      <p>{% trans "In fragment" %}
        {% if hit.themes_hit %}{% trans ", for themes:" %}{% for t in hit.themes_hit %}{{t.name}} {% endfor %}{% endif %}
      </p>
-    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.short_text|safe}}</a>
+    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.get_short_text|safe}}</a>
    </div>
    {% endif %}
    {% endif %}
diff --git a/wolnelektury/templates/catalogue/fragment_promo.html b/wolnelektury/templates/catalogue/fragment_promo.html

index a3b01be..979fff4 100755 (executable)
--- a/wolnelektury/templates/catalogue/fragment_promo.html
+++ b/wolnelektury/templates/catalogue/fragment_promo.html
@@ -3,7 +3,7 @@
  {% if fragment %}
  <a href="{{ fragment.get_absolute_url }}" class="cite">
      <blockquote class="cite-body">
-        {{ fragment.text|removewholetags:"a"|truncatewords_html:15|safe }}
+        {{ fragment.get_short_text|safe }}
      </blockquote>
      <p class="mono">{{ fragment.book.pretty_title }}</p>
  </a>
author	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
committer	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
README.md		patch \| blob \| history
apps/catalogue/models.py		patch \| blob \| history
apps/catalogue/templatetags/catalogue_tags.py		patch \| blob \| history
apps/catalogue/utils.py		patch \| blob \| history
apps/search/templatetags/search_tags.py		patch \| blob \| history
lib/librarian		patch \| blob \| history
lib/markupstring.py	[deleted file]	patch \| blob \| history
wolnelektury/templates/catalogue/book_searched.html		patch \| blob \| history
wolnelektury/templates/catalogue/fragment_promo.html		patch \| blob \| history