nicer fragments: no anchors in text, prettier short_text

author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)

committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
diff --git a/README.md b/README.md

index 725dc7c..fd193b2 100644 (file)
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ django-compress
   - License: [MIT License](http://www.opensource.org/licenses/mit-license.php)
   - Type: library (Django application)
   
   - License: [MIT License](http://www.opensource.org/licenses/mit-license.php)
   - Type: library (Django application)
   
- django-chunks
+django-chunks
  -------------
   - Source: [Google Code](http://code.google.com/p/django-chunks/)
   - Authors: Clint Ecker <clintecker@gmail.com>
  -------------
   - Source: [Google Code](http://code.google.com/p/django-chunks/)
   - Authors: Clint Ecker <clintecker@gmail.com>
@@ -131,19 +131,7 @@ django-newtagging
   - Type: library (Django aplication)
   - Notes: Aplication based on  [django-tagging](http://code.google.com/p/django-tagging/), also [MIT](http://www.opensource.org/licenses/mit-license.php) license.
   
   - Type: library (Django aplication)
   - Notes: Aplication based on  [django-tagging](http://code.google.com/p/django-tagging/), also [MIT](http://www.opensource.org/licenses/mit-license.php) license.
   
-django-piston (0.2.3rc)
-------------------------
- - http://bitbucket.org/jespern/django-piston/wiki/Home
  
  
-markupstring
-------------
- - Source: [ASPN Cookbook](http://code.activestate.com/recipes/389023/)
- - Authors: Thomas Hinkle
- - License: [MIT License](http://code.activestate.com/help/terms/)
- - Type: library
- - Notes: Patched by Marek Stępniowski <marek@stepniowski.com> to accept Unicode strings
- 
- 
  Authors
  =======
   
  Authors
  =======
   
diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py

index a944111..7f20b4e 100644 (file)
--- a/apps/catalogue/models.py
+++ b/apps/catalogue/models.py
@@ -24,7 +24,7 @@ from django.conf import settings
  from newtagging.models import TagBase, tags_updated
  from newtagging import managers
  from catalogue.fields import JSONField, OverwritingFileField
  from newtagging.models import TagBase, tags_updated
  from newtagging import managers
  from catalogue.fields import JSONField, OverwritingFileField
-from catalogue.utils import create_zip, split_tags
+from catalogue.utils import create_zip, split_tags, truncate_html_words
  from catalogue.tasks import touch_tag, index_book
  from shutil import copy
  from glob import glob
  from catalogue.tasks import touch_tag, index_book
  from shutil import copy
  from glob import glob
@@ -565,7 +565,6 @@ class Book(models.Model):
  
  
      def build_html(self):
  
  
      def build_html(self):
-        from markupstring import MarkupString
          from django.core.files.base import ContentFile
          from slughifi import slughifi
          from librarian import html
          from django.core.files.base import ContentFile
          from slughifi import slughifi
          from librarian import html
@@ -609,10 +608,9 @@ class Book(models.Model):
                      continue
  
                  text = fragment.to_string()
                      continue
  
                  text = fragment.to_string()
-                short_text = ''
-                markup = MarkupString(text)
-                if (len(markup) > 240):
-                    short_text = unicode(markup[:160])
+                short_text = truncate_html_words(text, 15)
+                if text == short_text:
+                    short_text = ''
                  new_fragment = Fragment.objects.create(anchor=fragment.id, book=self,
                      text=text, short_text=short_text)
  
                  new_fragment = Fragment.objects.create(anchor=fragment.id, book=self,
                      text=text, short_text=short_text)
  
@@ -1003,6 +1001,10 @@ class Fragment(models.Model):
          for lang, langname in settings.LANGUAGES:
              permanent_cache.delete(cache_key % (self.id, lang))
  
          for lang, langname in settings.LANGUAGES:
              permanent_cache.delete(cache_key % (self.id, lang))
  
+    def get_short_text(self):
+        """Returns short version of the fragment."""
+        return self.short_text if self.short_text else self.text
+
      def short_html(self):
          if self.id:
              cache_key = "Fragment.short_html/%d/%s" % (self.id, get_language())
      def short_html(self):
          if self.id:
              cache_key = "Fragment.short_html/%d/%s" % (self.id, get_language())
diff --git a/apps/catalogue/templatetags/catalogue_tags.py b/apps/catalogue/templatetags/catalogue_tags.py

index c2d7dd6..961cc01 100644 (file)
--- a/apps/catalogue/templatetags/catalogue_tags.py
+++ b/apps/catalogue/templatetags/catalogue_tags.py
@@ -363,18 +363,3 @@ def tag_url(category, slug):
      return reverse('catalogue.views.tagged_object_list', args=[
          '/'.join((Tag.categories_dict[category], slug))
      ])
      return reverse('catalogue.views.tagged_object_list', args=[
          '/'.join((Tag.categories_dict[category], slug))
      ])
-
-
-@register.filter
-@stringfilter
-def removewholetags(value, tags):
-    """Removes a space separated list of [X]HTML tags from the output.
-
-    FIXME: It makes the assumption the removed tags aren't nested.
-
-    """
-    tags = [re.escape(tag) for tag in tags.split()]
-    tags_re = u'(%s)' % u'|'.join(tags)
-    tag_re = re.compile(ur'<%s[^>]*>.*?</\s*\1\s*>' % tags_re, re.U)
-    value = tag_re.sub(u'', value)
-    return value
diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py

index 0fdeaf8..185f5fa 100644 (file)
--- a/apps/catalogue/utils.py
+++ b/apps/catalogue/utils.py
@@ -5,6 +5,7 @@
  from __future__ import with_statement
  
  import random
  from __future__ import with_statement
  
  import random
+import re
  import time
  from base64 import urlsafe_b64encode
  
  import time
  from base64 import urlsafe_b64encode
  
@@ -12,6 +13,7 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
  from django.core.files.uploadedfile import UploadedFile
  from django.core.files.base import File
  from django.core.files.storage import DefaultStorage
+from django.utils.encoding import force_unicode
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
  from django.utils.hashcompat import sha_constructor
  from django.conf import settings
  from celery.task import task
@@ -51,7 +53,7 @@ class ExistingFile(UploadedFile):
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
  
      def __init__(self, path, *args, **kwargs):
          self.path = path
-        return super(ExistingFile, self).__init__(*args, **kwargs)
+        super(ExistingFile, self).__init__(*args, **kwargs)
  
      def temporary_file_path(self):
          return self.path
  
      def temporary_file_path(self):
          return self.path
@@ -189,3 +191,72 @@ class MultiQuerySet(object):
                      offset = 0
                      stop = total_len - len(items)
                      continue
                      offset = 0
                      stop = total_len - len(items)
                      continue
+
+
+def truncate_html_words(s, num, end_text='...'):
+    """Truncates HTML to a certain number of words (not counting tags and
+    comments). Closes opened tags if they were correctly closed in the given
+    html. Takes an optional argument of what should be used to notify that the
+    string has been truncated, defaulting to ellipsis (...).
+
+    Newlines in the HTML are preserved.
+
+    This is just a version of django.utils.text.truncate_html_words with no space before the end_text.
+    """
+    s = force_unicode(s)
+    length = int(num)
+    if length <= 0:
+        return u''
+    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
+    # Set up regular expressions
+    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
+    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
+    # Count non-HTML words and keep note of open tags
+    pos = 0
+    end_text_pos = 0
+    words = 0
+    open_tags = []
+    while words <= length:
+        m = re_words.search(s, pos)
+        if not m:
+            # Checked through whole string
+            break
+        pos = m.end(0)
+        if m.group(1):
+            # It's an actual non-HTML word
+            words += 1
+            if words == length:
+                end_text_pos = pos
+            continue
+        # Check for tag
+        tag = re_tag.match(m.group(0))
+        if not tag or end_text_pos:
+            # Don't worry about non tags or tags after our truncate point
+            continue
+        closing_tag, tagname, self_closing = tag.groups()
+        tagname = tagname.lower()  # Element names are always case-insensitive
+        if self_closing or tagname in html4_singlets:
+            pass
+        elif closing_tag:
+            # Check for match in open tags list
+            try:
+                i = open_tags.index(tagname)
+            except ValueError:
+                pass
+            else:
+                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
+                open_tags = open_tags[i+1:]
+        else:
+            # Add it to the start of the open tags list
+            open_tags.insert(0, tagname)
+    if words <= length:
+        # Don't try to close tags if we don't need to truncate
+        return s
+    out = s[:end_text_pos]
+    if end_text:
+        out += end_text
+    # Close any tags still open
+    for tag in open_tags:
+        out += '</%s>' % tag
+    # Return string
+    return out
diff --git a/apps/search/templatetags/search_tags.py b/apps/search/templatetags/search_tags.py

index 32d5b64..06631b2 100644 (file)
--- a/apps/search/templatetags/search_tags.py
+++ b/apps/search/templatetags/search_tags.py
@@ -46,4 +46,5 @@ def book_searched(context, result):
          'book': book,
          'request': context.get('request'),
          'hits': hits,
          'book': book,
          'request': context.get('request'),
          'hits': hits,
+        'main_link': book.get_absolute_url(),
      }
      }
diff --git a/lib/librarian b/lib/librarian

index e394602..05843e2 160000 (submodule)
--- a/lib/librarian
+++ b/lib/librarian
@@ -1 +1 @@
-Subproject commit e394602de9243608d1e99a3de448a75646f1a77f
+Subproject commit 05843e29b4fffcc676da0e67b7a840a24d7b91d4
diff --git a/lib/markupstring.py b/lib/markupstring.py

deleted file mode 100644 (file)

index 0e273f2..0000000
--- a/lib/markupstring.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Code taken from ActiveState Python recipes:
-# http://code.activestate.com/recipes/389023/
-#
-# Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
-import xml.sax
-
-
-class simpleHandler(xml.sax.ContentHandler):
-    """A simple handler that provides us with indices of marked up content."""
-    def __init__(self):
-        self.elements = [] #this will contain a list of elements and their start/end indices
-        self.open_elements = [] #this holds info on open elements while we wait for their close
-        self.content = ""
-
-    def startElement(self, name, attrs):
-        if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
-        self.open_elements.append({'name':name,
-                                   'attrs':attrs.copy(),
-                                   'start':len(self.content),
-                                   })
-
-    def endElement(self, name):
-        if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
-        for i in range(len(self.open_elements)):
-            e = self.open_elements[i]
-            if e['name'] == name:
-                # append a  (start,end), name, attrs
-                self.elements.append(((e['start'], #start position
-                                       len(self.content)), # current (end) position
-                                      e['name'], e['attrs'])
-                                     )
-                del self.open_elements[i]
-                return
-
-    def characters(self, chunk):
-        self.content += chunk
-
-
-class MarkupString(unicode):
-    """A simple class for dealing with marked up strings. When we are sliced, we return
-    valid marked up strings, preserving markup."""
-    def __init__(self, string):
-        unicode.__init__(self)
-        self.handler = simpleHandler()
-        xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
-        self.raw = self.handler.content
-
-    def __getitem__(self, n):
-        return self.__getslice__(n, n + 1)
-
-    def __getslice__(self, s, e):
-        # only include relevant elements
-        if not e or e > len(self.raw): e = len(self.raw)
-        elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
-                                      tp[0][0] <= e # and start before the end
-                                      ),
-                          self.handler.elements)
-        ends = {}
-        starts = {}
-        for el in elements:
-            # cycle through elements that effect our slice and keep track of
-            # where their start and end tags should go.
-            pos = el[0]
-            name = el[1]
-            attrs = el[2]
-            # write our start tag <stag att="val"...>
-            stag = "<%s" % name
-            for k, v in attrs.items(): stag += " %s=%s" % (k, xml.sax.saxutils.quoteattr(v))
-            stag += ">"
-            etag = "</%s>" % name # simple end tag
-            spos = pos[0]
-            epos = pos[1]
-            if spos < s: spos = s
-            if epos > e: epos = e
-            if epos != spos: # we don't care about tags that don't markup any text
-                if not starts.has_key(spos): starts[spos] = []
-                starts[spos].append(stag)
-                if not ends.has_key(epos): ends[epos] = []
-                ends[epos].append(etag)
-        outbuf = "" # our actual output string
-        for pos in range(s, e): # we move through positions
-            char = self.raw[pos]
-            if ends.has_key(pos):  # if there are endtags to insert...
-                for et in ends[pos]: outbuf += et
-            if starts.has_key(pos): # if there are start tags to insert
-                mystarts = starts[pos]
-                # reverse these so the order works out,e.g. <i><b><u></u></b></i>
-                mystarts.reverse()
-                for st in mystarts: outbuf += st
-            outbuf += char
-        if ends.has_key(e):
-            for et in ends[e]: outbuf += et
-        return MarkupString(outbuf)
-
-    def __len__(self):
-        return len(self.raw)
-
diff --git a/wolnelektury/templates/catalogue/book_searched.html b/wolnelektury/templates/catalogue/book_searched.html

index 4b92c19..722e323 100644 (file)
--- a/wolnelektury/templates/catalogue/book_searched.html
+++ b/wolnelektury/templates/catalogue/book_searched.html
@@ -16,7 +16,7 @@
      <p>{% trans "In fragment" %}
        {% if hit.themes_hit %}{% trans ", for themes:" %}{% for t in hit.themes_hit %}{{t.name}} {% endfor %}{% endif %}
      </p>
      <p>{% trans "In fragment" %}
        {% if hit.themes_hit %}{% trans ", for themes:" %}{% for t in hit.themes_hit %}{{t.name}} {% endfor %}{% endif %}
      </p>
-    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.short_text|safe}}</a>
+    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.get_short_text|safe}}</a>
    </div>
    {% endif %}
    {% endif %}
    </div>
    {% endif %}
    {% endif %}
diff --git a/wolnelektury/templates/catalogue/fragment_promo.html b/wolnelektury/templates/catalogue/fragment_promo.html

index a3b01be..979fff4 100755 (executable)
--- a/wolnelektury/templates/catalogue/fragment_promo.html
+++ b/wolnelektury/templates/catalogue/fragment_promo.html
@@ -3,7 +3,7 @@
  {% if fragment %}
  <a href="{{ fragment.get_absolute_url }}" class="cite">
      <blockquote class="cite-body">
  {% if fragment %}
  <a href="{{ fragment.get_absolute_url }}" class="cite">
      <blockquote class="cite-body">
-        {{ fragment.text|removewholetags:"a"|truncatewords_html:15|safe }}
+        {{ fragment.get_short_text|safe }}
      </blockquote>
      <p class="mono">{{ fragment.book.pretty_title }}</p>
  </a>
      </blockquote>
      <p class="mono">{{ fragment.book.pretty_title }}</p>
  </a>
author	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
committer	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Wed, 25 Jan 2012 12:17:21 +0000 (13:17 +0100)
README.md		patch \| blob \| history
apps/catalogue/models.py		patch \| blob \| history
apps/catalogue/templatetags/catalogue_tags.py		patch \| blob \| history
apps/catalogue/utils.py		patch \| blob \| history
apps/search/templatetags/search_tags.py		patch \| blob \| history
lib/librarian		patch \| blob \| history
lib/markupstring.py	[deleted file]	patch \| blob \| history
wolnelektury/templates/catalogue/book_searched.html		patch \| blob \| history
wolnelektury/templates/catalogue/fragment_promo.html		patch \| blob \| history