From 0d6bbe91d75e82866aaa232eee4a036759446a86 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 25 Jan 2012 13:17:21 +0100 Subject: [PATCH] nicer fragments: no anchors in text, prettier short_text --- README.md | 14 +-- apps/catalogue/models.py | 14 +-- apps/catalogue/templatetags/catalogue_tags.py | 15 --- apps/catalogue/utils.py | 73 +++++++++++++- apps/search/templatetags/search_tags.py | 1 + lib/librarian | 2 +- lib/markupstring.py | 97 ------------------- .../templates/catalogue/book_searched.html | 2 +- .../templates/catalogue/fragment_promo.html | 2 +- 9 files changed, 85 insertions(+), 135 deletions(-) delete mode 100644 lib/markupstring.py diff --git a/README.md b/README.md index 725dc7c2d..fd193b233 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ django-compress - License: [MIT License](http://www.opensource.org/licenses/mit-license.php) - Type: library (Django application) - django-chunks +django-chunks ------------- - Source: [Google Code](http://code.google.com/p/django-chunks/) - Authors: Clint Ecker @@ -131,19 +131,7 @@ django-newtagging - Type: library (Django aplication) - Notes: Aplication based on [django-tagging](http://code.google.com/p/django-tagging/), also [MIT](http://www.opensource.org/licenses/mit-license.php) license. -django-piston (0.2.3rc) ------------------------- - - http://bitbucket.org/jespern/django-piston/wiki/Home -markupstring ------------- - - Source: [ASPN Cookbook](http://code.activestate.com/recipes/389023/) - - Authors: Thomas Hinkle - - License: [MIT License](http://code.activestate.com/help/terms/) - - Type: library - - Notes: Patched by Marek Stępniowski to accept Unicode strings - - Authors ======= diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py index a94411191..7f20b4e59 100644 --- a/apps/catalogue/models.py +++ b/apps/catalogue/models.py @@ -24,7 +24,7 @@ from django.conf import settings from newtagging.models import TagBase, tags_updated from newtagging import managers from catalogue.fields import JSONField, OverwritingFileField -from catalogue.utils import create_zip, split_tags +from catalogue.utils import create_zip, split_tags, truncate_html_words from catalogue.tasks import touch_tag, index_book from shutil import copy from glob import glob @@ -565,7 +565,6 @@ class Book(models.Model): def build_html(self): - from markupstring import MarkupString from django.core.files.base import ContentFile from slughifi import slughifi from librarian import html @@ -609,10 +608,9 @@ class Book(models.Model): continue text = fragment.to_string() - short_text = '' - markup = MarkupString(text) - if (len(markup) > 240): - short_text = unicode(markup[:160]) + short_text = truncate_html_words(text, 15) + if text == short_text: + short_text = '' new_fragment = Fragment.objects.create(anchor=fragment.id, book=self, text=text, short_text=short_text) @@ -1003,6 +1001,10 @@ class Fragment(models.Model): for lang, langname in settings.LANGUAGES: permanent_cache.delete(cache_key % (self.id, lang)) + def get_short_text(self): + """Returns short version of the fragment.""" + return self.short_text if self.short_text else self.text + def short_html(self): if self.id: cache_key = "Fragment.short_html/%d/%s" % (self.id, get_language()) diff --git a/apps/catalogue/templatetags/catalogue_tags.py b/apps/catalogue/templatetags/catalogue_tags.py index c2d7dd6d5..961cc015f 100644 --- a/apps/catalogue/templatetags/catalogue_tags.py +++ b/apps/catalogue/templatetags/catalogue_tags.py @@ -363,18 +363,3 @@ def tag_url(category, slug): return reverse('catalogue.views.tagged_object_list', args=[ '/'.join((Tag.categories_dict[category], slug)) ]) - - -@register.filter -@stringfilter -def removewholetags(value, tags): - """Removes a space separated list of [X]HTML tags from the output. - - FIXME: It makes the assumption the removed tags aren't nested. - - """ - tags = [re.escape(tag) for tag in tags.split()] - tags_re = u'(%s)' % u'|'.join(tags) - tag_re = re.compile(ur'<%s[^>]*>.*?' % tags_re, re.U) - value = tag_re.sub(u'', value) - return value diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py index 0fdeaf810..185f5fa34 100644 --- a/apps/catalogue/utils.py +++ b/apps/catalogue/utils.py @@ -5,6 +5,7 @@ from __future__ import with_statement import random +import re import time from base64 import urlsafe_b64encode @@ -12,6 +13,7 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons from django.core.files.uploadedfile import UploadedFile from django.core.files.base import File from django.core.files.storage import DefaultStorage +from django.utils.encoding import force_unicode from django.utils.hashcompat import sha_constructor from django.conf import settings from celery.task import task @@ -51,7 +53,7 @@ class ExistingFile(UploadedFile): def __init__(self, path, *args, **kwargs): self.path = path - return super(ExistingFile, self).__init__(*args, **kwargs) + super(ExistingFile, self).__init__(*args, **kwargs) def temporary_file_path(self): return self.path @@ -189,3 +191,72 @@ class MultiQuerySet(object): offset = 0 stop = total_len - len(items) continue + + +def truncate_html_words(s, num, end_text='...'): + """Truncates HTML to a certain number of words (not counting tags and + comments). Closes opened tags if they were correctly closed in the given + html. Takes an optional argument of what should be used to notify that the + string has been truncated, defaulting to ellipsis (...). + + Newlines in the HTML are preserved. + + This is just a version of django.utils.text.truncate_html_words with no space before the end_text. + """ + s = force_unicode(s) + length = int(num) + if length <= 0: + return u'' + html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') + # Set up regular expressions + re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U) + re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>') + # Count non-HTML words and keep note of open tags + pos = 0 + end_text_pos = 0 + words = 0 + open_tags = [] + while words <= length: + m = re_words.search(s, pos) + if not m: + # Checked through whole string + break + pos = m.end(0) + if m.group(1): + # It's an actual non-HTML word + words += 1 + if words == length: + end_text_pos = pos + continue + # Check for tag + tag = re_tag.match(m.group(0)) + if not tag or end_text_pos: + # Don't worry about non tags or tags after our truncate point + continue + closing_tag, tagname, self_closing = tag.groups() + tagname = tagname.lower() # Element names are always case-insensitive + if self_closing or tagname in html4_singlets: + pass + elif closing_tag: + # Check for match in open tags list + try: + i = open_tags.index(tagname) + except ValueError: + pass + else: + # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags + open_tags = open_tags[i+1:] + else: + # Add it to the start of the open tags list + open_tags.insert(0, tagname) + if words <= length: + # Don't try to close tags if we don't need to truncate + return s + out = s[:end_text_pos] + if end_text: + out += end_text + # Close any tags still open + for tag in open_tags: + out += '' % tag + # Return string + return out diff --git a/apps/search/templatetags/search_tags.py b/apps/search/templatetags/search_tags.py index 32d5b64c5..06631b2dc 100644 --- a/apps/search/templatetags/search_tags.py +++ b/apps/search/templatetags/search_tags.py @@ -46,4 +46,5 @@ def book_searched(context, result): 'book': book, 'request': context.get('request'), 'hits': hits, + 'main_link': book.get_absolute_url(), } diff --git a/lib/librarian b/lib/librarian index e394602de..05843e29b 160000 --- a/lib/librarian +++ b/lib/librarian @@ -1 +1 @@ -Subproject commit e394602de9243608d1e99a3de448a75646f1a77f +Subproject commit 05843e29b4fffcc676da0e67b7a840a24d7b91d4 diff --git a/lib/markupstring.py b/lib/markupstring.py deleted file mode 100644 index 0e273f2a1..000000000 --- a/lib/markupstring.py +++ /dev/null @@ -1,97 +0,0 @@ -# Code taken from ActiveState Python recipes: -# http://code.activestate.com/recipes/389023/ -# -# Changed by Marek Stepniowski to handle unicode characters -import xml.sax - - -class simpleHandler(xml.sax.ContentHandler): - """A simple handler that provides us with indices of marked up content.""" - def __init__(self): - self.elements = [] #this will contain a list of elements and their start/end indices - self.open_elements = [] #this holds info on open elements while we wait for their close - self.content = "" - - def startElement(self, name, attrs): - if name == 'foobar': return # we require an outer wrapper, which we promptly ignore. - self.open_elements.append({'name':name, - 'attrs':attrs.copy(), - 'start':len(self.content), - }) - - def endElement(self, name): - if name == 'foobar': return # we require an outer wrapper, which we promptly ignore. - for i in range(len(self.open_elements)): - e = self.open_elements[i] - if e['name'] == name: - # append a (start,end), name, attrs - self.elements.append(((e['start'], #start position - len(self.content)), # current (end) position - e['name'], e['attrs']) - ) - del self.open_elements[i] - return - - def characters(self, chunk): - self.content += chunk - - -class MarkupString(unicode): - """A simple class for dealing with marked up strings. When we are sliced, we return - valid marked up strings, preserving markup.""" - def __init__(self, string): - unicode.__init__(self) - self.handler = simpleHandler() - xml.sax.parseString((u"%s" % string).encode('utf-8'), self.handler) - self.raw = self.handler.content - - def __getitem__(self, n): - return self.__getslice__(n, n + 1) - - def __getslice__(self, s, e): - # only include relevant elements - if not e or e > len(self.raw): e = len(self.raw) - elements = filter(lambda tp: (tp[0][1] >= s and # end after the start... - tp[0][0] <= e # and start before the end - ), - self.handler.elements) - ends = {} - starts = {} - for el in elements: - # cycle through elements that effect our slice and keep track of - # where their start and end tags should go. - pos = el[0] - name = el[1] - attrs = el[2] - # write our start tag - stag = "<%s" % name - for k, v in attrs.items(): stag += " %s=%s" % (k, xml.sax.saxutils.quoteattr(v)) - stag += ">" - etag = "" % name # simple end tag - spos = pos[0] - epos = pos[1] - if spos < s: spos = s - if epos > e: epos = e - if epos != spos: # we don't care about tags that don't markup any text - if not starts.has_key(spos): starts[spos] = [] - starts[spos].append(stag) - if not ends.has_key(epos): ends[epos] = [] - ends[epos].append(etag) - outbuf = "" # our actual output string - for pos in range(s, e): # we move through positions - char = self.raw[pos] - if ends.has_key(pos): # if there are endtags to insert... - for et in ends[pos]: outbuf += et - if starts.has_key(pos): # if there are start tags to insert - mystarts = starts[pos] - # reverse these so the order works out,e.g. - mystarts.reverse() - for st in mystarts: outbuf += st - outbuf += char - if ends.has_key(e): - for et in ends[e]: outbuf += et - return MarkupString(outbuf) - - def __len__(self): - return len(self.raw) - diff --git a/wolnelektury/templates/catalogue/book_searched.html b/wolnelektury/templates/catalogue/book_searched.html index 4b92c19d2..722e3233f 100644 --- a/wolnelektury/templates/catalogue/book_searched.html +++ b/wolnelektury/templates/catalogue/book_searched.html @@ -16,7 +16,7 @@

{% trans "In fragment" %} {% if hit.themes_hit %}{% trans ", for themes:" %}{% for t in hit.themes_hit %}{{t.name}} {% endfor %}{% endif %}

- {{hit.fragment.short_text|safe}} + {{hit.fragment.get_short_text|safe}} {% endif %} {% endif %} diff --git a/wolnelektury/templates/catalogue/fragment_promo.html b/wolnelektury/templates/catalogue/fragment_promo.html index a3b01be16..979fff4f8 100755 --- a/wolnelektury/templates/catalogue/fragment_promo.html +++ b/wolnelektury/templates/catalogue/fragment_promo.html @@ -3,7 +3,7 @@ {% if fragment %}
- {{ fragment.text|removewholetags:"a"|truncatewords_html:15|safe }} + {{ fragment.get_short_text|safe }}

{{ fragment.book.pretty_title }}

-- 2.20.1