some chunk management automation
authorRadek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Mon, 6 Jun 2011 10:25:15 +0000 (12:25 +0200)
committerRadek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Mon, 6 Jun 2011 10:25:15 +0000 (12:25 +0200)
apps/dvcs/models.py
apps/wiki/constants.py
apps/wiki/forms.py
apps/wiki/templates/wiki/book_detail.html
apps/wiki/views.py
apps/wiki/xml_tools.py
redakcja/static/css/filelist.css

index 5ce00c0..dddbf3a 100644 (file)
@@ -240,7 +240,7 @@ class Document(models.Model):
         return rev if rev is not None else -1
 
     def at_revision(self, rev):
         return rev if rev is not None else -1
 
     def at_revision(self, rev):
-        if rev:
+        if rev is not None:
             return self.change_set.get(revision=rev)
         else:
             return self.head
             return self.change_set.get(revision=rev)
         else:
             return self.head
index fe0a446..d75d6b4 100644 (file)
@@ -1,5 +1,13 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
-import re
 
 
-RE_TRIM_BEGIN = re.compile("^<!-- TRIM_BEGIN -->$", re.M)
-RE_TRIM_END = re.compile("^<!-- TRIM_END -->$", re.M)
+TRIM_BEGIN = " TRIM_BEGIN "
+TRIM_END = " TRIM_END "
+
+MASTERS = ['powiesc',
+           'opowiadanie',
+           'liryka_l',
+           'liryka_lp',
+           'dramat_wierszowany_l',
+           'dramat_wierszowany_lp',
+           'dramat_wspolczesny',
+           ]
index 7cabe32..fb0f958 100644 (file)
@@ -8,6 +8,7 @@ from wiki.models import Book, Chunk
 from django.utils.translation import ugettext_lazy as _
 
 from dvcs.models import Tag
 from django.utils.translation import ugettext_lazy as _
 
 from dvcs.models import Tag
+from wiki.constants import MASTERS
 
 class DocumentTagForm(forms.Form):
     """
 
 class DocumentTagForm(forms.Form):
     """
@@ -189,3 +190,11 @@ class BookForm(forms.ModelForm):
 
     class Meta:
         model = Book
 
     class Meta:
         model = Book
+
+
+class ChooseMasterForm(forms.Form):
+    """
+        Form used for fixing the chunks in a book.
+    """
+
+    master = forms.ChoiceField(choices=((m, m) for m in MASTERS))
index d72befa..26f5214 100755 (executable)
@@ -8,27 +8,56 @@
 
 <table>
     {% for c in chunks %}
 
 <table>
     {% for c in chunks %}
-        <tr class="chunk-row
-        {% if c.graded.is_wl %}
-            chunk-wl
-            {% if c.graded.bad_master %}
-                chunk-bad-master
-            {% endif %}
-        {% else %}
-            {% if c.graded.is_xml %}
-                chunk-xml
-            {% else %}
-                chunk-plain
-            {% endif %}
-        {% endif %}
-        ">
+        <tr class="chunk-{{ c.grade }}">
         <td><a target="_blank" href="{{ c.chunk.get_absolute_url }}">{{ c.chunk.comment }}</a></td>
         <td><a target="_blank" href="{{ c.chunk.get_absolute_url }}">{{ c.chunk.comment }}</a></td>
-        <td>{% if c.chunk.publishable %}P{% endif %}</td>
+        <td>{% for fix in c.fix %}
+
+                {% ifequal fix "wl" %}<span class="fix"
+                    title="{% trans "add basic document structure" %}"
+                    >&lt;/&gt;</span>{% endifequal %}
+
+                {% ifequal fix "bad-master" %}<span class="fix"
+                    title='{% trans "change master tag to" %} "{{ first_master }}"'
+                    >master</span>{% endifequal %}
+
+                {% ifequal fix "trim-begin" %}<span class="fix"
+                    title="{% trans "add begin trimming tag" %}"
+                    >&#x2701;</span>{% endifequal %}
+
+                {% ifequal fix "trim-end" %}<span class="fix"
+                    title="{% trans "add end trimming tag" %}"
+                    >&#x2703;</span>{% endifequal %}
+
+            {% endfor %}
+
+            {% ifequal c.grade "plain" %}
+                <span class="fix-info">{% trans "unstructured text" %}</span>
+            {% endifequal %}
+
+            {% ifequal c.grade "xml" %}
+                <span class="fix-info">{% trans "unknown XML" %}</span>
+            {% endifequal %}
+
+            {% ifequal c.grade "wl-broken" %}
+                <span class="fix-info">{% trans "broken document" %}</span>
+            {% endifequal %}
+
+        </td>
         <td><a href="{% url wiki_chunk_edit book.slug c.chunk.slug%}">[{% trans "edit" %}]</a></td>
         <td><a href="{% url wiki_chunk_edit book.slug c.chunk.slug%}">[{% trans "edit" %}]</a></td>
-        <td>{% if c.bad_master %}{{ c.bad_master }}{% endif %}</td>
+        <td>{% if c.chunk.publishable %}P{% endif %}</td>
         <td><a href="{% url wiki_chunk_add book.slug c.chunk.slug %}">[+]</a></td>
         </tr>
     {% endfor %}
         <td><a href="{% url wiki_chunk_add book.slug c.chunk.slug %}">[+]</a></td>
         </tr>
     {% endfor %}
+    {% if need_fixing %}
+        <tr><td></td><td>
+            <form method="POST" action="">
+                {% if choose_master %}
+                    {{ form.master }}
+                {% endif %}
+                <button type="submit">{% trans "Apply fixes" %}</button>
+            </form>
+        </td></tr>
+    {% endif %}
 </table>
 
 <p><a href="{% url wiki_book_append book.slug %}">{% trans "Append to other book" %}</a></p>
 </table>
 
 <p><a href="{% url wiki_book_append book.slug %}">{% trans "Append to other book" %}</a></p>
index 146db69..3739c90 100644 (file)
@@ -26,7 +26,7 @@ from django.middleware.gzip import GZipMiddleware
 
 import librarian.html
 import librarian.text
 
 import librarian.html
 import librarian.text
-from wiki.xml_tools import GradedText
+from wiki import xml_tools
 
 #
 # Quick hack around caching problems, TODO: use ETags
 
 #
 # Quick hack around caching problems, TODO: use ETags
@@ -399,31 +399,94 @@ def history(request, slug, chunk=None):
 def book(request, slug):
     book = get_object_or_404(Book, slug=slug)
 
 def book(request, slug):
     book = get_object_or_404(Book, slug=slug)
 
+    # TODO: most of this should go somewhere else
+
     # do we need some automation?
     # do we need some automation?
-    some_wl = False
     first_master = None
     chunks = []
     first_master = None
     chunks = []
+    need_fixing = False
+    choose_master = False
 
 
-    for chunk in book:
-        graded = GradedText(chunk.materialize())
+    length = len(book)
+    for i, chunk in enumerate(book):
         chunk_dict = {
             "chunk": chunk,
         chunk_dict = {
             "chunk": chunk,
-            "graded": graded,
+            "fix": [],
+            "grade": ""
             }
             }
+        graded = xml_tools.GradedText(chunk.materialize())
         if graded.is_wl():
         if graded.is_wl():
-            some_wl = True
             master = graded.master()
             if first_master is None:
                 first_master = master
             elif master != first_master:
             master = graded.master()
             if first_master is None:
                 first_master = master
             elif master != first_master:
-                chunk_dict['bad_master'] = master
+                chunk_dict['fix'].append('bad-master')
+
+            if i > 0 and not graded.has_trim_begin():
+                chunk_dict['fix'].append('trim-begin')
+            if i < length - 1 and not graded.has_trim_end():
+                chunk_dict['fix'].append('trim-end')
+
+            if chunk_dict['fix']:
+                chunk_dict['grade'] = 'wl-fix'
+            else:
+                chunk_dict['grade'] = 'wl'
+
+        elif graded.is_broken_wl():
+            chunk_dict['grade'] = 'wl-broken'
+        elif graded.is_xml():
+            chunk_dict['grade'] = 'xml'
+        else:
+            chunk_dict['grade'] = 'plain'
+            chunk_dict['fix'].append('wl')
+            choose_master = True
+
+        if chunk_dict['fix']:
+            need_fixing = True
         chunks.append(chunk_dict)
 
         chunks.append(chunk_dict)
 
+    if first_master or not need_fixing:
+        choose_master = False
+
+    if request.method == "POST":
+        form = forms.ChooseMasterForm(request.POST)
+        if not choose_master or form.is_valid():
+            if choose_master:
+                first_master = form.cleaned_data['master']
+
+            # do the actual fixing
+            for c in chunks:
+                if not c['fix']:
+                    continue
+
+                text = c['chunk'].materialize()
+                for fix in c['fix']:
+                    if fix == 'bad-master':
+                        text = xml_tools.change_master(text, first_master)
+                    elif fix == 'trim-begin':
+                        text = xml_tools.add_trim_begin(text)
+                    elif fix == 'trim-end':
+                        text = xml_tools.add_trim_end(text)
+                    elif fix == 'wl':
+                        text = xml_tools.basic_structure(text, first_master)
+                author = request.user if request.user.is_authenticated() else None
+                description = "auto-fix: " + ", ".join(c['fix'])
+                c['chunk'].commit(text=text, author=author, 
+                    description=description)
+
+            return http.HttpResponseRedirect(book.get_absolute_url())
+    elif choose_master:
+        form = forms.ChooseMasterForm()
+    else:
+        form = None
+
     return direct_to_template(request, "wiki/book_detail.html", extra_context={
         "book": book,
         "chunks": chunks,
     return direct_to_template(request, "wiki/book_detail.html", extra_context={
         "book": book,
         "chunks": chunks,
-        "some_wl": some_wl,
+        "need_fixing": need_fixing,
+        "choose_master": choose_master,
         "first_master": first_master,
         "first_master": first_master,
+        "form": form,
     })
 
 
     })
 
 
index a4de433..6dc5089 100755 (executable)
+from functools import wraps
 import re
 
 from lxml import etree
 import re
 
 from lxml import etree
+from wiki.constants import TRIM_BEGIN, TRIM_END, MASTERS
+
+RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
+RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
+
+
+class ParseError(BaseException):
+    pass
+
+
+def obj_memoized(f):
+    """
+        A decorator that caches return value of object methods.
+        The cache is kept with the object, in a _obj_memoized property.
+    """
+    @wraps(f)
+    def wrapper(self, *args, **kwargs):
+        if not hasattr(self, '_obj_memoized'):
+            self._obj_memoized = {}
+        key = (f.__name__,) + args + tuple(sorted(kwargs.iteritems()))
+        try:
+            return self._obj_memoized[key]
+        except TypeError:
+            return f(self, *args, **kwargs)
+        except KeyError:
+            self._obj_memoized[key] = f(self, *args, **kwargs)
+            return self._obj_memoized[key]
+    return wrapper
 
 
-from wiki.constants import RE_TRIM_BEGIN, RE_TRIM_END
 
 class GradedText(object):
 
 class GradedText(object):
-    _is_xml = None
     _edoc = None
     _edoc = None
-    _is_wl = None
-    _master = None
 
     ROOT = 'utwor'
 
     ROOT = 'utwor'
-    MASTERS = ['powiesc',
-               'opowiadanie',
-               'liryka_l',
-               'liryka_lp',
-               'dramat_wierszowany_l',
-               'dramat_wierszowany_lp',
-               'dramat_wspolczesny',
-               ]
     RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'
 
     def __init__(self, text):
         self._text = text
 
     RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'
 
     def __init__(self, text):
         self._text = text
 
+    @obj_memoized
     def is_xml(self):
     def is_xml(self):
-        if self._is_xml is None:
-            try:
-                self._edoc = etree.fromstring(self._text)
-            except etree.XMLSyntaxError:
-                self._is_xml = False
-            else:
-                self._is_xml = True
-            del self._text
-        return self._is_xml
+        """
+            Determines if it's a well-formed XML.
 
 
+            >>> GradedText("<a/>").is_xml()
+            True
+            >>> GradedText("<a>").is_xml()
+            False
+        """
+        try:
+            self._edoc = etree.fromstring(self._text)
+        except etree.XMLSyntaxError:
+            return False
+        return True
+
+    @obj_memoized
     def is_wl(self):
     def is_wl(self):
-        if self._is_wl is None:
-            if self.is_xml():
-                e = self._edoc
-                self._is_wl = e.tag == self.ROOT and (
-                    len(e) == 1 and e[0].tag in self.MASTERS or
-                    len(e) == 2 and e[0].tag == self.RDF 
-                        and e[1].tag in self.MASTERS)
-                if self._is_wl:
-                    self._master = e[-1].tag
-                del self._edoc
-            else:
-                self._is_wl = False
-        return self._is_wl
+        """
+            Determines if it's an XML with a <utwor> and a master tag.
+
+            >>> GradedText("<utwor><powiesc></powiesc></utwor>").is_wl()
+            True
+            >>> GradedText("<a></a>").is_wl()
+            False
+        """
+        if self.is_xml():
+            e = self._edoc
+            # FIXME: there could be comments
+            ret = e.tag == self.ROOT and (
+                len(e) == 1 and e[0].tag in MASTERS or
+                len(e) == 2 and e[0].tag == self.RDF 
+                    and e[1].tag in MASTERS)
+            if ret:
+                self._master = e[-1].tag
+            del self._edoc
+            return ret
+        else:
+            return False
+
+    @obj_memoized
+    def is_broken_wl(self):
+        """
+            Determines if it at least looks like broken WL file
+            and not just some untagged text.
+
+            >>> GradedText("<utwor><</utwor>").is_broken_wl()
+            True
+            >>> GradedText("some text").is_broken_wl()
+            False
+        """
+        if self.is_wl():
+            return True
+        text = self._text.strip()
+        return text.startswith('<utwor>') and text.endswith('</utwor>')
 
     def master(self):
 
     def master(self):
+        """
+            Gets the master tag.
+
+            >>> GradedText("<utwor><powiesc></powiesc></utwor>").master()
+            'powiesc'
+        """
         assert self.is_wl()
         return self._master
 
         assert self.is_wl()
         return self._master
 
+    @obj_memoized
+    def has_trim_begin(self):
+        return RE_TRIM_BEGIN.search(self._text)
+
+    @obj_memoized
+    def has_trim_end(self):
+        return RE_TRIM_END.search(self._text)
+
 
 def _trim(text, trim_begin=True, trim_end=True):
     """ 
 
 def _trim(text, trim_begin=True, trim_end=True):
     """ 
@@ -87,3 +148,57 @@ def compile_text(parts):
     # only trim beginning if it's not still the first non-empty
     texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
     return "".join(texts)
     # only trim beginning if it's not still the first non-empty
     texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
     return "".join(texts)
+
+
+def change_master(text, master):
+    """
+        Changes the master tag in a WL document.
+    """
+    e = etree.fromstring(text)
+    e[-1].tag = master
+    return etree.tostring(e, encoding="utf-8")
+
+
+def basic_structure(text, master):
+    e = etree.fromstring('''<utwor>
+<master>
+<!--%s--><!--%s-->
+</master>
+</utwor>''' % (TRIM_BEGIN, TRIM_END))
+    e[0].tag = master
+    e[0][0].tail = "\n"*3 + text + "\n"*3
+    return etree.tostring(e, encoding="utf-8")
+
+
+def add_trim_begin(text):
+    trim_tag = etree.Comment(TRIM_BEGIN)
+    e = etree.fromstring(text)
+    for master in e[::-1]:
+        if master.tag in MASTERS:
+            break
+    if master.tag not in MASTERS:
+        raise ParseError('No master tag found!')
+
+    master.insert(0, trim_tag)
+    trim_tag.tail = '\n\n\n' + (master.text or '')
+    master.text = '\n'
+    return etree.tostring(e, encoding="utf-8")
+
+
+def add_trim_end(text):
+    trim_tag = etree.Comment(TRIM_END)
+    e = etree.fromstring(text)
+    for master in e[::-1]:
+        if master.tag in MASTERS:
+            break
+    if master.tag not in MASTERS:
+        raise ParseError('No master tag found!')
+
+    master.append(trim_tag)
+    trim_tag.tail = '\n'
+    prev = trim_tag.getprevious()
+    if prev is not None:
+        prev.tail = (prev.tail or '') + '\n\n\n'
+    else:
+        master.text = (master.text or '') + '\n\n\n'
+    return etree.tostring(e, encoding="utf-8")
index c2f59f3..91323c0 100644 (file)
@@ -100,12 +100,19 @@ td {
 }
 
 
 }
 
 
-.chunk-wl {
-    background-color: #afa;
+.fix {
+    border: 1px solid gray;
+    font-size: 0.7em;
+    padding: 3px;
 }
 }
-.chunk-plain {
-    background-color: #aaa;
-}
-.chunk-xml {
-    background-color: #faa;
+
+.fix-info {
+    font-size: 0.7em;
+    font-style: italic;
 }
 }
+
+.chunk-plain a {color: gray;}
+.chunk-xml a {color: gray; font-style: italic;}
+.chunk-wl-broken a {color: red;}
+.chunk-wl a {color: green;}
+.chunk-wl-fix a {color: black;}