command to remove empty tags + cleaning empty tags on save
authorJan Szejko <janek37@gmail.com>
Thu, 21 Dec 2017 17:07:22 +0000 (18:07 +0100)
committerJan Szejko <janek37@gmail.com>
Thu, 21 Dec 2017 17:07:22 +0000 (18:07 +0100)
apps/catalogue/management/commands/remove_empty_tags.py [new file with mode: 0644]
apps/catalogue/xml_tools.py
apps/wiki/forms.py

diff --git a/apps/catalogue/management/commands/remove_empty_tags.py b/apps/catalogue/management/commands/remove_empty_tags.py
new file mode 100644 (file)
index 0000000..9c2cc0f
--- /dev/null
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of FNP-Redakcja, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
+import sys
+from django.contrib.auth.models import User
+from optparse import make_option
+
+from django.core.management import BaseCommand
+
+from catalogue.models import Book
+from catalogue.xml_tools import remove_empty_elements
+
+EXCLUDED_SLUGS = [
+    'aktualizacja-szablonu-8kwie',
+]
+
+
+class Command(BaseCommand):
+    option_list = BaseCommand.option_list + (
+        # make_option('-q', '--quiet', action='store_false', dest='verbose',
+        #     default=True, help='Less output'),
+        # make_option('-d', '--dry-run', action='store_true', dest='dry_run',
+        #     default=False, help="Don't actually touch anything"),
+        make_option(
+            '-u', '--username', dest='username', metavar='USER',
+            help='Assign commits to this user (required, preferably yourself).'),
+    )
+
+    def handle(self, **options):
+        username = options.get('username')
+
+        if username:
+            user = User.objects.get(username=username)
+        else:
+            print 'Please provide a username.'
+            sys.exit(1)
+
+        for book in Book.objects.all():
+            if book.slug in EXCLUDED_SLUGS:
+                continue
+            print 'processing %s' % book.slug
+            for chunk in book.chunk_set.all():
+                old_head = chunk.head
+                src = old_head.materialize()
+                new_xml = remove_empty_elements(src)
+                if new_xml:
+                    new_head = chunk.commit(
+                        new_xml,
+                        author=user,
+                        description=u'automatyczne usunięcie pustych znaczników'
+                    )
+                    print 'committed %s (chunk %s)' % (book.slug, chunk.number)
+                    if old_head.publishable:
+                        new_head.set_publishable(True)
index 7be05fd..72e860f 100644 (file)
@@ -2,6 +2,7 @@
 from copy import deepcopy
 import re
 
+from django.utils.encoding import force_str
 from lxml import etree
 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
 
@@ -199,4 +200,42 @@ def wl2_to_wl1(wl2_xml, slug):
         if not h[0].text or not re.match(r'\d\.\s', h[0].text):
             raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text))
         h[0].text = h[0].text[3:]
-    return etree.tostring(w1t, encoding='utf-8')
\ No newline at end of file
+    return etree.tostring(w1t, encoding='utf-8')
+
+
+EXCEPTIONS = [
+    ('div', 'img'),
+    ('div', 'video'),
+    ('div', 'table.cell'),
+    ('span', 'link'),
+]
+
+
+def remove_element(element):
+    parent = element.getparent()
+    tail = element.tail
+    if tail:
+        prev = element.getprevious()
+        if prev is not None:
+            prev.tail = (prev.tail or '') + tail
+        else:
+            parent.text = (parent.text or '') + tail
+    parent.remove(element)
+
+
+def remove_empty_elements(xml):
+    try:
+        tree = etree.fromstring(force_str(xml.replace('&nbsp;', u'\xa0')))
+    except SyntaxError:
+        return None
+    changed = False
+    another_loop = True
+    while another_loop:
+        another_loop = False
+        for element in tree.findall('.//*'):
+            if (not element.text or not element.text.strip()) and len(element) == 0:
+                if (element.tag, element.attrib.get('class')) not in EXCEPTIONS:
+                    remove_element(element)
+                    changed = True
+                    another_loop = True
+    return etree.tostring(tree, encoding=unicode) if changed else None
index 772e0be..141f1b6 100644 (file)
@@ -7,6 +7,7 @@ from django import forms
 from django.utils.translation import ugettext_lazy as _
 
 from catalogue.models import Chunk
+from catalogue.xml_tools import remove_empty_elements
 
 
 class DocumentPubmarkForm(forms.Form):
@@ -79,6 +80,10 @@ class DocumentTextSaveForm(forms.Form):
         self.fields['for_cybernauts'].initial = self.chunk.book.for_cybernauts
         self.fields['publishable'].initial = self.chunk.head.publishable
 
+    def clean_text(self):
+        text = self.cleaned_data.get('text', '')
+        return remove_empty_elements(text)
+
     def save(self):
         if self.user.is_authenticated():
             author = self.user