From d1ef1bf6cf243a1cc9bbb2df1baef252f0628de3 Mon Sep 17 00:00:00 2001 From: Jan Szejko Date: Thu, 21 Dec 2017 18:07:22 +0100 Subject: [PATCH] command to remove empty tags + cleaning empty tags on save --- .../management/commands/remove_empty_tags.py | 56 +++++++++++++++++++ apps/catalogue/xml_tools.py | 41 +++++++++++++- apps/wiki/forms.py | 5 ++ 3 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 apps/catalogue/management/commands/remove_empty_tags.py diff --git a/apps/catalogue/management/commands/remove_empty_tags.py b/apps/catalogue/management/commands/remove_empty_tags.py new file mode 100644 index 00000000..9c2cc0f6 --- /dev/null +++ b/apps/catalogue/management/commands/remove_empty_tags.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# +# This file is part of FNP-Redakcja, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import sys +from django.contrib.auth.models import User +from optparse import make_option + +from django.core.management import BaseCommand + +from catalogue.models import Book +from catalogue.xml_tools import remove_empty_elements + +EXCLUDED_SLUGS = [ + 'aktualizacja-szablonu-8kwie', +] + + +class Command(BaseCommand): + option_list = BaseCommand.option_list + ( + # make_option('-q', '--quiet', action='store_false', dest='verbose', + # default=True, help='Less output'), + # make_option('-d', '--dry-run', action='store_true', dest='dry_run', + # default=False, help="Don't actually touch anything"), + make_option( + '-u', '--username', dest='username', metavar='USER', + help='Assign commits to this user (required, preferably yourself).'), + ) + + def handle(self, **options): + username = options.get('username') + + if username: + user = User.objects.get(username=username) + else: + print 'Please provide a username.' + sys.exit(1) + + for book in Book.objects.all(): + if book.slug in EXCLUDED_SLUGS: + continue + print 'processing %s' % book.slug + for chunk in book.chunk_set.all(): + old_head = chunk.head + src = old_head.materialize() + new_xml = remove_empty_elements(src) + if new_xml: + new_head = chunk.commit( + new_xml, + author=user, + description=u'automatyczne usunięcie pustych znaczników' + ) + print 'committed %s (chunk %s)' % (book.slug, chunk.number) + if old_head.publishable: + new_head.set_publishable(True) diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py index 7be05fd5..72e860fa 100644 --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -2,6 +2,7 @@ from copy import deepcopy import re +from django.utils.encoding import force_str from lxml import etree from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS @@ -199,4 +200,42 @@ def wl2_to_wl1(wl2_xml, slug): if not h[0].text or not re.match(r'\d\.\s', h[0].text): raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text)) h[0].text = h[0].text[3:] - return etree.tostring(w1t, encoding='utf-8') \ No newline at end of file + return etree.tostring(w1t, encoding='utf-8') + + +EXCEPTIONS = [ + ('div', 'img'), + ('div', 'video'), + ('div', 'table.cell'), + ('span', 'link'), +] + + +def remove_element(element): + parent = element.getparent() + tail = element.tail + if tail: + prev = element.getprevious() + if prev is not None: + prev.tail = (prev.tail or '') + tail + else: + parent.text = (parent.text or '') + tail + parent.remove(element) + + +def remove_empty_elements(xml): + try: + tree = etree.fromstring(force_str(xml.replace(' ', u'\xa0'))) + except SyntaxError: + return None + changed = False + another_loop = True + while another_loop: + another_loop = False + for element in tree.findall('.//*'): + if (not element.text or not element.text.strip()) and len(element) == 0: + if (element.tag, element.attrib.get('class')) not in EXCEPTIONS: + remove_element(element) + changed = True + another_loop = True + return etree.tostring(tree, encoding=unicode) if changed else None diff --git a/apps/wiki/forms.py b/apps/wiki/forms.py index 772e0be6..141f1b6b 100644 --- a/apps/wiki/forms.py +++ b/apps/wiki/forms.py @@ -7,6 +7,7 @@ from django import forms from django.utils.translation import ugettext_lazy as _ from catalogue.models import Chunk +from catalogue.xml_tools import remove_empty_elements class DocumentPubmarkForm(forms.Form): @@ -79,6 +80,10 @@ class DocumentTextSaveForm(forms.Form): self.fields['for_cybernauts'].initial = self.chunk.book.for_cybernauts self.fields['publishable'].initial = self.chunk.head.publishable + def clean_text(self): + text = self.cleaned_data.get('text', '') + return remove_empty_elements(text) + def save(self): if self.user.is_authenticated(): author = self.user -- 2.20.1