From 5c0ed2a6cb007fccf2cc4b58199f285707f7e974 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 25 Mar 2020 10:21:32 +0100 Subject: [PATCH] Simple docx import. --- requirements/requirements.txt | 1 + src/documents/docx.py | 86 +++++++++++++++++++++++++++++++++++ src/documents/forms.py | 27 ++++++++++- 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 src/documents/docx.py diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 6553dfe9..9b00c430 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -7,6 +7,7 @@ Pillow oauth2 httplib2 # oauth2 dependency python-slugify +python-docx==0.8.10 librarian==1.8.1 diff --git a/src/documents/docx.py b/src/documents/docx.py new file mode 100644 index 00000000..76811a30 --- /dev/null +++ b/src/documents/docx.py @@ -0,0 +1,86 @@ +import sys +import docx +from lxml import etree + + +DEBUG = False + +DC = "{http://purl.org/dc/elements/1.1/}" +RDF = "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}" + +ABOUT = "http://redakcja.wolnelektury.pl/documents/book/test-icm/" +WLURI = "http://wolnelektury.pl/katalog/lektura/test-icm/" + + + +META_STYLES = { + "Author": DC + "creator", + "Title": DC + "title", + "Publisher": DC + "publisher", + "Year": DC + "date", + "Editor": DC + "contributor.editor", + "Copyright holder": DC + "rights", +} + + +P_STYLES = { + "Normal": "akap", + "Autor": "autor_utworu", + "Title": "nazwa_utworu", + "Subtitle": "podtytul", + "Heading 1": "naglowek_czesc", + "Heading 2": "naglowek_rozdzial", + "Heading 3": "naglowek_podrozdzial", + "Heading 4": "srodtytul", + "Heading 5": "srodtytul", + +} + + +def wyroznienie(r): + if r.font.italic is not None or r.font.bold is not None or r.font.underline is not None: return r.font.italic or r.font.bold or r.font.underline + if r.style.font.italic is not None or r.style.font.bold is not None or r.style.font.underline is not None: return r.style.font.italic or r.style.font.bold or r.style.font.underline + return False + + +def xml_from_docx(f): + d = docx.Document(f) + + t = etree.Element("utwor") + rdf = etree.SubElement(t, RDF + "RDF") + meta = etree.SubElement(rdf, RDF + "Description") + meta.attrib[RDF + "about"] = ABOUT + + etree.SubElement(meta, DC + "language").text = "pol" + etree.SubElement(meta, DC + "identifier.url").text = WLURI + + m = etree.SubElement(t, "powiesc") + md = {} + + for p in d.paragraphs: + can_ignore = False + if p.style.name == 'Title': + md['title'] = p.text + if p.style.name in META_STYLES: + item = etree.SubElement(meta, META_STYLES[p.style.name]) + item.text = p.text + can_ignore = True + if p.style.name not in P_STYLES and not can_ignore: + print(p.style.name, file=sys.stderr) + if p.style.name in P_STYLES or not can_ignore: + tag = P_STYLES.get(p.style.name, "akap") + a = etree.SubElement(m, tag) + + for r in p.runs: + if wyroznienie(r): + etree.SubElement(a, "wyroznienie").text = r.text + else: + if len(a): + a[-1].tail = (a[-1].tail or '') + r.text + else: + a.text = (a.text or '') + r.text + + if DEBUG and p.style.name not in P_STYLES: + a.text += f" [{p.style.name}]" + + return etree.tostring(t, pretty_print=True, encoding='unicode'), md diff --git a/src/documents/forms.py b/src/documents/forms.py index f5f2901d..bb064eec 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -5,9 +5,10 @@ from django.db.models import Count from django import forms from django.utils.translation import ugettext_lazy as _ from django.conf import settings - +from slugify import slugify from .constants import MASTERS from .models import Book, Chunk, Image, User +from .docx import xml_from_docx class DocumentCreateForm(forms.ModelForm): """ @@ -15,6 +16,7 @@ class DocumentCreateForm(forms.ModelForm): """ file = forms.FileField(required=False) text = forms.CharField(required=False, widget=forms.Textarea) + docx = forms.FileField(required=False) class Meta: model = Book @@ -23,8 +25,10 @@ class DocumentCreateForm(forms.ModelForm): def __init__(self, *args, **kwargs): super(DocumentCreateForm, self).__init__(*args, **kwargs) self.fields['slug'].widget.attrs={'class': 'autoslug'} + self.fields['slug'].required = False self.fields['gallery'].widget.attrs={'class': 'autoslug'} self.fields['title'].widget.attrs={'class': 'autoslug-source'} + self.fields['title'].required = False def clean(self): super(DocumentCreateForm, self).clean() @@ -36,8 +40,27 @@ class DocumentCreateForm(forms.ModelForm): except UnicodeDecodeError: raise forms.ValidationError(_("Text file must be UTF-8 encoded.")) + docx = self.cleaned_data['docx'] + if docx is not None: + try: + text, meta = xml_from_docx(docx) + except Exception as e: + raise forms.ValidationError(e) + else: + self.cleaned_data['text'] = text + if not self.cleaned_data['title']: + self.cleaned_data['title'] = meta.get('title', '') + if not self.cleaned_data['slug']: + self.cleaned_data['slug'] = slugify(meta.get('title', '')) + + if not self.cleaned_data["title"]: + self._errors["title"] = self.error_class([_("Title not set")]) + + if not self.cleaned_data["slug"]: + self._errors["slug"] = self.error_class([_("Slug not set")]) + if not self.cleaned_data["text"]: - self._errors["file"] = self.error_class([_("You must either enter text or upload a file")]) + self._errors["text"] = self.error_class([_("You must either enter text or upload a file")]) return self.cleaned_data -- 2.20.1