Simple docx import.

author Radek Czajka <rczajka@rczajka.pl>

Wed, 25 Mar 2020 09:21:32 +0000 (10:21 +0100)

committer Radek Czajka <rczajka@rczajka.pl>

Wed, 25 Mar 2020 09:21:32 +0000 (10:21 +0100)
author Radek Czajka <rczajka@rczajka.pl>
Wed, 25 Mar 2020 09:21:32 +0000 (10:21 +0100)
committer Radek Czajka <rczajka@rczajka.pl>
Wed, 25 Mar 2020 09:21:32 +0000 (10:21 +0100)
diff --git a/requirements/requirements.txt b/requirements/requirements.txt

index 6553dfe..9b00c43 100644 (file)
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -7,6 +7,7 @@ Pillow
  oauth2
  httplib2 # oauth2 dependency
  python-slugify
  oauth2
  httplib2 # oauth2 dependency
  python-slugify
+python-docx==0.8.10
  
  librarian==1.8.1
  
  
  librarian==1.8.1
  
diff --git a/src/documents/docx.py b/src/documents/docx.py

new file mode 100644 (file)

index 0000000..76811a3
--- /dev/null
+++ b/src/documents/docx.py
@@ -0,0 +1,86 @@
+import sys
+import docx
+from lxml import etree
+
+
+DEBUG = False
+
+DC = "{http://purl.org/dc/elements/1.1/}"
+RDF = "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}"
+
+ABOUT = "http://redakcja.wolnelektury.pl/documents/book/test-icm/"
+WLURI = "http://wolnelektury.pl/katalog/lektura/test-icm/"
+
+
+
+META_STYLES = { 
+    "Author": DC + "creator",
+    "Title": DC + "title",
+    "Publisher": DC + "publisher",
+    "Year": DC + "date",
+    "Editor": DC + "contributor.editor",
+    "Copyright holder": DC + "rights",
+}
+
+
+P_STYLES = {
+    "Normal": "akap",
+    "Autor": "autor_utworu",
+    "Title": "nazwa_utworu",
+    "Subtitle": "podtytul",
+    "Heading 1": "naglowek_czesc",
+    "Heading 2": "naglowek_rozdzial",
+    "Heading 3": "naglowek_podrozdzial",
+    "Heading 4": "srodtytul",
+    "Heading 5": "srodtytul",
+
+}
+
+
+def wyroznienie(r):
+    if r.font.italic is not None or r.font.bold is not None or r.font.underline is not None: return r.font.italic or r.font.bold or r.font.underline
+    if r.style.font.italic is not None or r.style.font.bold is not None or r.style.font.underline is not None: return r.style.font.italic or r.style.font.bold or r.style.font.underline
+    return False
+
+
+def xml_from_docx(f):
+    d = docx.Document(f)
+
+    t = etree.Element("utwor")
+    rdf = etree.SubElement(t, RDF + "RDF")
+    meta = etree.SubElement(rdf, RDF + "Description")
+    meta.attrib[RDF + "about"] = ABOUT
+
+    etree.SubElement(meta, DC + "language").text = "pol"
+    etree.SubElement(meta, DC + "identifier.url").text = WLURI
+
+    m = etree.SubElement(t, "powiesc")
+    md = {}
+
+    for p in d.paragraphs:
+        can_ignore = False
+        if p.style.name == 'Title':
+            md['title'] = p.text
+        if p.style.name in META_STYLES:
+            item = etree.SubElement(meta, META_STYLES[p.style.name])
+            item.text = p.text
+            can_ignore = True
+        if p.style.name not in P_STYLES and not can_ignore:
+            print(p.style.name, file=sys.stderr)
+        if p.style.name in P_STYLES or not can_ignore:
+            tag = P_STYLES.get(p.style.name, "akap")
+            a = etree.SubElement(m, tag)
+
+            for r in p.runs:
+                if wyroznienie(r):
+                    etree.SubElement(a, "wyroznienie").text = r.text
+                else:
+                    if len(a):
+                        a[-1].tail = (a[-1].tail or '') + r.text
+                    else:
+                        a.text = (a.text or '') + r.text
+
+            if DEBUG and p.style.name not in P_STYLES:
+                a.text += f" [{p.style.name}]"
+
+    return etree.tostring(t, pretty_print=True, encoding='unicode'), md
diff --git a/src/documents/forms.py b/src/documents/forms.py

index f5f2901..bb064ee 100644 (file)
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -5,9 +5,10 @@ from django.db.models import Count
  from django import forms
  from django.utils.translation import ugettext_lazy as _
  from django.conf import settings
  from django import forms
  from django.utils.translation import ugettext_lazy as _
  from django.conf import settings
-
+from slugify import slugify
  from .constants import MASTERS
  from .models import Book, Chunk, Image, User
  from .constants import MASTERS
  from .models import Book, Chunk, Image, User
+from .docx import xml_from_docx
  
  class DocumentCreateForm(forms.ModelForm):
      """
  
  class DocumentCreateForm(forms.ModelForm):
      """
@@ -15,6 +16,7 @@ class DocumentCreateForm(forms.ModelForm):
      """
      file = forms.FileField(required=False)
      text = forms.CharField(required=False, widget=forms.Textarea)
      """
      file = forms.FileField(required=False)
      text = forms.CharField(required=False, widget=forms.Textarea)
+    docx = forms.FileField(required=False)
  
      class Meta:
          model = Book
  
      class Meta:
          model = Book
@@ -23,8 +25,10 @@ class DocumentCreateForm(forms.ModelForm):
      def __init__(self, *args, **kwargs):
          super(DocumentCreateForm, self).__init__(*args, **kwargs)
          self.fields['slug'].widget.attrs={'class': 'autoslug'}
      def __init__(self, *args, **kwargs):
          super(DocumentCreateForm, self).__init__(*args, **kwargs)
          self.fields['slug'].widget.attrs={'class': 'autoslug'}
+        self.fields['slug'].required = False
          self.fields['gallery'].widget.attrs={'class': 'autoslug'}
          self.fields['title'].widget.attrs={'class': 'autoslug-source'}
          self.fields['gallery'].widget.attrs={'class': 'autoslug'}
          self.fields['title'].widget.attrs={'class': 'autoslug-source'}
+        self.fields['title'].required = False
  
      def clean(self):
          super(DocumentCreateForm, self).clean()
  
      def clean(self):
          super(DocumentCreateForm, self).clean()
@@ -36,8 +40,27 @@ class DocumentCreateForm(forms.ModelForm):
              except UnicodeDecodeError:
                  raise forms.ValidationError(_("Text file must be UTF-8 encoded."))
  
              except UnicodeDecodeError:
                  raise forms.ValidationError(_("Text file must be UTF-8 encoded."))
  
+        docx = self.cleaned_data['docx']
+        if docx is not None:
+            try:
+                text, meta = xml_from_docx(docx)
+            except Exception as e:
+                raise forms.ValidationError(e)
+            else:
+                self.cleaned_data['text'] = text
+                if not self.cleaned_data['title']:
+                    self.cleaned_data['title'] = meta.get('title', '')
+                if not self.cleaned_data['slug']:
+                    self.cleaned_data['slug'] = slugify(meta.get('title', ''))
+
+        if not self.cleaned_data["title"]:
+            self._errors["title"] = self.error_class([_("Title not set")])
+
+        if not self.cleaned_data["slug"]:
+            self._errors["slug"] = self.error_class([_("Slug not set")])
+
          if not self.cleaned_data["text"]:
          if not self.cleaned_data["text"]:
-            self._errors["file"] = self.error_class([_("You must either enter text or upload a file")])
+            self._errors["text"] = self.error_class([_("You must either enter text or upload a file")])
  
          return self.cleaned_data
  
  
          return self.cleaned_data
author	Radek Czajka <rczajka@rczajka.pl>
	Wed, 25 Mar 2020 09:21:32 +0000 (10:21 +0100)
committer	Radek Czajka <rczajka@rczajka.pl>
	Wed, 25 Mar 2020 09:21:32 +0000 (10:21 +0100)
requirements/requirements.txt		patch \| blob \| history
src/documents/docx.py	[new file with mode: 0644]	patch \| blob
src/documents/forms.py		patch \| blob \| history