src/documents/docx.py

   1 import sys
   2 import docx
   3 from lxml import etree
   4
   5
   6 DEBUG = False
   7
   8 DC = "{http://purl.org/dc/elements/1.1/}"
   9 RDF = "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}"
  10
  11 ABOUT = "http://redakcja.wolnelektury.pl/documents/book/test-icm/"
  12 WLURI = "http://wolnelektury.pl/katalog/lektura/test-icm/"
  13
  14
  15
  16 META_STYLES = {
  17     "Author": DC + "creator",
  18     "Title": DC + "title",
  19     "Publisher": DC + "publisher",
  20     "Year": DC + "date",
  21     "Editor": DC + "contributor.editor",
  22     "Copyright holder": DC + "rights",
  23 }
  24
  25
  26 P_STYLES = {
  27     "Normal": "akap",
  28     "Autor": "autor_utworu",
  29     "Title": "nazwa_utworu",
  30     "Subtitle": "podtytul",
  31     "Heading 1": "naglowek_czesc",
  32     "Heading 2": "naglowek_rozdzial",
  33     "Heading 3": "naglowek_podrozdzial",
  34     "Heading 4": "srodtytul",
  35     "Heading 5": "srodtytul",
  36
  37 }
  38
  39
  40 def wyroznienie(r):
  41     if r.font.italic is not None or r.font.bold is not None or r.font.underline is not None: return r.font.italic or r.font.bold or r.font.underline
  42     if r.style.font.italic is not None or r.style.font.bold is not None or r.style.font.underline is not None: return r.style.font.italic or r.style.font.bold or r.style.font.underline
  43     return False
  44
  45
  46 def xml_from_docx(f):
  47     d = docx.Document(f)
  48
  49     t = etree.Element("utwor")
  50     rdf = etree.SubElement(t, RDF + "RDF")
  51     meta = etree.SubElement(rdf, RDF + "Description")
  52     meta.attrib[RDF + "about"] = ABOUT
  53
  54     etree.SubElement(meta, DC + "language").text = "pol"
  55     etree.SubElement(meta, DC + "identifier.url").text = WLURI
  56
  57     m = etree.SubElement(t, "powiesc")
  58     md = {}
  59
  60     for p in d.paragraphs:
  61         can_ignore = False
  62         if p.style.name == 'Title':
  63             md['title'] = p.text
  64         if p.style.name in META_STYLES:
  65             item = etree.SubElement(meta, META_STYLES[p.style.name])
  66             item.text = p.text
  67             can_ignore = True
  68         if p.style.name not in P_STYLES and not can_ignore:
  69             print(p.style.name, file=sys.stderr)
  70         if p.style.name in P_STYLES or not can_ignore:
  71             tag = P_STYLES.get(p.style.name, "akap")
  72             a = etree.SubElement(m, tag)
  73
  74             for r in p.runs:
  75                 if wyroznienie(r):
  76                     etree.SubElement(a, "wyroznienie").text = r.text
  77                 else:
  78                     if len(a):
  79                         a[-1].tail = (a[-1].tail or '') + r.text
  80                     else:
  81                         a.text = (a.text or '') + r.text
  82
  83             if DEBUG and p.style.name not in P_STYLES:
  84                 a.text += f" [{p.style.name}]"
  85
  86     return etree.tostring(t, pretty_print=True, encoding='unicode'), md