src/catalogue/api/tojson.py

   1 import json
   2 import re
   3 from sys import argv
   4 from lxml import etree
   5
   6 tags = {
   7     'utwor': ('_pass', False, None, None, None),
   8     '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': ('_ignore', False, None, None, None),
   9     'abstrakt': ('_ignore', False, None, None, None),
  10     'uwaga': ('_ignore', False, None, None, None),
  11     'extra': ('_ignore', False, None, None, None),
  12     'nota_red': ('_ignore', False, None, None, None),
  13     'numeracja': ('_ignore', False, None, None, None),
  14
  15     'powiesc': ('master', False, None, None, None),
  16     'opowiadanie': ('master', False, None, None, None),
  17     'liryka_lp': ('master', False, None, None, None),
  18     'liryka_l': ('master', False, None, None, None),
  19     'dramat_wspolczesny': ('master', False, None, None, None),
  20     'dramat_wierszowany_lp': ('master', False, None, None, None),
  21     'dramat_wierszowany_l': ('master', False, None, None, None),
  22
  23     'dlugi_cytat': ('blockquote', False, None, None, None),
  24     'poezja_cyt': ('blockquote', False, None, None, None),
  25     'dlugi_cyt': ('blockquote', False, None, None, None),
  26     'ramka': ('blockquote', False, {'class': 'ramka'}, None, None),
  27
  28     'blok': ('div', False, None, None, None),
  29
  30     'strofa': ('div', True, {'class': 'stanza'}, None, None),
  31     'wers': ('div', True, {'class': 'verse'}, None, None),
  32     'wers_wciety': ('div', True, {'class': 'wers_wciety'}, None, None),
  33     'wers_cd': ('div', True, {'class': 'wers_cd'}, None, None),
  34     'wers_akap': ('div', True, {'class': 'wers_akap'}, None, None),
  35     'zastepnik_wersu': ('div', True, {'class': 'zastepnik_wersu'}, None, None),
  36     'wers_do_prawej': ('div', True, {'class': 'wers_do_prawej'}, None, None),
  37     'wers_srodek': ('div', True, {'class': 'wers_srodek'}, None, None),
  38
  39     'autor_utworu': ('div', True, {'class': 'author'}, None, None),
  40     'dzielo_nadrzedne': ('div', True, {'class': 'dzielo_nadrzedne'}, None, None),
  41     'nazwa_utworu': ('div', True, {'class': 'title'}, None, None),
  42     'podtytul': ('div', True, {'class': 'podtytul'}, None, None),
  43
  44     'motto': ('div', False, {'class': 'motto'}, None, None),
  45     'motto_podpis': ('div', True, {'class': 'motto_podpis'}, None, None),
  46     'dedykacja': ('div', True, {'class': 'dedykacja'}, None, None),
  47     'miejsce_czas': ('div', True, {'class': 'miejsce_czas'}, None, None),
  48
  49     'lista_osob': ('div', False, {'class': 'lista_osob'}, None, None),
  50     'naglowek_listy': ('div', True, {'class': 'naglowek_listy'}, None, None),
  51     'lista_osoba': ('div', True, {'class': 'lista_osoba'}, None, None),
  52     'naglowek_osoba': ('div', True, {'class': 'naglowek_osoba'}, None, None),
  53     'osoba': ('em', True, {'class': 'osoba'}, None, None),
  54     'didaskalia': ('div', True, {'class': 'didaskalia'}, None, None),
  55     'kwestia': ('div', False, {'class': 'kwestia'}, None, None),
  56     'didask_tekst': ('em', False, {'class': 'didask_tekst'}, None, None),
  57
  58     'naglowek_czesc': ('h2', True, None, None, None),
  59     'naglowek_akt': ('h2', True, None, None, None),
  60     'naglowek_scena': ('h3', True, None, None, None),
  61     'naglowek_rozdzial': ('h3', True, None, None, None),
  62     'naglowek_podrozdzial': ('h4', True, None, None, None),
  63     'srodtytul': ('h5', True, None, None, None),
  64
  65     'nota': ('div', True, {'class': 'note'}, None, False),
  66
  67     'akap': ('p', True, {'class': 'paragraph'}, None, True),
  68     'akap_dialog': ('p', True, {'class': 'paragraph'}, None, True),
  69     'akap_cd': ('p', True, {'class': 'paragraph'}, None, True),
  70
  71     'sekcja_asterysk': ('p', True, {'class': 'spacer-asterisk'}, None, True),
  72     'sekcja_swiatlo': ('p', True, {'class': 'sekcja_swiatlo'}, None, True),
  73     'separator_linia': ('p', True, {'class': 'separator_linia'}, None, True),
  74
  75     'tytul_dziela': ('em', True, {'class': 'book-title'}, None, False),
  76     'slowo_obce': ('em', True, {'class': 'foreign-word'}, None, False),
  77     'wyroznienie': ('em', True, {'class': 'author-emphasis'}, None, False),
  78     'wieksze_odstepy': ('em', True, {'class': 'wieksze_odstepy'}, None, False),
  79
  80     'ref': ('a', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
  81
  82     'begin': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
  83     'end': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
  84     'motyw': ('a', True, {'class': 'theme'}, None, False),
  85
  86     'pa': ('a', True, {'class': 'footnote footnote-pa'}, None, False),
  87     'pe': ('a', True, {'class': 'footnote footnote-pe'}, None, False),
  88     'pr': ('a', True, {'class': 'footnote footnote-pr'}, None, False),
  89     'pt': ('a', True, {'class': 'footnote footnote-pt'}, None, False),
  90     'ptrad': ('a', True, {'class': 'footnote footnote-ptrad'}, None, False),
  91 }
  92
  93
  94 #tree = etree.parse(argv[1])
  95
  96 front1 = set([
  97     'dzielo_nadrzedne',
  98     'nazwa_utworu',
  99     'podtytul',
 100     ])
 101 front2 = set(['autor_utworu'])
 102
 103
 104 def norm(text):
 105     text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”')
 106     return text
 107
 108
 109 def toj(elem, S):
 110     if elem.tag is etree.Comment: return []
 111     tag, hastext, attrs, attr_map, num = tags[elem.tag]
 112     contents = []
 113     if tag == '_pass':
 114         output = contents
 115     elif tag == '_ignore':
 116         return []
 117     else:
 118         output = {
 119             'tag': tag,
 120         }
 121         if num:
 122             S['index'] += 1
 123             output['paragraphIndex'] = S['index']
 124             if 'dlugi_cytat' not in S['stack'] and 'poezja_cyt' not in S['stack']:
 125                 S['vindex'] += 1
 126                 output['visibleNumber'] = S['vindex']
 127         if attrs:
 128             output['attr'] = attrs.copy()
 129         if attr_map:
 130             output.setdefault('attr', {})
 131             for k, v in attr_map.items():
 132                 output['attr'][k] = elem.attrib[v]
 133         output['contents'] = contents
 134         output = [output]
 135     if elem.tag == 'strofa':
 136         verses = [etree.Element('wers')]
 137         if elem.text:
 138             vparts = re.split(r'/\s+', elem.text)
 139             for i, v in enumerate(vparts):
 140                 if i:
 141                     verses.append(etree.Element('wers'))
 142                 verses[-1].text = (verses[-1].text or '') + v
 143         for child in elem:
 144             vparts = re.split(r'/\s+', child.tail or '')
 145             child.tail = vparts[0]
 146             verses[-1].append(child)
 147             for v in vparts[1:]:
 148                 verses.append(etree.Element('wers'))
 149                 verses[-1].text = v
 150
 151         if not(len(verses[-1]) or (verses[-1].text or '').strip()):
 152             verses.pop()
 153
 154         elem.clear(keep_tail=True)
 155         for verse in verses:
 156             if len(verse) == 1 and (verse[0].tag.startswith('wers') or verse[0].tag == 'zastepnik_wersu') and not (verse[0].tail or '').strip():
 157                 elem.append(verse[0])
 158             else:
 159                 elem.append(verse)
 160
 161         #if not len(elem):
 162         #    for v in re.split(r'/\s+', elem.text):
 163         #        etree.SubElement(elem, 'wers').text = v
 164         #    elem.text = None
 165
 166     if hastext and elem.text:
 167         contents.append(norm(elem.text))
 168     for c in elem:
 169         S['stack'].append(elem.tag)
 170         contents += toj(c, S)
 171         if hastext and c.tail:
 172             contents.append(norm(c.tail))
 173         S['stack'].pop()
 174
 175     if elem.tag in front1:
 176         S['front1'] += output
 177         return []
 178     if elem.tag in front2:
 179         S['front2'] += output
 180         return []
 181     return output
 182
 183 def conv(tree):
 184     S = {
 185         'index': 0,
 186         'vindex': 0,
 187         'stack': [],
 188         'front1': [],
 189         'front2': [],
 190     }
 191     output = toj(tree.getroot(), S)
 192     if not len(output): return {}
 193     jt = output[0]
 194     jt['front1'] = S['front1']
 195     jt['front2'] = S['front2']
 196     return jt
 197
 198 #print(json.dumps(jt, indent=2, ensure_ascii=False))