src/catalogue/api/tojson.py

   1 from collections import defaultdict
   2 import json
   3 import re
   4 from sys import argv
   5 from lxml import etree
   6
   7 tags = {
   8     'utwor': ('_pass', False, None, None, None),
   9     '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': ('_ignore', False, None, None, None),
  10     'abstrakt': ('_ignore', False, None, None, None),
  11     'uwaga': ('_ignore', False, None, None, None),
  12     'extra': ('_ignore', False, None, None, None),
  13     'nota_red': ('_ignore', False, None, None, None),
  14     'numeracja': ('_ignore', False, None, None, None),
  15
  16     'powiesc': ('master', False, None, None, None),
  17     'opowiadanie': ('master', False, None, None, None),
  18     'liryka_lp': ('master', False, None, None, None),
  19     'liryka_l': ('master', False, None, None, None),
  20     'dramat_wspolczesny': ('master', False, None, None, None),
  21     'dramat_wierszowany_lp': ('master', False, None, None, None),
  22     'dramat_wierszowany_l': ('master', False, None, None, None),
  23
  24     'dlugi_cytat': ('blockquote', False, None, None, None),
  25     'poezja_cyt': ('blockquote', False, None, None, None),
  26     'dlugi_cyt': ('blockquote', False, None, None, None),
  27     'ramka': ('blockquote', False, {'class': 'ramka'}, None, None),
  28
  29     'blok': ('div', False, None, None, None),
  30
  31     'strofa': ('div', True, {'class': 'stanza'}, None, None),
  32     'wers': ('div', True, {'class': 'verse'}, None, None),
  33     'wers_wciety': ('div', True, {'class': 'wers_wciety'}, None, None),
  34     'wers_cd': ('div', True, {'class': 'wers_cd'}, None, None),
  35     'wers_akap': ('div', True, {'class': 'wers_akap'}, None, None),
  36     'zastepnik_wersu': ('div', True, {'class': 'zastepnik_wersu'}, None, None),
  37     'wers_do_prawej': ('div', True, {'class': 'wers_do_prawej'}, None, None),
  38     'wers_srodek': ('div', True, {'class': 'wers_srodek'}, None, None),
  39
  40     'autor_utworu': ('div', True, {'class': 'author'}, None, None),
  41     'dzielo_nadrzedne': ('div', True, {'class': 'dzielo_nadrzedne'}, None, None),
  42     'nazwa_utworu': ('div', True, {'class': 'title'}, None, None),
  43     'podtytul': ('div', True, {'class': 'podtytul'}, None, None),
  44
  45     'motto': ('div', False, {'class': 'motto'}, None, None),
  46     'motto_podpis': ('div', True, {'class': 'motto_podpis'}, None, None),
  47     'dedykacja': ('div', True, {'class': 'dedykacja'}, None, None),
  48     'miejsce_czas': ('div', True, {'class': 'miejsce_czas'}, None, None),
  49
  50     'lista_osob': ('div', False, {'class': 'lista_osob'}, None, None),
  51     'naglowek_listy': ('div', True, {'class': 'naglowek_listy'}, None, None),
  52     'lista_osoba': ('div', True, {'class': 'lista_osoba'}, None, None),
  53     'naglowek_osoba': ('div', True, {'class': 'naglowek_osoba'}, None, None),
  54     'osoba': ('em', True, {'class': 'osoba'}, None, None),
  55     'didaskalia': ('div', True, {'class': 'didaskalia'}, None, None),
  56     'kwestia': ('div', False, {'class': 'kwestia'}, None, None),
  57     'didask_tekst': ('em', True, {'class': 'didask_tekst'}, None, None),
  58
  59     'naglowek_czesc': ('h2', True, None, None, None),
  60     'naglowek_akt': ('h2', True, None, None, None),
  61     'naglowek_scena': ('h3', True, None, None, None),
  62     'naglowek_rozdzial': ('h3', True, None, None, None),
  63     'naglowek_podrozdzial': ('h4', True, None, None, None),
  64     'srodtytul': ('h5', True, None, None, None),
  65
  66     'nota': ('div', True, {'class': 'note'}, None, False),
  67
  68     'akap': ('p', True, {'class': 'paragraph'}, None, True),
  69     'akap_dialog': ('p', True, {'class': 'paragraph'}, None, True),
  70     'akap_cd': ('p', True, {'class': 'paragraph'}, None, True),
  71
  72     'sekcja_asterysk': ('p', True, {'class': 'spacer-asterisk'}, None, True),
  73     'sekcja_swiatlo': ('p', True, {'class': 'sekcja_swiatlo'}, None, True),
  74     'separator_linia': ('p', True, {'class': 'separator_linia'}, None, True),
  75
  76     'tytul_dziela': ('em', True, {'class': 'book-title'}, None, False),
  77     'slowo_obce': ('em', True, {'class': 'foreign-word'}, None, False),
  78     'wyroznienie': ('em', True, {'class': 'author-emphasis'}, None, False),
  79     'wieksze_odstepy': ('em', True, {'class': 'wieksze_odstepy'}, None, False),
  80
  81     'ref': ('a', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
  82
  83     'begin': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
  84     'end': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
  85     'motyw': ('_ignore', True, {'class': 'theme'}, None, False),
  86
  87     'pa': ('a', True, {'class': 'footnote footnote-pa'}, None, False),
  88     'pe': ('a', True, {'class': 'footnote footnote-pe'}, None, False),
  89     'pr': ('a', True, {'class': 'footnote footnote-pr'}, None, False),
  90     'pt': ('a', True, {'class': 'footnote footnote-pt'}, None, False),
  91     'ptrad': ('a', True, {'class': 'footnote footnote-ptrad'}, None, False),
  92
  93     'werset': ('p', True, {'class': 'werset'}, None, True),
  94     'br': ('br', False, None, None, None),
  95     'indeks_dolny': ('em', True, {'class': 'indeks_dolny'}, None, False),
  96     'mat': ('span', True, {'class': 'mat'}, None, False),
  97
  98     'mfenced': ('math_mfenced', True, None, None, False),
  99     'mfrac': ('math_mfrac', True, None, None, False),
 100     'mrow': ('math_mrow', True, None, None, False),
 101     'mi': ('math_mi', True, None, None, False),
 102     'mn': ('math_mn', True, None, None, False),
 103     'mo': ('math_mo', True, None, None, False),
 104     'msup': ('math_msup', True, None, None, False),
 105
 106     'list': ('blockquote', False, {'class': 'list'}, None, None),
 107     'wywiad_pyt': ('blockquote', False, {'class': 'wywiad_pyt'}, None, None),
 108     'wywiad_odp': ('blockquote', False, {'class': 'wywiad_odp'}, None, None),
 109     'rownolegle': ('blockquote', False, {'class': 'rownolegle'}, None, None),
 110     'animacja': ('div', False, {'class': 'animacja'}, None, None),
 111     'data': ('div', True, {'class': 'data'}, None, None),
 112     'podpis': ('div', True, {'class': 'podpis'}, None, None),
 113     'naglowek_listu': ('div', True, {'class': 'naglowek_listu'}, None, None),
 114     'pozdrowienie': ('div', True, {'class': 'pozdrowienie'}, None, None),
 115     'adresat': ('div', True, {'class': 'adresat'}, None, None),
 116     'tytul_oryg': ('div', True, {'class': 'tytul_oryg'}, None, None),
 117     'miejsce_data': ('div', True, {'class': 'miejsce_data'}, None, None),
 118     'audio': ('_ignore', False, None, None, None),
 119     'www': ('a', True, {'class': 'www'}, {'href': '.text'}, False),
 120
 121     'tabela': ('table', False, None, None, None),
 122     'tabelka': ('table', False, None, None, None),
 123     'wiersz': ('tr', False, None, None, None),
 124     'kol': ('td', True, None, None, None),
 125
 126     'ilustr': ('img', False, None, {'src': 'src'}, False),
 127     'tab': ('span', False, {'class': 'tab'}, {'szer': 'szer'}, False),
 128
 129 }
 130
 131 id_prefixes = {
 132     'pa': 'fn',
 133     'pe': 'fn',
 134     'pr': 'fn',
 135     'pt': 'fn',
 136     'ptrad': 'fn',
 137     'wers': 'f',
 138     'wers_wciety': 'f',
 139     'wers_cd': 'f',
 140     'wers_akap': 'f',
 141     'zastepnik_wersu': 'f',
 142     'wers_do_prawej': 'f',
 143     'wers_srodek': 'f',
 144     'akap': 'f',
 145     'akap_cd': 'f',
 146     'akap_dialog': 'f',
 147 }
 148
 149
 150 #tree = etree.parse(argv[1])
 151
 152 front1 = set([
 153     'dzielo_nadrzedne',
 154     'nazwa_utworu',
 155     'podtytul',
 156     ])
 157 front2 = set(['autor_utworu'])
 158
 159
 160 def norm(text):
 161     text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”').replace('\n', ' ')
 162     return text
 163
 164
 165 def toj(elem, S):
 166     if elem.tag is etree.Comment: return []
 167     tag, hastext, attrs, attr_map, num = tags[elem.tag]
 168     contents = []
 169     if tag == '_pass':
 170         output = contents
 171     elif tag == '_ignore':
 172         return []
 173     else:
 174         output = {
 175             'tag': tag,
 176         }
 177         if num:
 178             S['index'] += 1
 179             output['paragraphIndex'] = S['index']
 180             if 'dlugi_cytat' not in S['stack'] and 'poezja_cyt' not in S['stack']:
 181                 S['vindex'] += 1
 182                 output['visibleNumber'] = S['vindex']
 183         id_prefix = id_prefixes.get(elem.tag, 'i')
 184         S['id'][id_prefix] += 1
 185         output['id'] = id_prefix + str(S['id'][id_prefix])
 186         if elem.attrib.get('id'):
 187             output['id'] = 'wl-' + elem.attrib.get('id')
 188         if attrs:
 189             output['attr'] = attrs.copy()
 190         if attr_map:
 191             output.setdefault('attr', {})
 192             for k, v in attr_map.items():
 193                 if v == '.text':
 194                     val = elem.text
 195                 else:
 196                     val = elem.attrib[v]
 197                 output['attr'][k] = val
 198         output['contents'] = contents
 199         output = [output]
 200     if elem.tag == 'strofa':
 201         verses = [etree.Element('wers')]
 202         if elem.text:
 203             vparts = re.split(r'/\s+', elem.text)
 204             for i, v in enumerate(vparts):
 205                 if i:
 206                     verses.append(etree.Element('wers'))
 207                 verses[-1].text = (verses[-1].text or '') + v
 208         for child in elem:
 209             vparts = re.split(r'/\s+', child.tail or '')
 210             child.tail = vparts[0]
 211             verses[-1].append(child)
 212             for v in vparts[1:]:
 213                 verses.append(etree.Element('wers'))
 214                 verses[-1].text = v
 215
 216         if not(len(verses[-1]) or (verses[-1].text or '').strip()):
 217             verses.pop()
 218
 219         elem.clear(keep_tail=True)
 220         for verse in verses:
 221             if len(verse) == 1 and (verse[0].tag.startswith('wers') or verse[0].tag == 'zastepnik_wersu') and not (verse[0].tail or '').strip():
 222                 elem.append(verse[0])
 223             else:
 224                 elem.append(verse)
 225
 226         #if not len(elem):
 227         #    for v in re.split(r'/\s+', elem.text):
 228         #        etree.SubElement(elem, 'wers').text = v
 229         #    elem.text = None
 230
 231     if hastext and elem.text:
 232         contents.append(norm(elem.text))
 233     for c in elem:
 234         S['stack'].append(elem.tag)
 235         contents += toj(c, S)
 236         if hastext and c.tail:
 237             contents.append(norm(c.tail))
 238         S['stack'].pop()
 239
 240     if elem.tag in front1:
 241         S['front1'] += output
 242         return []
 243     if elem.tag in front2:
 244         S['front2'] += output
 245         return []
 246     return output
 247
 248 def conv(tree):
 249     S = {
 250         'index': 0,
 251         'vindex': 0,
 252         'id': defaultdict(lambda: 0),
 253         'stack': [],
 254         'front1': [],
 255         'front2': [],
 256     }
 257     output = toj(tree.getroot(), S)
 258     if not len(output): return {}
 259     jt = output[0]
 260     jt['front1'] = S['front1']
 261     jt['front2'] = S['front2']
 262     return jt
 263
 264 #print(json.dumps(jt, indent=2, ensure_ascii=False))