X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/102acc0b6eb715826f26b5082611604cf6ebe240..HEAD:/src/catalogue/api/tojson.py diff --git a/src/catalogue/api/tojson.py b/src/catalogue/api/tojson.py index 633d4f6bb..3ff257a60 100644 --- a/src/catalogue/api/tojson.py +++ b/src/catalogue/api/tojson.py @@ -1,3 +1,4 @@ +from collections import defaultdict import json import re from sys import argv @@ -81,13 +82,36 @@ tags = { 'begin': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False), 'end': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False), - 'motyw': ('a', True, {'class': 'theme'}, None, False), + 'motyw': ('_ignore', True, {'class': 'theme'}, None, False), 'pa': ('a', True, {'class': 'footnote footnote-pa'}, None, False), 'pe': ('a', True, {'class': 'footnote footnote-pe'}, None, False), 'pr': ('a', True, {'class': 'footnote footnote-pr'}, None, False), 'pt': ('a', True, {'class': 'footnote footnote-pt'}, None, False), 'ptrad': ('a', True, {'class': 'footnote footnote-ptrad'}, None, False), + + 'werset': ('p', True, {'class': 'werset'}, None, True), + 'br': ('br', False, None, None, None), + 'indeks_dolny': ('em', True, {'class': 'indeks_dolny'}, None, False), + 'mat': ('span', True, {'class': 'mat'}, None, False), +} + +id_prefixes = { + 'pa': 'fn', + 'pe': 'fn', + 'pr': 'fn', + 'pt': 'fn', + 'ptrad': 'fn', + 'wers': 'f', + 'wers_wciety': 'f', + 'wers_cd': 'f', + 'wers_akap': 'f', + 'zastepnik_wersu': 'f', + 'wers_do_prawej': 'f', + 'wers_srodek': 'f', + 'akap': 'f', + 'akap_cd': 'f', + 'akap_dialog': 'f', } @@ -102,7 +126,7 @@ front2 = set(['autor_utworu']) def norm(text): - text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”') + text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”').replace('\n', ' ') return text @@ -124,6 +148,11 @@ def toj(elem, S): if 'dlugi_cytat' not in S['stack'] and 'poezja_cyt' not in S['stack']: S['vindex'] += 1 output['visibleNumber'] = S['vindex'] + id_prefix = id_prefixes.get(elem.tag, 'i') + S['id'][id_prefix] += 1 + output['id'] = id_prefix + str(S['id'][id_prefix]) + if elem.attrib.get('id'): + output['id'] = 'wl-' + elem.attrib.get('id') if attrs: output['attr'] = attrs.copy() if attr_map: @@ -184,6 +213,7 @@ def conv(tree): S = { 'index': 0, 'vindex': 0, + 'id': defaultdict(lambda: 0), 'stack': [], 'front1': [], 'front2': [],