'osoba': ('em', True, {'class': 'osoba'}, None, None),
'didaskalia': ('div', True, {'class': 'didaskalia'}, None, None),
'kwestia': ('div', False, {'class': 'kwestia'}, None, None),
- 'didask_tekst': ('em', False, {'class': 'didask_tekst'}, None, None),
+ 'didask_tekst': ('em', True, {'class': 'didask_tekst'}, None, None),
'naglowek_czesc': ('h2', True, None, None, None),
'naglowek_akt': ('h2', True, None, None, None),
'begin': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
'end': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
- 'motyw': ('a', True, {'class': 'theme'}, None, False),
+ 'motyw': ('_ignore', True, {'class': 'theme'}, None, False),
'pa': ('a', True, {'class': 'footnote footnote-pa'}, None, False),
'pe': ('a', True, {'class': 'footnote footnote-pe'}, None, False),
'pr': ('a', True, {'class': 'footnote footnote-pr'}, None, False),
'pt': ('a', True, {'class': 'footnote footnote-pt'}, None, False),
'ptrad': ('a', True, {'class': 'footnote footnote-ptrad'}, None, False),
+
+ 'werset': ('p', True, {'class': 'werset'}, None, True),
+ 'br': ('br', False, None, None, None),
+ 'indeks_dolny': ('em', True, {'class': 'indeks_dolny'}, None, False),
+ 'mat': ('span', True, {'class': 'mat'}, None, False),
+
+ 'mfenced': ('math_mfenced', True, None, None, False),
+ 'mfrac': ('math_mfrac', True, None, None, False),
+ 'mrow': ('math_mrow', True, None, None, False),
+ 'mi': ('math_mi', True, None, None, False),
+ 'mn': ('math_mn', True, None, None, False),
+ 'mo': ('math_mo', True, None, None, False),
+ 'msup': ('math_msup', True, None, None, False),
+
+ 'list': ('blockquote', False, {'class': 'list'}, None, None),
+ 'wywiad_pyt': ('blockquote', False, {'class': 'wywiad_pyt'}, None, None),
+ 'wywiad_odp': ('blockquote', False, {'class': 'wywiad_odp'}, None, None),
+ 'rownolegle': ('blockquote', False, {'class': 'rownolegle'}, None, None),
+ 'animacja': ('div', False, {'class': 'animacja'}, None, None),
+ 'data': ('div', True, {'class': 'data'}, None, None),
+ 'podpis': ('div', True, {'class': 'podpis'}, None, None),
+ 'naglowek_listu': ('div', True, {'class': 'naglowek_listu'}, None, None),
+ 'pozdrowienie': ('div', True, {'class': 'pozdrowienie'}, None, None),
+ 'adresat': ('div', True, {'class': 'adresat'}, None, None),
+ 'tytul_oryg': ('div', True, {'class': 'tytul_oryg'}, None, None),
+ 'miejsce_data': ('div', True, {'class': 'miejsce_data'}, None, None),
+ 'audio': ('_ignore', False, None, None, None),
+ 'www': ('a', True, {'class': 'www'}, {'href': '.text'}, False),
+
+ 'tabela': ('table', False, None, None, None),
+ 'tabelka': ('table', False, None, None, None),
+ 'wiersz': ('tr', False, None, None, None),
+ 'kol': ('td', True, None, None, None),
+
+ 'ilustr': ('img', False, None, {'src': 'src'}, False),
+ 'tab': ('span', False, {'class': 'tab'}, {'szer': 'szer'}, False),
+
}
id_prefixes = {
'pr': 'fn',
'pt': 'fn',
'ptrad': 'fn',
- }
+ 'wers': 'f',
+ 'wers_wciety': 'f',
+ 'wers_cd': 'f',
+ 'wers_akap': 'f',
+ 'zastepnik_wersu': 'f',
+ 'wers_do_prawej': 'f',
+ 'wers_srodek': 'f',
+ 'akap': 'f',
+ 'akap_cd': 'f',
+ 'akap_dialog': 'f',
+}
#tree = etree.parse(argv[1])
def norm(text):
- text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”')
+ text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”').replace('\n', ' ')
return text
if 'dlugi_cytat' not in S['stack'] and 'poezja_cyt' not in S['stack']:
S['vindex'] += 1
output['visibleNumber'] = S['vindex']
- id_prefix = id_prefixes.get(tag, 'i')
+ id_prefix = id_prefixes.get(elem.tag, 'i')
S['id'][id_prefix] += 1
output['id'] = id_prefix + str(S['id'][id_prefix])
+ if elem.attrib.get('id'):
+ output['id'] = 'wl-' + elem.attrib.get('id')
if attrs:
output['attr'] = attrs.copy()
if attr_map:
output.setdefault('attr', {})
for k, v in attr_map.items():
- output['attr'][k] = elem.attrib[v]
+ if v == '.text':
+ val = elem.text
+ else:
+ val = elem.attrib[v]
+ output['attr'][k] = val
output['contents'] = contents
output = [output]
if elem.tag == 'strofa':