1 from collections import defaultdict
8 'utwor': ('_pass', False, None, None, None),
9 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': ('_ignore', False, None, None, None),
10 'abstrakt': ('_ignore', False, None, None, None),
11 'uwaga': ('_ignore', False, None, None, None),
12 'extra': ('_ignore', False, None, None, None),
13 'nota_red': ('_ignore', False, None, None, None),
14 'numeracja': ('_ignore', False, None, None, None),
16 'powiesc': ('master', False, None, None, None),
17 'opowiadanie': ('master', False, None, None, None),
18 'liryka_lp': ('master', False, None, None, None),
19 'liryka_l': ('master', False, None, None, None),
20 'dramat_wspolczesny': ('master', False, None, None, None),
21 'dramat_wierszowany_lp': ('master', False, None, None, None),
22 'dramat_wierszowany_l': ('master', False, None, None, None),
24 'dlugi_cytat': ('blockquote', False, None, None, None),
25 'poezja_cyt': ('blockquote', False, None, None, None),
26 'dlugi_cyt': ('blockquote', False, None, None, None),
27 'ramka': ('blockquote', False, {'class': 'ramka'}, None, None),
29 'blok': ('div', False, None, None, None),
31 'strofa': ('div', True, {'class': 'stanza'}, None, None),
32 'wers': ('div', True, {'class': 'verse'}, None, None),
33 'wers_wciety': ('div', True, {'class': 'wers_wciety'}, None, None),
34 'wers_cd': ('div', True, {'class': 'wers_cd'}, None, None),
35 'wers_akap': ('div', True, {'class': 'wers_akap'}, None, None),
36 'zastepnik_wersu': ('div', True, {'class': 'zastepnik_wersu'}, None, None),
37 'wers_do_prawej': ('div', True, {'class': 'wers_do_prawej'}, None, None),
38 'wers_srodek': ('div', True, {'class': 'wers_srodek'}, None, None),
40 'autor_utworu': ('div', True, {'class': 'author'}, None, None),
41 'dzielo_nadrzedne': ('div', True, {'class': 'dzielo_nadrzedne'}, None, None),
42 'nazwa_utworu': ('div', True, {'class': 'title'}, None, None),
43 'podtytul': ('div', True, {'class': 'podtytul'}, None, None),
45 'motto': ('div', False, {'class': 'motto'}, None, None),
46 'motto_podpis': ('div', True, {'class': 'motto_podpis'}, None, None),
47 'dedykacja': ('div', True, {'class': 'dedykacja'}, None, None),
48 'miejsce_czas': ('div', True, {'class': 'miejsce_czas'}, None, None),
50 'lista_osob': ('div', False, {'class': 'lista_osob'}, None, None),
51 'naglowek_listy': ('div', True, {'class': 'naglowek_listy'}, None, None),
52 'lista_osoba': ('div', True, {'class': 'lista_osoba'}, None, None),
53 'naglowek_osoba': ('div', True, {'class': 'naglowek_osoba'}, None, None),
54 'osoba': ('em', True, {'class': 'osoba'}, None, None),
55 'didaskalia': ('div', True, {'class': 'didaskalia'}, None, None),
56 'kwestia': ('div', False, {'class': 'kwestia'}, None, None),
57 'didask_tekst': ('em', True, {'class': 'didask_tekst'}, None, None),
59 'naglowek_czesc': ('h2', True, None, None, None),
60 'naglowek_akt': ('h2', True, None, None, None),
61 'naglowek_scena': ('h3', True, None, None, None),
62 'naglowek_rozdzial': ('h3', True, None, None, None),
63 'naglowek_podrozdzial': ('h4', True, None, None, None),
64 'srodtytul': ('h5', True, None, None, None),
66 'nota': ('div', True, {'class': 'note'}, None, False),
68 'akap': ('p', True, {'class': 'paragraph'}, None, True),
69 'akap_dialog': ('p', True, {'class': 'paragraph'}, None, True),
70 'akap_cd': ('p', True, {'class': 'paragraph'}, None, True),
72 'sekcja_asterysk': ('p', True, {'class': 'spacer-asterisk'}, None, True),
73 'sekcja_swiatlo': ('p', True, {'class': 'sekcja_swiatlo'}, None, True),
74 'separator_linia': ('p', True, {'class': 'separator_linia'}, None, True),
76 'tytul_dziela': ('em', True, {'class': 'book-title'}, None, False),
77 'slowo_obce': ('em', True, {'class': 'foreign-word'}, None, False),
78 'wyroznienie': ('em', True, {'class': 'author-emphasis'}, None, False),
79 'wieksze_odstepy': ('em', True, {'class': 'wieksze_odstepy'}, None, False),
81 'ref': ('a', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
83 'begin': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
84 'end': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
85 'motyw': ('_ignore', True, {'class': 'theme'}, None, False),
87 'pa': ('a', True, {'class': 'footnote footnote-pa'}, None, False),
88 'pe': ('a', True, {'class': 'footnote footnote-pe'}, None, False),
89 'pr': ('a', True, {'class': 'footnote footnote-pr'}, None, False),
90 'pt': ('a', True, {'class': 'footnote footnote-pt'}, None, False),
91 'ptrad': ('a', True, {'class': 'footnote footnote-ptrad'}, None, False),
93 'werset': ('p', True, {'class': 'werset'}, None, True),
94 'br': ('br', False, None, None, None),
95 'indeks_dolny': ('em', True, {'class': 'indeks_dolny'}, None, False),
96 'mat': ('span', True, {'class': 'mat'}, None, False),
98 'mfenced': ('math_mfenced', True, None, None, False),
99 'mfrac': ('math_mfrac', True, None, None, False),
100 'mrow': ('math_mrow', True, None, None, False),
101 'mi': ('math_mi', True, None, None, False),
102 'mn': ('math_mn', True, None, None, False),
103 'mo': ('math_mo', True, None, None, False),
104 'msup': ('math_msup', True, None, None, False),
106 'list': ('blockquote', False, {'class': 'list'}, None, None),
107 'wywiad_pyt': ('blockquote', False, {'class': 'wywiad_pyt'}, None, None),
108 'wywiad_odp': ('blockquote', False, {'class': 'wywiad_odp'}, None, None),
109 'rownolegle': ('blockquote', False, {'class': 'rownolegle'}, None, None),
110 'animacja': ('div', False, {'class': 'animacja'}, None, None),
111 'data': ('div', True, {'class': 'data'}, None, None),
112 'podpis': ('div', True, {'class': 'podpis'}, None, None),
113 'naglowek_listu': ('div', True, {'class': 'naglowek_listu'}, None, None),
114 'pozdrowienie': ('div', True, {'class': 'pozdrowienie'}, None, None),
115 'adresat': ('div', True, {'class': 'adresat'}, None, None),
116 'tytul_oryg': ('div', True, {'class': 'tytul_oryg'}, None, None),
117 'miejsce_data': ('div', True, {'class': 'miejsce_data'}, None, None),
118 'audio': ('_ignore', False, None, None, None),
119 'www': ('a', True, {'class': 'www'}, {'href': '.text'}, False),
121 'tabela': ('table', False, None, None, None),
122 'tabelka': ('table', False, None, None, None),
123 'wiersz': ('tr', False, None, None, None),
124 'kol': ('td', True, None, None, None),
126 'ilustr': ('img', False, None, {'src': 'src'}, False),
127 'tab': ('span', False, {'class': 'tab'}, {'szer': 'szer'}, False),
141 'zastepnik_wersu': 'f',
142 'wers_do_prawej': 'f',
150 #tree = etree.parse(argv[1])
157 front2 = set(['autor_utworu'])
161 text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”').replace('\n', ' ')
166 if elem.tag is etree.Comment: return []
167 tag, hastext, attrs, attr_map, num = tags[elem.tag]
171 elif tag == '_ignore':
179 output['paragraphIndex'] = S['index']
180 if 'dlugi_cytat' not in S['stack'] and 'poezja_cyt' not in S['stack']:
182 output['visibleNumber'] = S['vindex']
183 id_prefix = id_prefixes.get(elem.tag, 'i')
184 S['id'][id_prefix] += 1
185 output['id'] = id_prefix + str(S['id'][id_prefix])
186 if elem.attrib.get('id'):
187 output['id'] = 'wl-' + elem.attrib.get('id')
189 output['attr'] = attrs.copy()
191 output.setdefault('attr', {})
192 for k, v in attr_map.items():
197 output['attr'][k] = val
198 output['contents'] = contents
200 if elem.tag == 'strofa':
201 verses = [etree.Element('wers')]
203 vparts = re.split(r'/\s+', elem.text)
204 for i, v in enumerate(vparts):
206 verses.append(etree.Element('wers'))
207 verses[-1].text = (verses[-1].text or '') + v
209 vparts = re.split(r'/\s+', child.tail or '')
210 child.tail = vparts[0]
211 verses[-1].append(child)
213 verses.append(etree.Element('wers'))
216 if not(len(verses[-1]) or (verses[-1].text or '').strip()):
219 elem.clear(keep_tail=True)
221 if len(verse) == 1 and (verse[0].tag.startswith('wers') or verse[0].tag == 'zastepnik_wersu') and not (verse[0].tail or '').strip():
222 elem.append(verse[0])
227 # for v in re.split(r'/\s+', elem.text):
228 # etree.SubElement(elem, 'wers').text = v
231 if hastext and elem.text:
232 contents.append(norm(elem.text))
234 S['stack'].append(elem.tag)
235 contents += toj(c, S)
236 if hastext and c.tail:
237 contents.append(norm(c.tail))
240 if elem.tag in front1:
241 S['front1'] += output
243 if elem.tag in front2:
244 S['front2'] += output
252 'id': defaultdict(lambda: 0),
257 output = toj(tree.getroot(), S)
258 if not len(output): return {}
260 jt['front1'] = S['front1']
261 jt['front2'] = S['front2']
264 #print(json.dumps(jt, indent=2, ensure_ascii=False))