use ints for stats

[wolnelektury.git] / src / catalogue / api / tojson.py
diff --git a/src/catalogue/api/tojson.py b/src/catalogue/api/tojson.py

index 1fe055c..b803e73 100644 (file)
--- a/src/catalogue/api/tojson.py
+++ b/src/catalogue/api/tojson.py
@@ -54,7 +54,7 @@ tags = {
      'osoba': ('em', True, {'class': 'osoba'}, None, None),
      'didaskalia': ('div', True, {'class': 'didaskalia'}, None, None),
      'kwestia': ('div', False, {'class': 'kwestia'}, None, None),
-    'didask_tekst': ('em', False, {'class': 'didask_tekst'}, None, None),
+    'didask_tekst': ('em', True, {'class': 'didask_tekst'}, None, None),
      
      'naglowek_czesc': ('h2', True, None, None, None),
      'naglowek_akt': ('h2', True, None, None, None),
@@ -82,13 +82,50 @@ tags = {
  
      'begin': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
      'end': ('_ignore', True, {'class': 'reference'}, {'data-uri': 'href'}, False),
-    'motyw': ('a', True, {'class': 'theme'}, None, False),
+    'motyw': ('_ignore', True, {'class': 'theme'}, None, False),
  
      'pa': ('a', True, {'class': 'footnote footnote-pa'}, None, False),
      'pe': ('a', True, {'class': 'footnote footnote-pe'}, None, False),
      'pr': ('a', True, {'class': 'footnote footnote-pr'}, None, False),
      'pt': ('a', True, {'class': 'footnote footnote-pt'}, None, False),
      'ptrad': ('a', True, {'class': 'footnote footnote-ptrad'}, None, False),
+
+    'werset': ('p', True, {'class': 'werset'}, None, True),
+    'br': ('br', False, None, None, None),
+    'indeks_dolny': ('em', True, {'class': 'indeks_dolny'}, None, False),
+    'mat': ('span', True, {'class': 'mat'}, None, False),
+
+    'mfenced': ('math_mfenced', True, None, None, False),
+    'mfrac': ('math_mfrac', True, None, None, False),
+    'mrow': ('math_mrow', True, None, None, False),
+    'mi': ('math_mi', True, None, None, False),
+    'mn': ('math_mn', True, None, None, False),
+    'mo': ('math_mo', True, None, None, False),
+    'msup': ('math_msup', True, None, None, False),
+
+    'list': ('blockquote', False, {'class': 'list'}, None, None),
+    'wywiad_pyt': ('blockquote', False, {'class': 'wywiad_pyt'}, None, None),
+    'wywiad_odp': ('blockquote', False, {'class': 'wywiad_odp'}, None, None),
+    'rownolegle': ('blockquote', False, {'class': 'rownolegle'}, None, None),
+    'animacja': ('div', False, {'class': 'animacja'}, None, None),
+    'data': ('div', True, {'class': 'data'}, None, None),
+    'podpis': ('div', True, {'class': 'podpis'}, None, None),
+    'naglowek_listu': ('div', True, {'class': 'naglowek_listu'}, None, None),
+    'pozdrowienie': ('div', True, {'class': 'pozdrowienie'}, None, None),
+    'adresat': ('div', True, {'class': 'adresat'}, None, None),
+    'tytul_oryg': ('div', True, {'class': 'tytul_oryg'}, None, None),
+    'miejsce_data': ('div', True, {'class': 'miejsce_data'}, None, None),
+    'audio': ('_ignore', False, None, None, None),
+    'www': ('a', True, {'class': 'www'}, {'href': '.text'}, False),
+
+    'tabela': ('table', False, None, None, None),
+    'tabelka': ('table', False, None, None, None),
+    'wiersz': ('tr', False, None, None, None),
+    'kol': ('td', True, None, None, None),
+
+    'ilustr': ('img', False, None, {'src': 'src'}, False),
+    'tab': ('span', False, {'class': 'tab'}, {'szer': 'szer'}, False),
+    
  }
  
  id_prefixes = {
@@ -97,7 +134,17 @@ id_prefixes = {
      'pr': 'fn',
      'pt': 'fn',
      'ptrad': 'fn',
-    }
+    'wers': 'f',
+    'wers_wciety': 'f',
+    'wers_cd': 'f',
+    'wers_akap': 'f',
+    'zastepnik_wersu': 'f',
+    'wers_do_prawej': 'f',
+    'wers_srodek': 'f',
+    'akap': 'f',
+    'akap_cd': 'f',
+    'akap_dialog': 'f',
+}
  
  
  #tree = etree.parse(argv[1])
@@ -111,7 +158,7 @@ front2 = set(['autor_utworu'])
  
  
  def norm(text):
-    text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”')
+    text = text.replace('---', '—').replace('--', '–').replace('...', '…').replace(',,', '„').replace('"', '”').replace('\n', ' ')
      return text
  
  
@@ -133,15 +180,21 @@ def toj(elem, S):
              if 'dlugi_cytat' not in S['stack'] and 'poezja_cyt' not in S['stack']:
                  S['vindex'] += 1
                  output['visibleNumber'] = S['vindex']
-        id_prefix = id_prefixes.get(tag, 'i')
+        id_prefix = id_prefixes.get(elem.tag, 'i')
          S['id'][id_prefix] += 1
          output['id'] = id_prefix + str(S['id'][id_prefix])
+        if elem.attrib.get('id'):
+            output['id'] = 'wl-' + elem.attrib.get('id')
          if attrs:
              output['attr'] = attrs.copy()
          if attr_map:
              output.setdefault('attr', {})
              for k, v in attr_map.items():
-                output['attr'][k] = elem.attrib[v]
+                if v == '.text':
+                    val = elem.text
+                else:
+                    val = elem.attrib[v]
+                output['attr'][k] = val
          output['contents'] = contents
          output = [output]
      if elem.tag == 'strofa':