From: Radek Czajka
Date: Thu, 27 Sep 2012 15:09:02 +0000 (+0200)
Subject: Many FB2 fixes
X-Git-Tag: 1.7~143
X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/3754989331c91f1d78cd5c1904f768a4cf80f07a?ds=sidebyside;hp=-c
Many FB2 fixes
---
3754989331c91f1d78cd5c1904f768a4cf80f07a
diff --git a/librarian/epub.py b/librarian/epub.py
index bbeb3d7..10922d4 100644
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -7,6 +7,7 @@ from __future__ import with_statement
import os
import os.path
+import re
import subprocess
from StringIO import StringIO
from copy import deepcopy
@@ -109,31 +110,74 @@ def find_annotations(annotations, source, part_no):
find_annotations(annotations, child, part_no)
+class Stanza(object):
+ """
+ Converts / verse endings into verse elements in a stanza.
+
+ Slashes may only occur directly in the stanza. Any slashes in subelements
+ will be ignored, and the subelements will be put inside verse elements.
+
+ >>> s = etree.fromstring("a/\\nbx/\\nyc/ \\nd")
+ >>> Stanza(s).versify()
+ >>> print etree.tostring(s)
+ abx/
+ ycd
+
+ """
+ def __init__(self, stanza_elem):
+ self.stanza = stanza_elem
+ self.verses = []
+ self.open_verse = None
+
+ def versify(self):
+ self.push_text(self.stanza.text)
+ for elem in self.stanza:
+ self.push_elem(elem)
+ self.push_text(elem.tail)
+ tail = self.stanza.tail
+ self.stanza.clear()
+ self.stanza.tail = tail
+ self.stanza.extend(self.verses)
+
+ def open_normal_verse(self):
+ self.open_verse = self.stanza.makeelement("wers_normalny")
+ self.verses.append(self.open_verse)
+
+ def get_open_verse(self):
+ if self.open_verse is None:
+ self.open_normal_verse()
+ return self.open_verse
+
+ def push_text(self, text):
+ if not text or not text.strip():
+ return
+ for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
+ if i:
+ self.open_normal_verse()
+ verse = self.get_open_verse()
+ if len(verse):
+ verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
+ else:
+ verse.text = (verse.text or "") + verse_text.strip()
+
+ def push_elem(self, elem):
+ if elem.tag.startswith("wers"):
+ verse = deepcopy(elem)
+ verse.tail = None
+ self.verses.append(verse)
+ self.open_verse = verse
+ else:
+ appended = deepcopy(elem)
+ appended.tail = None
+ self.get_open_verse().append(appended)
+
+
def replace_by_verse(tree):
""" Find stanzas and create new verses in place of a '/' character """
stanzas = tree.findall('.//' + WLNS('strofa'))
- for node in stanzas:
- for child_node in node:
- if child_node.tag in ('slowo_obce', 'wyroznienie'):
- foreign_verses = inner_xml(child_node).split('/\n')
- if len(foreign_verses) > 1:
- new_foreign = ''
- for foreign_verse in foreign_verses:
- if foreign_verse.startswith('', foreign_verse, ''))
- set_inner_xml(child_node, new_foreign)
- verses = inner_xml(node).split('/\n')
- if len(verses) > 1:
- modified_inner_xml = ''
- for verse in verses:
- if verse.startswith('', verse, ''))
- set_inner_xml(node, modified_inner_xml)
+ for stanza in stanzas:
+ Stanza(stanza).versify()
def add_to_manifest(manifest, partno):
diff --git a/librarian/fb2.py b/librarian/fb2.py
index 78707a9..d979566 100644
--- a/librarian/fb2.py
+++ b/librarian/fb2.py
@@ -12,6 +12,28 @@ from .epub import replace_by_verse
functions.reg_substitute_entities()
+functions.reg_person_name()
+
+
+def sectionify(tree):
+ """Finds section headers and adds a tree of _section tags."""
+ sections = ['naglowek_czesc',
+ 'naglowek_akt', 'naglowek_rozdzial', 'naglowek_scena',
+ 'naglowek_podrozdzial']
+ section_level = dict((v,k) for (k,v) in enumerate(sections))
+
+ # We can assume there are just subelements an no text at section level.
+ for level, section_name in reversed(list(enumerate(sections))):
+ for header in tree.findall('//' + section_name):
+ section = header.makeelement("_section")
+ header.addprevious(section)
+ section.append(header)
+ sibling = section.getnext()
+ while (sibling is not None and
+ section_level.get(sibling.tag, 1000) > level):
+ section.append(sibling)
+ sibling = section.getnext()
+
def transform(wldoc, verbose=False,
cover=None, flags=None):
@@ -32,6 +54,7 @@ def transform(wldoc, verbose=False,
style = etree.parse(style_filename)
replace_by_verse(document.edoc)
+ sectionify(document.edoc)
result = document.transform(style)
diff --git a/librarian/fb2/drama.xslt b/librarian/fb2/drama.xslt
new file mode 100755
index 0000000..ab8fb06
--- /dev/null
+++ b/librarian/fb2/drama.xslt
@@ -0,0 +1,42 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/librarian/fb2/fb2.xslt b/librarian/fb2/fb2.xslt
index 2a07e35..950b526 100644
--- a/librarian/fb2/fb2.xslt
+++ b/librarian/fb2/fb2.xslt
@@ -7,6 +7,7 @@
-->
@@ -16,6 +17,7 @@
+
@@ -31,12 +33,13 @@
-
+
+ select="autor_utworu|dzielo_nadrzedne|nazwa_utworu|podtytul"/>
+
@@ -49,24 +52,7 @@
-
-
-
-
+
@@ -79,6 +65,23 @@
+
+
+
+ tÅum.
+
+ ,
+
+
+
+
+
+
+
+
+
+
+
diff --git a/librarian/fb2/footnotes.xslt b/librarian/fb2/footnotes.xslt
index 37f467f..09270b9 100644
--- a/librarian/fb2/footnotes.xslt
+++ b/librarian/fb2/footnotes.xslt
@@ -12,21 +12,24 @@
xmlns:l="http://www.w3.org/1999/xlink">
-
+
-
+
fn
-
+
+
+ [przypis autorski]
+
-
-
+
+
note
#fn
diff --git a/librarian/fb2/inline.xslt b/librarian/fb2/inline.xslt
index 221fbfd..03c6b65 100644
--- a/librarian/fb2/inline.xslt
+++ b/librarian/fb2/inline.xslt
@@ -17,12 +17,19 @@
-
+
-
+
+
+ â
+
+ â
+
+
+
diff --git a/librarian/fb2/paragraphs.xslt b/librarian/fb2/paragraphs.xslt
index 01943a3..68c6257 100644
--- a/librarian/fb2/paragraphs.xslt
+++ b/librarian/fb2/paragraphs.xslt
@@ -13,12 +13,34 @@
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ *
+
+
+
+ ââââââââ
+
+
+
+
diff --git a/librarian/fb2/poems.xslt b/librarian/fb2/poems.xslt
index bbc9407..31b05b4 100644
--- a/librarian/fb2/poems.xslt
+++ b/librarian/fb2/poems.xslt
@@ -33,7 +33,7 @@
puts it here -->
-
+
diff --git a/librarian/fb2/sections.xslt b/librarian/fb2/sections.xslt
index b698652..80ffb65 100644
--- a/librarian/fb2/sections.xslt
+++ b/librarian/fb2/sections.xslt
@@ -11,34 +11,37 @@
xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"
xmlns:l="http://www.w3.org/1999/xlink">
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
diff --git a/scripts/book2txt b/scripts/book2txt
index e584579..1b4c0ef 100755
--- a/scripts/book2txt
+++ b/scripts/book2txt
@@ -5,6 +5,7 @@
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from librarian.book2anything import Book2Anything, Option
+from librarian.parser import WLDocument
class Book2Txt(Book2Anything):
@@ -21,6 +22,7 @@ class Book2Txt(Book2Anything):
Option('-w', '--wrap', action='store', type='int', dest='wrapping', default=0,
help='set line wrap column')
]
+ transform = WLDocument.as_text
if __name__ == '__main__':
diff --git a/setup.py b/setup.py
index f88817e..51003ef 100755
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ setup(
maintainer_email='radoslaw.czajka@nowoczesnapolska.org.pl',
url='http://github.com/fnp/librarian',
packages=['librarian'],
- package_data={'librarian': ['xslt/*.xslt', 'epub/*', 'mobi/*', 'pdf/*', 'fonts/*', 'res/*'] +
+ package_data={'librarian': ['xslt/*.xslt', 'epub/*', 'mobi/*', 'pdf/*', 'fb2/*', 'fonts/*', 'res/*'] +
whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'font-optimizer')},
include_package_data=True,
install_requires=['lxml>=2.2'],
@@ -38,6 +38,7 @@ setup(
'scripts/book2epub',
'scripts/book2mobi',
'scripts/book2pdf',
+ 'scripts/book2fb2',
'scripts/book2partner',
'scripts/book2cover',
'scripts/bookfragments',
diff --git a/tests/files/example-wl.xml b/tests/files/example-wl.xml
new file mode 100644
index 0000000..d2fd87b
--- /dev/null
+++ b/tests/files/example-wl.xml
@@ -0,0 +1,170 @@
+
+
+
+
+
+Utworu, Autor
+TytuÅ w DC
+Utworu, TÅumacz
+Literacki, Redaktor
+Techniczny, Redaktor
+Fundacja Nowoczesna Polska
+period
+type
+genre
+Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana
+ przez BibliotekÄ NarodowÄ
z egzemplarza pochodzÄ
cego ze zbiorów BN.
+http://wolnelektury.pl/katalog/lektura/test1
+source
+Domena publiczna
+1500
+xml
+text
+text
+2000
+pol
+
+
+
+
+nota_red
+
+autor_utworu
+dzielo_nadrzedne
+nazwa_utworu
+podtytul
+
+[powyżej:
+nota_red (nie pojawia siÄ w tekÅcie, może byÄ podana osobno),
+autor_utworu, dzielo_nadrzedne, nazwa_utworu, podtytul, tÅumacz (z DC)]
+
+[Noty: nota/akap, dedykacja/akap, motto/akap, motto_podpis]
+
+
+nota/akap
+dedykacja/akap
+motto/akap
+motto_podpis
+
+[PoczÄ
tek dramatu: lista_osob, naglowek_listy, lista_osoba, miejsce_czas]
+
+
+ lista_osob/naglowek_listy
+ lista_osob/lista_osoba
+ lista_osob/lista_osoba
+
+miejsce_czas
+
+[naglowek_czesc, naglowek_rozdzial, naglowek_podrozdzial, srodtytul]
+
+naglowek_czesc
+naglowek_rozdzial
+naglowek_podrozdzial
+srodtytul
+
+[akap, akap_cd, akap_dialog, motyw]
+
+akapmotyw
+akap_cd
+akap_dialog
+
+[strofa, wers_akap, wers_wciety,typ=1-6, wers_cd, zastepnik_wersu]
+
+strofa/
+wers_akap/
+wers_wciety@typ=1/
+wers_wciety@typ=2/
+wers_wciety@typ=3
+
+wers_wciety@typ=4/
+wers_wciety@typ=5/
+wers_wciety@typ=6/
+wers_cd/
+. . . . . . . . . . . . . . . .
+
+
+[dlugi_cytat/akap]
+
+Cytowany akapit powinien wyglÄ
daÄ jak cytowany akapit.
+Znaczy, może mieÄ jakieÅ dodatkowe wciÄcie, jakiÅ rodzaj wyróżnienia czy coÅ.
+
+[poezja_cyt/strofa]
+
+To jest poezja/
+cytowana/
+ma byÄ porzÄ
dnie/
+wyrównana
+
+[naglowek_akt, naglowek_scena]
+
+naglowek_akt
+naglowek_scena
+
+[Kwestia: naglowek_osoba, kwestia, didask_tekst, didaskalia, strofa, akap]
+
+naglowek_osoba
+
+
+didask_tekst
+didaskalia
+Strofa w dramacie/
+jak amen w pacie/
+rzu.
+Powyższy kawaÅek wiersza jest najzupeÅniej bez sensu i tak naprawdÄ wcale nie trzyma rytmu ani rymu. ByÄ może należy skoncentrowaÄ siÄ na dramacie prozÄ
, jak ta tutaj niniejsza wypowiedź.
+
+[didaskalia, osoba]
+
+odezwaÅ siÄ autor.
+
+[Wyróżnienia: tytul_dziela, tytul_dziela@typ=1, wyroznienie, slowo_obce]
+
+
+tytul_dziela,
+tytul_dziela@typ=1,
+wyroznienie,
+slowo_obce
+
+
+[Przypisy: pa, pt, pr, pe]
+
+
+pa - - - przypis autorski
+pt - - - przypis tÅumacza
+pr - - - przypis redakcyjny
+pe - - - przypis edytorski
+
+
+[Separatory]
+
+[sekcja_swiatlo:]
+
+
+
+[sekcja_asterysk:]
+
+
+
+[separator_linia:]
+
+
+
+
+
+[Komentarze: uwaga, extra]
+uwaga
+extra
+
+[Nieużywane]
+
+wyp_osoba
+wywiad_pyt/akap
+wywiad_odp/akap
+mat
+www
+
+
+