#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import re
import os
import optparse
-import codecs
-from librarian import dcparser
-HEADER = u"""\
-Kodowanie znaków w dokumencie: UTF-8.
-Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
-Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
-się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
-Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %s.
-def get_header(filename):
- return HEADER % dcparser.parse(filename).url
- (r'<rdf:RDF[^>]*>(.|\n)*?</rdf:RDF>', ''),
- (r'<motyw[^>]*>(.|\n)*?</motyw>', ''),
- ('<(begin|end)\\sid=[\'|"][b|e]\\d+[\'|"]\\s/>', ''),
- (r'<extra>((<!--<(elementy_poczatkowe|tekst_glowny)>-->)|(<!--</(elementy_poczatkowe|tekst_glowny)>-->))</extra>', ''),
- (r'<uwaga>(.|\n)*?</uwaga>', ''),
- (r'<p[a|e|r|t]>(.|\n)*?</p[a|e|r|t]>', ''),
- (r'<[^>]+>', ''),
- (r'/\n', '\n'),
- (r'---', u'—'),
- (r'--', u'-'),
- (r',,', u'„'),
- (r'"', u'”'),
+from librarian import text
if __name__ == '__main__':
print input_filename
output_filename = os.path.splitext(input_filename)[0] + '.txt'
- xml = codecs.open(input_filename, 'r', encoding='utf-8').read()
- for pattern, repl in REGEXES:
- xml, n = re.subn(pattern, repl, xml)
- output = codecs.open(output_filename, 'w', encoding='utf-8')
- output.write(get_header(input_filename))
- output.write(xml)
+ text.transform(input_filename, output_filename)
--- /dev/null
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:wl="http://wolnelektury.pl/functions" >
+<xsl:output encoding="utf-8" method="text" />
+<!-- ============================================================================== -->
+<!-- = MASTER TAG = -->
+<!-- = (can contain block tags, paragraph tags, standalone tags and special tags) = -->
+<!-- ============================================================================== -->
+<xsl:template match="powiesc|opowiadanie|liryka_l|liryka_lp|dramat_wierszowany_l|dramat_wierszowany_lp|dramat_wspolczesny">
+<xsl:text>Kodowanie znaków w dokumencie: UTF-8.
+Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
+Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
+się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
+Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %s.
+<xsl:if test="nazwa_utworu"><xsl:apply-templates select="autor_utworu|dzielo_nadrzedne|nazwa_utworu|podtytul" mode="header" /></xsl:if>
+<xsl:apply-templates />
+<!-- ==================================================================================== -->
+<!-- = BLOCK TAGS = -->
+<!-- = (can contain other block tags, paragraph tags, standalone tags and special tags) = -->
+<!-- ==================================================================================== -->
+<xsl:template match="nota">
+<xsl:apply-templates />
+<xsl:template match="lista_osob">
+<xsl:value-of select="naglowek_listy" />
+<xsl:apply-templates select="lista_osoba" />
+<xsl:template match="dedykacja">
+<xsl:apply-templates />
+<xsl:template match="kwestia">
+<xsl:apply-templates select="strofa|akap|didaskalia" />
+<xsl:template match="dlugi_cytat|poezja_cyt">
+<xsl:apply-templates />
+<xsl:template match="motto">
+<xsl:apply-templates mode="inline" /><xsl:text>
+<!-- ========================================== -->
+<!-- = PARAGRAPH TAGS = -->
+<!-- = (can contain inline and special tags) = -->
+<!-- ========================================== -->
+<!-- Title page -->
+<xsl:template match="autor_utworu" mode="header">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="nazwa_utworu" mode="header">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="dzielo_nadrzedne" mode="header">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="podtytul" mode="header">
+<xsl:apply-templates mode="inline" />
+<!-- Section headers (included in index)-->
+<xsl:template match="naglowek_akt|naglowek_czesc|srodtytul">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="naglowek_scena|naglowek_rozdzial">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="naglowek_osoba|naglowek_podrozdzial">
+<xsl:apply-templates mode="inline" />
+<!-- Other paragraph tags -->
+<xsl:template match="miejsce_czas">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="didaskalia">
+/ </xsl:text><xsl:apply-templates mode="inline" /><xsl:text> /</xsl:text>
+<xsl:template match="lista_osoba">
+ * </xsl:text>
+<xsl:apply-templates mode="inline" />
+<xsl:template match="akap|akap_dialog|akap_cd">
+<xsl:apply-templates mode="inline" />
+<xsl:template match="strofa">
+ <xsl:choose>
+ <xsl:when test="count(br) > 0">
+ <xsl:call-template name="verse">
+ <xsl:with-param name="verse-content" select="br[1]/preceding-sibling::text() | br[1]/preceding-sibling::node()" />
+ <xsl:with-param name="verse-type" select="br[1]/preceding-sibling::*[name() = 'wers_wciety' or name() = 'wers_akap' or name() = 'wers_cd'][1]" />
+ </xsl:call-template>
+ <xsl:for-each select="br">
+ <!-- Each BR tag "consumes" text after it -->
+ <xsl:variable name="lnum" select="count(preceding-sibling::br)" />
+ <xsl:call-template name="verse">
+ <xsl:with-param name="verse-content"
+ select="following-sibling::text()[count(preceding-sibling::br) = $lnum+1] | following-sibling::node()[count(preceding-sibling::br) = $lnum+1]" />
+ <xsl:with-param name="verse-type" select="following-sibling::*[count(preceding-sibling::br) = $lnum+1 and (name() = 'wers_wciety' or name() = 'wers_akap' or name() = 'wers_cd')][1]" />
+ </xsl:call-template>
+ </xsl:for-each>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:call-template name="verse">
+ <xsl:with-param name="verse-content" select="text() | node()" />
+ <xsl:with-param name="verse-type" select="wers_wciety|wers_akap|wers_cd[1]" />
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+<xsl:template name="verse">
+ <xsl:param name="verse-content" />
+ <xsl:param name="verse-type" />
+ <xsl:choose>
+ <xsl:when test="name($verse-type) = 'wers_akap'">
+ <xsl:text> </xsl:text>
+ </xsl:when>
+ <xsl:when test="name($verse-type) = 'wers_wciety'">
+ <xsl:choose>
+ <xsl:when test="$verse-content/@typ">
+ <xsl:text> </xsl:text>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:text> </xsl:text>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:when test="name($verse-type) = 'wers_cd'">
+ <xsl:text> </xsl:text>
+ </xsl:when>
+ </xsl:choose>
+ <xsl:apply-templates select="$verse-content" mode="inline" />
+<xsl:template match="motto_podpis">
+<xsl:apply-templates mode="inline" />
+<!-- ================================================ -->
+<!-- = INLINE TAGS = -->
+<!-- = (contain other inline tags and special tags) = -->
+<!-- ================================================ -->
+<!-- Annotations -->
+<xsl:template match="pa|pe|pr|pt" mode="inline" />
+<!-- Other inline tags -->
+<xsl:template match="mat" mode="inline"><xsl:apply-templates mode="inline" /></xsl:template>
+<xsl:template match="didask_tekst" mode="inline"><xsl:apply-templates mode="inline" /></xsl:template>
+<xsl:template match="slowo_obce" mode="inline"><xsl:apply-templates mode="inline" /></xsl:template>
+<xsl:template match="tytul_dziela" mode="inline">
+<xsl:if test="@typ = '1'">„</xsl:if><xsl:apply-templates mode="inline" /><xsl:if test="@typ = '1'">”</xsl:if>
+<xsl:template match="wyroznienie" mode="inline">
+<xsl:text>*</xsl:text><xsl:apply-templates mode="inline" /><xsl:text>*</xsl:text>
+<xsl:template match="osoba" mode="inline">
+<xsl:apply-templates mode="inline" />
+<!-- ============================================== -->
+<!-- = STANDALONE TAGS = -->
+<!-- = (cannot contain any other tags) = -->
+<!-- ============================================== -->
+<xsl:template match="sekcja_swiatlo">
+<xsl:template match="sekcja_asterysk">
+<xsl:template match="separator_linia">
+<!-- ================ -->
+<!-- = SPECIAL TAGS = -->
+<!-- ================ -->
+<!-- Themes -->
+<xsl:template match="begin" mode="inline" />
+<xsl:template match="end" mode="inline" />
+<xsl:template match="begin|end" />
+<xsl:template match="motyw" mode="inline" />
+<!-- ================ -->
+<!-- = IGNORED TAGS = -->
+<!-- ================ -->
+<xsl:template match="extra|uwaga" />
+<xsl:template match="extra|uwaga" mode="inline" />
+<!-- ======== -->
+<!-- = TEXT = -->
+<!-- ======== -->
+<xsl:template match="text()" />
+<xsl:template match="text()" mode="inline">
+ <xsl:value-of select="wl:substitute_entities(.)" />
--- /dev/null
+# -*- coding: utf-8 -*-
+import os
+import cStringIO
+import re
+import codecs
+from lxml import etree
+from librarian import dcparser
+ (u'---', u'—'),
+ (u'--', u'–'),
+ (u'...', u'…'),
+ (u',,', u'„'),
+ (u'"', u'”'),
+def substitute_entities(context, text):
+ """XPath extension function converting all entites in passed text."""
+ if isinstance(text, list):
+ text = ''.join(text)
+ for entity, substitutution in ENTITY_SUBSTITUTIONS:
+ text = text.replace(entity, substitutution)
+ return text
+# Register substitute_entities function with lxml
+ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
+ns['substitute_entities'] = substitute_entities
+def transform(input_filename, output_filename):
+ """Transforms file input_filename in XML to output_filename in TXT."""
+ # Parse XSLT
+ style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt')
+ style = etree.parse(style_filename)
+ doc_file = cStringIO.StringIO()
+ expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
+ f = open(input_filename, 'r')
+ for line in f:
+ line = line.decode('utf-8')
+ line = expr.sub(u'<br/>\n', line)
+ doc_file.write(line.encode('utf-8'))
+ f.close()
+ doc_file.seek(0)
+ parser = etree.XMLParser(remove_blank_text=True)
+ doc = etree.parse(doc_file, parser)
+ result = doc.xslt(style)
+ output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
+ output_file.write(unicode(result) % dcparser.parse(input_filename).url)