# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
import os
+import re
+import shutil
class ParseError(Exception):
def __str__(self):
pass
class NoDublinCore(ValidationError):
+ """There's no DublinCore section, and it's required."""
+ pass
+
+class NoProvider(Exception):
+ """There's no DocProvider specified, and it's needed."""
pass
class XMLNamespace(object):
WLNS = EmptyNamespace()
+class WLURI(object):
+ """Represents a WL URI. Extracts slug and language from it."""
+
+ slug = None
+ language = None
+
+ _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/lektura/'
+ '(?P<slug>[-a-z]+)(/(?P<lang>[a-z]{3})/?)?')
+
+ def __init__(self, uri):
+ self.uri = uri
+ match = self._re_wl_uri.match(uri)
+ assert match
+ self.slug = match.group('slug')
+ self.language = match.group('lang')
+
+
class DocProvider(object):
- """ Base class for a repository of XML files.
- Used for generating joined files, like EPUBs
+ """Base class for a repository of XML files.
+
+ Used for generating joined files, like EPUBs.
"""
- def by_slug(self, slug):
- raise NotImplemented
+ def by_slug_and_lang(self, slug, lang=None):
+ """Should return a file-like object with a WL document XML."""
+ raise NotImplementedError
- def __getitem__(self, slug):
- return self.by_slug(slug)
+ def by_slug(self, slug):
+ """Should return a file-like object with a WL document XML."""
+ return self.by_slug_and_lang(slug)
def by_uri(self, uri):
- return self.by_slug(uri.rsplit('/', 1)[1])
+ """Should return a file-like object with a WL document XML."""
+ wluri = WLURI(uri)
+ return self.by_slug_and_lang(wluri.slug, wluri.language)
class DirDocProvider(DocProvider):
""" Serve docs from a directory of files in form <slug>.xml """
- def __init__(self, dir):
- self.dir = dir
+ def __init__(self, dir_):
+ self.dir = dir_
self.files = {}
+ return super(DirDocProvider, self).__init__()
- def by_slug(self, slug):
- return open(os.path.join(self.dir, '%s.xml' % slug))
+ def by_slug_and_lang(self, slug, lang=None):
+ fname = "%s%s.xml" % (slug, ".%s" % lang if lang else "")
+ return open(os.path.join(self.dir, fname))
import lxml.etree as etree
import dcparser
DEFAULT_BOOKINFO = dcparser.BookInfo(
- { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, \
+ { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'},
{ DCNS('creator'): [u'Some, Author'],
DCNS('title'): [u'Some Title'],
DCNS('subject.period'): [u'Unknown'],
method='xml', encoding=unicode, pretty_print=True)
return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext + \
- u'\n</plain-text>\n</utwor>';
+ u'\n</plain-text>\n</utwor>'
def serialize_raw(element):
b = u'' + (element.text or '')
for child in element.iterchildren():
- e = etree.tostring(child, method='xml', encoding=unicode, pretty_print=True)
+ e = etree.tostring(child, method='xml', encoding=unicode,
+ pretty_print=True)
b += e
return b
def get_resource(path):
return os.path.join(os.path.dirname(__file__), path)
+
+class OutputFile(object):
+ """Represents a file returned by one of the converters."""
+
+ _string = None
+ _filename = None
+
+ def __del__(self):
+ if self._filename:
+ os.unlink(self._filename)
+
+ def __nonzero__(self):
+ return self._string is not None or self._filename is not None
+
+ @classmethod
+ def from_string(cls, string):
+ """Converter returns contents of a file as a string."""
+
+ instance = cls()
+ instance._string = string
+ return instance
+
+ @classmethod
+ def from_filename(cls, filename):
+ """Converter returns contents of a file as a named file."""
+
+ instance = cls()
+ instance._filename = filename
+ return instance
+
+ def get_string(self):
+ """Get file's contents as a string."""
+
+ if self._filename is not None:
+ with open(self._filename) as f:
+ return f.read()
+ else:
+ return self._string
+
+ def get_file(self):
+ """Get file as a file-like object."""
+
+ if self._string is not None:
+ from StringIO import StringIO
+ return StringIO(self._string)
+ elif self._filename is not None:
+ return open(self._filename)
+
+ def get_filename(self):
+ """Get file as a fs path."""
+
+ if self._filename is not None:
+ return self._filename
+ elif self._string is not None:
+ from tempfile import NamedTemporaryFile
+ temp = NamedTemporaryFile(prefix='librarian-', delete=False)
+ temp.write(self._string)
+ temp.close()
+ self._filename = temp.name
+ return self._filename
+ else:
+ return None
+
+ def save_as(self, path):
+ """Save file to a path. Create directories, if necessary."""
+
+ dirname = os.path.dirname(os.path.abspath(path))
+ if not os.path.isdir(dirname):
+ os.makedirs(dirname)
+ shutil.copy(self.get_filename(), path)
from datetime import date
import time
-from librarian import ValidationError, NoDublinCore, ParseError, DCNS, RDFNS
+from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
+ WLURI)
import lxml.etree as etree # ElementTree API using libxml2
from lxml.etree import XMLSyntaxError
@property
def slug(self):
- return self.url.rsplit('/', 1)[1]
+ return WLURI(self.url).slug
@classmethod
def from_string(cls, xml):
from copy import deepcopy
from lxml import etree
import zipfile
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
from shutil import rmtree
-import sys
-
-from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
-from librarian.dcparser import BookInfo
+from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
from librarian import functions, get_resource
return output_html, toc, chars
-def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
+def transform(wldoc, verbose=False,
style=None, html_toc=False,
sample=None, cover=None, flags=None):
""" produces a EPUB file
- provider: a DocProvider
- slug: slug of file to process, available by provider
- output_file: file-like object or path to output file
- output_dir: path to directory to save output file to; either this or output_file must be present
- make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
sample=n: generate sample e-book (with at least n paragraphs)
cover: a cover.Cover object
flags: less-advertising, without-fonts
"""
- def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
+ def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
""" processes one input file and proceeds to its children """
- replace_characters(input_xml.getroot())
-
- children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
+ replace_characters(wldoc.edoc.getroot())
# every input file will have a TOC entry,
# pointing to starting chunk
- toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), "part%d.html" % chunk_counter)
+ toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
chars = set()
if first:
# write book title page
- html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
+ html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
chars = used_chars(html_tree.getroot())
zip.writestr('OPS/title.html',
etree.tostring(html_tree, method="html", pretty_print=True))
# add a title page TOC entry
toc.add(u"Strona tytułowa", "title.html")
- elif children:
+ elif wldoc.book_info.parts:
# write title page for every parent
if sample is not None and sample <= 0:
chars = set()
html_string = open(get_resource('epub/emptyChunk.html')).read()
else:
- html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
+ html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
chars = used_chars(html_tree.getroot())
html_string = etree.tostring(html_tree, method="html", pretty_print=True)
zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
add_to_spine(spine, chunk_counter)
chunk_counter += 1
- if len(input_xml.getroot()) > 1:
+ if len(wldoc.edoc.getroot()) > 1:
# rdf before style master
- main_text = input_xml.getroot()[1]
+ main_text = wldoc.edoc.getroot()[1]
else:
# rdf in style master
- main_text = input_xml.getroot()[0]
+ main_text = wldoc.edoc.getroot()[0]
if main_text.tag == RDFNS('RDF'):
main_text = None
add_to_spine(spine, chunk_counter)
chunk_counter += 1
- if children:
- for child in children:
- child_xml = etree.parse(provider.by_uri(child))
- child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
- toc.append(child_toc)
- chars = chars.union(chunk_chars)
+ for child in wldoc.parts():
+ child_toc, chunk_counter, chunk_chars, sample = transform_file(
+ child, chunk_counter, first=False, sample=sample)
+ toc.append(child_toc)
+ chars = chars.union(chunk_chars)
return toc, chunk_counter, chars, sample
- # read metadata from the first file
- if file_path:
- if slug:
- raise ValueError('slug or file_path should be specified, not both')
- f = open(file_path, 'r')
- input_xml = etree.parse(f)
- f.close()
- else:
- if not slug:
- raise ValueError('either slug or file_path should be specified')
- input_xml = etree.parse(provider[slug])
+
+ document = deepcopy(wldoc)
+ del wldoc
if flags:
for flag in flags:
- input_xml.getroot().set(flag, 'yes')
-
- metadata = input_xml.find('.//'+RDFNS('Description'))
- if metadata is None:
- raise NoDublinCore('Document has no DublinCore - which is required.')
- book_info = BookInfo.from_element(input_xml)
- metadata = etree.ElementTree(metadata)
-
- # if output to dir, create the file
- if output_dir is not None:
- if make_dir:
- author = unicode(book_info.author)
- output_dir = os.path.join(output_dir, author)
- try:
- os.makedirs(output_dir)
- except OSError:
- pass
- if slug:
- output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
- else:
- output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
+ document.edoc.getroot().set(flag, 'yes')
+
+ opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
+ manifest = opf.find('.//' + OPFNS('manifest'))
+ guide = opf.find('.//' + OPFNS('guide'))
+ spine = opf.find('.//' + OPFNS('spine'))
+ output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
# write static elements
style = get_resource('epub/style.css')
zip.write(style, os.path.join('OPS', 'style.css'))
- opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
- manifest = opf.find('.//' + OPFNS('manifest'))
- guide = opf.find('.//' + OPFNS('guide'))
- spine = opf.find('.//' + OPFNS('spine'))
if cover:
cover_file = StringIO()
- c = cover(book_info.author.readable(), book_info.title)
+ c = cover(document.book_info.author.readable(), document.book_info.title)
c.save(cover_file)
c_name = 'cover.%s' % c.ext()
zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
'<itemref idref="html_toc" />'))
guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
- toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
+ toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
if len(toc.children) < 2:
toc.add(u"Początek utworu", "part1.html")
'<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
spine.append(etree.fromstring(
'<itemref idref="last" />'))
- html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
+ html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
chars.update(used_chars(html_tree.getroot()))
zip.writestr('OPS/last.html', etree.tostring(
html_tree, method="html", pretty_print=True))
os.chdir(cwd)
zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
- contents = []
- title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
+ title = document.book_info.title
attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
for st in attributes:
meta = toc_file.makeelement(NCXNS('meta'))
toc.write_to_xml(nav_map)
zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
zip.close()
+
+ return OutputFile.from_filename(output_file.name)
#
import os
import cStringIO
-import re
import copy
from lxml import etree
-from librarian.parser import WLDocument
-from librarian import XHTMLNS, ParseError
+from librarian import XHTMLNS, ParseError, OutputFile
from librarian import functions
from lxml.etree import XMLSyntaxError, XSLTApplyError
def html_has_content(text):
return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
-def transform(input, output_filename=None, is_file=True, \
- parse_dublincore=True, stylesheet='legacy', options={}, flags=None):
- """Transforms file input_filename in XML to output_filename in XHTML.
+def transform(wldoc, stylesheet='legacy', options=None, flags=None):
+ """Transforms the WL document to XHTML.
If output_filename is None, returns an XML,
otherwise returns True if file has been written,False if it hasn't.
style_filename = get_stylesheet(stylesheet)
style = etree.parse(style_filename)
- if is_file:
- document = WLDocument.from_file(input, True, \
- parse_dublincore=parse_dublincore)
- else:
- document = WLDocument.from_string(input, True, \
- parse_dublincore=parse_dublincore)
+ document = copy.deepcopy(wldoc)
+ del wldoc
+ document.swap_endlines()
if flags:
for flag in flags:
document.clean_ed_note()
+ if not options:
+ options = {}
result = document.transform(style, **options)
del document # no longer needed large object :)
add_anchors(result.getroot())
add_table_of_contents(result.getroot())
- if output_filename is not None:
- result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
- else:
- return result
- return True
+ return OutputFile.from_string(etree.tostring(result, method='html',
+ xml_declaration=False, pretty_print=True, encoding='utf-8'))
else:
- if output_filename is not None:
- return False
- else:
- return "<empty />"
+ return None
except KeyError:
raise ValueError("'%s' is not a valid stylesheet.")
except (XMLSyntaxError, XSLTApplyError), e:
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
import os
-import os.path
import subprocess
from tempfile import NamedTemporaryFile
-from lxml import etree
+from librarian import OutputFile
from librarian.cover import WLCover
-from librarian import epub, get_resource, NoDublinCore, RDFNS
-from librarian.dcparser import BookInfo
+from librarian import get_resource
-def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
+def transform(wldoc, verbose=False,
sample=None, cover=None, flags=None):
""" produces a MOBI file
- provider: a DocProvider
- slug: slug of file to process, available by provider
- output_file: path to output file
- output_dir: path to directory to save output file to; either this or output_file must be present
- make_dir: writes output to <output_dir>/<author>/<slug>.mobi instead of <output_dir>/<slug>.mobi
+ wldoc: a WLDocument
sample=n: generate sample e-book (with at least n paragraphs)
cover: a cover.Cover object
flags: less-advertising,
"""
- # read metadata from the first file
- if file_path:
- if slug:
- raise ValueError('slug or file_path should be specified, not both')
- f = open(file_path, 'r')
- input_xml = etree.parse(f)
- f.close()
- else:
- if not slug:
- raise ValueError('either slug or file_path should be specified')
- input_xml = etree.parse(provider[slug])
-
- metadata = input_xml.find('.//'+RDFNS('Description'))
- if metadata is None:
- raise NoDublinCore('Document has no DublinCore - which is required.')
- book_info = BookInfo.from_element(input_xml)
-
- # if output to dir, create the file
- if output_dir is not None:
- if make_dir:
- author = unicode(book_info.author)
- output_dir = os.path.join(output_dir, author)
- try:
- os.makedirs(output_dir)
- except OSError:
- pass
- if slug:
- output_file = os.path.join(output_dir, '%s.mobi' % slug)
- else:
- output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.mobi')
+ book_info = wldoc.book_info
# provide a cover by default
if not cover:
c = cover(book_info.author.readable(), book_info.title)
c.save(cover_file)
- epub_file = NamedTemporaryFile(suffix='.epub', delete=False)
if not flags:
flags = []
flags = list(flags) + ['without-fonts']
- epub.transform(provider, file_path=file_path, output_file=epub_file, verbose=verbose,
- sample=sample, html_toc=True, flags=flags, style=get_resource('mobi/style.css'))
+ epub = wldoc.as_epub(verbose=verbose, sample=sample, html_toc=True,
+ flags=flags, style=get_resource('mobi/style.css'))
if verbose:
kwargs = {}
else:
devnull = open("/dev/null", 'w')
kwargs = {"stdout": devnull, "stderr": devnull}
- subprocess.check_call(['ebook-convert', epub_file.name, output_file,
+
+ output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi', delete=False)
+ output_file.close()
+ subprocess.check_call(['ebook-convert', epub.get_filename(), output_file.name,
'--no-inline-toc', '--cover=%s' % cover_file.name], **kwargs)
- os.unlink(epub_file.name)
os.unlink(cover_file.name)
+ return OutputFile.from_filename(output_file.name)
\ No newline at end of file
import os
from copy import deepcopy
from lxml import etree
-from librarian import epub, pdf, DirDocProvider, ParseError, cover
-from librarian.dcparser import BookInfo
+from librarian import pdf, epub, DirDocProvider, ParseError, cover
+from librarian.parser import WLDocument
class Packager(object):
except:
pass
outfile = os.path.join(output_dir, slug + '.' + cls.ext)
- cls.converter.transform(provider, file_path=main_input, output_file=outfile,
+
+ doc = WLDocument.from_file(main_input, provider=provider)
+ output_file = cls.converter.transform(doc,
cover=cls.cover, flags=cls.flags)
+ doc.save_output_file(output_file, output_path=outfile)
@classmethod
""" truncates text to at most `limit' bytes in utf-8 """
if text is None:
return text
- orig_text = text
if len(text.encode('utf-8')) > limit:
newlimit = limit - 3
while len(text.encode('utf-8')) > newlimit:
outfile_dir = os.path.join(output_dir, slug)
os.makedirs(os.path.join(output_dir, slug))
- info = BookInfo.from_file(main_input)
+ doc = WLDocument.from_file(main_input, provider=provider)
+ info = doc.book_info
product_elem = deepcopy(product)
product_elem[0].text = cls.utf_trunc(slug, 100)
).save(os.path.join(outfile_dir, slug+'.jpg'))
outfile = os.path.join(outfile_dir, '1.epub')
outfile_sample = os.path.join(outfile_dir, '1.sample.epub')
- epub.transform(provider, file_path=main_input, output_file=outfile)
- epub.transform(provider, file_path=main_input, output_file=outfile_sample, sample=25)
+ doc.save_output_file(epub.transform(doc),
+ output_path=outfile)
+ doc.save_output_file(epub.transform(doc, sample=25),
+ output_path=outfile_sample)
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': main_input,
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from librarian import ValidationError, NoDublinCore, ParseError
+from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
from librarian import RDFNS
from librarian import dcparser
from lxml import etree
from lxml.etree import XMLSyntaxError, XSLTApplyError
+import os
import re
from StringIO import StringIO
class WLDocument(object):
- LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
+ LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
+ provider = None
- def __init__(self, edoc, parse_dublincore=True):
+ def __init__(self, edoc, parse_dublincore=True, provider=None):
self.edoc = edoc
+ self.provider = provider
root_elem = edoc.getroot()
return cls.from_file(StringIO(xml), *args, **kwargs)
@classmethod
- def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
+ def from_file(cls, xmlfile, parse_dublincore=True, provider=None):
# first, prepare for parsing
if isinstance(xmlfile, basestring):
parser = etree.XMLParser(remove_blank_text=False)
tree = etree.parse(StringIO(data.encode('utf-8')), parser)
- if swap_endlines:
- cls.swap_endlines(tree)
-
- return cls(tree, parse_dublincore=parse_dublincore)
+ return cls(tree, parse_dublincore=parse_dublincore, provider=provider)
except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
raise ParseError(e)
- @classmethod
- def swap_endlines(cls, tree):
+ def swap_endlines(self):
+ """Converts line breaks in stanzas into <br/> tags."""
# only swap inside stanzas
- for elem in tree.iter('strofa'):
+ for elem in self.edoc.iter('strofa'):
for child in list(elem):
if child.tail:
- chunks = cls.LINE_SWAP_EXPR.split(child.tail)
+ chunks = self.LINE_SWAP_EXPR.split(child.tail)
ins_index = elem.index(child) + 1
while len(chunks) > 1:
ins = etree.Element('br')
elem.insert(ins_index, ins)
child.tail = chunks.pop(0)
if elem.text:
- chunks = cls.LINE_SWAP_EXPR.split(elem.text)
+ chunks = self.LINE_SWAP_EXPR.split(elem.text)
while len(chunks) > 1:
ins = etree.Element('br')
ins.tail = chunks.pop()
elem.insert(0, ins)
elem.text = chunks.pop(0)
+ def parts(self):
+ if self.provider is None:
+ raise NoProvider('No document provider supplied.')
+ if self.book_info is None:
+ raise NoDublinCore('No Dublin Core in document.')
+ for part_uri in self.book_info.parts:
+ yield self.from_file(self.provider.by_uri(part_uri),
+ provider=self.provider)
+
def chunk(self, path):
# convert the path to XPath
expr = self.path_to_xpath(path)
node.clear()
node.tag = 'span'
node.tail = tail
+
+ # Converters
+
+ def as_html(self, *args, **kwargs):
+ from librarian import html
+ return html.transform(self, *args, **kwargs)
+
+ def as_text(self, *args, **kwargs):
+ from librarian import text
+ return text.transform(self, *args, **kwargs)
+
+ def as_epub(self, *args, **kwargs):
+ from librarian import epub
+ return epub.transform(self, *args, **kwargs)
+
+ def as_pdf(self, *args, **kwargs):
+ from librarian import pdf
+ return pdf.transform(self, *args, **kwargs)
+
+ def as_mobi(self, *args, **kwargs):
+ from librarian import mobi
+ return mobi.transform(self, *args, **kwargs)
+
+ def save_output_file(self, output_file, output_path=None,
+ output_dir_path=None, make_author_dir=False, ext=None):
+ if output_dir_path:
+ save_path = output_dir_path
+ if make_author_dir:
+ save_path = os.path.join(save_path,
+ unicode(self.book_info.author).encode('utf-8'))
+ save_path = os.path.join(save_path, self.book_info.slug)
+ if ext:
+ save_path += '.%s' % ext
+ else:
+ save_path = output_path
+
+ output_file.save_as(save_path)
import os.path
import shutil
from StringIO import StringIO
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
import re
from copy import deepcopy
from subprocess import call, PIPE
-import sys
-
from Texml.processor import process
from lxml import etree
from lxml.etree import XMLSyntaxError, XSLTApplyError
from librarian.dcparser import Person
from librarian.parser import WLDocument
-from librarian import ParseError, DCNS, get_resource
+from librarian import ParseError, DCNS, get_resource, OutputFile
from librarian import functions
return p == 0
-def transform(provider, slug=None, file_path=None,
- output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
+def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
cover=None, flags=None, customizations=None):
""" produces a PDF file with XeLaTeX
- provider: a DocProvider
- slug: slug of file to process, available by provider
- file_path can be provided instead of a slug
- output_file: file-like object or path to output file
- output_dir: path to directory to save output file to; either this or output_file must be present
- make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
+ wldoc: a WLDocument
verbose: prints all output from LaTeX
save_tex: path to save the intermediary LaTeX file to
morefloats (old/new/none): force specific morefloats
# Parse XSLT
try:
- if file_path:
- if slug:
- raise ValueError('slug or file_path should be specified, not both')
- document = load_including_children(provider, file_path=file_path)
- else:
- if not slug:
- raise ValueError('either slug or file_path should be specified')
- document = load_including_children(provider, slug=slug)
+ document = load_including_children(wldoc)
if cover:
document.edoc.getroot().set('data-cover-width', str(cover.width))
substitute_hyphens(document.edoc)
fix_hanging(document.edoc)
- # find output dir
- if make_dir and output_dir is not None:
- author = unicode(document.book_info.author)
- output_dir = os.path.join(output_dir, author)
-
# wl -> TeXML
style_filename = get_stylesheet("wl2tex")
style = etree.parse(style_filename)
os.chdir(cwd)
- # save the PDF
+ output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
pdf_path = os.path.join(temp, 'doc.pdf')
- if output_dir is not None:
- try:
- os.makedirs(output_dir)
- except OSError:
- pass
- if slug:
- output_path = os.path.join(output_dir, '%s.pdf' % slug)
- else:
- output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
- shutil.move(pdf_path, output_path)
- else:
- if hasattr(output_file, 'write'):
- # file-like object
- with open(pdf_path) as f:
- output_file.write(f.read())
- output_file.close()
- else:
- # path to output file
- shutil.copy(pdf_path, output_file)
+ shutil.move(pdf_path, output_file.name)
shutil.rmtree(temp)
+ return OutputFile.from_filename(output_file.name)
except (XMLSyntaxError, XSLTApplyError), e:
raise ParseError(e)
-def load_including_children(provider, slug=None, uri=None, file_path=None):
- """ makes one big xml file with children inserted at end
- either slug or uri must be provided
+def load_including_children(wldoc=None, provider=None, uri=None):
+ """ Makes one big xml file with children inserted at end.
+
+ Either wldoc or provider and URI must be provided.
"""
- if uri:
+ if uri and provider:
f = provider.by_uri(uri)
- elif slug:
- f = provider[slug]
- elif file_path:
- f = open(file_path, 'r')
+ text = f.read().decode('utf-8')
+ f.close()
+ elif wldoc is not None:
+ text = etree.tostring(wldoc.edoc, encoding=unicode)
+ provider = wldoc.provider
else:
- raise ValueError('Neither slug, URI nor file path provided for a book.')
+ raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
- text = f.read().decode('utf-8')
text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
- document = WLDocument.from_string(text, True,
- parse_dublincore=True)
+ document = WLDocument.from_string(text, parse_dublincore=True)
+ document.swap_endlines()
- f.close()
for child_uri in document.book_info.parts:
- print child_uri
- child = load_including_children(provider, uri=child_uri)
+ child = load_including_children(provider=provider, uri=child_uri)
document.edoc.getroot().append(child.edoc.getroot())
return document
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from librarian import dcparser, parser, functions
+import copy
+from librarian import functions, OutputFile
from lxml import etree
import os
%(description)s%(contributors)s
"""
-def transform(input_file, output_file, parse_dublincore=True, flags=None, **options):
+def transform(wldoc, flags=None, **options):
"""
Transforms input_file in XML to output_file in TXT.
possible flags: raw-text,
style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
style = etree.parse(style_filename)
- document = parser.WLDocument.from_file(input_file, True, parse_dublincore=parse_dublincore)
+ document = copy.deepcopy(wldoc)
+ del wldoc
+ document.swap_endlines()
if flags:
for flag in flags:
result = document.transform(style, **options)
if not flags or 'raw-text' not in flags:
- if parse_dublincore:
- parsed_dc = dcparser.BookInfo.from_element(document.edoc)
+ if document.book_info:
+ parsed_dc = document.book_info
description = parsed_dc.description
- url = parsed_dc.url
+ url = document.book_info.url
license_description = parsed_dc.license_description
license = parsed_dc.license
license_description = ""
source = ""
contributors = ""
- output_file.write((TEMPLATE % {
+ return OutputFile.from_string((TEMPLATE % {
'description': description,
'url': url,
'license_description': license_description,
'contributors': contributors,
}).encode('utf-8'))
else:
- output_file.write(unicode(result).encode('utf-8'))
+ return OutputFile.from_string(unicode(result).encode('utf-8'))
import os.path
import optparse
-from librarian import epub, DirDocProvider, ParseError
+from librarian import DirDocProvider, ParseError
+from librarian.parser import WLDocument
if __name__ == '__main__':
for main_input in input_filenames:
if options.verbose:
print main_input
+
path, fname = os.path.realpath(main_input).rsplit('/', 1)
provider = DirDocProvider(path)
-
- output_dir = output_file = None
- if options.output_dir:
- output_dir = options.output_dir
- elif options.output_file:
- output_file = options.output_file
+ if not (options.output_file or options.output_dir):
+ output_file = os.path.splitext(main_input)[0] + '.epub'
else:
- output_dir = path
+ output_file = None
+
+ doc = WLDocument.from_file(main_input, provider=provider)
+ epub = doc.as_epub()
+
+ doc.save_output_file(epub,
+ output_file, options.output_dir, options.make_dir, 'epub')
- epub.transform(provider, file_path=main_input, output_dir=output_dir, output_file=output_file, make_dir=options.make_dir)
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': main_input,
import os
import optparse
-from librarian import html, ParseError
+from librarian import ParseError
+from librarian.parser import WLDocument
if __name__ == '__main__':
output_filename = os.path.splitext(input_filename)[0] + '.html'
try:
- html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore, flags=('full-page',))
+ doc = WLDocument.from_file(input_filename,
+ parse_dublincore=options.parse_dublincore)
+ html = doc.as_html(flags=('full-page',))
+ doc.save_output_file(html, output_path=output_filename)
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': input_filename,
import os
import optparse
-from librarian import html, ParseError
+from librarian import ParseError
+from librarian.parser import WLDocument
if __name__ == '__main__':
output_filename = os.path.splitext(input_filename)[0] + '.html'
try:
- html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore,\
- stylesheet='partial')
+ doc = WLDocument.from_file(input_filename,
+ parse_dublincore=options.parse_dublincore)
+ html = doc.as_html(flags=('full-page',), stylesheet='partial')
+ doc.save_output_file(html, output_path=output_filename)
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': input_filename,
import os.path
import optparse
-from librarian import mobi, DirDocProvider, ParseError
+from librarian import DirDocProvider, ParseError
+from librarian.parser import WLDocument
if __name__ == '__main__':
# Do some real work
try:
for main_input in input_filenames:
- if options.verbose:
- print main_input
path, fname = os.path.realpath(main_input).rsplit('/', 1)
provider = DirDocProvider(path)
-
- output_dir = output_file = None
- if options.output_dir:
- output_dir = options.output_dir
- elif options.output_file:
- output_file = options.output_file
+ if not (options.output_file or options.output_dir):
+ output_file = os.path.splitext(main_input)[0] + '.mobi'
else:
- output_dir = path
+ output_file = None
+
+ doc = WLDocument.from_file(main_input, provider=provider)
+ mobi = doc.as_mobi()
- mobi.transform(provider, file_path=main_input, output_dir=output_dir, output_file=output_file, make_dir=options.make_dir)
+ doc.save_output_file(mobi,
+ output_file, options.output_dir, options.make_dir, 'mobi')
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': main_input,
#
import os.path
from optparse import OptionParser
-from librarian import pdf, DirDocProvider, ParseError
+
+from librarian import DirDocProvider, ParseError
+from librarian.parser import WLDocument
+
if __name__ == '__main__':
usage = """Usage: %prog [options] SOURCE [SOURCE...]
parser.print_help()
exit(1)
- try:
- if options.output_dir and options.output_file:
- raise ValueError("Either --output-dir or --output file should be specified")
+ if options.output_dir and options.output_file:
+ raise ValueError("Either --output-dir or --output file should be specified")
+ try:
for main_input in args:
- if options.verbose:
- print main_input
path, fname = os.path.realpath(main_input).rsplit('/', 1)
provider = DirDocProvider(path)
-
- output_file = output_dir = None
- if options.output_dir:
- output_dir = options.output_dir
- elif options.output_file:
- output_file = options.output_file
+ output_file, output_dir = options.output_file, options.output_dir
+ if not (options.output_file or options.output_dir):
+ output_file = os.path.splitext(main_input)[0] + '.pdf'
else:
- output_dir = path
+ output_file = None
+
+ doc = WLDocument.from_file(main_input, provider=provider)
+ pdf = doc.as_pdf(save_tex=options.save_tex,
+ morefloats=options.morefloats)
- pdf.transform(provider,
- file_path=main_input,
- output_file=output_file,
- output_dir=output_dir,
- verbose=options.verbose,
- make_dir=options.make_dir,
- save_tex=options.save_tex,
- morefloats=options.morefloats
- )
+ doc.save_output_file(pdf,
+ output_file, options.output_dir, options.make_dir, 'pdf')
except ParseError, e:
print '%(file)s:%(name)s:%(message)s; use -v to see more output' % {
'file': main_input,
import os
import optparse
-from librarian import text
-from librarian import dcparser, ParseError
+from librarian import ParseError
+from librarian.parser import WLDocument
if __name__ == '__main__':
output_filename = os.path.splitext(input_filename)[0] + '.txt'
try:
- output_file = open(output_filename, 'w')
- text.transform(open(input_filename), output_file, parse_dublincore=options.parse_dublincore,
- wrapping=str(options.wrapping))
+ doc = WLDocument.from_file(input_filename,
+ parse_dublincore=options.parse_dublincore)
+ html = doc.as_text(wrapping=str(options.wrapping))
+ doc.save_output_file(html, output_path=output_filename)
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': input_filename,
setup(
name='librarian',
- version='1.3',
+ version='1.4',
description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
author="Marek Stępniowski",
author_email='marek@stepniowski.com',
+++ /dev/null
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
- <liryka_lp>
-
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/Lektury:Asnyk/Między_nami_nic_nie_było">
-<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
-<dc:title xml:lang="pl">Między nami nic nie było</dc:title>
-<dc:contributor.editor xml:lang="pl" />
-<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
-<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
-<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:identifier.url>
-<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
-<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
-<dc:date.pd xml:lang="pl">1897</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2007-09-06</dc:date>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-
-
-<autor_utworu>Adam Asnyk</autor_utworu>
-
-<nazwa_utworu><begin id="b1189062500041"/><motyw id="m1189062500041">Miłość platoniczna</motyw>Między nami nic nie było</nazwa_utworu>
-
-
-
-<strofa>Między nami nic nie było!/
-Żadnych zwierzeń, wyznań żadnych!/
-Nic nas z sobą nie łączyło ---/
-Prócz wiosennych marzeń zdradnych;</strofa>
-
-
-
-<strofa><begin id="b1189062528872"/><motyw id="m1189062528872">Natura</motyw>Prócz tych woni, barw i blasków,/
-Unoszących się w przestrzeni;/
-Prócz szumiących śpiewem lasków/
-I tej świeżej łąk zieleni;</strofa>
-
-
-
-<strofa>Prócz tych kaskad i potoków,/
-Zraszających każdy parów,/
-Prócz girlandy tęcz, obłoków,/
-Prócz natury słodkich czarów;</strofa>
-
-
-
-<strofa>Prócz tych wspólnych, jasnych zdrojów,/
-Z których serce zachwyt piło;/
-Prócz pierwiosnków i powojów,---/
-Między nami nic nie było!<end id="e1189062528872"/><end id="e1189062500041"/></strofa>
-
-</liryka_lp>
-</utwor>
--- /dev/null
+<?xml version='1.0' encoding='utf-8'?>
+<utwor>
+
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<rdf:Description rdf:about="http://redakcja.wolnelektury.pl/documents/book/asnyk-poezye/">
+<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
+<dc:title xml:lang="pl">Poezye</dc:title>
+<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
+<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
+<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
+<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
+<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
+<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/poezye</dc:identifier.url>
+<dc:relation.hasPart xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:relation.hasPart>
+<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
+<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
+<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
+<dc:date.pd xml:lang="pl">1897</dc:date.pd>
+<dc:format xml:lang="pl">xml</dc:format>
+<dc:type xml:lang="pl">text</dc:type>
+<dc:type xml:lang="en">text</dc:type>
+<dc:date xml:lang="pl">2007-09-06</dc:date>
+<dc:audience xml:lang="pl">L</dc:audience>
+<dc:language xml:lang="pl">pol</dc:language>
+</rdf:Description>
+</rdf:RDF>
+
+
+</utwor>
--- /dev/null
+<?xml version='1.0' encoding='utf-8'?>
+<utwor>
+ <liryka_lp>
+
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<rdf:Description rdf:about="http://redakcja.wolnelektury.pl/documents/book/miedzy-nami-nic-nie-bylo/">
+<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
+<dc:title xml:lang="pl">Między nami nic nie było</dc:title>
+<dc:contributor.editor xml:lang="pl" />
+<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
+<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
+<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
+<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
+<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
+<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
+<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
+<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:identifier.url>
+<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
+<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
+<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
+<dc:date.pd xml:lang="pl">1897</dc:date.pd>
+<dc:format xml:lang="pl">xml</dc:format>
+<dc:type xml:lang="pl">text</dc:type>
+<dc:type xml:lang="en">text</dc:type>
+<dc:date xml:lang="pl">2007-09-06</dc:date>
+<dc:audience xml:lang="pl">L</dc:audience>
+<dc:language xml:lang="pl">pol</dc:language>
+</rdf:Description>
+</rdf:RDF>
+
+
+<autor_utworu>Adam Asnyk</autor_utworu>
+
+<nazwa_utworu><begin id="b1189062500041"/><motyw id="m1189062500041">Miłość platoniczna</motyw>Między nami nic nie było</nazwa_utworu>
+
+
+
+<strofa>Między nami nic nie było!/
+Żadnych zwierzeń, wyznań żadnych!/
+Nic nas z sobą nie łączyło ---/
+Prócz wiosennych marzeń zdradnych;</strofa>
+
+
+
+<strofa><begin id="b1189062528872"/><motyw id="m1189062528872">Natura</motyw>Prócz tych woni, barw i blasków,/
+Unoszących się w przestrzeni;/
+Prócz szumiących śpiewem lasków/
+I tej świeżej łąk zieleni;</strofa>
+
+
+
+<strofa>Prócz tych kaskad i potoków,/
+Zraszających każdy parów,/
+Prócz girlandy tęcz, obłoków,/
+Prócz natury słodkich czarów;</strofa>
+
+
+
+<strofa>Prócz tych wspólnych, jasnych zdrojów,/
+Z których serce zachwyt piło;/
+Prócz pierwiosnków i powojów,---/
+Między nami nic nie było!<end id="e1189062528872"/><end id="e1189062500041"/></strofa>
+
+</liryka_lp>
+</utwor>
--- /dev/null
+# -*- coding: utf-8 -*-
+#
+# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
+from librarian import DirDocProvider
+from librarian.parser import WLDocument
+from nose.tools import *
+from utils import get_fixture
+
+
+def test_transform():
+ WLDocument.from_file(
+ get_fixture('text', 'asnyk_zbior.xml'),
+ provider=DirDocProvider(get_fixture('text', ''))
+ ).as_epub(flags=['without_fonts'])
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from librarian import html, NoDublinCore
+from librarian import NoDublinCore
+from librarian.parser import WLDocument
from nose.tools import *
-from utils import get_fixture, remove_output_file
+from utils import get_fixture
-def teardown_transform():
- remove_output_file('text', 'asnyk_miedzy_nami.html')
-
-@with_setup(None, teardown_transform)
def test_transform():
- output_file_path = get_fixture('text', 'asnyk_miedzy_nami.html')
expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
- html.transform(
- get_fixture('text', 'asnyk_miedzy_nami.xml'),
- output_file_path,
- )
+ html = WLDocument.from_file(
+ get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
+ ).as_html().get_string()
- assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
+ assert_equal(html, file(expected_output_file_path).read())
-@with_setup(None, teardown_transform)
@raises(NoDublinCore)
def test_no_dublincore():
- html.transform(
- get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
- get_fixture('text', 'asnyk_miedzy_nami.html'),
- )
+ WLDocument.from_file(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')
+ ).as_html()
-@with_setup(None, teardown_transform)
def test_passing_parse_dublincore_to_transform():
"""Passing parse_dublincore=False to transform omits DublinCore parsing."""
- html.transform(
- get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
- get_fixture('text', 'asnyk_miedzy_nami.html'),
- parse_dublincore=False,
- )
+ WLDocument.from_file(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+ parse_dublincore=False,
+ ).as_html()
def test_empty():
- assert html.transform('<utwor />', is_file=False, parse_dublincore=False).find('empty')
+ assert not WLDocument.from_string(
+ '<utwor />',
+ parse_dublincore=False,
+ ).as_html()
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from librarian import text, NoDublinCore
+from librarian import NoDublinCore
+from librarian.parser import WLDocument
from nose.tools import *
-from utils import get_fixture, remove_output_file
+from utils import get_fixture
-def teardown_transform():
- remove_output_file('text', 'asnyk_miedzy_nami.txt')
-
-
-@with_setup(None, teardown_transform)
def test_transform():
- output_file_path = get_fixture('text', 'asnyk_miedzy_nami.txt')
expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt')
- text.transform(
- open(get_fixture('text', 'asnyk_miedzy_nami.xml')),
- open(output_file_path, 'w'),
- )
+ text = WLDocument.from_file(
+ get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
+ ).as_text().get_string()
- assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
+ assert_equal(text, file(expected_output_file_path).read())
-@with_setup(None, teardown_transform)
@raises(NoDublinCore)
def test_no_dublincore():
- text.transform(
- open(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')),
- open(get_fixture('text', 'asnyk_miedzy_nami.txt'), 'w'),
- )
+ WLDocument.from_file(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')
+ ).as_text()
-@with_setup(None, teardown_transform)
def test_passing_parse_dublincore_to_transform():
- """Passing parse_dublincore=False to transform omits DublinCore parsing."""
- text.transform(
- open(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')),
- open(get_fixture('text', 'asnyk_miedzy_nami.txt'), 'w'),
- parse_dublincore=False,
- )
+ """Passing parse_dublincore=False to the constructor omits DublinCore parsing."""
+ WLDocument.from_file(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+ parse_dublincore=False,
+ ).as_text()
def get_all_fixtures(dir_name, glob_pattern='*'):
"""Returns list of paths for fixtures in directory dir_name matching the glob_pattern."""
return [get_fixture(dir_name, file_name) for file_name in glob.glob(join(get_fixture_dir(dir_name), glob_pattern))]
-
-
-def remove_output_file(dir_name, file_name):
- try:
- os.remove(get_fixture(dir_name, file_name))
- except:
- pass