From: Radek Czajka Date: Mon, 5 Dec 2011 16:06:51 +0000 (+0100) Subject: converters interface changed: WLDocument in, OutputFile out X-Git-Tag: 1.7~188 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/b6ec0976a383cc1823f4a199bc3e6dc40880b049?hp=--cc converters interface changed: WLDocument in, OutputFile out added WLDocument.to_* converter functions --- b6ec0976a383cc1823f4a199bc3e6dc40880b049 diff --git a/librarian/__init__.py b/librarian/__init__.py index 8f5cf1a..fdd6b55 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -4,6 +4,8 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import os +import re +import shutil class ParseError(Exception): def __str__(self): @@ -18,6 +20,11 @@ class ValidationError(Exception): pass class NoDublinCore(ValidationError): + """There's no DublinCore section, and it's required.""" + pass + +class NoProvider(Exception): + """There's no DocProvider specified, and it's needed.""" pass class XMLNamespace(object): @@ -56,37 +63,61 @@ OPFNS = XMLNamespace("http://www.idpf.org/2007/opf") WLNS = EmptyNamespace() +class WLURI(object): + """Represents a WL URI. Extracts slug and language from it.""" + + slug = None + language = None + + _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/lektura/' + '(?P[-a-z]+)(/(?P[a-z]{3})/?)?') + + def __init__(self, uri): + self.uri = uri + match = self._re_wl_uri.match(uri) + assert match + self.slug = match.group('slug') + self.language = match.group('lang') + + class DocProvider(object): - """ Base class for a repository of XML files. - Used for generating joined files, like EPUBs + """Base class for a repository of XML files. + + Used for generating joined files, like EPUBs. """ - def by_slug(self, slug): - raise NotImplemented + def by_slug_and_lang(self, slug, lang=None): + """Should return a file-like object with a WL document XML.""" + raise NotImplementedError - def __getitem__(self, slug): - return self.by_slug(slug) + def by_slug(self, slug): + """Should return a file-like object with a WL document XML.""" + return self.by_slug_and_lang(slug) def by_uri(self, uri): - return self.by_slug(uri.rsplit('/', 1)[1]) + """Should return a file-like object with a WL document XML.""" + wluri = WLURI(uri) + return self.by_slug_and_lang(wluri.slug, wluri.language) class DirDocProvider(DocProvider): """ Serve docs from a directory of files in form .xml """ - def __init__(self, dir): - self.dir = dir + def __init__(self, dir_): + self.dir = dir_ self.files = {} + return super(DirDocProvider, self).__init__() - def by_slug(self, slug): - return open(os.path.join(self.dir, '%s.xml' % slug)) + def by_slug_and_lang(self, slug, lang=None): + fname = "%s%s.xml" % (slug, ".%s" % lang if lang else "") + return open(os.path.join(self.dir, fname)) import lxml.etree as etree import dcparser DEFAULT_BOOKINFO = dcparser.BookInfo( - { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, \ + { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, { DCNS('creator'): [u'Some, Author'], DCNS('title'): [u'Some Title'], DCNS('subject.period'): [u'Unknown'], @@ -119,14 +150,15 @@ def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): method='xml', encoding=unicode, pretty_print=True) return u'\n' + dcstring + u'\n\n' + ocrtext + \ - u'\n\n'; + u'\n\n' def serialize_raw(element): b = u'' + (element.text or '') for child in element.iterchildren(): - e = etree.tostring(child, method='xml', encoding=unicode, pretty_print=True) + e = etree.tostring(child, method='xml', encoding=unicode, + pretty_print=True) b += e return b @@ -141,3 +173,73 @@ def serialize_children(element, format='raw'): def get_resource(path): return os.path.join(os.path.dirname(__file__), path) + +class OutputFile(object): + """Represents a file returned by one of the converters.""" + + _string = None + _filename = None + + def __del__(self): + if self._filename: + os.unlink(self._filename) + + def __nonzero__(self): + return self._string is not None or self._filename is not None + + @classmethod + def from_string(cls, string): + """Converter returns contents of a file as a string.""" + + instance = cls() + instance._string = string + return instance + + @classmethod + def from_filename(cls, filename): + """Converter returns contents of a file as a named file.""" + + instance = cls() + instance._filename = filename + return instance + + def get_string(self): + """Get file's contents as a string.""" + + if self._filename is not None: + with open(self._filename) as f: + return f.read() + else: + return self._string + + def get_file(self): + """Get file as a file-like object.""" + + if self._string is not None: + from StringIO import StringIO + return StringIO(self._string) + elif self._filename is not None: + return open(self._filename) + + def get_filename(self): + """Get file as a fs path.""" + + if self._filename is not None: + return self._filename + elif self._string is not None: + from tempfile import NamedTemporaryFile + temp = NamedTemporaryFile(prefix='librarian-', delete=False) + temp.write(self._string) + temp.close() + self._filename = temp.name + return self._filename + else: + return None + + def save_as(self, path): + """Save file to a path. Create directories, if necessary.""" + + dirname = os.path.dirname(os.path.abspath(path)) + if not os.path.isdir(dirname): + os.makedirs(dirname) + shutil.copy(self.get_filename(), path) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index aa8f50d..5492f7a 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -7,7 +7,8 @@ from xml.parsers.expat import ExpatError from datetime import date import time -from librarian import ValidationError, NoDublinCore, ParseError, DCNS, RDFNS +from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS, + WLURI) import lxml.etree as etree # ElementTree API using libxml2 from lxml.etree import XMLSyntaxError @@ -150,7 +151,7 @@ class BookInfo(object): @property def slug(self): - return self.url.rsplit('/', 1)[1] + return WLURI(self.url).slug @classmethod def from_string(cls, xml): diff --git a/librarian/epub.py b/librarian/epub.py index bb3123d..b063380 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -12,13 +12,10 @@ from StringIO import StringIO from copy import deepcopy from lxml import etree import zipfile -from tempfile import mkdtemp +from tempfile import mkdtemp, NamedTemporaryFile from shutil import rmtree -import sys - -from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore -from librarian.dcparser import BookInfo +from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile from librarian import functions, get_resource @@ -287,47 +284,40 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s return output_html, toc, chars -def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, +def transform(wldoc, verbose=False, style=None, html_toc=False, sample=None, cover=None, flags=None): """ produces a EPUB file - provider: a DocProvider - slug: slug of file to process, available by provider - output_file: file-like object or path to output file - output_dir: path to directory to save output file to; either this or output_file must be present - make_dir: writes output to //.epub instead of /.epub sample=n: generate sample e-book (with at least n paragraphs) cover: a cover.Cover object flags: less-advertising, without-fonts """ - def transform_file(input_xml, chunk_counter=1, first=True, sample=None): + def transform_file(wldoc, chunk_counter=1, first=True, sample=None): """ processes one input file and proceeds to its children """ - replace_characters(input_xml.getroot()) - - children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))] + replace_characters(wldoc.edoc.getroot()) # every input file will have a TOC entry, # pointing to starting chunk - toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), "part%d.html" % chunk_counter) + toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter) chars = set() if first: # write book title page - html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl')) + html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl')) chars = used_chars(html_tree.getroot()) zip.writestr('OPS/title.html', etree.tostring(html_tree, method="html", pretty_print=True)) # add a title page TOC entry toc.add(u"Strona tytułowa", "title.html") - elif children: + elif wldoc.book_info.parts: # write title page for every parent if sample is not None and sample <= 0: chars = set() html_string = open(get_resource('epub/emptyChunk.html')).read() else: - html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl')) + html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl')) chars = used_chars(html_tree.getroot()) html_string = etree.tostring(html_tree, method="html", pretty_print=True) zip.writestr('OPS/part%d.html' % chunk_counter, html_string) @@ -335,12 +325,12 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= add_to_spine(spine, chunk_counter) chunk_counter += 1 - if len(input_xml.getroot()) > 1: + if len(wldoc.edoc.getroot()) > 1: # rdf before style master - main_text = input_xml.getroot()[1] + main_text = wldoc.edoc.getroot()[1] else: # rdf in style master - main_text = input_xml.getroot()[0] + main_text = wldoc.edoc.getroot()[0] if main_text.tag == RDFNS('RDF'): main_text = None @@ -361,51 +351,28 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= add_to_spine(spine, chunk_counter) chunk_counter += 1 - if children: - for child in children: - child_xml = etree.parse(provider.by_uri(child)) - child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample) - toc.append(child_toc) - chars = chars.union(chunk_chars) + for child in wldoc.parts(): + child_toc, chunk_counter, chunk_chars, sample = transform_file( + child, chunk_counter, first=False, sample=sample) + toc.append(child_toc) + chars = chars.union(chunk_chars) return toc, chunk_counter, chars, sample - # read metadata from the first file - if file_path: - if slug: - raise ValueError('slug or file_path should be specified, not both') - f = open(file_path, 'r') - input_xml = etree.parse(f) - f.close() - else: - if not slug: - raise ValueError('either slug or file_path should be specified') - input_xml = etree.parse(provider[slug]) + + document = deepcopy(wldoc) + del wldoc if flags: for flag in flags: - input_xml.getroot().set(flag, 'yes') - - metadata = input_xml.find('.//'+RDFNS('Description')) - if metadata is None: - raise NoDublinCore('Document has no DublinCore - which is required.') - book_info = BookInfo.from_element(input_xml) - metadata = etree.ElementTree(metadata) - - # if output to dir, create the file - if output_dir is not None: - if make_dir: - author = unicode(book_info.author) - output_dir = os.path.join(output_dir, author) - try: - os.makedirs(output_dir) - except OSError: - pass - if slug: - output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w') - else: - output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w') + document.edoc.getroot().set(flag, 'yes') + + opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl')) + manifest = opf.find('.//' + OPFNS('manifest')) + guide = opf.find('.//' + OPFNS('guide')) + spine = opf.find('.//' + OPFNS('spine')) + output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False) zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) # write static elements @@ -425,14 +392,10 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= style = get_resource('epub/style.css') zip.write(style, os.path.join('OPS', 'style.css')) - opf = xslt(metadata, get_resource('epub/xsltContent.xsl')) - manifest = opf.find('.//' + OPFNS('manifest')) - guide = opf.find('.//' + OPFNS('guide')) - spine = opf.find('.//' + OPFNS('spine')) if cover: cover_file = StringIO() - c = cover(book_info.author.readable(), book_info.title) + c = cover(document.book_info.author.readable(), document.book_info.title) c.save(cover_file) c_name = 'cover.%s' % c.ext() zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue()) @@ -468,7 +431,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= '')) guide.append(etree.fromstring('')) - toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample) + toc, chunk_counter, chars, sample = transform_file(document, sample=sample) if len(toc.children) < 2: toc.add(u"Początek utworu", "part1.html") @@ -491,7 +454,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= '')) spine.append(etree.fromstring( '')) - html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl')) + html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl')) chars.update(used_chars(html_tree.getroot())) zip.writestr('OPS/last.html', etree.tostring( html_tree, method="html", pretty_print=True)) @@ -517,8 +480,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= os.chdir(cwd) zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True)) - contents = [] - title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0]) + title = document.book_info.title attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber" for st in attributes: meta = toc_file.makeelement(NCXNS('meta')) @@ -536,3 +498,5 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= toc.write_to_xml(nav_map) zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True)) zip.close() + + return OutputFile.from_filename(output_file.name) diff --git a/librarian/html.py b/librarian/html.py index 5974d93..997f904 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -5,12 +5,10 @@ # import os import cStringIO -import re import copy from lxml import etree -from librarian.parser import WLDocument -from librarian import XHTMLNS, ParseError +from librarian import XHTMLNS, ParseError, OutputFile from librarian import functions from lxml.etree import XMLSyntaxError, XSLTApplyError @@ -30,9 +28,8 @@ def get_stylesheet(name): def html_has_content(text): return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text) -def transform(input, output_filename=None, is_file=True, \ - parse_dublincore=True, stylesheet='legacy', options={}, flags=None): - """Transforms file input_filename in XML to output_filename in XHTML. +def transform(wldoc, stylesheet='legacy', options=None, flags=None): + """Transforms the WL document to XHTML. If output_filename is None, returns an XML, otherwise returns True if file has been written,False if it hasn't. @@ -43,12 +40,9 @@ def transform(input, output_filename=None, is_file=True, \ style_filename = get_stylesheet(stylesheet) style = etree.parse(style_filename) - if is_file: - document = WLDocument.from_file(input, True, \ - parse_dublincore=parse_dublincore) - else: - document = WLDocument.from_string(input, True, \ - parse_dublincore=parse_dublincore) + document = copy.deepcopy(wldoc) + del wldoc + document.swap_endlines() if flags: for flag in flags: @@ -56,6 +50,8 @@ def transform(input, output_filename=None, is_file=True, \ document.clean_ed_note() + if not options: + options = {} result = document.transform(style, **options) del document # no longer needed large object :) @@ -63,16 +59,10 @@ def transform(input, output_filename=None, is_file=True, \ add_anchors(result.getroot()) add_table_of_contents(result.getroot()) - if output_filename is not None: - result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8') - else: - return result - return True + return OutputFile.from_string(etree.tostring(result, method='html', + xml_declaration=False, pretty_print=True, encoding='utf-8')) else: - if output_filename is not None: - return False - else: - return "" + return None except KeyError: raise ValueError("'%s' is not a valid stylesheet.") except (XMLSyntaxError, XSLTApplyError), e: diff --git a/librarian/mobi.py b/librarian/mobi.py index cd894fe..a93315e 100755 --- a/librarian/mobi.py +++ b/librarian/mobi.py @@ -4,60 +4,25 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import os -import os.path import subprocess from tempfile import NamedTemporaryFile -from lxml import etree +from librarian import OutputFile from librarian.cover import WLCover -from librarian import epub, get_resource, NoDublinCore, RDFNS -from librarian.dcparser import BookInfo +from librarian import get_resource -def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, +def transform(wldoc, verbose=False, sample=None, cover=None, flags=None): """ produces a MOBI file - provider: a DocProvider - slug: slug of file to process, available by provider - output_file: path to output file - output_dir: path to directory to save output file to; either this or output_file must be present - make_dir: writes output to //.mobi instead of /.mobi + wldoc: a WLDocument sample=n: generate sample e-book (with at least n paragraphs) cover: a cover.Cover object flags: less-advertising, """ - # read metadata from the first file - if file_path: - if slug: - raise ValueError('slug or file_path should be specified, not both') - f = open(file_path, 'r') - input_xml = etree.parse(f) - f.close() - else: - if not slug: - raise ValueError('either slug or file_path should be specified') - input_xml = etree.parse(provider[slug]) - - metadata = input_xml.find('.//'+RDFNS('Description')) - if metadata is None: - raise NoDublinCore('Document has no DublinCore - which is required.') - book_info = BookInfo.from_element(input_xml) - - # if output to dir, create the file - if output_dir is not None: - if make_dir: - author = unicode(book_info.author) - output_dir = os.path.join(output_dir, author) - try: - os.makedirs(output_dir) - except OSError: - pass - if slug: - output_file = os.path.join(output_dir, '%s.mobi' % slug) - else: - output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.mobi') + book_info = wldoc.book_info # provide a cover by default if not cover: @@ -66,19 +31,21 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir= c = cover(book_info.author.readable(), book_info.title) c.save(cover_file) - epub_file = NamedTemporaryFile(suffix='.epub', delete=False) if not flags: flags = [] flags = list(flags) + ['without-fonts'] - epub.transform(provider, file_path=file_path, output_file=epub_file, verbose=verbose, - sample=sample, html_toc=True, flags=flags, style=get_resource('mobi/style.css')) + epub = wldoc.as_epub(verbose=verbose, sample=sample, html_toc=True, + flags=flags, style=get_resource('mobi/style.css')) if verbose: kwargs = {} else: devnull = open("/dev/null", 'w') kwargs = {"stdout": devnull, "stderr": devnull} - subprocess.check_call(['ebook-convert', epub_file.name, output_file, + + output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi', delete=False) + output_file.close() + subprocess.check_call(['ebook-convert', epub.get_filename(), output_file.name, '--no-inline-toc', '--cover=%s' % cover_file.name], **kwargs) - os.unlink(epub_file.name) os.unlink(cover_file.name) + return OutputFile.from_filename(output_file.name) \ No newline at end of file diff --git a/librarian/packagers.py b/librarian/packagers.py index 054f068..ebeb5b3 100644 --- a/librarian/packagers.py +++ b/librarian/packagers.py @@ -6,8 +6,8 @@ import os from copy import deepcopy from lxml import etree -from librarian import epub, pdf, DirDocProvider, ParseError, cover -from librarian.dcparser import BookInfo +from librarian import pdf, epub, DirDocProvider, ParseError, cover +from librarian.parser import WLDocument class Packager(object): @@ -26,8 +26,11 @@ class Packager(object): except: pass outfile = os.path.join(output_dir, slug + '.' + cls.ext) - cls.converter.transform(provider, file_path=main_input, output_file=outfile, + + doc = WLDocument.from_file(main_input, provider=provider) + output_file = cls.converter.transform(doc, cover=cls.cover, flags=cls.flags) + doc.save_output_file(output_file, output_path=outfile) @classmethod @@ -78,7 +81,6 @@ class VirtualoEpubPackager(Packager): """ truncates text to at most `limit' bytes in utf-8 """ if text is None: return text - orig_text = text if len(text.encode('utf-8')) > limit: newlimit = limit - 3 while len(text.encode('utf-8')) > newlimit: @@ -116,7 +118,8 @@ class VirtualoEpubPackager(Packager): outfile_dir = os.path.join(output_dir, slug) os.makedirs(os.path.join(output_dir, slug)) - info = BookInfo.from_file(main_input) + doc = WLDocument.from_file(main_input, provider=provider) + info = doc.book_info product_elem = deepcopy(product) product_elem[0].text = cls.utf_trunc(slug, 100) @@ -133,8 +136,10 @@ class VirtualoEpubPackager(Packager): ).save(os.path.join(outfile_dir, slug+'.jpg')) outfile = os.path.join(outfile_dir, '1.epub') outfile_sample = os.path.join(outfile_dir, '1.sample.epub') - epub.transform(provider, file_path=main_input, output_file=outfile) - epub.transform(provider, file_path=main_input, output_file=outfile_sample, sample=25) + doc.save_output_file(epub.transform(doc), + output_path=outfile) + doc.save_output_file(epub.transform(doc, sample=25), + output_path=outfile_sample) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, diff --git a/librarian/parser.py b/librarian/parser.py index afc4f1a..469b7df 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -3,7 +3,7 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import ValidationError, NoDublinCore, ParseError +from librarian import ValidationError, NoDublinCore, ParseError, NoProvider from librarian import RDFNS from librarian import dcparser @@ -11,14 +11,17 @@ from xml.parsers.expat import ExpatError from lxml import etree from lxml.etree import XMLSyntaxError, XSLTApplyError +import os import re from StringIO import StringIO class WLDocument(object): - LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE) + provider = None - def __init__(self, edoc, parse_dublincore=True): + def __init__(self, edoc, parse_dublincore=True, provider=None): self.edoc = edoc + self.provider = provider root_elem = edoc.getroot() @@ -42,7 +45,7 @@ class WLDocument(object): return cls.from_file(StringIO(xml), *args, **kwargs) @classmethod - def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True): + def from_file(cls, xmlfile, parse_dublincore=True, provider=None): # first, prepare for parsing if isinstance(xmlfile, basestring): @@ -63,20 +66,17 @@ class WLDocument(object): parser = etree.XMLParser(remove_blank_text=False) tree = etree.parse(StringIO(data.encode('utf-8')), parser) - if swap_endlines: - cls.swap_endlines(tree) - - return cls(tree, parse_dublincore=parse_dublincore) + return cls(tree, parse_dublincore=parse_dublincore, provider=provider) except (ExpatError, XMLSyntaxError, XSLTApplyError), e: raise ParseError(e) - @classmethod - def swap_endlines(cls, tree): + def swap_endlines(self): + """Converts line breaks in stanzas into
tags.""" # only swap inside stanzas - for elem in tree.iter('strofa'): + for elem in self.edoc.iter('strofa'): for child in list(elem): if child.tail: - chunks = cls.LINE_SWAP_EXPR.split(child.tail) + chunks = self.LINE_SWAP_EXPR.split(child.tail) ins_index = elem.index(child) + 1 while len(chunks) > 1: ins = etree.Element('br') @@ -84,13 +84,22 @@ class WLDocument(object): elem.insert(ins_index, ins) child.tail = chunks.pop(0) if elem.text: - chunks = cls.LINE_SWAP_EXPR.split(elem.text) + chunks = self.LINE_SWAP_EXPR.split(elem.text) while len(chunks) > 1: ins = etree.Element('br') ins.tail = chunks.pop() elem.insert(0, ins) elem.text = chunks.pop(0) + def parts(self): + if self.provider is None: + raise NoProvider('No document provider supplied.') + if self.book_info is None: + raise NoDublinCore('No Dublin Core in document.') + for part_uri in self.book_info.parts: + yield self.from_file(self.provider.by_uri(part_uri), + provider=self.provider) + def chunk(self, path): # convert the path to XPath expr = self.path_to_xpath(path) @@ -152,3 +161,40 @@ class WLDocument(object): node.clear() node.tag = 'span' node.tail = tail + + # Converters + + def as_html(self, *args, **kwargs): + from librarian import html + return html.transform(self, *args, **kwargs) + + def as_text(self, *args, **kwargs): + from librarian import text + return text.transform(self, *args, **kwargs) + + def as_epub(self, *args, **kwargs): + from librarian import epub + return epub.transform(self, *args, **kwargs) + + def as_pdf(self, *args, **kwargs): + from librarian import pdf + return pdf.transform(self, *args, **kwargs) + + def as_mobi(self, *args, **kwargs): + from librarian import mobi + return mobi.transform(self, *args, **kwargs) + + def save_output_file(self, output_file, output_path=None, + output_dir_path=None, make_author_dir=False, ext=None): + if output_dir_path: + save_path = output_dir_path + if make_author_dir: + save_path = os.path.join(save_path, + unicode(self.book_info.author).encode('utf-8')) + save_path = os.path.join(save_path, self.book_info.slug) + if ext: + save_path += '.%s' % ext + else: + save_path = output_path + + output_file.save_as(save_path) diff --git a/librarian/pdf.py b/librarian/pdf.py index 1bfd949..02438a6 100644 --- a/librarian/pdf.py +++ b/librarian/pdf.py @@ -8,20 +8,18 @@ import os import os.path import shutil from StringIO import StringIO -from tempfile import mkdtemp +from tempfile import mkdtemp, NamedTemporaryFile import re from copy import deepcopy from subprocess import call, PIPE -import sys - from Texml.processor import process from lxml import etree from lxml.etree import XMLSyntaxError, XSLTApplyError from librarian.dcparser import Person from librarian.parser import WLDocument -from librarian import ParseError, DCNS, get_resource +from librarian import ParseError, DCNS, get_resource, OutputFile from librarian import functions @@ -173,17 +171,11 @@ def package_available(package, args='', verbose=False): return p == 0 -def transform(provider, slug=None, file_path=None, - output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None, +def transform(wldoc, verbose=False, save_tex=None, morefloats=None, cover=None, flags=None, customizations=None): """ produces a PDF file with XeLaTeX - provider: a DocProvider - slug: slug of file to process, available by provider - file_path can be provided instead of a slug - output_file: file-like object or path to output file - output_dir: path to directory to save output file to; either this or output_file must be present - make_dir: writes output to //.pdf istead of /.pdf + wldoc: a WLDocument verbose: prints all output from LaTeX save_tex: path to save the intermediary LaTeX file to morefloats (old/new/none): force specific morefloats @@ -194,14 +186,7 @@ def transform(provider, slug=None, file_path=None, # Parse XSLT try: - if file_path: - if slug: - raise ValueError('slug or file_path should be specified, not both') - document = load_including_children(provider, file_path=file_path) - else: - if not slug: - raise ValueError('either slug or file_path should be specified') - document = load_including_children(provider, slug=slug) + document = load_including_children(wldoc) if cover: document.edoc.getroot().set('data-cover-width', str(cover.width)) @@ -227,11 +212,6 @@ def transform(provider, slug=None, file_path=None, substitute_hyphens(document.edoc) fix_hanging(document.edoc) - # find output dir - if make_dir and output_dir is not None: - author = unicode(document.book_info.author) - output_dir = os.path.join(output_dir, author) - # wl -> TeXML style_filename = get_stylesheet("wl2tex") style = etree.parse(style_filename) @@ -273,56 +253,38 @@ def transform(provider, slug=None, file_path=None, os.chdir(cwd) - # save the PDF + output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False) pdf_path = os.path.join(temp, 'doc.pdf') - if output_dir is not None: - try: - os.makedirs(output_dir) - except OSError: - pass - if slug: - output_path = os.path.join(output_dir, '%s.pdf' % slug) - else: - output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf') - shutil.move(pdf_path, output_path) - else: - if hasattr(output_file, 'write'): - # file-like object - with open(pdf_path) as f: - output_file.write(f.read()) - output_file.close() - else: - # path to output file - shutil.copy(pdf_path, output_file) + shutil.move(pdf_path, output_file.name) shutil.rmtree(temp) + return OutputFile.from_filename(output_file.name) except (XMLSyntaxError, XSLTApplyError), e: raise ParseError(e) -def load_including_children(provider, slug=None, uri=None, file_path=None): - """ makes one big xml file with children inserted at end - either slug or uri must be provided +def load_including_children(wldoc=None, provider=None, uri=None): + """ Makes one big xml file with children inserted at end. + + Either wldoc or provider and URI must be provided. """ - if uri: + if uri and provider: f = provider.by_uri(uri) - elif slug: - f = provider[slug] - elif file_path: - f = open(file_path, 'r') + text = f.read().decode('utf-8') + f.close() + elif wldoc is not None: + text = etree.tostring(wldoc.edoc, encoding=unicode) + provider = wldoc.provider else: - raise ValueError('Neither slug, URI nor file path provided for a book.') + raise ValueError('Neither a WLDocument, nor provider and URI were provided.') - text = f.read().decode('utf-8') text = re.sub(ur"([\u0400-\u04ff]+)", ur"\1", text) - document = WLDocument.from_string(text, True, - parse_dublincore=True) + document = WLDocument.from_string(text, parse_dublincore=True) + document.swap_endlines() - f.close() for child_uri in document.book_info.parts: - print child_uri - child = load_including_children(provider, uri=child_uri) + child = load_including_children(provider=provider, uri=child_uri) document.edoc.getroot().append(child.edoc.getroot()) return document diff --git a/librarian/text.py b/librarian/text.py index c23bcd6..d99e7cf 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -3,7 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import dcparser, parser, functions +import copy +from librarian import functions, OutputFile from lxml import etree import os @@ -28,7 +29,7 @@ Utwór opracowany został w ramach projektu Wolne Lektury przez fundację Nowocz %(description)s%(contributors)s """ -def transform(input_file, output_file, parse_dublincore=True, flags=None, **options): +def transform(wldoc, flags=None, **options): """ Transforms input_file in XML to output_file in TXT. possible flags: raw-text, @@ -37,7 +38,9 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt') style = etree.parse(style_filename) - document = parser.WLDocument.from_file(input_file, True, parse_dublincore=parse_dublincore) + document = copy.deepcopy(wldoc) + del wldoc + document.swap_endlines() if flags: for flag in flags: @@ -46,10 +49,10 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti result = document.transform(style, **options) if not flags or 'raw-text' not in flags: - if parse_dublincore: - parsed_dc = dcparser.BookInfo.from_element(document.edoc) + if document.book_info: + parsed_dc = document.book_info description = parsed_dc.description - url = parsed_dc.url + url = document.book_info.url license_description = parsed_dc.license_description license = parsed_dc.license @@ -75,7 +78,7 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti license_description = "" source = "" contributors = "" - output_file.write((TEMPLATE % { + return OutputFile.from_string((TEMPLATE % { 'description': description, 'url': url, 'license_description': license_description, @@ -84,5 +87,5 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti 'contributors': contributors, }).encode('utf-8')) else: - output_file.write(unicode(result).encode('utf-8')) + return OutputFile.from_string(unicode(result).encode('utf-8')) diff --git a/scripts/book2epub b/scripts/book2epub index 9adf4b4..9af3692 100755 --- a/scripts/book2epub +++ b/scripts/book2epub @@ -7,7 +7,8 @@ import os.path import optparse -from librarian import epub, DirDocProvider, ParseError +from librarian import DirDocProvider, ParseError +from librarian.parser import WLDocument if __name__ == '__main__': @@ -37,18 +38,20 @@ if __name__ == '__main__': for main_input in input_filenames: if options.verbose: print main_input + path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) - - output_dir = output_file = None - if options.output_dir: - output_dir = options.output_dir - elif options.output_file: - output_file = options.output_file + if not (options.output_file or options.output_dir): + output_file = os.path.splitext(main_input)[0] + '.epub' else: - output_dir = path + output_file = None + + doc = WLDocument.from_file(main_input, provider=provider) + epub = doc.as_epub() + + doc.save_output_file(epub, + output_file, options.output_dir, options.make_dir, 'epub') - epub.transform(provider, file_path=main_input, output_dir=output_dir, output_file=output_file, make_dir=options.make_dir) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, diff --git a/scripts/book2html b/scripts/book2html index d61b299..1e88823 100755 --- a/scripts/book2html +++ b/scripts/book2html @@ -7,7 +7,8 @@ import os import optparse -from librarian import html, ParseError +from librarian import ParseError +from librarian.parser import WLDocument if __name__ == '__main__': @@ -35,7 +36,10 @@ if __name__ == '__main__': output_filename = os.path.splitext(input_filename)[0] + '.html' try: - html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore, flags=('full-page',)) + doc = WLDocument.from_file(input_filename, + parse_dublincore=options.parse_dublincore) + html = doc.as_html(flags=('full-page',)) + doc.save_output_file(html, output_path=output_filename) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': input_filename, diff --git a/scripts/book2ihtml b/scripts/book2ihtml index 97d8ebd..779f245 100755 --- a/scripts/book2ihtml +++ b/scripts/book2ihtml @@ -7,7 +7,8 @@ import os import optparse -from librarian import html, ParseError +from librarian import ParseError +from librarian.parser import WLDocument if __name__ == '__main__': @@ -35,8 +36,10 @@ if __name__ == '__main__': output_filename = os.path.splitext(input_filename)[0] + '.html' try: - html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore,\ - stylesheet='partial') + doc = WLDocument.from_file(input_filename, + parse_dublincore=options.parse_dublincore) + html = doc.as_html(flags=('full-page',), stylesheet='partial') + doc.save_output_file(html, output_path=output_filename) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': input_filename, diff --git a/scripts/book2mobi b/scripts/book2mobi index 1c00b51..665dcfa 100755 --- a/scripts/book2mobi +++ b/scripts/book2mobi @@ -7,7 +7,8 @@ import os.path import optparse -from librarian import mobi, DirDocProvider, ParseError +from librarian import DirDocProvider, ParseError +from librarian.parser import WLDocument if __name__ == '__main__': @@ -35,20 +36,18 @@ if __name__ == '__main__': # Do some real work try: for main_input in input_filenames: - if options.verbose: - print main_input path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) - - output_dir = output_file = None - if options.output_dir: - output_dir = options.output_dir - elif options.output_file: - output_file = options.output_file + if not (options.output_file or options.output_dir): + output_file = os.path.splitext(main_input)[0] + '.mobi' else: - output_dir = path + output_file = None + + doc = WLDocument.from_file(main_input, provider=provider) + mobi = doc.as_mobi() - mobi.transform(provider, file_path=main_input, output_dir=output_dir, output_file=output_file, make_dir=options.make_dir) + doc.save_output_file(mobi, + output_file, options.output_dir, options.make_dir, 'mobi') except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, diff --git a/scripts/book2pdf b/scripts/book2pdf index d10f400..171264b 100755 --- a/scripts/book2pdf +++ b/scripts/book2pdf @@ -6,7 +6,10 @@ # import os.path from optparse import OptionParser -from librarian import pdf, DirDocProvider, ParseError + +from librarian import DirDocProvider, ParseError +from librarian.parser import WLDocument + if __name__ == '__main__': usage = """Usage: %prog [options] SOURCE [SOURCE...] @@ -31,33 +34,25 @@ if __name__ == '__main__': parser.print_help() exit(1) - try: - if options.output_dir and options.output_file: - raise ValueError("Either --output-dir or --output file should be specified") + if options.output_dir and options.output_file: + raise ValueError("Either --output-dir or --output file should be specified") + try: for main_input in args: - if options.verbose: - print main_input path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) - - output_file = output_dir = None - if options.output_dir: - output_dir = options.output_dir - elif options.output_file: - output_file = options.output_file + output_file, output_dir = options.output_file, options.output_dir + if not (options.output_file or options.output_dir): + output_file = os.path.splitext(main_input)[0] + '.pdf' else: - output_dir = path + output_file = None + + doc = WLDocument.from_file(main_input, provider=provider) + pdf = doc.as_pdf(save_tex=options.save_tex, + morefloats=options.morefloats) - pdf.transform(provider, - file_path=main_input, - output_file=output_file, - output_dir=output_dir, - verbose=options.verbose, - make_dir=options.make_dir, - save_tex=options.save_tex, - morefloats=options.morefloats - ) + doc.save_output_file(pdf, + output_file, options.output_dir, options.make_dir, 'pdf') except ParseError, e: print '%(file)s:%(name)s:%(message)s; use -v to see more output' % { 'file': main_input, diff --git a/scripts/book2txt b/scripts/book2txt index d56d6ff..9cfdef2 100755 --- a/scripts/book2txt +++ b/scripts/book2txt @@ -7,8 +7,8 @@ import os import optparse -from librarian import text -from librarian import dcparser, ParseError +from librarian import ParseError +from librarian.parser import WLDocument if __name__ == '__main__': @@ -38,9 +38,10 @@ if __name__ == '__main__': output_filename = os.path.splitext(input_filename)[0] + '.txt' try: - output_file = open(output_filename, 'w') - text.transform(open(input_filename), output_file, parse_dublincore=options.parse_dublincore, - wrapping=str(options.wrapping)) + doc = WLDocument.from_file(input_filename, + parse_dublincore=options.parse_dublincore) + html = doc.as_text(wrapping=str(options.wrapping)) + doc.save_output_file(html, output_path=output_filename) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': input_filename, diff --git a/setup.py b/setup.py index 1394643..023c943 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def whole_tree(prefix, path): setup( name='librarian', - version='1.3', + version='1.4', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', diff --git a/tests/files/text/asnyk_miedzy_nami.xml b/tests/files/text/asnyk_miedzy_nami.xml deleted file mode 100644 index 36d8df6..0000000 --- a/tests/files/text/asnyk_miedzy_nami.xml +++ /dev/null @@ -1,65 +0,0 @@ - - - - - - -Asnyk, Adam -Między nami nic nie było - -Sekuła, Aleksandra -Sutkowska, Olga -Fundacja Nowoczesna Polska -Pozytywizm -Liryka -Wiersz -Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. -http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo -http://www.polona.pl/Content/5164 -(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898 -Domena publiczna - Adam Asnyk zm. 1897 -1897 -xml -text -text -2007-09-06 -L -pol - - - - -Adam Asnyk - -Miłość platonicznaMiędzy nami nic nie było - - - -Między nami nic nie było!/ -Żadnych zwierzeń, wyznań żadnych!/ -Nic nas z sobą nie łączyło ---/ -Prócz wiosennych marzeń zdradnych; - - - -NaturaPrócz tych woni, barw i blasków,/ -Unoszących się w przestrzeni;/ -Prócz szumiących śpiewem lasków/ -I tej świeżej łąk zieleni; - - - -Prócz tych kaskad i potoków,/ -Zraszających każdy parów,/ -Prócz girlandy tęcz, obłoków,/ -Prócz natury słodkich czarów; - - - -Prócz tych wspólnych, jasnych zdrojów,/ -Z których serce zachwyt piło;/ -Prócz pierwiosnków i powojów,---/ -Między nami nic nie było! - - - diff --git a/tests/files/text/asnyk_zbior.xml b/tests/files/text/asnyk_zbior.xml new file mode 100755 index 0000000..c585a8b --- /dev/null +++ b/tests/files/text/asnyk_zbior.xml @@ -0,0 +1,29 @@ + + + + + +Asnyk, Adam +Poezye +Fundacja Nowoczesna Polska +Pozytywizm +Liryka +Wiersz +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. +http://wolnelektury.pl/katalog/lektura/poezye +http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo +http://www.polona.pl/Content/5164 +(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898 +Domena publiczna - Adam Asnyk zm. 1897 +1897 +xml +text +text +2007-09-06 +L +pol + + + + + diff --git a/tests/files/text/miedzy-nami-nic-nie-bylo.xml b/tests/files/text/miedzy-nami-nic-nie-bylo.xml new file mode 100644 index 0000000..124940e --- /dev/null +++ b/tests/files/text/miedzy-nami-nic-nie-bylo.xml @@ -0,0 +1,65 @@ + + + + + + +Asnyk, Adam +Między nami nic nie było + +Sekuła, Aleksandra +Sutkowska, Olga +Fundacja Nowoczesna Polska +Pozytywizm +Liryka +Wiersz +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. +http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo +http://www.polona.pl/Content/5164 +(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898 +Domena publiczna - Adam Asnyk zm. 1897 +1897 +xml +text +text +2007-09-06 +L +pol + + + + +Adam Asnyk + +Miłość platonicznaMiędzy nami nic nie było + + + +Między nami nic nie było!/ +Żadnych zwierzeń, wyznań żadnych!/ +Nic nas z sobą nie łączyło ---/ +Prócz wiosennych marzeń zdradnych; + + + +NaturaPrócz tych woni, barw i blasków,/ +Unoszących się w przestrzeni;/ +Prócz szumiących śpiewem lasków/ +I tej świeżej łąk zieleni; + + + +Prócz tych kaskad i potoków,/ +Zraszających każdy parów,/ +Prócz girlandy tęcz, obłoków,/ +Prócz natury słodkich czarów; + + + +Prócz tych wspólnych, jasnych zdrojów,/ +Z których serce zachwyt piło;/ +Prócz pierwiosnków i powojów,---/ +Między nami nic nie było! + + + diff --git a/tests/test_epub.py b/tests/test_epub.py new file mode 100644 index 0000000..9fc5637 --- /dev/null +++ b/tests/test_epub.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from librarian import DirDocProvider +from librarian.parser import WLDocument +from nose.tools import * +from utils import get_fixture + + +def test_transform(): + WLDocument.from_file( + get_fixture('text', 'asnyk_zbior.xml'), + provider=DirDocProvider(get_fixture('text', '')) + ).as_epub(flags=['without_fonts']) diff --git a/tests/test_html.py b/tests/test_html.py index 5187e06..51d6acd 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -3,44 +3,38 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import html, NoDublinCore +from librarian import NoDublinCore +from librarian.parser import WLDocument from nose.tools import * -from utils import get_fixture, remove_output_file +from utils import get_fixture -def teardown_transform(): - remove_output_file('text', 'asnyk_miedzy_nami.html') - -@with_setup(None, teardown_transform) def test_transform(): - output_file_path = get_fixture('text', 'asnyk_miedzy_nami.html') expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html') - html.transform( - get_fixture('text', 'asnyk_miedzy_nami.xml'), - output_file_path, - ) + html = WLDocument.from_file( + get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') + ).as_html().get_string() - assert_equal(file(output_file_path).read(), file(expected_output_file_path).read()) + assert_equal(html, file(expected_output_file_path).read()) -@with_setup(None, teardown_transform) @raises(NoDublinCore) def test_no_dublincore(): - html.transform( - get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), - get_fixture('text', 'asnyk_miedzy_nami.html'), - ) + WLDocument.from_file( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml') + ).as_html() -@with_setup(None, teardown_transform) def test_passing_parse_dublincore_to_transform(): """Passing parse_dublincore=False to transform omits DublinCore parsing.""" - html.transform( - get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), - get_fixture('text', 'asnyk_miedzy_nami.html'), - parse_dublincore=False, - ) + WLDocument.from_file( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), + parse_dublincore=False, + ).as_html() def test_empty(): - assert html.transform('', is_file=False, parse_dublincore=False).find('empty') + assert not WLDocument.from_string( + '', + parse_dublincore=False, + ).as_html() diff --git a/tests/test_text.py b/tests/test_text.py index 7ff94ca..70dfb60 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -3,42 +3,32 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import text, NoDublinCore +from librarian import NoDublinCore +from librarian.parser import WLDocument from nose.tools import * -from utils import get_fixture, remove_output_file +from utils import get_fixture -def teardown_transform(): - remove_output_file('text', 'asnyk_miedzy_nami.txt') - - -@with_setup(None, teardown_transform) def test_transform(): - output_file_path = get_fixture('text', 'asnyk_miedzy_nami.txt') expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt') - text.transform( - open(get_fixture('text', 'asnyk_miedzy_nami.xml')), - open(output_file_path, 'w'), - ) + text = WLDocument.from_file( + get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') + ).as_text().get_string() - assert_equal(file(output_file_path).read(), file(expected_output_file_path).read()) + assert_equal(text, file(expected_output_file_path).read()) -@with_setup(None, teardown_transform) @raises(NoDublinCore) def test_no_dublincore(): - text.transform( - open(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')), - open(get_fixture('text', 'asnyk_miedzy_nami.txt'), 'w'), - ) + WLDocument.from_file( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml') + ).as_text() -@with_setup(None, teardown_transform) def test_passing_parse_dublincore_to_transform(): - """Passing parse_dublincore=False to transform omits DublinCore parsing.""" - text.transform( - open(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')), - open(get_fixture('text', 'asnyk_miedzy_nami.txt'), 'w'), - parse_dublincore=False, - ) + """Passing parse_dublincore=False to the constructor omits DublinCore parsing.""" + WLDocument.from_file( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), + parse_dublincore=False, + ).as_text() diff --git a/tests/utils.py b/tests/utils.py index b112066..3b1f4f5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -21,10 +21,3 @@ def get_fixture(dir_name, file_name): def get_all_fixtures(dir_name, glob_pattern='*'): """Returns list of paths for fixtures in directory dir_name matching the glob_pattern.""" return [get_fixture(dir_name, file_name) for file_name in glob.glob(join(get_fixture_dir(dir_name), glob_pattern))] - - -def remove_output_file(dir_name, file_name): - try: - os.remove(get_fixture(dir_name, file_name)) - except: - pass