converters interface changed: WLDocument in, OutputFile out

author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Mon, 5 Dec 2011 16:06:51 +0000 (17:06 +0100)

committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Mon, 5 Dec 2011 16:11:03 +0000 (17:11 +0100)
author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Mon, 5 Dec 2011 16:06:51 +0000 (17:06 +0100)
committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Mon, 5 Dec 2011 16:11:03 +0000 (17:11 +0100)
diff --git a/librarian/__init__.py b/librarian/__init__.py

index 8f5cf1a..fdd6b55 100644 (file)
--- a/librarian/__init__.py
+++ b/librarian/__init__.py
@@ -4,6 +4,8 @@
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  import os
+import re
+import shutil
  
  class ParseError(Exception):
      def __str__(self):
@@ -18,6 +20,11 @@ class ValidationError(Exception):
      pass
  
  class NoDublinCore(ValidationError):
+    """There's no DublinCore section, and it's required."""
+    pass
+
+class NoProvider(Exception):
+    """There's no DocProvider specified, and it's needed."""
      pass
  
  class XMLNamespace(object):
@@ -56,37 +63,61 @@ OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  WLNS = EmptyNamespace()
  
  
+class WLURI(object):
+    """Represents a WL URI. Extracts slug and language from it."""
+
+    slug = None
+    language = None
+
+    _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/lektura/'
+            '(?P<slug>[-a-z]+)(/(?P<lang>[a-z]{3})/?)?')
+
+    def __init__(self, uri):
+        self.uri = uri
+        match = self._re_wl_uri.match(uri)
+        assert match
+        self.slug = match.group('slug')
+        self.language = match.group('lang')
+
+
  class DocProvider(object):
-    """ Base class for a repository of XML files.
-        Used for generating joined files, like EPUBs
+    """Base class for a repository of XML files.
+
+    Used for generating joined files, like EPUBs.
      """
  
-    def by_slug(self, slug):
-        raise NotImplemented
+    def by_slug_and_lang(self, slug, lang=None):
+        """Should return a file-like object with a WL document XML."""
+        raise NotImplementedError
  
-    def __getitem__(self, slug):
-        return self.by_slug(slug)
+    def by_slug(self, slug):
+        """Should return a file-like object with a WL document XML."""
+        return self.by_slug_and_lang(slug)
  
      def by_uri(self, uri):
-        return self.by_slug(uri.rsplit('/', 1)[1])
+        """Should return a file-like object with a WL document XML."""
+        wluri = WLURI(uri)
+        return self.by_slug_and_lang(wluri.slug, wluri.language)
  
  
  class DirDocProvider(DocProvider):
      """ Serve docs from a directory of files in form <slug>.xml """
  
-    def __init__(self, dir):
-        self.dir = dir
+    def __init__(self, dir_):
+        self.dir = dir_
          self.files = {}
+        return super(DirDocProvider, self).__init__()
  
-    def by_slug(self, slug):
-        return open(os.path.join(self.dir, '%s.xml' % slug))
+    def by_slug_and_lang(self, slug, lang=None):
+        fname = "%s%s.xml" % (slug, ".%s" % lang if lang else "")
+        return open(os.path.join(self.dir, fname))
  
  
  import lxml.etree as etree
  import dcparser
  
  DEFAULT_BOOKINFO = dcparser.BookInfo(
-        { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, \
+        { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'},
          { DCNS('creator'): [u'Some, Author'],
            DCNS('title'): [u'Some Title'],
            DCNS('subject.period'): [u'Unknown'],
@@ -119,14 +150,15 @@ def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO):
          method='xml', encoding=unicode, pretty_print=True)
  
      return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext + \
-        u'\n</plain-text>\n</utwor>';
+        u'\n</plain-text>\n</utwor>'
  
  
  def serialize_raw(element):
      b = u'' + (element.text or '')
  
      for child in element.iterchildren():
-        e = etree.tostring(child, method='xml', encoding=unicode, pretty_print=True)
+        e = etree.tostring(child, method='xml', encoding=unicode,
+                pretty_print=True)
          b += e
  
      return b
@@ -141,3 +173,73 @@ def serialize_children(element, format='raw'):
  def get_resource(path):
      return os.path.join(os.path.dirname(__file__), path)
  
+
+class OutputFile(object):
+    """Represents a file returned by one of the converters."""
+
+    _string = None
+    _filename = None
+
+    def __del__(self):
+        if self._filename:
+            os.unlink(self._filename)
+
+    def __nonzero__(self):
+        return self._string is not None or self._filename is not None
+
+    @classmethod
+    def from_string(cls, string):
+        """Converter returns contents of a file as a string."""
+
+        instance = cls()
+        instance._string = string
+        return instance
+
+    @classmethod
+    def from_filename(cls, filename):
+        """Converter returns contents of a file as a named file."""
+
+        instance = cls()
+        instance._filename = filename
+        return instance
+
+    def get_string(self):
+        """Get file's contents as a string."""
+
+        if self._filename is not None:
+            with open(self._filename) as f:
+                return f.read()
+        else:
+            return self._string
+
+    def get_file(self):
+        """Get file as a file-like object."""
+
+        if self._string is not None:
+            from StringIO import StringIO
+            return StringIO(self._string)
+        elif self._filename is not None:
+            return open(self._filename)
+
+    def get_filename(self):
+        """Get file as a fs path."""
+
+        if self._filename is not None:
+            return self._filename
+        elif self._string is not None:
+            from tempfile import NamedTemporaryFile
+            temp = NamedTemporaryFile(prefix='librarian-', delete=False)
+            temp.write(self._string)
+            temp.close()
+            self._filename = temp.name
+            return self._filename
+        else:
+            return None
+
+    def save_as(self, path):
+        """Save file to a path. Create directories, if necessary."""
+
+        dirname = os.path.dirname(os.path.abspath(path))
+        if not os.path.isdir(dirname):
+            os.makedirs(dirname)
+        shutil.copy(self.get_filename(), path)
diff --git a/librarian/dcparser.py b/librarian/dcparser.py

index aa8f50d..5492f7a 100644 (file)
--- a/librarian/dcparser.py
+++ b/librarian/dcparser.py
@@ -7,7 +7,8 @@ from xml.parsers.expat import ExpatError
  from datetime import date
  import time
  
-from librarian import ValidationError, NoDublinCore, ParseError, DCNS, RDFNS
+from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
+                       WLURI)
  
  import lxml.etree as etree # ElementTree API using libxml2
  from lxml.etree import XMLSyntaxError
@@ -150,7 +151,7 @@ class BookInfo(object):
  
      @property
      def slug(self):
-        return self.url.rsplit('/', 1)[1]
+        return WLURI(self.url).slug
  
      @classmethod
      def from_string(cls, xml):
diff --git a/librarian/epub.py b/librarian/epub.py

index bb3123d..b063380 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -12,13 +12,10 @@ from StringIO import StringIO
  from copy import deepcopy
  from lxml import etree
  import zipfile
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
  from shutil import rmtree
  
-import sys
-
-from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
-from librarian.dcparser import BookInfo
+from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  
  from librarian import functions, get_resource
  
@@ -287,47 +284,40 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s
      return output_html, toc, chars
  
  
-def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
+def transform(wldoc, verbose=False,
                style=None, html_toc=False,
                sample=None, cover=None, flags=None):
      """ produces a EPUB file
  
-    provider: a DocProvider
-    slug: slug of file to process, available by provider
-    output_file: file-like object or path to output file
-    output_dir: path to directory to save output file to; either this or output_file must be present
-    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
      sample=n: generate sample e-book (with at least n paragraphs)
      cover: a cover.Cover object
      flags: less-advertising, without-fonts
      """
  
-    def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
+    def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
          """ processes one input file and proceeds to its children """
  
-        replace_characters(input_xml.getroot())
-
-        children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
+        replace_characters(wldoc.edoc.getroot())
  
          # every input file will have a TOC entry,
          # pointing to starting chunk
-        toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), "part%d.html" % chunk_counter)
+        toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
          chars = set()
          if first:
              # write book title page
-            html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
+            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
                   etree.tostring(html_tree, method="html", pretty_print=True))
              # add a title page TOC entry
              toc.add(u"Strona tytułowa", "title.html")
-        elif children:
+        elif wldoc.book_info.parts:
              # write title page for every parent
              if sample is not None and sample <= 0:
                  chars = set()
                  html_string = open(get_resource('epub/emptyChunk.html')).read()
              else:
-                html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
+                html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
                  chars = used_chars(html_tree.getroot())
                  html_string = etree.tostring(html_tree, method="html", pretty_print=True)
              zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
@@ -335,12 +325,12 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
  
-        if len(input_xml.getroot()) > 1:
+        if len(wldoc.edoc.getroot()) > 1:
              # rdf before style master
-            main_text = input_xml.getroot()[1]
+            main_text = wldoc.edoc.getroot()[1]
          else:
              # rdf in style master
-            main_text = input_xml.getroot()[0]
+            main_text = wldoc.edoc.getroot()[0]
              if main_text.tag == RDFNS('RDF'):
                  main_text = None
  
@@ -361,51 +351,28 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
                  add_to_spine(spine, chunk_counter)
                  chunk_counter += 1
  
-        if children:
-            for child in children:
-                child_xml = etree.parse(provider.by_uri(child))
-                child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
-                toc.append(child_toc)
-                chars = chars.union(chunk_chars)
+        for child in wldoc.parts():
+            child_toc, chunk_counter, chunk_chars, sample = transform_file(
+                child, chunk_counter, first=False, sample=sample)
+            toc.append(child_toc)
+            chars = chars.union(chunk_chars)
  
          return toc, chunk_counter, chars, sample
  
-    # read metadata from the first file
-    if file_path:
-        if slug:
-            raise ValueError('slug or file_path should be specified, not both')
-        f = open(file_path, 'r')
-        input_xml = etree.parse(f)
-        f.close()
-    else:
-        if not slug:
-            raise ValueError('either slug or file_path should be specified')
-        input_xml = etree.parse(provider[slug])
+
+    document = deepcopy(wldoc)
+    del wldoc
  
      if flags:
          for flag in flags:
-            input_xml.getroot().set(flag, 'yes')
-
-    metadata = input_xml.find('.//'+RDFNS('Description'))
-    if metadata is None:
-        raise NoDublinCore('Document has no DublinCore - which is required.')
-    book_info = BookInfo.from_element(input_xml)
-    metadata = etree.ElementTree(metadata)
-
-    # if output to dir, create the file
-    if output_dir is not None:
-        if make_dir:
-            author = unicode(book_info.author)
-            output_dir = os.path.join(output_dir, author)
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-        if slug:
-            output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
-        else:
-            output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
+            document.edoc.getroot().set(flag, 'yes')
+
+    opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
+    manifest = opf.find('.//' + OPFNS('manifest'))
+    guide = opf.find('.//' + OPFNS('guide'))
+    spine = opf.find('.//' + OPFNS('spine'))
  
+    output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
      # write static elements
@@ -425,14 +392,10 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
          style = get_resource('epub/style.css')
      zip.write(style, os.path.join('OPS', 'style.css'))
  
-    opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
-    manifest = opf.find('.//' + OPFNS('manifest'))
-    guide = opf.find('.//' + OPFNS('guide'))
-    spine = opf.find('.//' + OPFNS('spine'))
  
      if cover:
          cover_file = StringIO()
-        c = cover(book_info.author.readable(), book_info.title)
+        c = cover(document.book_info.author.readable(), document.book_info.title)
          c.save(cover_file)
          c_name = 'cover.%s' % c.ext()
          zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
@@ -468,7 +431,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
              '<itemref idref="html_toc" />'))
          guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
  
-    toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
+    toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
  
      if len(toc.children) < 2:
          toc.add(u"Początek utworu", "part1.html")
@@ -491,7 +454,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
          '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
      spine.append(etree.fromstring(
          '<itemref idref="last" />'))
-    html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
+    html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
      chars.update(used_chars(html_tree.getroot()))
      zip.writestr('OPS/last.html', etree.tostring(
                          html_tree, method="html", pretty_print=True))
@@ -517,8 +480,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
          os.chdir(cwd)
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
-    contents = []
-    title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
+    title = document.book_info.title
      attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
      for st in attributes:
          meta = toc_file.makeelement(NCXNS('meta'))
@@ -536,3 +498,5 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
      toc.write_to_xml(nav_map)
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
+
+    return OutputFile.from_filename(output_file.name)
diff --git a/librarian/html.py b/librarian/html.py

index 5974d93..997f904 100644 (file)
--- a/librarian/html.py
+++ b/librarian/html.py
@@ -5,12 +5,10 @@
  #
  import os
  import cStringIO
-import re
  import copy
  
  from lxml import etree
-from librarian.parser import WLDocument
-from librarian import XHTMLNS, ParseError
+from librarian import XHTMLNS, ParseError, OutputFile
  from librarian import functions
  
  from lxml.etree import XMLSyntaxError, XSLTApplyError
@@ -30,9 +28,8 @@ def get_stylesheet(name):
  def html_has_content(text):
      return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  
-def transform(input, output_filename=None, is_file=True, \
-    parse_dublincore=True, stylesheet='legacy', options={}, flags=None):
-    """Transforms file input_filename in XML to output_filename in XHTML.
+def transform(wldoc, stylesheet='legacy', options=None, flags=None):
+    """Transforms the WL document to XHTML.
  
      If output_filename is None, returns an XML,
      otherwise returns True if file has been written,False if it hasn't.
@@ -43,12 +40,9 @@ def transform(input, output_filename=None, is_file=True, \
          style_filename = get_stylesheet(stylesheet)
          style = etree.parse(style_filename)
  
-        if is_file:
-            document = WLDocument.from_file(input, True, \
-                parse_dublincore=parse_dublincore)
-        else:
-            document = WLDocument.from_string(input, True, \
-                parse_dublincore=parse_dublincore)
+        document = copy.deepcopy(wldoc)
+        del wldoc
+        document.swap_endlines()
  
          if flags:
              for flag in flags:
@@ -56,6 +50,8 @@ def transform(input, output_filename=None, is_file=True, \
  
          document.clean_ed_note()
  
+        if not options:
+            options = {}
          result = document.transform(style, **options)
          del document # no longer needed large object :)
  
@@ -63,16 +59,10 @@ def transform(input, output_filename=None, is_file=True, \
              add_anchors(result.getroot())
              add_table_of_contents(result.getroot())
  
-            if output_filename is not None:
-                result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
-            else:
-                return result
-            return True
+            return OutputFile.from_string(etree.tostring(result, method='html',
+                xml_declaration=False, pretty_print=True, encoding='utf-8'))
          else:
-            if output_filename is not None:
-                return False
-            else:
-                return "<empty />"
+            return None
      except KeyError:
          raise ValueError("'%s' is not a valid stylesheet.")
      except (XMLSyntaxError, XSLTApplyError), e:
diff --git a/librarian/mobi.py b/librarian/mobi.py

index cd894fe..a93315e 100755 (executable)
--- a/librarian/mobi.py
+++ b/librarian/mobi.py
@@ -4,60 +4,25 @@
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  import os
-import os.path
  import subprocess
  from tempfile import NamedTemporaryFile
-from lxml import etree
  
+from librarian import OutputFile
  from librarian.cover import WLCover
-from librarian import epub, get_resource, NoDublinCore, RDFNS
-from librarian.dcparser import BookInfo
+from librarian import get_resource
  
  
-def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
+def transform(wldoc, verbose=False,
                sample=None, cover=None, flags=None):
      """ produces a MOBI file
  
-    provider: a DocProvider
-    slug: slug of file to process, available by provider
-    output_file: path to output file
-    output_dir: path to directory to save output file to; either this or output_file must be present
-    make_dir: writes output to <output_dir>/<author>/<slug>.mobi instead of <output_dir>/<slug>.mobi
+    wldoc: a WLDocument
      sample=n: generate sample e-book (with at least n paragraphs)
      cover: a cover.Cover object
      flags: less-advertising,
      """
  
-    # read metadata from the first file
-    if file_path:
-        if slug:
-            raise ValueError('slug or file_path should be specified, not both')
-        f = open(file_path, 'r')
-        input_xml = etree.parse(f)
-        f.close()
-    else:
-        if not slug:
-            raise ValueError('either slug or file_path should be specified')
-        input_xml = etree.parse(provider[slug])
-
-    metadata = input_xml.find('.//'+RDFNS('Description'))
-    if metadata is None:
-        raise NoDublinCore('Document has no DublinCore - which is required.')
-    book_info = BookInfo.from_element(input_xml)
-
-    # if output to dir, create the file
-    if output_dir is not None:
-        if make_dir:
-            author = unicode(book_info.author)
-            output_dir = os.path.join(output_dir, author)
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-        if slug:
-            output_file = os.path.join(output_dir, '%s.mobi' % slug)
-        else:
-            output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.mobi')
+    book_info = wldoc.book_info
  
      # provide a cover by default
      if not cover:
@@ -66,19 +31,21 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
      c = cover(book_info.author.readable(), book_info.title)
      c.save(cover_file)
  
-    epub_file = NamedTemporaryFile(suffix='.epub', delete=False)
      if not flags:
          flags = []
      flags = list(flags) + ['without-fonts']
-    epub.transform(provider, file_path=file_path, output_file=epub_file, verbose=verbose,
-              sample=sample, html_toc=True, flags=flags, style=get_resource('mobi/style.css'))
+    epub = wldoc.as_epub(verbose=verbose, sample=sample, html_toc=True,
+            flags=flags, style=get_resource('mobi/style.css'))
  
      if verbose:
          kwargs = {}
      else:
          devnull = open("/dev/null", 'w')
          kwargs = {"stdout": devnull, "stderr": devnull}
-    subprocess.check_call(['ebook-convert', epub_file.name, output_file,
+
+    output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi', delete=False)
+    output_file.close()
+    subprocess.check_call(['ebook-convert', epub.get_filename(), output_file.name,
              '--no-inline-toc', '--cover=%s' % cover_file.name], **kwargs)
-    os.unlink(epub_file.name)
      os.unlink(cover_file.name)
+    return OutputFile.from_filename(output_file.name)
+\ No newline at end of file
diff --git a/librarian/packagers.py b/librarian/packagers.py

index 054f068..ebeb5b3 100644 (file)
--- a/librarian/packagers.py
+++ b/librarian/packagers.py
@@ -6,8 +6,8 @@
  import os
  from copy import deepcopy
  from lxml import etree
-from librarian import epub, pdf, DirDocProvider, ParseError, cover
-from librarian.dcparser import BookInfo
+from librarian import pdf, epub, DirDocProvider, ParseError, cover
+from librarian.parser import WLDocument
  
  
  class Packager(object):
@@ -26,8 +26,11 @@ class Packager(object):
              except:
                  pass
          outfile = os.path.join(output_dir, slug + '.' + cls.ext)
-        cls.converter.transform(provider, file_path=main_input, output_file=outfile,
+
+        doc = WLDocument.from_file(main_input, provider=provider)
+        output_file = cls.converter.transform(doc,
                  cover=cls.cover, flags=cls.flags)
+        doc.save_output_file(output_file, output_path=outfile)
  
  
      @classmethod
@@ -78,7 +81,6 @@ class VirtualoEpubPackager(Packager):
          """ truncates text to at most `limit' bytes in utf-8 """
          if text is None:
              return text
-        orig_text = text
          if len(text.encode('utf-8')) > limit:
              newlimit = limit - 3
              while len(text.encode('utf-8')) > newlimit:
@@ -116,7 +118,8 @@ class VirtualoEpubPackager(Packager):
                  outfile_dir = os.path.join(output_dir, slug)
                  os.makedirs(os.path.join(output_dir, slug))
  
-                info = BookInfo.from_file(main_input)
+                doc = WLDocument.from_file(main_input, provider=provider)
+                info = doc.book_info
  
                  product_elem = deepcopy(product)
                  product_elem[0].text = cls.utf_trunc(slug, 100)
@@ -133,8 +136,10 @@ class VirtualoEpubPackager(Packager):
                      ).save(os.path.join(outfile_dir, slug+'.jpg'))
                  outfile = os.path.join(outfile_dir, '1.epub')
                  outfile_sample = os.path.join(outfile_dir, '1.sample.epub')
-                epub.transform(provider, file_path=main_input, output_file=outfile)
-                epub.transform(provider, file_path=main_input, output_file=outfile_sample, sample=25)
+                doc.save_output_file(epub.transform(doc),
+                        output_path=outfile)
+                doc.save_output_file(epub.transform(doc, sample=25), 
+                        output_path=outfile_sample)
          except ParseError, e:
              print '%(file)s:%(name)s:%(message)s' % {
                  'file': main_input,
diff --git a/librarian/parser.py b/librarian/parser.py

index afc4f1a..469b7df 100644 (file)
--- a/librarian/parser.py
+++ b/librarian/parser.py
@@ -3,7 +3,7 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from librarian import ValidationError, NoDublinCore,  ParseError
+from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
  from librarian import RDFNS
  from librarian import dcparser
  
@@ -11,14 +11,17 @@ from xml.parsers.expat import ExpatError
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
+import os
  import re
  from StringIO import StringIO
  
  class WLDocument(object):
-    LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
+    LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
+    provider = None
  
-    def __init__(self, edoc, parse_dublincore=True):
+    def __init__(self, edoc, parse_dublincore=True, provider=None):
          self.edoc = edoc
+        self.provider = provider
  
          root_elem = edoc.getroot()
  
@@ -42,7 +45,7 @@ class WLDocument(object):
          return cls.from_file(StringIO(xml), *args, **kwargs)
  
      @classmethod
-    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
+    def from_file(cls, xmlfile, parse_dublincore=True, provider=None):
  
          # first, prepare for parsing
          if isinstance(xmlfile, basestring):
@@ -63,20 +66,17 @@ class WLDocument(object):
              parser = etree.XMLParser(remove_blank_text=False)
              tree = etree.parse(StringIO(data.encode('utf-8')), parser)
  
-            if swap_endlines:
-                cls.swap_endlines(tree)
-
-            return cls(tree, parse_dublincore=parse_dublincore)
+            return cls(tree, parse_dublincore=parse_dublincore, provider=provider)
          except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
              raise ParseError(e)
  
-    @classmethod
-    def swap_endlines(cls, tree):
+    def swap_endlines(self):
+        """Converts line breaks in stanzas into <br/> tags."""
          # only swap inside stanzas
-        for elem in tree.iter('strofa'):
+        for elem in self.edoc.iter('strofa'):
              for child in list(elem):
                  if child.tail:
-                    chunks = cls.LINE_SWAP_EXPR.split(child.tail)
+                    chunks = self.LINE_SWAP_EXPR.split(child.tail)
                      ins_index = elem.index(child) + 1
                      while len(chunks) > 1:
                          ins = etree.Element('br')
@@ -84,13 +84,22 @@ class WLDocument(object):
                          elem.insert(ins_index, ins)
                      child.tail = chunks.pop(0)
              if elem.text:
-                chunks = cls.LINE_SWAP_EXPR.split(elem.text)
+                chunks = self.LINE_SWAP_EXPR.split(elem.text)
                  while len(chunks) > 1:
                      ins = etree.Element('br')
                      ins.tail = chunks.pop()
                      elem.insert(0, ins)
                  elem.text = chunks.pop(0)
  
+    def parts(self):
+        if self.provider is None:
+            raise NoProvider('No document provider supplied.')
+        if self.book_info is None:
+            raise NoDublinCore('No Dublin Core in document.')
+        for part_uri in self.book_info.parts:
+            yield self.from_file(self.provider.by_uri(part_uri),
+                    provider=self.provider)
+
      def chunk(self, path):
          # convert the path to XPath
          expr = self.path_to_xpath(path)
@@ -152,3 +161,40 @@ class WLDocument(object):
              node.clear()
              node.tag = 'span'
              node.tail = tail
+
+    # Converters
+
+    def as_html(self, *args, **kwargs):
+        from librarian import html
+        return html.transform(self, *args, **kwargs)
+
+    def as_text(self, *args, **kwargs):
+        from librarian import text
+        return text.transform(self, *args, **kwargs)
+
+    def as_epub(self, *args, **kwargs):
+        from librarian import epub
+        return epub.transform(self, *args, **kwargs)
+
+    def as_pdf(self, *args, **kwargs):
+        from librarian import pdf
+        return pdf.transform(self, *args, **kwargs)
+
+    def as_mobi(self, *args, **kwargs):
+        from librarian import mobi
+        return mobi.transform(self, *args, **kwargs)
+
+    def save_output_file(self, output_file, output_path=None,
+            output_dir_path=None, make_author_dir=False, ext=None):
+        if output_dir_path:
+            save_path = output_dir_path
+            if make_author_dir:
+                save_path = os.path.join(save_path,
+                        unicode(self.book_info.author).encode('utf-8'))
+            save_path = os.path.join(save_path, self.book_info.slug)
+            if ext:
+                save_path += '.%s' % ext
+        else:
+            save_path = output_path
+
+        output_file.save_as(save_path)
diff --git a/librarian/pdf.py b/librarian/pdf.py

index 1bfd949..02438a6 100644 (file)
--- a/librarian/pdf.py
+++ b/librarian/pdf.py
@@ -8,20 +8,18 @@ import os
  import os.path
  import shutil
  from StringIO import StringIO
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
  import re
  from copy import deepcopy
  from subprocess import call, PIPE
  
-import sys
-
  from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
  from librarian.dcparser import Person
  from librarian.parser import WLDocument
-from librarian import ParseError, DCNS, get_resource
+from librarian import ParseError, DCNS, get_resource, OutputFile
  from librarian import functions
  
  
@@ -173,17 +171,11 @@ def package_available(package, args='', verbose=False):
      return p == 0
  
  
-def transform(provider, slug=None, file_path=None,
-              output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
+def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
                cover=None, flags=None, customizations=None):
      """ produces a PDF file with XeLaTeX
  
-    provider: a DocProvider
-    slug: slug of file to process, available by provider
-    file_path can be provided instead of a slug
-    output_file: file-like object or path to output file
-    output_dir: path to directory to save output file to; either this or output_file must be present
-    make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
+    wldoc: a WLDocument
      verbose: prints all output from LaTeX
      save_tex: path to save the intermediary LaTeX file to
      morefloats (old/new/none): force specific morefloats
@@ -194,14 +186,7 @@ def transform(provider, slug=None, file_path=None,
  
      # Parse XSLT
      try:
-        if file_path:
-            if slug:
-                raise ValueError('slug or file_path should be specified, not both')
-            document = load_including_children(provider, file_path=file_path)
-        else:
-            if not slug:
-                raise ValueError('either slug or file_path should be specified')
-            document = load_including_children(provider, slug=slug)
+        document = load_including_children(wldoc)
  
          if cover:
              document.edoc.getroot().set('data-cover-width', str(cover.width))
@@ -227,11 +212,6 @@ def transform(provider, slug=None, file_path=None,
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
  
-        # find output dir
-        if make_dir and output_dir is not None:
-            author = unicode(document.book_info.author)
-            output_dir = os.path.join(output_dir, author)
-
          # wl -> TeXML
          style_filename = get_stylesheet("wl2tex")
          style = etree.parse(style_filename)
@@ -273,56 +253,38 @@ def transform(provider, slug=None, file_path=None,
  
          os.chdir(cwd)
  
-        # save the PDF
+        output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
          pdf_path = os.path.join(temp, 'doc.pdf')
-        if output_dir is not None:
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-            if slug:
-                output_path = os.path.join(output_dir, '%s.pdf' % slug)
-            else:
-                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
-            shutil.move(pdf_path, output_path)
-        else:
-            if hasattr(output_file, 'write'):
-                # file-like object
-                with open(pdf_path) as f:
-                    output_file.write(f.read())
-                output_file.close()
-            else:
-                # path to output file
-                shutil.copy(pdf_path, output_file)
+        shutil.move(pdf_path, output_file.name)
          shutil.rmtree(temp)
+        return OutputFile.from_filename(output_file.name)
  
      except (XMLSyntaxError, XSLTApplyError), e:
          raise ParseError(e)
  
  
-def load_including_children(provider, slug=None, uri=None, file_path=None):
-    """ makes one big xml file with children inserted at end
-    either slug or uri must be provided
+def load_including_children(wldoc=None, provider=None, uri=None):
+    """ Makes one big xml file with children inserted at end.
+    
+    Either wldoc or provider and URI must be provided.
      """
  
-    if uri:
+    if uri and provider:
          f = provider.by_uri(uri)
-    elif slug:
-        f = provider[slug]
-    elif file_path:
-        f = open(file_path, 'r')
+        text = f.read().decode('utf-8')
+        f.close()
+    elif wldoc is not None:
+        text = etree.tostring(wldoc.edoc, encoding=unicode)
+        provider = wldoc.provider
      else:
-        raise ValueError('Neither slug, URI nor file path provided for a book.')
+        raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
  
-    text = f.read().decode('utf-8')
      text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  
-    document = WLDocument.from_string(text, True,
-        parse_dublincore=True)
+    document = WLDocument.from_string(text, parse_dublincore=True)
+    document.swap_endlines()
  
-    f.close()
      for child_uri in document.book_info.parts:
-        print child_uri
-        child = load_including_children(provider, uri=child_uri)
+        child = load_including_children(provider=provider, uri=child_uri)
          document.edoc.getroot().append(child.edoc.getroot())
      return document
diff --git a/librarian/text.py b/librarian/text.py

index c23bcd6..d99e7cf 100644 (file)
--- a/librarian/text.py
+++ b/librarian/text.py
@@ -3,7 +3,8 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from librarian import dcparser, parser, functions
+import copy
+from librarian import functions, OutputFile
  from lxml import etree
  import os
  
@@ -28,7 +29,7 @@ Utwór opracowany został w ramach projektu Wolne Lektury przez fundację Nowocz
  %(description)s%(contributors)s
  """
  
-def transform(input_file, output_file, parse_dublincore=True, flags=None, **options):
+def transform(wldoc, flags=None, **options):
      """
      Transforms input_file in XML to output_file in TXT.
      possible flags: raw-text,
@@ -37,7 +38,9 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti
      style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
      style = etree.parse(style_filename)
  
-    document = parser.WLDocument.from_file(input_file, True, parse_dublincore=parse_dublincore)
+    document = copy.deepcopy(wldoc)
+    del wldoc
+    document.swap_endlines()
  
      if flags:
          for flag in flags:
@@ -46,10 +49,10 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti
      result = document.transform(style, **options)
  
      if not flags or 'raw-text' not in flags:
-        if parse_dublincore:
-            parsed_dc = dcparser.BookInfo.from_element(document.edoc)
+        if document.book_info:
+            parsed_dc = document.book_info
              description = parsed_dc.description
-            url = parsed_dc.url
+            url = document.book_info.url
      
              license_description = parsed_dc.license_description
              license = parsed_dc.license
@@ -75,7 +78,7 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti
              license_description = ""
              source = ""
              contributors = ""
-        output_file.write((TEMPLATE % {
+        return OutputFile.from_string((TEMPLATE % {
              'description': description,
              'url': url,
              'license_description': license_description,
@@ -84,5 +87,5 @@ def transform(input_file, output_file, parse_dublincore=True, flags=None, **opti
              'contributors': contributors,
          }).encode('utf-8'))
      else:
-        output_file.write(unicode(result).encode('utf-8'))
+        return OutputFile.from_string(unicode(result).encode('utf-8'))
  
diff --git a/scripts/book2epub b/scripts/book2epub

index 9adf4b4..9af3692 100755 (executable)
--- a/scripts/book2epub
+++ b/scripts/book2epub
@@ -7,7 +7,8 @@
  import os.path
  import optparse
  
-from librarian import epub, DirDocProvider, ParseError
+from librarian import DirDocProvider, ParseError
+from librarian.parser import WLDocument
  
  
  if __name__ == '__main__':
@@ -37,18 +38,20 @@ if __name__ == '__main__':
          for main_input in input_filenames:
              if options.verbose:
                  print main_input
+
              path, fname = os.path.realpath(main_input).rsplit('/', 1)
              provider = DirDocProvider(path)
-
-            output_dir = output_file = None
-            if options.output_dir:
-                output_dir = options.output_dir
-            elif options.output_file:
-                output_file = options.output_file
+            if not (options.output_file or options.output_dir):
+                output_file = os.path.splitext(main_input)[0] + '.epub'
              else:
-                output_dir = path
+                output_file = None
+
+            doc = WLDocument.from_file(main_input, provider=provider)
+            epub = doc.as_epub()
+
+            doc.save_output_file(epub,
+                output_file, options.output_dir, options.make_dir, 'epub')
  
-            epub.transform(provider, file_path=main_input, output_dir=output_dir, output_file=output_file, make_dir=options.make_dir)
      except ParseError, e:
          print '%(file)s:%(name)s:%(message)s' % {
              'file': main_input,
diff --git a/scripts/book2html b/scripts/book2html

index d61b299..1e88823 100755 (executable)
--- a/scripts/book2html
+++ b/scripts/book2html
@@ -7,7 +7,8 @@
  import os
  import optparse
  
-from librarian import html, ParseError
+from librarian import ParseError
+from librarian.parser import WLDocument
  
  
  if __name__ == '__main__':
@@ -35,7 +36,10 @@ if __name__ == '__main__':
  
          output_filename = os.path.splitext(input_filename)[0] + '.html'
          try:
-            html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore, flags=('full-page',))
+            doc = WLDocument.from_file(input_filename,
+                parse_dublincore=options.parse_dublincore)
+            html = doc.as_html(flags=('full-page',))
+            doc.save_output_file(html, output_path=output_filename)
          except ParseError, e:
              print '%(file)s:%(name)s:%(message)s' % {
                  'file': input_filename,
diff --git a/scripts/book2ihtml b/scripts/book2ihtml

index 97d8ebd..779f245 100755 (executable)
--- a/scripts/book2ihtml
+++ b/scripts/book2ihtml
@@ -7,7 +7,8 @@
  import os
  import optparse
  
-from librarian import html, ParseError
+from librarian import ParseError
+from librarian.parser import WLDocument
  
  
  if __name__ == '__main__':
@@ -35,8 +36,10 @@ if __name__ == '__main__':
  
          output_filename = os.path.splitext(input_filename)[0] + '.html'
          try:
-            html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore,\
-                stylesheet='partial')
+            doc = WLDocument.from_file(input_filename,
+                parse_dublincore=options.parse_dublincore)
+            html = doc.as_html(flags=('full-page',), stylesheet='partial')
+            doc.save_output_file(html, output_path=output_filename)
          except ParseError, e:
              print '%(file)s:%(name)s:%(message)s' % {
                  'file': input_filename,
diff --git a/scripts/book2mobi b/scripts/book2mobi

index 1c00b51..665dcfa 100755 (executable)
--- a/scripts/book2mobi
+++ b/scripts/book2mobi
@@ -7,7 +7,8 @@
  import os.path
  import optparse
  
-from librarian import mobi, DirDocProvider, ParseError
+from librarian import DirDocProvider, ParseError
+from librarian.parser import WLDocument
  
  
  if __name__ == '__main__':
@@ -35,20 +36,18 @@ if __name__ == '__main__':
      # Do some real work
      try:
          for main_input in input_filenames:
-            if options.verbose:
-                print main_input
              path, fname = os.path.realpath(main_input).rsplit('/', 1)
              provider = DirDocProvider(path)
-
-            output_dir = output_file = None
-            if options.output_dir:
-                output_dir = options.output_dir
-            elif options.output_file:
-                output_file = options.output_file
+            if not (options.output_file or options.output_dir):
+                output_file = os.path.splitext(main_input)[0] + '.mobi'
              else:
-                output_dir = path
+                output_file = None
+
+            doc = WLDocument.from_file(main_input, provider=provider)
+            mobi = doc.as_mobi()
  
-            mobi.transform(provider, file_path=main_input, output_dir=output_dir, output_file=output_file, make_dir=options.make_dir)
+            doc.save_output_file(mobi,
+                output_file, options.output_dir, options.make_dir, 'mobi')
      except ParseError, e:
          print '%(file)s:%(name)s:%(message)s' % {
              'file': main_input,
diff --git a/scripts/book2pdf b/scripts/book2pdf

index d10f400..171264b 100755 (executable)
--- a/scripts/book2pdf
+++ b/scripts/book2pdf
@@ -6,7 +6,10 @@
  #
  import os.path
  from optparse import OptionParser
-from librarian import pdf, DirDocProvider, ParseError
+
+from librarian import DirDocProvider, ParseError
+from librarian.parser import WLDocument
+
  
  if __name__ == '__main__':
      usage = """Usage: %prog [options] SOURCE [SOURCE...]
@@ -31,33 +34,25 @@ if __name__ == '__main__':
          parser.print_help()
          exit(1)
  
-    try:
-        if options.output_dir and options.output_file:
-            raise ValueError("Either --output-dir or --output file should be specified")
+    if options.output_dir and options.output_file:
+        raise ValueError("Either --output-dir or --output file should be specified")
  
+    try:
          for main_input in args:
-            if options.verbose:
-                print main_input
              path, fname = os.path.realpath(main_input).rsplit('/', 1)
              provider = DirDocProvider(path)
-
-            output_file = output_dir = None
-            if options.output_dir:
-                output_dir = options.output_dir
-            elif options.output_file:
-                output_file = options.output_file
+            output_file, output_dir = options.output_file, options.output_dir
+            if not (options.output_file or options.output_dir):
+                output_file = os.path.splitext(main_input)[0] + '.pdf'
              else:
-                output_dir = path
+                output_file = None
+
+            doc = WLDocument.from_file(main_input, provider=provider)
+            pdf = doc.as_pdf(save_tex=options.save_tex,
+                        morefloats=options.morefloats)
  
-            pdf.transform(provider,
-                file_path=main_input,
-                output_file=output_file,
-                output_dir=output_dir,
-                verbose=options.verbose,
-                make_dir=options.make_dir,
-                save_tex=options.save_tex,
-                morefloats=options.morefloats
-                )
+            doc.save_output_file(pdf,
+                output_file, options.output_dir, options.make_dir, 'pdf')
      except ParseError, e:
          print '%(file)s:%(name)s:%(message)s; use -v to see more output' % {
              'file': main_input,
diff --git a/scripts/book2txt b/scripts/book2txt

index d56d6ff..9cfdef2 100755 (executable)
--- a/scripts/book2txt
+++ b/scripts/book2txt
@@ -7,8 +7,8 @@
  import os
  import optparse
  
-from librarian import text
-from librarian import dcparser, ParseError
+from librarian import ParseError
+from librarian.parser import WLDocument
  
  
  if __name__ == '__main__':
@@ -38,9 +38,10 @@ if __name__ == '__main__':
  
          output_filename = os.path.splitext(input_filename)[0] + '.txt'
          try:
-            output_file = open(output_filename, 'w')
-            text.transform(open(input_filename), output_file, parse_dublincore=options.parse_dublincore,
-                wrapping=str(options.wrapping))
+            doc = WLDocument.from_file(input_filename,
+                parse_dublincore=options.parse_dublincore)
+            html = doc.as_text(wrapping=str(options.wrapping))
+            doc.save_output_file(html, output_path=output_filename)
          except ParseError, e:
              print '%(file)s:%(name)s:%(message)s' % {
                  'file': input_filename,
diff --git a/setup.py b/setup.py

index 1394643..023c943 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@ def whole_tree(prefix, path):
  
  setup(
      name='librarian',
-    version='1.3',
+    version='1.4',
      description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
      author="Marek Stępniowski",
      author_email='marek@stepniowski.com',
diff --git a/tests/files/text/asnyk_miedzy_nami.xml b/tests/files/text/asnyk_miedzy_nami.xml

deleted file mode 100644 (file)

index 36d8df6..0000000
--- a/tests/files/text/asnyk_miedzy_nami.xml
+++ /dev/null
@@ -1,65 +0,0 @@
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
-  <liryka_lp>
-
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/Lektury:Asnyk/Między_nami_nic_nie_było">
-<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
-<dc:title xml:lang="pl">Między nami nic nie było</dc:title>
-<dc:contributor.editor xml:lang="pl" />
-<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
-<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
-<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:identifier.url>
-<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
-<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3,  Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
-<dc:date.pd xml:lang="pl">1897</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2007-09-06</dc:date>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-
-
-<autor_utworu>Adam Asnyk</autor_utworu>
-
-<nazwa_utworu><begin id="b1189062500041"/><motyw id="m1189062500041">Miłość platoniczna</motyw>Między nami nic nie było</nazwa_utworu>
-
-
-
-<strofa>Między nami nic nie było!/
-Żadnych zwierzeń, wyznań żadnych!/
-Nic nas z sobą nie łączyło ---/
-Prócz wiosennych marzeń zdradnych;</strofa>
-
-
-
-<strofa><begin id="b1189062528872"/><motyw id="m1189062528872">Natura</motyw>Prócz tych woni, barw i blasków,/
-Unoszących się w przestrzeni;/
-Prócz szumiących śpiewem lasków/
-I tej świeżej łąk zieleni;</strofa>
-
-
-
-<strofa>Prócz tych kaskad i potoków,/
-Zraszających każdy parów,/
-Prócz girlandy tęcz, obłoków,/
-Prócz natury słodkich czarów;</strofa>
-
-
-
-<strofa>Prócz tych wspólnych, jasnych zdrojów,/
-Z których serce zachwyt piło;/
-Prócz pierwiosnków i powojów,---/
-Między nami nic nie było!<end id="e1189062528872"/><end id="e1189062500041"/></strofa>
-
-</liryka_lp>
-</utwor>
diff --git a/tests/files/text/asnyk_zbior.xml b/tests/files/text/asnyk_zbior.xml

new file mode 100755 (executable)

index 0000000..c585a8b
--- /dev/null
+++ b/tests/files/text/asnyk_zbior.xml
@@ -0,0 +1,29 @@
+<?xml version='1.0' encoding='utf-8'?>
+<utwor>
+
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<rdf:Description rdf:about="http://redakcja.wolnelektury.pl/documents/book/asnyk-poezye/">
+<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
+<dc:title xml:lang="pl">Poezye</dc:title>
+<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
+<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
+<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
+<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
+<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
+<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/poezye</dc:identifier.url>
+<dc:relation.hasPart xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:relation.hasPart>
+<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
+<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3,  Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
+<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
+<dc:date.pd xml:lang="pl">1897</dc:date.pd>
+<dc:format xml:lang="pl">xml</dc:format>
+<dc:type xml:lang="pl">text</dc:type>
+<dc:type xml:lang="en">text</dc:type>
+<dc:date xml:lang="pl">2007-09-06</dc:date>
+<dc:audience xml:lang="pl">L</dc:audience>
+<dc:language xml:lang="pl">pol</dc:language>
+</rdf:Description>
+</rdf:RDF>
+
+
+</utwor>
diff --git a/tests/files/text/miedzy-nami-nic-nie-bylo.xml b/tests/files/text/miedzy-nami-nic-nie-bylo.xml

new file mode 100644 (file)

index 0000000..124940e
--- /dev/null
+++ b/tests/files/text/miedzy-nami-nic-nie-bylo.xml
@@ -0,0 +1,65 @@
+<?xml version='1.0' encoding='utf-8'?>
+<utwor>
+  <liryka_lp>
+
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<rdf:Description rdf:about="http://redakcja.wolnelektury.pl/documents/book/miedzy-nami-nic-nie-bylo/">
+<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
+<dc:title xml:lang="pl">Między nami nic nie było</dc:title>
+<dc:contributor.editor xml:lang="pl" />
+<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
+<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
+<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
+<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
+<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
+<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
+<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
+<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:identifier.url>
+<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
+<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3,  Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
+<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
+<dc:date.pd xml:lang="pl">1897</dc:date.pd>
+<dc:format xml:lang="pl">xml</dc:format>
+<dc:type xml:lang="pl">text</dc:type>
+<dc:type xml:lang="en">text</dc:type>
+<dc:date xml:lang="pl">2007-09-06</dc:date>
+<dc:audience xml:lang="pl">L</dc:audience>
+<dc:language xml:lang="pl">pol</dc:language>
+</rdf:Description>
+</rdf:RDF>
+
+
+<autor_utworu>Adam Asnyk</autor_utworu>
+
+<nazwa_utworu><begin id="b1189062500041"/><motyw id="m1189062500041">Miłość platoniczna</motyw>Między nami nic nie było</nazwa_utworu>
+
+
+
+<strofa>Między nami nic nie było!/
+Żadnych zwierzeń, wyznań żadnych!/
+Nic nas z sobą nie łączyło ---/
+Prócz wiosennych marzeń zdradnych;</strofa>
+
+
+
+<strofa><begin id="b1189062528872"/><motyw id="m1189062528872">Natura</motyw>Prócz tych woni, barw i blasków,/
+Unoszących się w przestrzeni;/
+Prócz szumiących śpiewem lasków/
+I tej świeżej łąk zieleni;</strofa>
+
+
+
+<strofa>Prócz tych kaskad i potoków,/
+Zraszających każdy parów,/
+Prócz girlandy tęcz, obłoków,/
+Prócz natury słodkich czarów;</strofa>
+
+
+
+<strofa>Prócz tych wspólnych, jasnych zdrojów,/
+Z których serce zachwyt piło;/
+Prócz pierwiosnków i powojów,---/
+Między nami nic nie było!<end id="e1189062528872"/><end id="e1189062500041"/></strofa>
+
+</liryka_lp>
+</utwor>
diff --git a/tests/test_epub.py b/tests/test_epub.py

new file mode 100644 (file)

index 0000000..9fc5637
--- /dev/null
+++ b/tests/test_epub.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
+from librarian import DirDocProvider
+from librarian.parser import WLDocument
+from nose.tools import *
+from utils import get_fixture
+
+
+def test_transform():
+    WLDocument.from_file(
+            get_fixture('text', 'asnyk_zbior.xml'),
+            provider=DirDocProvider(get_fixture('text', ''))
+        ).as_epub(flags=['without_fonts'])
diff --git a/tests/test_html.py b/tests/test_html.py

index 5187e06..51d6acd 100644 (file)
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -3,44 +3,38 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from librarian import html, NoDublinCore
+from librarian import NoDublinCore
+from librarian.parser import WLDocument
  from nose.tools import *
-from utils import get_fixture, remove_output_file
+from utils import get_fixture
  
-def teardown_transform():
-    remove_output_file('text', 'asnyk_miedzy_nami.html')
  
-
-@with_setup(None, teardown_transform)
  def test_transform():
-    output_file_path = get_fixture('text', 'asnyk_miedzy_nami.html')
      expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
  
-    html.transform(
-        get_fixture('text', 'asnyk_miedzy_nami.xml'),
-        output_file_path,
-    )
+    html = WLDocument.from_file(
+            get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
+        ).as_html().get_string()
  
-    assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
+    assert_equal(html, file(expected_output_file_path).read())
  
  
-@with_setup(None, teardown_transform)
  @raises(NoDublinCore)
  def test_no_dublincore():
-    html.transform(
-        get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
-        get_fixture('text', 'asnyk_miedzy_nami.html'),
-    )
+    WLDocument.from_file(
+            get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')
+        ).as_html()
  
  
-@with_setup(None, teardown_transform)
  def test_passing_parse_dublincore_to_transform():
      """Passing parse_dublincore=False to transform omits DublinCore parsing."""
-    html.transform(
-        get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
-        get_fixture('text', 'asnyk_miedzy_nami.html'),
-        parse_dublincore=False,
-    )
+    WLDocument.from_file(
+            get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+            parse_dublincore=False,
+        ).as_html()
  
  def test_empty():
-    assert html.transform('<utwor />', is_file=False, parse_dublincore=False).find('empty')
+    assert not WLDocument.from_string(
+            '<utwor />',
+            parse_dublincore=False,
+        ).as_html()
diff --git a/tests/test_text.py b/tests/test_text.py

index 7ff94ca..70dfb60 100644 (file)
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -3,42 +3,32 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from librarian import text, NoDublinCore
+from librarian import NoDublinCore
+from librarian.parser import WLDocument
  from nose.tools import *
-from utils import get_fixture, remove_output_file
+from utils import get_fixture
  
  
-def teardown_transform():
-    remove_output_file('text', 'asnyk_miedzy_nami.txt')
-
-
-@with_setup(None, teardown_transform)
  def test_transform():
-    output_file_path = get_fixture('text', 'asnyk_miedzy_nami.txt')
      expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt')
  
-    text.transform(
-        open(get_fixture('text', 'asnyk_miedzy_nami.xml')),
-        open(output_file_path, 'w'),
-    )
+    text = WLDocument.from_file(
+            get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
+        ).as_text().get_string()
  
-    assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
+    assert_equal(text, file(expected_output_file_path).read())
  
  
-@with_setup(None, teardown_transform)
  @raises(NoDublinCore)
  def test_no_dublincore():
-    text.transform(
-        open(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')),
-        open(get_fixture('text', 'asnyk_miedzy_nami.txt'), 'w'),
-    )
+    WLDocument.from_file(
+            get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')
+        ).as_text()
  
  
-@with_setup(None, teardown_transform)
  def test_passing_parse_dublincore_to_transform():
-    """Passing parse_dublincore=False to transform omits DublinCore parsing."""
-    text.transform(
-        open(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')),
-        open(get_fixture('text', 'asnyk_miedzy_nami.txt'), 'w'),
-        parse_dublincore=False,
-    )
+    """Passing parse_dublincore=False to the constructor omits DublinCore parsing."""
+    WLDocument.from_file(
+            get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+            parse_dublincore=False,
+        ).as_text()
diff --git a/tests/utils.py b/tests/utils.py

index b112066..3b1f4f5 100644 (file)
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -21,10 +21,3 @@ def get_fixture(dir_name, file_name):
  def get_all_fixtures(dir_name, glob_pattern='*'):
      """Returns list of paths for fixtures in directory dir_name matching the glob_pattern."""
      return [get_fixture(dir_name, file_name) for file_name in glob.glob(join(get_fixture_dir(dir_name), glob_pattern))]
-
-
-def remove_output_file(dir_name, file_name):
-    try:
-        os.remove(get_fixture(dir_name, file_name))
-    except:
-        pass
author	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Mon, 5 Dec 2011 16:06:51 +0000 (17:06 +0100)
committer	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Mon, 5 Dec 2011 16:11:03 +0000 (17:11 +0100)
librarian/__init__.py		patch \| blob \| history
librarian/dcparser.py		patch \| blob \| history
librarian/epub.py		patch \| blob \| history
librarian/html.py		patch \| blob \| history
librarian/mobi.py		patch \| blob \| history
librarian/packagers.py		patch \| blob \| history
librarian/parser.py		patch \| blob \| history
librarian/pdf.py		patch \| blob \| history
librarian/text.py		patch \| blob \| history
scripts/book2epub		patch \| blob \| history
scripts/book2html		patch \| blob \| history
scripts/book2ihtml		patch \| blob \| history
scripts/book2mobi		patch \| blob \| history
scripts/book2pdf		patch \| blob \| history
scripts/book2txt		patch \| blob \| history
setup.py		patch \| blob \| history
tests/files/text/asnyk_miedzy_nami.xml	[deleted file]	patch \| blob \| history
tests/files/text/asnyk_zbior.xml	[new file with mode: 0755]	patch \| blob
tests/files/text/miedzy-nami-nic-nie-bylo.xml	[new file with mode: 0644]	patch \| blob
tests/test_epub.py	[new file with mode: 0644]	patch \| blob
tests/test_html.py		patch \| blob \| history
tests/test_text.py		patch \| blob \| history
tests/utils.py		patch \| blob \| history