src/librarian/__init__.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import shutil
  11 from tempfile import NamedTemporaryFile
  12 import urllib
  13 from lxml import etree
  14 import six
  15 from six.moves.urllib.request import FancyURLopener
  16 from .util import makedirs
  17
  18
  19 @six.python_2_unicode_compatible
  20 class UnicodeException(Exception):
  21     def __str__(self):
  22         """ Dirty workaround for Python Unicode handling problems. """
  23         args = self.args[0] if len(self.args) == 1 else self.args
  24         try:
  25             message = six.text_type(args)
  26         except UnicodeDecodeError:
  27             message = six.text_type(args, encoding='utf-8', errors='ignore')
  28         return message
  29
  30
  31 class ParseError(UnicodeException):
  32     pass
  33
  34
  35 class ValidationError(UnicodeException):
  36     pass
  37
  38
  39 class NoDublinCore(ValidationError):
  40     """There's no DublinCore section, and it's required."""
  41     pass
  42
  43
  44 class NoProvider(UnicodeException):
  45     """There's no DocProvider specified, and it's needed."""
  46     pass
  47
  48
  49 class XMLNamespace(object):
  50     '''A handy structure to repsent names in an XML namespace.'''
  51
  52     def __init__(self, uri):
  53         self.uri = uri
  54
  55     def __call__(self, tag):
  56         return '{%s}%s' % (self.uri, tag)
  57
  58     def __contains__(self, tag):
  59         return tag.startswith('{' + str(self) + '}')
  60
  61     def __repr__(self):
  62         return 'XMLNamespace(%r)' % self.uri
  63
  64     def __str__(self):
  65         return '%s' % self.uri
  66
  67
  68 class EmptyNamespace(XMLNamespace):
  69     def __init__(self):
  70         super(EmptyNamespace, self).__init__('')
  71
  72     def __call__(self, tag):
  73         return tag
  74
  75
  76 # some common namespaces we use
  77 XMLNS = XMLNamespace('http://www.w3.org/XML/1998/namespace')
  78 RDFNS = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
  79 DCNS = XMLNamespace('http://purl.org/dc/elements/1.1/')
  80 XINS = XMLNamespace("http://www.w3.org/2001/XInclude")
  81 XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml")
  82 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  83 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  84 PLMETNS = XMLNamespace("http://dl.psnc.pl/schemas/plmet/")
  85
  86 WLNS = EmptyNamespace()
  87
  88
  89
  90
  91 class DocProvider(object):
  92     """Base class for a repository of XML files.
  93
  94     Used for generating joined files, like EPUBs.
  95     """
  96
  97     def by_slug(self, slug):
  98         """Should return a file-like object with a WL document XML."""
  99         raise NotImplementedError
 100
 101
 102 class DirDocProvider(DocProvider):
 103     """ Serve docs from a directory of files in form <slug>.xml """
 104
 105     def __init__(self, dir_):
 106         self.dir = dir_
 107         self.files = {}
 108
 109     def by_slug(self, slug):
 110         fname = slug + '.xml'
 111         return open(os.path.join(self.dir, fname), 'rb')
 112
 113
 114 from . import dcparser
 115 from .meta.types.wluri import WLURI
 116
 117
 118 DEFAULT_BOOKINFO = dcparser.BookInfo(
 119     {
 120         RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'
 121     },
 122     {
 123         DCNS('creator'): [u'Some, Author'],
 124         DCNS('title'): [u'Some Title'],
 125         DCNS('subject.period'): [u'Unknown'],
 126         DCNS('subject.type'): [u'Unknown'],
 127         DCNS('subject.genre'): [u'Unknown'],
 128         DCNS('date'): ['1970-01-01'],
 129         DCNS('language'): [u'pol'],
 130         # DCNS('date'): [creation_date],
 131         DCNS('publisher'): [u"Fundacja Nowoczesna Polska"],
 132         DCNS('description'):
 133         [u"""Publikacja zrealizowana w ramach projektu
 134         Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa
 135         wykonana przez Bibliotekę Narodową z egzemplarza
 136         pochodzącego ze zbiorów BN."""],
 137         DCNS('identifier.url'): [WLURI.example],
 138         DCNS('rights'):
 139         [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"]
 140     }
 141 )
 142
 143
 144 def xinclude_forURI(uri):
 145     e = etree.Element(XINS("include"))
 146     e.set("href", uri)
 147     return etree.tostring(e, encoding='unicode')
 148
 149
 150 def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO):
 151     """Wrap the text within the minimal XML structure with a DC template."""
 152     bookinfo.created_at = creation_date
 153
 154     dcstring = etree.tostring(
 155         bookinfo.to_etree(),  method='xml', encoding='unicode',
 156         pretty_print=True
 157     )
 158
 159     return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext + \
 160         u'\n</plain-text>\n</utwor>'
 161
 162
 163 def serialize_raw(element):
 164     b = u'' + (element.text or '')
 165
 166     for child in element.iterchildren():
 167         e = etree.tostring(child, method='xml', encoding='unicode',
 168                            pretty_print=True)
 169         b += e
 170
 171     return b
 172
 173
 174 SERIALIZERS = {
 175     'raw': serialize_raw,
 176 }
 177
 178
 179 def serialize_children(element, format='raw'):
 180     return SERIALIZERS[format](element)
 181
 182
 183 def get_resource(path):
 184     return os.path.join(os.path.dirname(__file__), path)
 185
 186
 187 class OutputFile(object):
 188     """Represents a file returned by one of the converters."""
 189
 190     _bytes = None
 191     _filename = None
 192
 193     def __del__(self):
 194         if self._filename:
 195             os.unlink(self._filename)
 196
 197     def __nonzero__(self):
 198         return self._bytes is not None or self._filename is not None
 199
 200     @classmethod
 201     def from_bytes(cls, bytestring):
 202         """Converter returns contents of a file as a string."""
 203
 204         instance = cls()
 205         instance._bytes = bytestring
 206         return instance
 207
 208     @classmethod
 209     def from_filename(cls, filename):
 210         """Converter returns contents of a file as a named file."""
 211
 212         instance = cls()
 213         instance._filename = filename
 214         return instance
 215
 216     def get_bytes(self):
 217         """Get file's contents as a bytestring."""
 218
 219         if self._filename is not None:
 220             with open(self._filename, 'rb') as f:
 221                 return f.read()
 222         else:
 223             return self._bytes
 224
 225     def get_file(self):
 226         """Get file as a file-like object."""
 227
 228         if self._bytes is not None:
 229             return six.BytesIO(self._bytes)
 230         elif self._filename is not None:
 231             return open(self._filename, 'rb')
 232
 233     def get_filename(self):
 234         """Get file as a fs path."""
 235
 236         if self._filename is not None:
 237             return self._filename
 238         elif self._bytes is not None:
 239             temp = NamedTemporaryFile(prefix='librarian-', delete=False)
 240             temp.write(self._bytes)
 241             temp.close()
 242             self._filename = temp.name
 243             return self._filename
 244         else:
 245             return None
 246
 247     def save_as(self, path):
 248         """Save file to a path. Create directories, if necessary."""
 249
 250         dirname = os.path.dirname(os.path.abspath(path))
 251         makedirs(dirname)
 252         shutil.copy(self.get_filename(), path)
 253
 254
 255 class URLOpener(FancyURLopener):
 256     version = 'FNP Librarian (http://github.com/fnp/librarian)'
 257
 258
 259 urllib._urlopener = URLOpener()