librarian/__init__.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import shutil
  11 from tempfile import NamedTemporaryFile
  12 import urllib
  13 from lxml import etree
  14 import six
  15 from six.moves.urllib.request import FancyURLopener
  16 from .util import makedirs
  17
  18
  19 @six.python_2_unicode_compatible
  20 class UnicodeException(Exception):
  21     def __str__(self):
  22         """ Dirty workaround for Python Unicode handling problems. """
  23         args = self.args[0] if len(self.args) == 1 else self.args
  24         try:
  25             message = six.text_type(args)
  26         except UnicodeDecodeError:
  27             message = six.text_type(args, encoding='utf-8', errors='ignore')
  28         return message
  29
  30 class ParseError(UnicodeException):
  31     pass
  32
  33 class ValidationError(UnicodeException):
  34     pass
  35
  36 class NoDublinCore(ValidationError):
  37     """There's no DublinCore section, and it's required."""
  38     pass
  39
  40 class NoProvider(UnicodeException):
  41     """There's no DocProvider specified, and it's needed."""
  42     pass
  43
  44 class XMLNamespace(object):
  45     '''A handy structure to repsent names in an XML namespace.'''
  46
  47     def __init__(self, uri):
  48         self.uri = uri
  49
  50     def __call__(self, tag):
  51         return '{%s}%s' % (self.uri, tag)
  52
  53     def __contains__(self, tag):
  54         return tag.startswith('{' + str(self) + '}')
  55
  56     def __repr__(self):
  57         return 'XMLNamespace(%r)' % self.uri
  58
  59     def __str__(self):
  60         return '%s' % self.uri
  61
  62 class EmptyNamespace(XMLNamespace):
  63     def __init__(self):
  64         super(EmptyNamespace, self).__init__('')
  65
  66     def __call__(self, tag):
  67         return tag
  68
  69 # some common namespaces we use
  70 XMLNS = XMLNamespace('http://www.w3.org/XML/1998/namespace')
  71 RDFNS = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
  72 DCNS = XMLNamespace('http://purl.org/dc/elements/1.1/')
  73 XINS = XMLNamespace("http://www.w3.org/2001/XInclude")
  74 XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml")
  75 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  76 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  77 PLMETNS = XMLNamespace("http://dl.psnc.pl/schemas/plmet/")
  78
  79 WLNS = EmptyNamespace()
  80
  81
  82 @six.python_2_unicode_compatible
  83 class WLURI(object):
  84     """Represents a WL URI. Extracts slug from it."""
  85     slug = None
  86
  87     example = 'http://wolnelektury.pl/katalog/lektura/template/'
  88     _re_wl_uri = re.compile(r'http://(www\.)?wolnelektury.pl/katalog/lektur[ay]/'
  89             '(?P<slug>[-a-z0-9]+)/?$')
  90
  91     def __init__(self, uri):
  92         uri = six.text_type(uri)
  93         self.uri = uri
  94         self.slug = uri.rstrip('/').rsplit('/', 1)[-1]
  95
  96     @classmethod
  97     def strict(cls, uri):
  98         match = cls._re_wl_uri.match(uri)
  99         if not match:
 100             raise ValidationError(u'Invalid URI (%s). Should match: %s' % (
 101                         uri, cls._re_wl_uri.pattern))
 102         return cls(uri)
 103
 104     @classmethod
 105     def from_slug(cls, slug):
 106         """Contructs an URI from slug.
 107
 108         >>> print(WLURI.from_slug('a-slug').uri)
 109         http://wolnelektury.pl/katalog/lektura/a-slug/
 110
 111         """
 112         uri = 'http://wolnelektury.pl/katalog/lektura/%s/' % slug
 113         return cls(uri)
 114
 115     def __str__(self):
 116         return self.uri
 117
 118     def __eq__(self, other):
 119         return self.slug == other.slug
 120
 121
 122 class DocProvider(object):
 123     """Base class for a repository of XML files.
 124
 125     Used for generating joined files, like EPUBs.
 126     """
 127
 128     def by_slug(self, slug):
 129         """Should return a file-like object with a WL document XML."""
 130         raise NotImplementedError
 131
 132     def by_uri(self, uri, wluri=WLURI):
 133         """Should return a file-like object with a WL document XML."""
 134         wluri = wluri(uri)
 135         return self.by_slug(wluri.slug)
 136
 137
 138 class DirDocProvider(DocProvider):
 139     """ Serve docs from a directory of files in form <slug>.xml """
 140
 141     def __init__(self, dir_):
 142         self.dir = dir_
 143         self.files = {}
 144
 145     def by_slug(self, slug):
 146         fname = slug + '.xml'
 147         return open(os.path.join(self.dir, fname), 'rb')
 148
 149
 150 from . import dcparser
 151
 152 DEFAULT_BOOKINFO = dcparser.BookInfo(
 153         { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'},
 154         { DCNS('creator'): [u'Some, Author'],
 155           DCNS('title'): [u'Some Title'],
 156           DCNS('subject.period'): [u'Unknown'],
 157           DCNS('subject.type'): [u'Unknown'],
 158           DCNS('subject.genre'): [u'Unknown'],
 159           DCNS('date'): ['1970-01-01'],
 160           DCNS('language'): [u'pol'],
 161           # DCNS('date'): [creation_date],
 162           DCNS('publisher'): [u"Fundacja Nowoczesna Polska"],
 163           DCNS('description'):
 164           [u"""Publikacja zrealizowana w ramach projektu
 165              Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa
 166              wykonana przez Bibliotekę Narodową z egzemplarza
 167              pochodzącego ze zbiorów BN."""],
 168           DCNS('identifier.url'): [WLURI.example],
 169           DCNS('rights'):
 170             [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"] })
 171
 172 def xinclude_forURI(uri):
 173     e = etree.Element(XINS("include"))
 174     e.set("href", uri)
 175     return etree.tostring(e, encoding='unicode')
 176
 177 def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO):
 178     """Wrap the text within the minimal XML structure with a DC template."""
 179     bookinfo.created_at = creation_date
 180
 181     dcstring = etree.tostring(bookinfo.to_etree(), \
 182         method='xml', encoding='unicode', pretty_print=True)
 183
 184     return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext + \
 185         u'\n</plain-text>\n</utwor>'
 186
 187
 188 def serialize_raw(element):
 189     b = u'' + (element.text or '')
 190
 191     for child in element.iterchildren():
 192         e = etree.tostring(child, method='xml', encoding='unicode',
 193                 pretty_print=True)
 194         b += e
 195
 196     return b
 197
 198 SERIALIZERS = {
 199     'raw': serialize_raw,
 200 }
 201
 202 def serialize_children(element, format='raw'):
 203     return SERIALIZERS[format](element)
 204
 205 def get_resource(path):
 206     return os.path.join(os.path.dirname(__file__), path)
 207
 208
 209 class OutputFile(object):
 210     """Represents a file returned by one of the converters."""
 211
 212     _bytes = None
 213     _filename = None
 214
 215     def __del__(self):
 216         if self._filename:
 217             os.unlink(self._filename)
 218
 219     def __nonzero__(self):
 220         return self._bytes is not None or self._filename is not None
 221
 222     @classmethod
 223     def from_bytes(cls, bytestring):
 224         """Converter returns contents of a file as a string."""
 225
 226         instance = cls()
 227         instance._bytes = bytestring
 228         return instance
 229
 230     @classmethod
 231     def from_filename(cls, filename):
 232         """Converter returns contents of a file as a named file."""
 233
 234         instance = cls()
 235         instance._filename = filename
 236         return instance
 237
 238     def get_bytes(self):
 239         """Get file's contents as a bytestring."""
 240
 241         if self._filename is not None:
 242             with open(self._filename, 'rb') as f:
 243                 return f.read()
 244         else:
 245             return self._bytes
 246
 247     def get_file(self):
 248         """Get file as a file-like object."""
 249
 250         if self._bytes is not None:
 251             return six.BytesIO(self._bytes)
 252         elif self._filename is not None:
 253             return open(self._filename, 'rb')
 254
 255     def get_filename(self):
 256         """Get file as a fs path."""
 257
 258         if self._filename is not None:
 259             return self._filename
 260         elif self._bytes is not None:
 261             temp = NamedTemporaryFile(prefix='librarian-', delete=False)
 262             temp.write(self._bytes)
 263             temp.close()
 264             self._filename = temp.name
 265             return self._filename
 266         else:
 267             return None
 268
 269     def save_as(self, path):
 270         """Save file to a path. Create directories, if necessary."""
 271
 272         dirname = os.path.dirname(os.path.abspath(path))
 273         makedirs(dirname)
 274         shutil.copy(self.get_filename(), path)
 275
 276
 277 class URLOpener(FancyURLopener):
 278     version = 'FNP Librarian (http://github.com/fnp/librarian)'
 279 urllib._urlopener = URLOpener()