src/librarian/__init__.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import shutil
  11 from tempfile import NamedTemporaryFile
  12 import urllib
  13 from lxml import etree
  14 import six
  15 from six.moves.urllib.request import FancyURLopener
  16 from .util import makedirs
  17
  18
  19 @six.python_2_unicode_compatible
  20 class UnicodeException(Exception):
  21     def __str__(self):
  22         """ Dirty workaround for Python Unicode handling problems. """
  23         args = self.args[0] if len(self.args) == 1 else self.args
  24         try:
  25             message = six.text_type(args)
  26         except UnicodeDecodeError:
  27             message = six.text_type(args, encoding='utf-8', errors='ignore')
  28         return message
  29
  30
  31 class ParseError(UnicodeException):
  32     pass
  33
  34
  35 class ValidationError(UnicodeException):
  36     pass
  37
  38
  39 class NoDublinCore(ValidationError):
  40     """There's no DublinCore section, and it's required."""
  41     pass
  42
  43
  44 class NoProvider(UnicodeException):
  45     """There's no DocProvider specified, and it's needed."""
  46     pass
  47
  48
  49 class XMLNamespace(object):
  50     '''A handy structure to repsent names in an XML namespace.'''
  51
  52     def __init__(self, uri):
  53         self.uri = uri
  54
  55     def __call__(self, tag):
  56         return '{%s}%s' % (self.uri, tag)
  57
  58     def __contains__(self, tag):
  59         return tag.startswith('{' + str(self) + '}')
  60
  61     def __repr__(self):
  62         return 'XMLNamespace(%r)' % self.uri
  63
  64     def __str__(self):
  65         return '%s' % self.uri
  66
  67
  68 class EmptyNamespace(XMLNamespace):
  69     def __init__(self):
  70         super(EmptyNamespace, self).__init__('')
  71
  72     def __call__(self, tag):
  73         return tag
  74
  75
  76 # some common namespaces we use
  77 XMLNS = XMLNamespace('http://www.w3.org/XML/1998/namespace')
  78 RDFNS = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
  79 DCNS = XMLNamespace('http://purl.org/dc/elements/1.1/')
  80 XINS = XMLNamespace("http://www.w3.org/2001/XInclude")
  81 XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml")
  82 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  83 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  84 PLMETNS = XMLNamespace("http://dl.psnc.pl/schemas/plmet/")
  85
  86 WLNS = EmptyNamespace()
  87
  88
  89 @six.python_2_unicode_compatible
  90 class WLURI(object):
  91     """Represents a WL URI. Extracts slug from it."""
  92     slug = None
  93
  94     example = 'http://wolnelektury.pl/katalog/lektura/template/'
  95     _re_wl_uri = re.compile(
  96         r'http://(www\.)?wolnelektury.pl/katalog/lektur[ay]/'
  97         '(?P<slug>[-a-z0-9]+)/?$'
  98     )
  99
 100     def __init__(self, uri):
 101         uri = six.text_type(uri)
 102         self.uri = uri
 103         self.slug = uri.rstrip('/').rsplit('/', 1)[-1]
 104
 105     @classmethod
 106     def strict(cls, uri):
 107         match = cls._re_wl_uri.match(uri)
 108         if not match:
 109             raise ValidationError(u'Invalid URI (%s). Should match: %s' % (
 110                         uri, cls._re_wl_uri.pattern))
 111         return cls(uri)
 112
 113     @classmethod
 114     def from_slug(cls, slug):
 115         """Contructs an URI from slug.
 116
 117         >>> print(WLURI.from_slug('a-slug').uri)
 118         http://wolnelektury.pl/katalog/lektura/a-slug/
 119
 120         """
 121         uri = 'http://wolnelektury.pl/katalog/lektura/%s/' % slug
 122         return cls(uri)
 123
 124     def __str__(self):
 125         return self.uri
 126
 127     def __eq__(self, other):
 128         return self.slug == other.slug
 129
 130
 131 class DocProvider(object):
 132     """Base class for a repository of XML files.
 133
 134     Used for generating joined files, like EPUBs.
 135     """
 136
 137     def by_slug(self, slug):
 138         """Should return a file-like object with a WL document XML."""
 139         raise NotImplementedError
 140
 141     def by_uri(self, uri, wluri=WLURI):
 142         """Should return a file-like object with a WL document XML."""
 143         wluri = wluri(uri)
 144         return self.by_slug(wluri.slug)
 145
 146
 147 class DirDocProvider(DocProvider):
 148     """ Serve docs from a directory of files in form <slug>.xml """
 149
 150     def __init__(self, dir_):
 151         self.dir = dir_
 152         self.files = {}
 153
 154     def by_slug(self, slug):
 155         fname = slug + '.xml'
 156         return open(os.path.join(self.dir, fname), 'rb')
 157
 158
 159 from . import dcparser
 160
 161
 162 DEFAULT_BOOKINFO = dcparser.BookInfo(
 163     {
 164         RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'
 165     },
 166     {
 167         DCNS('creator'): [u'Some, Author'],
 168         DCNS('title'): [u'Some Title'],
 169         DCNS('subject.period'): [u'Unknown'],
 170         DCNS('subject.type'): [u'Unknown'],
 171         DCNS('subject.genre'): [u'Unknown'],
 172         DCNS('date'): ['1970-01-01'],
 173         DCNS('language'): [u'pol'],
 174         # DCNS('date'): [creation_date],
 175         DCNS('publisher'): [u"Fundacja Nowoczesna Polska"],
 176         DCNS('description'):
 177         [u"""Publikacja zrealizowana w ramach projektu
 178         Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa
 179         wykonana przez Bibliotekę Narodową z egzemplarza
 180         pochodzącego ze zbiorów BN."""],
 181         DCNS('identifier.url'): [WLURI.example],
 182         DCNS('rights'):
 183         [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"]
 184     }
 185 )
 186
 187
 188 def xinclude_forURI(uri):
 189     e = etree.Element(XINS("include"))
 190     e.set("href", uri)
 191     return etree.tostring(e, encoding='unicode')
 192
 193
 194 def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO):
 195     """Wrap the text within the minimal XML structure with a DC template."""
 196     bookinfo.created_at = creation_date
 197
 198     dcstring = etree.tostring(
 199         bookinfo.to_etree(),  method='xml', encoding='unicode',
 200         pretty_print=True
 201     )
 202
 203     return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext + \
 204         u'\n</plain-text>\n</utwor>'
 205
 206
 207 def serialize_raw(element):
 208     b = u'' + (element.text or '')
 209
 210     for child in element.iterchildren():
 211         e = etree.tostring(child, method='xml', encoding='unicode',
 212                            pretty_print=True)
 213         b += e
 214
 215     return b
 216
 217
 218 SERIALIZERS = {
 219     'raw': serialize_raw,
 220 }
 221
 222
 223 def serialize_children(element, format='raw'):
 224     return SERIALIZERS[format](element)
 225
 226
 227 def get_resource(path):
 228     return os.path.join(os.path.dirname(__file__), path)
 229
 230
 231 class OutputFile(object):
 232     """Represents a file returned by one of the converters."""
 233
 234     _bytes = None
 235     _filename = None
 236
 237     def __del__(self):
 238         if self._filename:
 239             os.unlink(self._filename)
 240
 241     def __nonzero__(self):
 242         return self._bytes is not None or self._filename is not None
 243
 244     @classmethod
 245     def from_bytes(cls, bytestring):
 246         """Converter returns contents of a file as a string."""
 247
 248         instance = cls()
 249         instance._bytes = bytestring
 250         return instance
 251
 252     @classmethod
 253     def from_filename(cls, filename):
 254         """Converter returns contents of a file as a named file."""
 255
 256         instance = cls()
 257         instance._filename = filename
 258         return instance
 259
 260     def get_bytes(self):
 261         """Get file's contents as a bytestring."""
 262
 263         if self._filename is not None:
 264             with open(self._filename, 'rb') as f:
 265                 return f.read()
 266         else:
 267             return self._bytes
 268
 269     def get_file(self):
 270         """Get file as a file-like object."""
 271
 272         if self._bytes is not None:
 273             return six.BytesIO(self._bytes)
 274         elif self._filename is not None:
 275             return open(self._filename, 'rb')
 276
 277     def get_filename(self):
 278         """Get file as a fs path."""
 279
 280         if self._filename is not None:
 281             return self._filename
 282         elif self._bytes is not None:
 283             temp = NamedTemporaryFile(prefix='librarian-', delete=False)
 284             temp.write(self._bytes)
 285             temp.close()
 286             self._filename = temp.name
 287             return self._filename
 288         else:
 289             return None
 290
 291     def save_as(self, path):
 292         """Save file to a path. Create directories, if necessary."""
 293
 294         dirname = os.path.dirname(os.path.abspath(path))
 295         makedirs(dirname)
 296         shutil.copy(self.get_filename(), path)
 297
 298
 299 class URLOpener(FancyURLopener):
 300     version = 'FNP Librarian (http://github.com/fnp/librarian)'
 301
 302
 303 urllib._urlopener = URLOpener()