librarian/__init__.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import re
  10 import shutil
  11 import urllib
  12 import lxml.etree as etree
  13
  14
  15 class UnicodeException(Exception):
  16     def __str__(self):
  17         """ Dirty workaround for Python Unicode handling problems. """
  18         return unicode(self).encode('utf-8')
  19
  20     def __unicode__(self):
  21         """ Dirty workaround for Python Unicode handling problems. """
  22         args = self.args[0] if len(self.args) == 1 else self.args
  23         try:
  24             message = unicode(args)
  25         except UnicodeDecodeError:
  26             message = unicode(args, encoding='utf-8', errors='ignore')
  27         return message
  28
  29
  30 class ParseError(UnicodeException):
  31     pass
  32
  33
  34 class ValidationError(UnicodeException):
  35     pass
  36
  37
  38 class NoDublinCore(ValidationError):
  39     """There's no DublinCore section, and it's required."""
  40     pass
  41
  42
  43 class NoProvider(UnicodeException):
  44     """There's no DocProvider specified, and it's needed."""
  45     pass
  46
  47
  48 class XMLNamespace(object):
  49     """A handy structure to repsent names in an XML namespace."""
  50
  51     def __init__(self, uri):
  52         self.uri = uri
  53
  54     def __call__(self, tag):
  55         return '{%s}%s' % (self.uri, tag)
  56
  57     def __contains__(self, tag):
  58         return tag.startswith('{' + str(self) + '}')
  59
  60     def __repr__(self):
  61         return 'XMLNamespace(%r)' % self.uri
  62
  63     def __str__(self):
  64         return '%s' % self.uri
  65
  66
  67 class EmptyNamespace(XMLNamespace):
  68     def __init__(self):
  69         super(EmptyNamespace, self).__init__('')
  70
  71     def __call__(self, tag):
  72         return tag
  73
  74 # some common namespaces we use
  75 RDFNS = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
  76 DCNS = XMLNamespace('http://purl.org/dc/elements/1.1/')
  77 XINS = XMLNamespace("http://www.w3.org/2001/XInclude")
  78 XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml")
  79 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  80 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  81
  82 WLNS = EmptyNamespace()
  83
  84
  85 class WLURI(object):
  86     """Represents a WL URI. Extracts slug from it."""
  87     slug = None
  88
  89     example = 'http://edukacjamedialna.edu.pl/lekcje/template'
  90     _re_wl_uri = re.compile(
  91         r'http://(www\.)?edukacjamedialna.edu.pl/lekcje/'
  92         '(?P<slug>[-a-z0-9]+)/?$')
  93
  94     def __init__(self, uri):
  95         uri = unicode(uri)
  96         self.uri = uri
  97         self.slug = uri.rstrip('/').rsplit('/', 1)[-1]
  98
  99     @classmethod
 100     def strict(cls, uri):
 101         match = cls._re_wl_uri.match(uri)
 102         if not match:
 103             raise ValidationError(u'Invalid URI (%s). Should match: %s' % (
 104                         uri, cls._re_wl_uri.pattern))
 105         return cls(uri)
 106
 107     @classmethod
 108     def from_slug(cls, slug):
 109         """Contructs an URI from slug.
 110
 111         >>> WLURI.from_slug('a-slug').uri
 112         u'http://edukacjamedialna.edu.pl/lekcje/a-slug/'
 113
 114         """
 115         uri = 'http://edukacjamedialna.edu.pl/lekcje/%s/' % slug
 116         return cls(uri)
 117
 118     def __unicode__(self):
 119         return self.uri
 120
 121     def __str__(self):
 122         return self.uri
 123
 124     def canonical(self):
 125         return type(self).from_slug(self.slug)
 126
 127     def __eq__(self, other):
 128         return self.slug == other.slug
 129
 130
 131 class DocProvider(object):
 132     """Base class for a repository of XML files.
 133
 134     Used for generating joined files, like EPUBs.
 135     """
 136
 137     def by_slug(self, slug):
 138         """Should return an IOFile object with a WL document XML."""
 139         raise NotImplementedError
 140
 141     def by_uri(self, uri, wluri=WLURI):
 142         """Should return an IOFile object with a WL document XML."""
 143         wluri = wluri(uri)
 144         return self.by_slug(wluri.slug)
 145
 146
 147 class DirDocProvider(DocProvider):
 148     """ Serve docs from a directory of files in form <slug>.xml """
 149
 150     def __init__(self, dir_):
 151         self.dir = dir_
 152         self.files = {}
 153
 154     def by_slug(self, slug):
 155         fname = slug + '.xml'
 156         return IOFile.from_filename(os.path.join(self.dir, fname))
 157
 158
 159 def get_default_bookinfo():
 160     import dcparser
 161     dcparser.BookInfo(
 162         {RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'},
 163         {
 164             DCNS('creator.expert'): [u'Some, Author'],
 165             DCNS('creator.scenario'): [u'Some, Author'],
 166             DCNS('creator.textbook'): [u'Some, Author'],
 167             DCNS('title'): [u'Some Title'],
 168             DCNS('subject.period'): [u'Unknown'],
 169             DCNS('subject.type'): [u'Unknown'],
 170             DCNS('subject.genre'): [u'Unknown'],
 171             DCNS('date'): ['1970-01-01'],
 172             DCNS('language'): [u'pol'],
 173             # DCNS('date'): [creation_date],
 174             DCNS('publisher'): [u"Fundacja Nowoczesna Polska"],
 175             DCNS('description'):
 176                 [u"""Publikacja zrealizowana w ramach projektu
 177                  Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa
 178                  wykonana przez Bibliotekę Narodową z egzemplarza
 179                  pochodzącego ze zbiorów BN."""],
 180             DCNS('identifier.url'): [WLURI.example],
 181             DCNS('rights'):
 182                 [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"],
 183         })
 184
 185 DEFAULT_BOOKINFO = get_default_bookinfo()
 186
 187
 188 def xinclude_forURI(uri):
 189     e = etree.Element(XINS("include"))
 190     e.set("href", uri)
 191     return etree.tostring(e, encoding=unicode)
 192
 193
 194 def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO):
 195     """Wrap the text within the minimal XML structure with a DC template."""
 196     bookinfo.created_at = creation_date
 197
 198     dcstring = etree.tostring(bookinfo.to_etree(), encoding=unicode, pretty_print=True)
 199
 200     return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext + \
 201         u'\n</plain-text>\n</utwor>'
 202
 203
 204 def serialize_raw(element):
 205     b = u'' + (element.text or '')
 206
 207     for child in element.iterchildren():
 208         e = etree.tostring(child, encoding=unicode, pretty_print=True)
 209         b += e
 210
 211     return b
 212
 213 SERIALIZERS = {
 214     'raw': serialize_raw,
 215 }
 216
 217
 218 def serialize_children(element, format='raw'):
 219     return SERIALIZERS[format](element)
 220
 221
 222 def get_resource(path):
 223     return os.path.join(os.path.dirname(__file__), path)
 224
 225
 226 class IOFile(object):
 227     """ Represents a file fed as input or returned as a result. """
 228     _string = None
 229     _filename = None
 230     _filename_tmp = False
 231
 232     def __init__(self, attachments=None):
 233         self.attachments = attachments or {}
 234
 235     def __del__(self):
 236         if self._filename_tmp:
 237             os.unlink(self._filename)
 238
 239     def __nonzero__(self):
 240         return self._string is not None or self._filename is not None
 241
 242     @classmethod
 243     def from_string(cls, string, *args, **kwargs):
 244         """Converter returns contents of a file as a string."""
 245
 246         instance = cls(*args, **kwargs)
 247         instance._string = string
 248         return instance
 249
 250     @classmethod
 251     def from_filename(cls, filename, *args, **kwargs):
 252         """Converter returns contents of a file as a named file."""
 253
 254         instance = cls(*args, **kwargs)
 255         instance._filename = filename
 256         return instance
 257
 258     def get_string(self):
 259         """Get file's contents as a string."""
 260
 261         if self._filename is not None:
 262             with open(self._filename) as f:
 263                 return f.read()
 264         else:
 265             return self._string
 266
 267     def get_file(self):
 268         """Get file as a file-like object."""
 269
 270         if self._string is not None:
 271             from StringIO import StringIO
 272             return StringIO(self._string)
 273         elif self._filename is not None:
 274             return open(self._filename)
 275
 276     def get_filename(self):
 277         """Get file as a fs path."""
 278
 279         if self._filename is not None:
 280             return self._filename
 281         elif self._string is not None:
 282             from tempfile import NamedTemporaryFile
 283             temp = NamedTemporaryFile(prefix='librarian-', delete=False)
 284             temp.write(self._string)
 285             temp.close()
 286             self._filename = temp.name
 287             self._filename_tmp = True
 288             return self._filename
 289         else:
 290             return None
 291
 292     def save_as(self, path):
 293         """Save file to a path. Create directories, if necessary."""
 294
 295         dirname = os.path.dirname(os.path.abspath(path))
 296         if not os.path.isdir(dirname):
 297             os.makedirs(dirname)
 298         shutil.copy(self.get_filename(), path)
 299
 300     def dump_to(self, path, directory=None):
 301         """ Path should be name for main file. """
 302         self.save_as(path)
 303         dirname = os.path.dirname(os.path.abspath(path))
 304         for filename, attachment in self.attachments.items():
 305             attachment.save_as(os.path.join(dirname, filename))
 306
 307
 308 class Format(object):
 309     """ Generic format class. """
 310     def __init__(self, wldoc, **kwargs):
 311         self.wldoc = wldoc
 312         self.customization = kwargs
 313
 314     def build(self):
 315         raise NotImplementedError
 316
 317
 318 class URLOpener(urllib.FancyURLopener):
 319     version = 'FNP Librarian (http://github.com/fnp/librarian)'
 320 urllib._urlopener = URLOpener()