From: Radek Czajka Date: Thu, 29 Dec 2011 14:02:42 +0000 (+0100) Subject: remove lang from URI and add relation.isVariantOf X-Git-Tag: 1.7~174 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/a34b95aa7ba5fd4838541d1cdcd28358fb808062?ds=inline remove lang from URI and add relation.isVariantOf --- diff --git a/librarian/__init__.py b/librarian/__init__.py index feb9974..dd09ce4 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -66,42 +66,35 @@ WLNS = EmptyNamespace() class WLURI(object): - """Represents a WL URI. Extracts slug and language from it.""" - DEFAULT_LANGUAGE = u'pol' - + """Represents a WL URI. Extracts slug from it.""" slug = None - language = None example = 'http://wolnelektury.pl/katalog/lektura/template/' _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/lektura/' - '(?P[-a-z0-9]+)(/(?P[a-z]{3}))?/?$') - - def __init__(self, uri=None): - if uri is not None: - uri = unicode(uri) - self.uri = uri - match = self._re_wl_uri.match(uri) - if not match: - raise ValueError('Supplied URI (%s) does not match ' - 'the WL document URI template.' % uri) - self.slug = match.group('slug') - self.language = match.group('lang') or self.DEFAULT_LANGUAGE + '(?P[-a-z0-9]+)/?$') + + def __init__(self, uri): + uri = unicode(uri) + self.uri = uri + self.slug = uri.rstrip('/').rsplit('/', 1)[-1] + + @classmethod + def strict(cls, uri): + match = cls._re_wl_uri.match(uri) + if not match: + raise ValueError('Supplied URI (%s) does not match ' + 'the template: %s.' % (uri, cls._re_wl_uri)) + return cls(uri) @classmethod - def from_slug_and_lang(cls, slug, lang): - """Contructs an URI from slug and language code. + def from_slug(cls, slug): + """Contructs an URI from slug. - >>> WLURI.from_slug_and_lang('a-slug', WLURI.DEFAULT_LANGUAGE).uri + >>> WLURI.from_slug('a-slug').uri u'http://wolnelektury.pl/katalog/lektura/a-slug/' - >>> WLURI.from_slug_and_lang('a-slug', 'deu').uri - u'http://wolnelektury.pl/katalog/lektura/a-slug/deu/' """ - if lang is None: - lang = cls.DEFAULT_LANGUAGE uri = 'http://wolnelektury.pl/katalog/lektura/%s/' % slug - if lang is not None and lang != cls.DEFAULT_LANGUAGE: - uri += lang + '/' return cls(uri) def __unicode__(self): @@ -111,17 +104,7 @@ class WLURI(object): return self.uri def __eq__(self, other): - return self.slug, self.language == other.slug, other.language - - def filename_stem(self): - stem = self.slug - if self.language != self.DEFAULT_LANGUAGE: - stem += '_' + self.language - return stem - - def validate_language(self, language): - if language != self.language: - raise ValidationError("Incorrect language definition in URI") + return self.slug == other.slug class DocProvider(object): @@ -130,18 +113,14 @@ class DocProvider(object): Used for generating joined files, like EPUBs. """ - def by_slug_and_lang(self, slug, lang=None): - """Should return a file-like object with a WL document XML.""" - raise NotImplementedError - def by_slug(self, slug): """Should return a file-like object with a WL document XML.""" - return self.by_slug_and_lang(slug) + raise NotImplementedError def by_uri(self, uri, wluri=WLURI): """Should return a file-like object with a WL document XML.""" wluri = wluri(uri) - return self.by_slug_and_lang(wluri.slug, wluri.language) + return self.by_slug(wluri.slug) class DirDocProvider(DocProvider): @@ -151,8 +130,8 @@ class DirDocProvider(DocProvider): self.dir = dir_ self.files = {} - def by_slug_and_lang(self, slug, lang=None): - fname = WLURI.from_slug_and_lang(slug, lang).filename_stem() + '.xml' + def by_slug(self, slug): + fname = slug + '.xml' return open(os.path.join(self.dir, fname)) @@ -167,7 +146,7 @@ DEFAULT_BOOKINFO = dcparser.BookInfo( DCNS('subject.type'): [u'Unknown'], DCNS('subject.genre'): [u'Unknown'], DCNS('date'): ['1970-01-01'], - DCNS('language'): [WLURI.DEFAULT_LANGUAGE], + DCNS('language'): [u'pol'], # DCNS('date'): [creation_date], DCNS('publisher'): [u"Fundacja Nowoczesna Polska"], DCNS('description'): diff --git a/librarian/dcparser.py b/librarian/dcparser.py index d99aaf0..21244ef 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -70,8 +70,7 @@ def as_date(text): except ValueError, e: raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.") -def as_person(text): - return Person.from_text(text) +as_person = Person.from_text def as_unicode(text): if isinstance(text, unicode): @@ -80,34 +79,39 @@ def as_unicode(text): return text.decode('utf-8') class Field(object): - def __init__(self, uri, attr_name, type=as_unicode, multiple=False, salias=None, **kwargs): + def __init__(self, uri, attr_name, validator=as_unicode, strict=None, multiple=False, salias=None, **kwargs): self.uri = uri self.name = attr_name - self.validator = type + self.validator = lambda x: validator(x) + self.strict = lambda x: strict(x) self.multiple = multiple self.salias = salias self.required = kwargs.get('required', True) and not kwargs.has_key('default') self.default = kwargs.get('default', [] if multiple else [None]) - def validate_value(self, val): + def validate_value(self, val, strict=False): + if strict and self.strict is not None: + validator = self.strict + else: + validator = self.validator try: if self.multiple: - if self.validator is None: + if validator is None: return val - return [ self.validator(v) if v is not None else v for v in val ] + return [ validator(v) if v is not None else v for v in val ] elif len(val) > 1: raise ValidationError("Multiple values not allowed for field '%s'" % self.uri) elif len(val) == 0: raise ValidationError("Field %s has no value to assign. Check your defaults." % self.uri) else: - if self.validator is None or val[0] is None: + if validator is None or val[0] is None: return val[0] - return self.validator(val[0]) + return validator(val[0]) except ValueError, e: raise ValidationError("Field '%s' - invald value: %s" % (self.uri, e.message)) - def validate(self, fdict): + def validate(self, fdict, strict=False): if not fdict.has_key(self.uri): if not self.required: f = self.default @@ -116,7 +120,7 @@ class Field(object): else: f = fdict[self.uri] - return self.validate_value(f) + return self.validate_value(f, strict=strict) def __eq__(self, other): if isinstance(other, Field) and other.name == self.name: @@ -162,18 +166,18 @@ class WorkInfo(object): Field( DCNS('source'), 'source_name', required=False), Field( DCNS('source.URL'), 'source_url', required=False), - Field( DCNS('identifier.url'), 'url', WLURI), + Field( DCNS('identifier.url'), 'url', WLURI, strict=WLURI.strict), Field( DCNS('rights.license'), 'license', required=False), Field( DCNS('rights'), 'license_description'), ) @classmethod - def from_string(cls, xml): + def from_string(cls, xml, *args, **kwargs): from StringIO import StringIO - return cls.from_file(StringIO(xml)) + return cls.from_file(StringIO(xml), *args, **kwargs) @classmethod - def from_file(cls, xmlfile): + def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: iter = etree.iterparse(xmlfile, ['start', 'end']) @@ -194,14 +198,14 @@ class WorkInfo(object): # if there is no end, Expat should yell at us with an ExpatError # extract data from the element and make the info - return cls.from_element(desc_tag) + return cls.from_element(desc_tag, *args, **kwargs) except XMLSyntaxError, e: raise ParseError(e) except ExpatError, e: raise ParseError(e) @classmethod - def from_element(cls, rdf_tag): + def from_element(cls, rdf_tag, *args, **kwargs): # the tree is already parsed, so we don't need to worry about Expat errors field_dict = {} desc = rdf_tag.find(".//" + RDFNS('Description')) @@ -214,9 +218,9 @@ class WorkInfo(object): fv.append(e.text) field_dict[e.tag] = fv - return cls(desc.attrib, field_dict) + return cls(desc.attrib, field_dict, *args, **kwargs) - def __init__(self, rdf_attrs, dc_fields): + def __init__(self, rdf_attrs, dc_fields, strict=False): """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the given field. """ @@ -225,16 +229,11 @@ class WorkInfo(object): self.fmap = {} for field in self.FIELDS: - value = field.validate( dc_fields ) + value = field.validate(dc_fields, strict=strict) setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field if field.salias: self.fmap[field.salias] = field - self.validate() - - def validate(self): - self.url.validate_language(self.language) - def __getattribute__(self, name): try: field = object.__getattribute__(self, 'fmap')[name] @@ -350,7 +349,10 @@ class BookInfo(WorkInfo): Field( DCNS('contributor.translator'), 'translators', \ as_person, salias='translator', multiple=True, default=[]), - Field( DCNS('relation.hasPart'), 'parts', WLURI, multiple=True, required=False), + Field( DCNS('relation.hasPart'), 'parts', + WLURI, strict=WLURI.strict, multiple=True, required=False), + Field( DCNS('relation.isVariantOf'), 'variant_of', + WLURI, strict=WLURI.strict, required=False), Field( DCNS('relation.cover_image.url'), 'cover_url', required=False), Field( DCNS('relation.cover_image.attribution'), 'cover_by', required=False), diff --git a/librarian/parser.py b/librarian/parser.py index 5ae06e2..2ece72f 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -19,7 +19,7 @@ class WLDocument(object): LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE) provider = None - def __init__(self, edoc, parse_dublincore=True, provider=None): + def __init__(self, edoc, parse_dublincore=True, provider=None, strict=False): self.edoc = edoc self.provider = provider @@ -36,7 +36,8 @@ class WLDocument(object): if self.rdf_elem is None: raise NoDublinCore('Document has no DublinCore - which is required.') - self.book_info = dcparser.BookInfo.from_element(self.rdf_elem) + self.book_info = dcparser.BookInfo.from_element( + self.rdf_elem, strict=strict) else: self.book_info = None @@ -192,7 +193,7 @@ class WLDocument(object): save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8')) save_path = os.path.join(save_path, - self.book_info.uri.filename_stem()) + self.book_info.uri.slug) if ext: save_path += '.%s' % ext else: diff --git a/librarian/picture.py b/librarian/picture.py index 0f5c99a..ce1a551 100644 --- a/librarian/picture.py +++ b/librarian/picture.py @@ -11,19 +11,13 @@ import re class WLPictureURI(WLURI): _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/' - '(?P[-a-z0-9]+)(/(?P[a-z]{3}))?/?$') - - def __init__(self, *args, **kw): - super(WLPictureURI, self).__init__(*args, **kw) + '(?P[-a-z0-9]+)/?$') @classmethod - def from_slug_and_lang(cls, slug, lang): + def from_slug(cls, slug): uri = 'http://wolnelektury.pl/katalog/obraz/%s/' % slug return cls(uri) - def filename_stem(self): - return self.slug - class PictureInfo(WorkInfo): """ @@ -39,15 +33,9 @@ class PictureInfo(WorkInfo): Field(DCNS('description.medium'), 'medium', required=False), Field(DCNS('description.dimensions'), 'original_dimensions', required=False), Field(DCNS('format'), 'mime_type', required=False), - Field(DCNS('identifier.url'), 'url', WLPictureURI), + Field(DCNS('identifier.url'), 'url', WLPictureURI, strict=WLPictureURI.strict), ) - def validate(self): - """ - WorkInfo has a language validation code only, which we do not need. - """ - pass - class ImageStore(object): EXT = ['gif', 'jpeg', 'png', 'swf', 'psd', 'bmp'