From: Marcin Koziej Date: Wed, 30 Jan 2013 15:28:07 +0000 (+0100) Subject: Merge branch 'universal' into edumed-ofop X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/fe8e5b5e224d32baebbdaa2fecf4a847ed4e5354?ds=inline;hp=-c Merge branch 'universal' into edumed-ofop --- fe8e5b5e224d32baebbdaa2fecf4a847ed4e5354 diff --combined librarian/__init__.py index 09bdcd7,3b811d3..bf41c7a --- a/librarian/__init__.py +++ b/librarian/__init__.py @@@ -79,8 -79,8 +79,8 @@@ class WLURI(object) """Represents a WL URI. Extracts slug from it.""" slug = None - example = 'http://wolnelektury.pl/katalog/lektura/template/' - _re_wl_uri = re.compile(r'http://(www\.)?wolnelektury.pl/katalog/lektura/' + example = 'http://edukacjamedialna.edu.pl/' + _re_wl_uri = re.compile(r'http://(www\.)?edukacjamedialna.edu.pl/' '(?P[-a-z0-9]+)/?$') def __init__(self, uri): @@@ -104,7 -104,7 +104,7 @@@ u'http://wolnelektury.pl/katalog/lektura/a-slug/' """ - uri = 'http://wolnelektury.pl/katalog/lektura/%s/' % slug + uri = 'http://prawokultury.pl/publikacje/%s/' % slug return cls(uri) def __unicode__(self): @@@ -150,10 -150,7 +150,10 @@@ import dcparse DEFAULT_BOOKINFO = dcparser.BookInfo( { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, - { DCNS('creator'): [u'Some, Author'], + { + DCNS('creator.expert'): [u'Some, Author'], + DCNS('creator.scenario'): [u'Some, Author'], + DCNS('creator.textbook'): [u'Some, Author'], DCNS('title'): [u'Some Title'], DCNS('subject.period'): [u'Unknown'], DCNS('subject.type'): [u'Unknown'], @@@ -208,32 -205,35 +208,35 @@@ def get_resource(path) return os.path.join(os.path.dirname(__file__), path) - class OutputFile(object): - """Represents a file returned by one of the converters.""" - + class IOFile(object): + """ Represents a file fed as input or returned as a result. """ _string = None _filename = None + _filename_tmp = False + + def __init__(self, attachments=None): + self.attachments = attachments or {} def __del__(self): - if self._filename: + if self._filename_tmp: os.unlink(self._filename) def __nonzero__(self): return self._string is not None or self._filename is not None @classmethod - def from_string(cls, string): + def from_string(cls, string, *args, **kwargs): """Converter returns contents of a file as a string.""" - instance = cls() + instance = cls(*args, **kwargs) instance._string = string return instance @classmethod - def from_filename(cls, filename): + def from_filename(cls, filename, *args, **kwargs): """Converter returns contents of a file as a named file.""" - instance = cls() + instance = cls(*args, **kwargs) instance._filename = filename return instance @@@ -266,6 -266,7 +269,7 @@@ temp.write(self._string) temp.close() self._filename = temp.name + self._filename_tmp = True return self._filename else: return None @@@ -278,6 -279,23 +282,23 @@@ os.makedirs(dirname) shutil.copy(self.get_filename(), path) + def dump_to(self, path, directory=None): + """ Path should be name for main file. """ + self.save_as(path) + dirname = os.path.dirname(os.path.abspath(path)) + for filename, attachment in self.attachments.items(): + attachment.save_as(os.path.join(dirname, filename)) + + + class Format(object): + """ Generic format class. """ + def __init__(self, wldoc, **kwargs): + self.wldoc = wldoc + self.customization = kwargs + + def build(self): + raise NotImplementedError + class URLOpener(urllib.FancyURLopener): version = 'FNP Librarian (http://github.com/fnp/librarian)' diff --combined librarian/functions.py index e91d7e1,9490cbb..40f06cd --- a/librarian/functions.py +++ b/librarian/functions.py @@@ -14,47 -14,42 +14,47 @@@ def _register_function(f) ns[f.__name__] = f -def reg_substitute_entities(): - ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), - ] - - def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text +ENTITY_SUBSTITUTIONS = [ + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), +] + +def substitute_entities(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = ''.join(text) + for entity, substitutution in ENTITY_SUBSTITUTIONS: + text = text.replace(entity, substitutution) + return text + +def reg_substitute_entities(): _register_function(substitute_entities) +def strip(context, text): + """Remove unneeded whitespace from beginning and end""" + if isinstance(text, list): + text = ''.join(text) + return re.sub(r'\s+', ' ', text).strip() + + def reg_strip(): - def strip(context, text): - """Remove unneeded whitespace from beginning and end""" - if isinstance(text, list): - text = ''.join(text) - return re.sub(r'\s+', ' ', text).strip() _register_function(strip) +def starts_white(context, text): + if isinstance(text, list): + text = ''.join(text) + if not text: + return False + return text[0].isspace() + + def reg_starts_white(): - def starts_white(context, text): - if isinstance(text, list): - text = ''.join(text) - if not text: - return False - return text[0].isspace() _register_function(starts_white) @@@ -68,50 -63,58 +68,64 @@@ def reg_ends_white() _register_function(ends_white) +def wrap_words(context, text, wrapping): + """XPath extension function automatically wrapping words in passed text""" + if isinstance(text, list): + text = ''.join(text) + if not wrapping: + return text + + words = re.split(r'\s', text) + + line_length = 0 + lines = [[]] + for word in words: + line_length += len(word) + 1 + if line_length > wrapping: + # Max line length was exceeded. We create new line + lines.append([]) + line_length = len(word) + lines[-1].append(word) + return '\n'.join(' '.join(line) for line in lines) + + def reg_wrap_words(): - def wrap_words(context, text, wrapping): - """XPath extension function automatically wrapping words in passed text""" - if isinstance(text, list): - text = ''.join(text) - if not wrapping: - return text - - words = re.split(r'\s', text) - - line_length = 0 - lines = [[]] - for word in words: - line_length += len(word) + 1 - if line_length > wrapping: - # Max line length was exceeded. We create new line - lines.append([]) - line_length = len(word) - lines[-1].append(word) - return '\n'.join(' '.join(line) for line in lines) _register_function(wrap_words) +def person_name(context, text): + """ Converts "Name, Forename" to "Forename Name" """ + if isinstance(text, list): + text = ''.join(text) + return Person.from_text(text).readable() + + def reg_person_name(): - def person_name(context, text): - """ Converts "Name, Forename" to "Forename Name" """ - if isinstance(text, list): - text = ''.join(text) - return Person.from_text(text).readable() _register_function(person_name) +def texcommand(context, text): + """Remove non-letters""" + if isinstance(text, list): + text = ''.join(text) + return re.sub(r'[^a-zA-Z]', '', text).strip() + + def reg_texcommand(): - def texcommand(context, text): - """Remove non-letters""" - if isinstance(text, list): - text = ''.join(text) - return re.sub(r'[^a-zA-Z]', '', text).strip() _register_function(texcommand) + def reg_get(format_): + def get(context, *args): + obj = format_ + for arg in args: + if hasattr(obj, arg): + obj = getattr(obj, arg) + else: + try: + obj = obj[arg] + except (TypeError, KeyError), e: + # Just raise proper AttributeError. + getattr(obj, arg) + return obj + _register_function(get) diff --combined librarian/parser.py index 9068fc0,d330a72..b2ad98a --- a/librarian/parser.py +++ b/librarian/parser.py @@@ -4,8 -4,8 +4,8 @@@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from librarian import ValidationError, NoDublinCore, ParseError, NoProvider - from librarian import RDFNS - from librarian.cover import WLCover + from librarian import RDFNS, IOFile + from librarian.styles.wolnelektury.cover import WLCover from librarian import dcparser from xml.parsers.expat import ExpatError @@@ -20,58 -20,68 +20,68 @@@ class WLDocument(object) LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE) provider = None - def __init__(self, edoc, parse_dublincore=True, provider=None, - strict=False, meta_fallbacks=None): - self.edoc = edoc + _edoc = None + @property + def edoc(self): + if self._edoc is None: + data = self.source.get_string() + if not isinstance(data, unicode): + data = data.decode('utf-8') + data = data.replace(u'\ufeff', '') + try: + parser = etree.XMLParser(remove_blank_text=False) + self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser) + except (ExpatError, XMLSyntaxError, XSLTApplyError), e: + raise ParseError(e) + return self._edoc + + _rdf_elem = None + @property + def rdf_elem(self): + if self._rdf_elem is None: + dc_path = './/' + RDFNS('RDF') + self._rdf_elem = self.edoc.getroot().find(dc_path) + if self._rdf_elem is None: + raise NoDublinCore('Document has no DublinCore - which is required.') + return self._rdf_elem + + _book_info = None + @property + def book_info(self): + if not self.parse_dublincore: + return None + if self._book_info is None: + self._book_info = dcparser.BookInfo.from_element( + self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict) + return self._book_info + + def __init__(self, iofile, provider=None, + parse_dublincore=True, # shouldn't it be in a subclass? + strict=False, # ? + meta_fallbacks=None # ? + ): + self.source = iofile self.provider = provider - - root_elem = edoc.getroot() - - dc_path = './/' + RDFNS('RDF') - - if root_elem.tag != 'utwor': + self.parse_dublincore = parse_dublincore + self.strict = strict + self.meta_fallbacks = meta_fallbacks + if self.edoc.getroot().tag != 'utwor': raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag) - if parse_dublincore: - self.rdf_elem = root_elem.find(dc_path) - - if self.rdf_elem is None: - raise NoDublinCore('Document has no DublinCore - which is required.') - - self.book_info = dcparser.BookInfo.from_element( - self.rdf_elem, fallbacks=meta_fallbacks, strict=strict) - else: - self.book_info = None + self.book_info @classmethod def from_string(cls, xml, *args, **kwargs): - return cls.from_file(StringIO(xml), *args, **kwargs) + return cls(IOFile.from_string(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): - - # first, prepare for parsing if isinstance(xmlfile, basestring): - file = open(xmlfile, 'rb') - try: - data = file.read() - finally: - file.close() + iofile = IOFile.from_filename(xmlfile) else: - data = xmlfile.read() - - if not isinstance(data, unicode): - data = data.decode('utf-8') + iofile = IOFile.from_file(xmlfile) + return cls(iofile, *args, **kwargs) - data = data.replace(u'\ufeff', '') - - try: - parser = etree.XMLParser(remove_blank_text=False) - tree = etree.parse(StringIO(data.encode('utf-8')), parser) - - return cls(tree, *args, **kwargs) - except (ExpatError, XMLSyntaxError, XSLTApplyError), e: - raise ParseError(e) def swap_endlines(self): """Converts line breaks in stanzas into
tags.""" @@@ -95,10 -105,10 +105,10 @@@ elem.text = chunks.pop(0) def parts(self): - if self.provider is None: - raise NoProvider('No document provider supplied.') if self.book_info is None: raise NoDublinCore('No Dublin Core in document.') + if self.book_info.parts and self.provider is None: + raise NoProvider('No document provider supplied.') for part_uri in self.book_info.parts: yield self.from_file(self.provider.by_uri(part_uri), provider=self.provider) @@@ -183,7 -193,7 +193,7 @@@ # Converters def as_html(self, *args, **kwargs): - from librarian import html + from librarian import pyhtml as html return html.transform(self, *args, **kwargs) def as_text(self, *args, **kwargs):