From: Marek Stępniowski Date: Sat, 6 Sep 2008 13:45:46 +0000 (+0200) Subject: Made dcparser a submodule of librarian. X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/8c0d2b20aee2edeb9bd8639ee7fb1887359e9797 Made dcparser a submodule of librarian. Fixed imports in application code. --- diff --git a/catalogue/models.py b/catalogue/models.py index 0d3df1603..057236dc2 100644 --- a/catalogue/models.py +++ b/catalogue/models.py @@ -10,7 +10,7 @@ from django.utils.safestring import mark_safe from newtagging.models import TagBase from newtagging import managers -from librarian import html +from librarian import html, dcparser TAG_CATEGORIES = ( @@ -128,7 +128,6 @@ class Book(models.Model): def from_xml_file(xml_file): from tempfile import NamedTemporaryFile from slughifi import slughifi - import dcparser from markupstring import MarkupString # Read book metadata diff --git a/lib/dcparser/__init__.py b/lib/dcparser/__init__.py deleted file mode 100644 index b92af61fe..000000000 --- a/lib/dcparser/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# -*- coding: utf-8 -*- - -from dcparser import parse, ParseError, Person - diff --git a/lib/dcparser/dcparser.py b/lib/dcparser/dcparser.py deleted file mode 100644 index 9281325c6..000000000 --- a/lib/dcparser/dcparser.py +++ /dev/null @@ -1,179 +0,0 @@ -# -*- coding: utf-8 -*- -from xml.parsers.expat import ExpatError -from datetime import date -import time - -# Import ElementTree from anywhere -try: - import xml.etree.ElementTree as etree # Python >= 2.5 -except ImportError: - try: - import elementtree.ElementTree as etree # effbot's pure Python module - except ImportError: - import lxml.etree as etree # ElementTree API using libxml2 - - -# ============== -# = Converters = -# ============== -class Person(object): - """Single person with last name and a list of first names.""" - def __init__(self, last_name, *first_names): - self.last_name = last_name - self.first_names = first_names - - - def __eq__(self, right): - return self.last_name == right.last_name and self.first_names == right.first_names - - - def __unicode__(self): - if len(self.first_names) > 0: - return '%s, %s' % (self.last_name, ' '.join(self.first_names)) - else: - return self.last_name - - - def __repr__(self): - return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names) - - -def str_to_unicode(value, previous): - return unicode(value) - - -def str_to_unicode_list(value, previous): - if previous is None: - previous = [] - previous.append(str_to_unicode(value)) - return previous - - -def str_to_person(value, previous): - comma_count = value.count(',') - - if comma_count == 0: - last_name, first_names = value, [] - elif comma_count == 1: - last_name, first_names = value.split(',') - first_names = [name for name in first_names.split(' ') if len(name)] - else: - raise ValueError("value contains more than one comma: %r" % value) - - return Person(last_name.strip(), *first_names) - - -def str_to_date(value, previous): - try: - t = time.strptime(value, '%Y-%m-%d') - except ValueError: - t = time.strptime(value, '%Y') - return date(t[0], t[1], t[2]) - - -# ========== -# = Parser = -# ========== -class ParseError(Exception): - def __init__(self, message): - super(self, Exception).__init__(message) - - -class XMLNamespace(object): - '''Represents XML namespace.''' - - def __init__(self, uri): - self.uri = uri - - def __call__(self, tag): - return '{%s}%s' % (self.uri, tag) - - def __contains__(self, tag): - return tag.startswith(str(self)) - - def __repr__(self): - return 'XMLNamespace(%r)' % self.uri - - def __str__(self): - return '%s' % self.uri - - -class BookInfo(object): - RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') - DC = XMLNamespace('http://purl.org/dc/elements/1.1/') - - mapping = { - DC('creator') : ('author', str_to_person), - DC('title') : ('title', str_to_unicode), - DC('subject.period') : ('epoch', str_to_unicode), - DC('subject.type') : ('kind', str_to_unicode), - DC('subject.genre') : ('genre', str_to_unicode), - DC('date') : ('created_at', str_to_date), - DC('date.pd') : ('released_to_public_domain_at', str_to_date), - DC('contributor.translator') : ('translator', str_to_person), - DC('contributor.technical_editor') : ('technical_editor', str_to_person), - DC('publisher') : ('publisher', str_to_unicode), - DC('source') : ('source_name', str_to_unicode), - DC('source.URL') : ('source_url', str_to_unicode), - DC('identifier.url') : ('url', str_to_unicode), - DC('relation.hasPart') : ('parts', str_to_unicode_list), - } - - @classmethod - def from_string(cls, xml): - from StringIO import StringIO - return cls.from_file(StringIO(xml)) - - @classmethod - def from_file(cls, xml_file): - book_info = cls() - - try: - tree = etree.parse(xml_file) - except ExpatError, e: - raise ParseError(e) - - description = tree.find('//' + book_info.RDF('Description')) - if description is None: - raise ParseError('no Description tag found in document') - - for element in description.findall('*'): - book_info.parse_element(element) - - return book_info - - def parse_element(self, element): - try: - attribute, converter = self.mapping[element.tag] - setattr(self, attribute, converter(element.text, getattr(self, attribute, None))) - except KeyError: - pass - - def to_xml(self): - """XML representation of this object.""" - etree._namespace_map[str(self.RDF)] = 'rdf' - etree._namespace_map[str(self.DC)] = 'dc' - - root = etree.Element(self.RDF('RDF')) - description = etree.SubElement(root, self.RDF('Description')) - - for tag, (attribute, converter) in self.mapping.iteritems(): - if hasattr(self, attribute): - e = etree.Element(tag) - e.text = unicode(getattr(self, attribute)) - description.append(e) - - return unicode(etree.tostring(root, 'utf-8'), 'utf-8') - - -def parse(file_name): - return BookInfo.from_file(file_name) - - -if __name__ == '__main__': - import sys - - info = parse(sys.argv[1]) - for attribute, _ in BookInfo.mapping.values(): - print '%s: %r' % (attribute, getattr(info, attribute, None)) - diff --git a/lib/librarian/dcparser.py b/lib/librarian/dcparser.py new file mode 100644 index 000000000..9281325c6 --- /dev/null +++ b/lib/librarian/dcparser.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +from xml.parsers.expat import ExpatError +from datetime import date +import time + +# Import ElementTree from anywhere +try: + import xml.etree.ElementTree as etree # Python >= 2.5 +except ImportError: + try: + import elementtree.ElementTree as etree # effbot's pure Python module + except ImportError: + import lxml.etree as etree # ElementTree API using libxml2 + + +# ============== +# = Converters = +# ============== +class Person(object): + """Single person with last name and a list of first names.""" + def __init__(self, last_name, *first_names): + self.last_name = last_name + self.first_names = first_names + + + def __eq__(self, right): + return self.last_name == right.last_name and self.first_names == right.first_names + + + def __unicode__(self): + if len(self.first_names) > 0: + return '%s, %s' % (self.last_name, ' '.join(self.first_names)) + else: + return self.last_name + + + def __repr__(self): + return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names) + + +def str_to_unicode(value, previous): + return unicode(value) + + +def str_to_unicode_list(value, previous): + if previous is None: + previous = [] + previous.append(str_to_unicode(value)) + return previous + + +def str_to_person(value, previous): + comma_count = value.count(',') + + if comma_count == 0: + last_name, first_names = value, [] + elif comma_count == 1: + last_name, first_names = value.split(',') + first_names = [name for name in first_names.split(' ') if len(name)] + else: + raise ValueError("value contains more than one comma: %r" % value) + + return Person(last_name.strip(), *first_names) + + +def str_to_date(value, previous): + try: + t = time.strptime(value, '%Y-%m-%d') + except ValueError: + t = time.strptime(value, '%Y') + return date(t[0], t[1], t[2]) + + +# ========== +# = Parser = +# ========== +class ParseError(Exception): + def __init__(self, message): + super(self, Exception).__init__(message) + + +class XMLNamespace(object): + '''Represents XML namespace.''' + + def __init__(self, uri): + self.uri = uri + + def __call__(self, tag): + return '{%s}%s' % (self.uri, tag) + + def __contains__(self, tag): + return tag.startswith(str(self)) + + def __repr__(self): + return 'XMLNamespace(%r)' % self.uri + + def __str__(self): + return '%s' % self.uri + + +class BookInfo(object): + RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') + DC = XMLNamespace('http://purl.org/dc/elements/1.1/') + + mapping = { + DC('creator') : ('author', str_to_person), + DC('title') : ('title', str_to_unicode), + DC('subject.period') : ('epoch', str_to_unicode), + DC('subject.type') : ('kind', str_to_unicode), + DC('subject.genre') : ('genre', str_to_unicode), + DC('date') : ('created_at', str_to_date), + DC('date.pd') : ('released_to_public_domain_at', str_to_date), + DC('contributor.translator') : ('translator', str_to_person), + DC('contributor.technical_editor') : ('technical_editor', str_to_person), + DC('publisher') : ('publisher', str_to_unicode), + DC('source') : ('source_name', str_to_unicode), + DC('source.URL') : ('source_url', str_to_unicode), + DC('identifier.url') : ('url', str_to_unicode), + DC('relation.hasPart') : ('parts', str_to_unicode_list), + } + + @classmethod + def from_string(cls, xml): + from StringIO import StringIO + return cls.from_file(StringIO(xml)) + + @classmethod + def from_file(cls, xml_file): + book_info = cls() + + try: + tree = etree.parse(xml_file) + except ExpatError, e: + raise ParseError(e) + + description = tree.find('//' + book_info.RDF('Description')) + if description is None: + raise ParseError('no Description tag found in document') + + for element in description.findall('*'): + book_info.parse_element(element) + + return book_info + + def parse_element(self, element): + try: + attribute, converter = self.mapping[element.tag] + setattr(self, attribute, converter(element.text, getattr(self, attribute, None))) + except KeyError: + pass + + def to_xml(self): + """XML representation of this object.""" + etree._namespace_map[str(self.RDF)] = 'rdf' + etree._namespace_map[str(self.DC)] = 'dc' + + root = etree.Element(self.RDF('RDF')) + description = etree.SubElement(root, self.RDF('Description')) + + for tag, (attribute, converter) in self.mapping.iteritems(): + if hasattr(self, attribute): + e = etree.Element(tag) + e.text = unicode(getattr(self, attribute)) + description.append(e) + + return unicode(etree.tostring(root, 'utf-8'), 'utf-8') + + +def parse(file_name): + return BookInfo.from_file(file_name) + + +if __name__ == '__main__': + import sys + + info = parse(sys.argv[1]) + for attribute, _ in BookInfo.mapping.values(): + print '%s: %r' % (attribute, getattr(info, attribute, None)) +