librarian/dcparser.py

   1 # -*- coding: utf-8 -*-
   2 from xml.parsers.expat import ExpatError
   3 from datetime import date
   4 import time
   5
   6 # Import ElementTree from anywhere
   7 try:
   8     import xml.etree.ElementTree as etree # Python >= 2.5
   9 except ImportError:
  10     try:
  11         import elementtree.ElementTree as etree # effbot's pure Python module
  12     except ImportError:
  13         import lxml.etree as etree # ElementTree API using libxml2
  14
  15
  16 # ==============
  17 # = Converters =
  18 # ==============
  19 class Person(object):
  20     """Single person with last name and a list of first names."""
  21     def __init__(self, last_name, *first_names):
  22         self.last_name = last_name
  23         self.first_names = first_names
  24
  25
  26     def __eq__(self, right):
  27         return self.last_name == right.last_name and self.first_names == right.first_names
  28
  29
  30     def __unicode__(self):
  31         if len(self.first_names) > 0:
  32             return '%s, %s' % (self.last_name, ' '.join(self.first_names))
  33         else:
  34             return self.last_name
  35
  36
  37     def __repr__(self):
  38         return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names)
  39
  40
  41 def str_to_unicode(value, previous):
  42     return unicode(value)
  43
  44
  45 def str_to_unicode_list(value, previous):
  46     if previous is None:
  47         previous = []
  48     previous.append(str_to_unicode(value, None))
  49     return previous
  50
  51
  52 def str_to_person(value, previous):
  53     comma_count = value.count(',')
  54
  55     if comma_count == 0:
  56         last_name, first_names = value, []
  57     elif comma_count == 1:
  58         last_name, first_names = value.split(',')
  59         first_names = [name for name in first_names.split(' ') if len(name)]
  60     else:
  61         raise ValueError("value contains more than one comma: %r" % value)
  62
  63     return Person(last_name.strip(), *first_names)
  64
  65
  66 def str_to_date(value, previous):
  67     try:
  68         t = time.strptime(value, '%Y-%m-%d')
  69     except ValueError:
  70         t = time.strptime(value, '%Y')
  71     return date(t[0], t[1], t[2])
  72
  73
  74 # ==========
  75 # = Parser =
  76 # ==========
  77 class ParseError(Exception):
  78     def __init__(self, message):
  79         super(ParseError, self).__init__(message)
  80
  81
  82 class XMLNamespace(object):
  83     '''Represents XML namespace.'''
  84
  85     def __init__(self, uri):
  86         self.uri = uri
  87
  88     def __call__(self, tag):
  89         return '{%s}%s' % (self.uri, tag)
  90
  91     def __contains__(self, tag):
  92         return tag.startswith(str(self))
  93
  94     def __repr__(self):
  95         return 'XMLNamespace(%r)' % self.uri
  96
  97     def __str__(self):
  98         return '%s' % self.uri
  99
 100
 101 class BookInfo(object):
 102     RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
 103     DC = XMLNamespace('http://purl.org/dc/elements/1.1/')
 104
 105     mapping = {
 106         DC('creator')        : ('author', str_to_person),
 107         DC('title')          : ('title', str_to_unicode),
 108         DC('subject.period') : ('epoch', str_to_unicode),
 109         DC('subject.type')   : ('kind', str_to_unicode),
 110         DC('subject.genre')  : ('genre', str_to_unicode),
 111         DC('date')           : ('created_at', str_to_date),
 112         DC('date.pd')        : ('released_to_public_domain_at', str_to_date),
 113         DC('contributor.translator') : ('translator', str_to_person),
 114         DC('contributor.technical_editor') : ('technical_editor', str_to_person),
 115         DC('publisher')      : ('publisher', str_to_unicode),
 116         DC('source')         : ('source_name', str_to_unicode),
 117         DC('source.URL')     : ('source_url', str_to_unicode),
 118         DC('identifier.url') : ('url', str_to_unicode),
 119         DC('relation.hasPart') : ('parts', str_to_unicode_list),
 120         DC('rights.license') : ('license', str_to_unicode),
 121         DC('rights')         : ('license_description', str_to_unicode),
 122     }
 123
 124     @classmethod
 125     def from_string(cls, xml):
 126         from StringIO import StringIO
 127         return cls.from_file(StringIO(xml))
 128
 129     @classmethod
 130     def from_file(cls, xml_file):
 131         book_info = cls()
 132
 133         try:
 134             tree = etree.parse(xml_file)
 135         except ExpatError, e:
 136             raise ParseError(e)
 137
 138         description = tree.find('//' + book_info.RDF('Description'))
 139         book_info.wiki_url = description.get(cls.RDF('about'), None)
 140
 141         if description is None:
 142             raise ParseError('no Description tag found in document')
 143
 144         for element in description.findall('*'):
 145             book_info.parse_element(element)
 146
 147         return book_info
 148
 149     def parse_element(self, element):
 150         try:
 151             attribute, converter = self.mapping[element.tag]
 152             setattr(self, attribute, converter(element.text, getattr(self, attribute, None)))
 153         except KeyError:
 154             pass
 155
 156     def to_xml(self):
 157         """XML representation of this object."""
 158         etree._namespace_map[str(self.RDF)] = 'rdf'
 159         etree._namespace_map[str(self.DC)] = 'dc'
 160
 161         root = etree.Element(self.RDF('RDF'))
 162         description = etree.SubElement(root, self.RDF('Description'))
 163
 164         if self.wiki_url:
 165             description.set(self.RDF('about'), self.wiki_url)
 166
 167         for tag, (attribute, converter) in self.mapping.iteritems():
 168             if hasattr(self, attribute):
 169                 e = etree.Element(tag)
 170                 e.text = unicode(getattr(self, attribute))
 171                 description.append(e)
 172
 173         return unicode(etree.tostring(root, 'utf-8'), 'utf-8')
 174
 175     def to_dict(self):
 176         etree._namespace_map[str(self.RDF)] = 'rdf'
 177         etree._namespace_map[str(self.DC)] = 'dc'
 178
 179         result = {'about': self.wiki_url}
 180         for tag, (attribute, converter) in self.mapping.iteritems():
 181             if hasattr(self, attribute):
 182                 result[attribute] = unicode(getattr(self, attribute))
 183
 184         return result
 185
 186
 187 def parse(file_name):
 188     return BookInfo.from_file(file_name)
 189
 190
 191 if __name__ == '__main__':
 192     import sys
 193
 194     info = parse(sys.argv[1])
 195     for attribute, _ in BookInfo.mapping.values():
 196         print '%s: %r' % (attribute, getattr(info, attribute, None))
 197