1 # -*- coding: utf-8 -*-
2 from xml.parsers.expat import ExpatError
3 from datetime import date
6 # Import ElementTree from anywhere
8 import xml.etree.ElementTree as etree # Python >= 2.5
11 import elementtree.ElementTree as etree # effbot's pure Python module
13 import lxml.etree as etree # ElementTree API using libxml2
20 """Single person with last name and a list of first names."""
21 def __init__(self, last_name, *first_names):
22 self.last_name = last_name
23 self.first_names = first_names
26 def __eq__(self, right):
27 return self.last_name == right.last_name and self.first_names == right.first_names
30 def __unicode__(self):
31 if len(self.first_names) > 0:
32 return '%s, %s' % (self.last_name, ' '.join(self.first_names))
38 return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names)
41 def str_to_unicode(value, previous):
45 def str_to_unicode_list(value, previous):
48 previous.append(str_to_unicode(value, None))
52 def str_to_person(value, previous):
53 comma_count = value.count(',')
56 last_name, first_names = value, []
57 elif comma_count == 1:
58 last_name, first_names = value.split(',')
59 first_names = [name for name in first_names.split(' ') if len(name)]
61 raise ValueError("value contains more than one comma: %r" % value)
63 return Person(last_name.strip(), *first_names)
66 def str_to_date(value, previous):
68 t = time.strptime(value, '%Y-%m-%d')
70 t = time.strptime(value, '%Y')
71 return date(t[0], t[1], t[2])
77 class ParseError(Exception):
78 def __init__(self, message):
79 super(ParseError, self).__init__(message)
82 class XMLNamespace(object):
83 '''Represents XML namespace.'''
85 def __init__(self, uri):
88 def __call__(self, tag):
89 return '{%s}%s' % (self.uri, tag)
91 def __contains__(self, tag):
92 return tag.startswith(str(self))
95 return 'XMLNamespace(%r)' % self.uri
98 return '%s' % self.uri
101 class BookInfo(object):
102 RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
103 DC = XMLNamespace('http://purl.org/dc/elements/1.1/')
106 DC('creator') : ('author', str_to_person),
107 DC('title') : ('title', str_to_unicode),
108 DC('subject.period') : ('epoch', str_to_unicode),
109 DC('subject.type') : ('kind', str_to_unicode),
110 DC('subject.genre') : ('genre', str_to_unicode),
111 DC('date') : ('created_at', str_to_date),
112 DC('date.pd') : ('released_to_public_domain_at', str_to_date),
113 DC('contributor.translator') : ('translator', str_to_person),
114 DC('contributor.technical_editor') : ('technical_editor', str_to_person),
115 DC('publisher') : ('publisher', str_to_unicode),
116 DC('source') : ('source_name', str_to_unicode),
117 DC('source.URL') : ('source_url', str_to_unicode),
118 DC('identifier.url') : ('url', str_to_unicode),
119 DC('relation.hasPart') : ('parts', str_to_unicode_list),
123 def from_string(cls, xml):
124 from StringIO import StringIO
125 return cls.from_file(StringIO(xml))
128 def from_file(cls, xml_file):
132 tree = etree.parse(xml_file)
133 except ExpatError, e:
136 description = tree.find('//' + book_info.RDF('Description'))
137 book_info.wiki_url = description.get(cls.RDF('about'), None)
139 if description is None:
140 raise ParseError('no Description tag found in document')
142 for element in description.findall('*'):
143 book_info.parse_element(element)
147 def parse_element(self, element):
149 attribute, converter = self.mapping[element.tag]
150 setattr(self, attribute, converter(element.text, getattr(self, attribute, None)))
155 """XML representation of this object."""
156 etree._namespace_map[str(self.RDF)] = 'rdf'
157 etree._namespace_map[str(self.DC)] = 'dc'
159 root = etree.Element(self.RDF('RDF'))
160 description = etree.SubElement(root, self.RDF('Description'))
163 description.set(self.RDF('about'), self.wiki_url)
165 for tag, (attribute, converter) in self.mapping.iteritems():
166 if hasattr(self, attribute):
167 e = etree.Element(tag)
168 e.text = unicode(getattr(self, attribute))
169 description.append(e)
171 return unicode(etree.tostring(root, 'utf-8'), 'utf-8')
174 etree._namespace_map[str(self.RDF)] = 'rdf'
175 etree._namespace_map[str(self.DC)] = 'dc'
177 result = {'about': self.wiki_url}
178 for tag, (attribute, converter) in self.mapping.iteritems():
179 if hasattr(self, attribute):
180 result[attribute] = unicode(getattr(self, attribute))
185 def parse(file_name):
186 return BookInfo.from_file(file_name)
189 if __name__ == '__main__':
192 info = parse(sys.argv[1])
193 for attribute, _ in BookInfo.mapping.values():
194 print '%s: %r' % (attribute, getattr(info, attribute, None))