importbooks / tasks for indexing
[wolnelektury.git] / apps / oai / handlers.py
1 from oaipmh import server, common, metadata, error
2 from catalogue.models import Book, Tag
3 from api.models import Deleted
4 from api.handlers import WL_BASE
5 from librarian.dcparser import BookInfo
6 from django.contrib.contenttypes.models import ContentType
7 from django.contrib.auth.models import User
8 from datetime import datetime
9 from lxml import etree
10 from lxml.etree import ElementTree
11 from django.db.models import Q
12 from django.conf import settings
13 from django.contrib.sites.models import Site
14
15
16 wl_dc_reader = metadata.MetadataReader(
17     fields={
18     'title':       ('textList', 'rdf:RDF/rdf:Description/dc:title/text()'),
19     'creator':     ('textList', 'rdf:RDF/rdf:Description/dc:creator/text()'),
20     'subject':     ('textList', 'rdf:RDF/rdf:Description/dc:subject.period/text() | rdf:RDF/rdf:Description/dc:subject.type/text() | rdf:RDF/rdf:Description/dc:subject.genre/text()'),
21     'description': ('textList', 'rdf:RDF/rdf:Description/dc:description/text()'),
22     'publisher':   ('textList', 'rdf:RDF/rdf:Description/dc:publisher/text()'),
23     'contributor': ('textList', 'rdf:RDF/rdf:Description/dc:contributor.editor/text() | rdf:RDF/rdf:Description/dc:contributor.translator/text() | rdf:RDF/rdf:Description/dc:contributor.technical_editor/text()'),
24     'date':        ('textList', 'rdf:RDF/rdf:Description/dc:date/text()'),
25     'type':        ('textList', 'rdf:RDF/rdf:Description/dc:type/text()'),
26     'format':      ('textList', 'rdf:RDF/rdf:Description/dc:format/text()'),
27     'identifier':  ('textList', 'rdf:RDF/rdf:Description/dc:identifier.url/text()'),
28     'source':      ('textList', 'rdf:RDF/rdf:Description/dc:source/text()'),
29     'language':    ('textList', 'rdf:RDF/rdf:Description/dc:language/text()'),
30     #    'relation':    ('textList', 'rdf:RDF/rdf:Description/dc:relation/text()'),
31     #    'coverage':    ('textList', 'rdf:RDF/rdf:Description/dc:coverage/text()'),
32     'rights':      ('textList', 'rdf:RDF/rdf:Description/dc:rights/text()')
33     },
34     namespaces={
35     'dc': 'http://purl.org/dc/elements/1.1/',
36     'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}
37     )
38
39
40 class Catalogue(common.ResumptionOAIPMH):
41     TAG_CATEGORIES = ['author', 'epoch', 'kind', 'genre']
42     
43     def __init__(self, metadata_registry):
44         super(Catalogue, self).__init__()
45         self.metadata_registry = metadata_registry
46
47         self.oai_id = "oai:"+Site.objects.get_current().domain+":%s"
48
49         # earliest change
50         year_zero = datetime(1990, 1, 1, 0, 0, 0)
51
52         try:
53             earliest_change = \
54                 Book.objects.order_by('changed_at')[0].changed_at
55         except: earliest_change = year_zero
56
57         try:
58             earliest_delete = \
59                 Deleted.objects.exclude(slug__exact=u'').ordery_by('deleted_at')[0].deleted_at
60         except: earliest_delete = year_zero
61
62         self.earliest_datestamp = earliest_change <= earliest_delete and \
63             earliest_change or earliest_delete
64
65     def metadata(self, book):
66         try:
67             xml = etree.parse(book.xml_file)
68         finally:
69             book.xml_file.close()
70         md = wl_dc_reader(xml)
71         return md.getMap()
72
73     def record_for_book(self, book, headers_only=False):
74         meta = None
75         identifier = self.slug_to_identifier(book.slug)
76         if isinstance(book, Book):
77             #            setSpec = map(self.tag_to_setspec, book.tags.filter(category__in=self.TAG_CATEGORIES))
78             header = common.Header(identifier, book.changed_at, [], False)
79             if not headers_only:
80                 meta = common.Metadata(self.metadata(book))
81             about = None
82         elif isinstance(book, Deleted):
83             header = common.Header(identifier, book.deleted_at, [], True)
84             if not headers_only:
85                 meta = common.Metadata({})
86             about = None
87         if headers_only:
88             return header
89         return header, meta, about
90
91     def identify(self, **kw):
92         ident = common.Identify(
93             'Wolne Lektury',  # generate
94             '%s/oaipmh' % WL_BASE,  # generate
95             '2.0',  # version
96             [m[1] for m in settings.MANAGERS],  # adminEmails
97             self.earliest_datestamp,  # earliest datestamp of any change
98             'persistent',  # deletedRecord
99             'YYYY-MM-DDThh:mm:ssZ',  # granularity
100             ['identity'],  # compression
101             []  # descriptions
102             )
103         return ident
104
105     def books(self, tag, from_, until):
106         if tag:
107             # we do not support sets, since they are problematic for deleted books.
108             raise errror.NoSetHierarchyError("Wolne Lektury does not support sets.")
109             # books = Book.tagged.with_all([tag])
110         else:
111             books = Book.objects.all()
112         deleted = Deleted.objects.exclude(slug__exact=u'')
113
114         books = books.order_by('changed_at')
115         deleted = deleted.order_by('deleted_at')
116         if from_:
117             books = books.filter(changed_at__gte=from_)
118             deleted = deleted.filter(deleted_at__gte=from_)
119         if until:
120             books = books.filter(changed_at__lte=until)
121             deleted = deleted.filter(deleted_at__lte=until)
122         return list(books) + list(deleted)
123
124     @staticmethod
125     def tag_to_setspec(tag):
126         return "%s:%s" % (tag.category, tag.slug)
127
128     @staticmethod
129     def setspec_to_tag(s):
130         if not s: return None
131         cs = s.split(':')
132         if len(cs) == 2:
133             if not cs[0] in Catalogue.TAG_CATEGORIES:
134                 raise error.NoSetHierarchyError("No category part in set")
135             tag = Tag.objects.get(slug=cs[1], category=cs[0])
136             return tag
137         raise error.NoSetHierarchyError("Setspec should have two components: category:slug")
138
139     def getRecord(self, **kw):
140         """
141 Returns (header, metadata, about) for given record.
142         """
143         slug = self.identifier_to_slug(kw['identifier'])
144         try:
145             book = Book.objects.get(slug=slug)
146             return self.record_for_book(book)
147         except Book.DoesNotExist:
148             book_type = ContentType.objects.get_for_model(Book)
149             try:
150                 deleted_book = Deleted.objects.get(content_type=book_type,
151                                                   slug=slug)
152             except:
153                 raise error.IdDoesNotExistError("No item for this identifier")
154             return self.record_for_book(deleted_book)
155
156     def validate_kw(self, kw):
157         if 'resumptionToken' in kw:
158             raise error.BadResumptionTokenError("No resumption token support at this point")
159         if 'metadataPrefix' in kw and not self.metadata_registry.hasWriter(kw['metadataPrefix']):
160             raise error.CannotDisseminateFormatError("This format is not supported")
161
162     def identifier_to_slug(self, ident):
163         return ident.split(':')[-1]
164
165     def slug_to_identifier(self, slug):
166         return self.oai_id % slug
167
168     def listIdentifiers(self, **kw):
169         self.validate_kw(kw)
170         records = [self.record_for_book(book, headers_only=True) for
171                    book in self.books(None,
172                            kw.get('from_', None),
173                            kw.get('until', None))]
174         return records, None
175
176     def listRecords(self, **kw):
177         """
178 can get a resumptionToken kw.
179 returns result, token
180         """
181         self.validate_kw(kw)
182         records = [self.record_for_book(book) for
183                    book in self.books(None,
184                            kw.get('from_', None),
185                            kw.get('until', None))]
186
187         return records, None
188
189     def listMetadataFormats(self, **kw):
190         formats = [('oai_dc',
191                  'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
192                  server.NS_OAIDC)]
193         if 'identifier' in kw:
194             slug = self.identifier_to_slug(kw['identifier'])
195             try:
196                 b = Book.objects.get(slug=slug)
197                 return formats
198             except:
199                 try:
200                     d = Deleted.objects.get(slug=slug)
201                     return []
202                 except:
203                     raise error.IdDoesNotExistError("This id does not exist")
204         else:
205             return formats
206
207     def listSets(self, **kw):
208         raise error.NoSetHierarchyError("Wolne Lektury does not support sets.")
209         # tags = []
210         # for category in Catalogue.TAG_CATEGORIES:
211         #     for tag in Tag.objects.filter(category=category):
212         #         tags.append(("%s:%s" % (tag.category, tag.slug),
213         #                      tag.name,
214         #                      tag.description))
215         # return tags, None
216
217