pipeline bug
[wolnelektury.git] / apps / oai / handlers.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from oaipmh import server, common, metadata, error
6 from catalogue.models import Book, Tag
7 from api.models import Deleted
8 from api.handlers import WL_BASE
9 from librarian import WLURI
10 from django.contrib.contenttypes.models import ContentType
11 from datetime import datetime
12 from lxml import etree
13 from django.conf import settings
14 from django.contrib.sites.models import Site
15 from django.utils import timezone
16
17
18 make_time_naive = lambda d: timezone.localtime(d).replace(tzinfo=None)
19
20 WL_DC_READER_XPATH = '(.|*)/rdf:RDF/rdf:Description/%s/text()'
21 wl_dc_reader = metadata.MetadataReader(
22     fields={
23     'title':       ('textList', WL_DC_READER_XPATH % 'dc:title'),
24     'creator':     ('textList', WL_DC_READER_XPATH % 'dc:creator'),
25     'subject':     ('textList', (WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH) %
26                     ('dc:subject.period', 'dc:subject.type', 'dc:subject.genre')),
27     'description': ('textList', WL_DC_READER_XPATH % 'dc:description'),
28     'publisher':   ('textList', WL_DC_READER_XPATH % 'dc:publisher'),
29     'contributor': ('textList', (WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH) %
30                     ('dc:contributor.editor', 'dc:contributor.translator', 'dc:contributor.technical_editor')),
31     'date':        ('textList', WL_DC_READER_XPATH % 'dc:date'),
32     'type':        ('textList', WL_DC_READER_XPATH % 'dc:type'),
33     'format':      ('textList', WL_DC_READER_XPATH % 'dc:format'),
34     'identifier':  ('textList', WL_DC_READER_XPATH % 'dc:identifier.url'),
35     'source':      ('textList', WL_DC_READER_XPATH % 'dc:source'),
36     'language':    ('textList', WL_DC_READER_XPATH % 'dc:language'),
37     #'isPartOf':     ('textList', 'rdf:RDF/rdf:Description/dc:relation.isPartOf/text()'),
38     'hasPart':     ('textList', WL_DC_READER_XPATH % 'dc:relation.hasPart'),
39     #    'relation':    ('textList', 'rdf:RDF/rdf:Description/dc:relation/text()'),
40     #    'coverage':    ('textList', 'rdf:RDF/rdf:Description/dc:coverage/text()'),
41     'rights':      ('textList', WL_DC_READER_XPATH % 'dc:rights')
42     },
43     namespaces={
44     'dc': 'http://purl.org/dc/elements/1.1/',
45     'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}
46     )
47
48
49 NS_DCTERMS = "http://purl.org/dc/terms/"
50
51
52 def nsdcterms(name):
53     return '{%s}%s' % (NS_DCTERMS, name)
54
55
56 class Catalogue(common.ResumptionOAIPMH):
57     TAG_CATEGORIES = ['author', 'epoch', 'kind', 'genre']
58
59     def __init__(self, metadata_registry):
60         super(Catalogue, self).__init__()
61         self.metadata_registry = metadata_registry
62
63         self.oai_id = "oai:" + Site.objects.get_current().domain + ":%s"
64
65         # earliest change
66         year_zero = timezone.make_aware(datetime(1990, 1, 1, 0, 0, 0), timezone.utc)
67
68         try:
69             earliest_change = \
70                 Book.objects.order_by('changed_at')[0].changed_at
71         except: earliest_change = year_zero
72
73         try:
74             earliest_delete = \
75                 Deleted.objects.exclude(slug__exact=u'').ordery_by('deleted_at')[0].deleted_at
76         except: earliest_delete = year_zero
77
78         self.earliest_datestamp = earliest_change <= earliest_delete and \
79             earliest_change or earliest_delete
80
81     def metadata(self, book):
82         try:
83             xml = etree.parse(book.xml_file)
84         finally:
85             book.xml_file.close()
86         md = wl_dc_reader(xml)
87         m = md.getMap()
88         if book.parent:
89             m['isPartOf'] = [str(WLURI.from_slug(book.parent.slug))]
90         return m
91
92     def record_for_book(self, book, headers_only=False):
93         meta = None
94         identifier = self.slug_to_identifier(book.slug)
95         if isinstance(book, Book):
96             #            setSpec = map(self.tag_to_setspec, book.tags.filter(category__in=self.TAG_CATEGORIES))
97             header = common.Header(identifier, make_time_naive(book.changed_at), [], False)
98             if not headers_only:
99                 meta = common.Metadata(self.metadata(book))
100             about = None
101         elif isinstance(book, Deleted):
102             header = common.Header(identifier, make_time_naive(book.deleted_at), [], True)
103             if not headers_only:
104                 meta = common.Metadata({})
105             about = None
106         if headers_only:
107             return header
108         return header, meta, about
109
110     def identify(self, **kw):
111         ident = common.Identify(
112             'Wolne Lektury',  # generate
113             '%s/oaipmh' % unicode(WL_BASE),  # generate
114             '2.0',  # version
115             [m[1] for m in settings.MANAGERS],  # adminEmails
116             make_time_naive(self.earliest_datestamp),  # earliest datestamp of any change
117             'persistent',  # deletedRecord
118             'YYYY-MM-DDThh:mm:ssZ',  # granularity
119             ['identity'],  # compression
120             []  # descriptions
121             )
122         return ident
123
124     def books(self, tag, from_, until):
125         if tag:
126             # we do not support sets, since they are problematic for deleted books.
127             raise error.NoSetHierarchyError("Wolne Lektury does not support sets.")
128             # books = Book.tagged.with_all([tag])
129         else:
130             books = Book.objects.all()
131         deleted = Deleted.objects.exclude(slug__exact=u'')
132
133         books = books.order_by('changed_at')
134         deleted = deleted.order_by('deleted_at')
135         if from_:
136             books = books.filter(changed_at__gte=from_)
137             deleted = deleted.filter(deleted_at__gte=from_)
138         if until:
139             books = books.filter(changed_at__lte=until)
140             deleted = deleted.filter(deleted_at__lte=until)
141         return list(books) + list(deleted)
142
143     @staticmethod
144     def tag_to_setspec(tag):
145         return "%s:%s" % (tag.category, tag.slug)
146
147     @staticmethod
148     def setspec_to_tag(s):
149         if not s: return None
150         cs = s.split(':')
151         if len(cs) == 2:
152             if not cs[0] in Catalogue.TAG_CATEGORIES:
153                 raise error.NoSetHierarchyError("No category part in set")
154             tag = Tag.objects.get(slug=cs[1], category=cs[0])
155             return tag
156         raise error.NoSetHierarchyError("Setspec should have two components: category:slug")
157
158     def getRecord(self, **kw):
159         """
160 Returns (header, metadata, about) for given record.
161         """
162         slug = self.identifier_to_slug(kw['identifier'])
163         try:
164             book = Book.objects.get(slug=slug)
165             return self.record_for_book(book)
166         except Book.DoesNotExist:
167             book_type = ContentType.objects.get_for_model(Book)
168             try:
169                 deleted_book = Deleted.objects.get(content_type=book_type,
170                                                   slug=slug)
171             except:
172                 raise error.IdDoesNotExistError("No item for this identifier")
173             return self.record_for_book(deleted_book)
174
175     def validate_kw(self, kw):
176         if 'resumptionToken' in kw:
177             raise error.BadResumptionTokenError("No resumption token support at this point")
178         if 'metadataPrefix' in kw and not self.metadata_registry.hasWriter(kw['metadataPrefix']):
179             raise error.CannotDisseminateFormatError("This format is not supported")
180
181     def identifier_to_slug(self, ident):
182         return ident.split(':')[-1]
183
184     def slug_to_identifier(self, slug):
185         return self.oai_id % slug
186
187     def listIdentifiers(self, **kw):
188         self.validate_kw(kw)
189         records = [self.record_for_book(book, headers_only=True) for
190                    book in self.books(None,
191                            kw.get('from_', None),
192                            kw.get('until', None))]
193         return records, None
194
195     def listRecords(self, **kw):
196         """
197 can get a resumptionToken kw.
198 returns result, token
199         """
200         self.validate_kw(kw)
201         records = [self.record_for_book(book) for
202                    book in self.books(None,
203                            kw.get('from_', None),
204                            kw.get('until', None))]
205
206         return records, None
207
208     def listMetadataFormats(self, **kw):
209         formats = [
210             ('oai_dc',
211              'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
212              server.NS_OAIDC),
213             ('qdc',
214              'http://dublincore.org/schemas/xmls/qdc/2006/01/06/dcterms.xsd',
215              NS_DCTERMS)]
216         if 'identifier' in kw:
217             slug = self.identifier_to_slug(kw['identifier'])
218             try:
219                 b = Book.objects.get(slug=slug)
220                 return formats
221             except:
222                 try:
223                     d = Deleted.objects.get(slug=slug)
224                     return []
225                 except:
226                     raise error.IdDoesNotExistError("This id does not exist")
227         else:
228             return formats
229
230     def listSets(self, **kw):
231         raise error.NoSetHierarchyError("Wolne Lektury does not support sets.")
232         # tags = []
233         # for category in Catalogue.TAG_CATEGORIES:
234         #     for tag in Tag.objects.filter(category=category):
235         #         tags.append(("%s:%s" % (tag.category, tag.slug),
236         #                      tag.name,
237         #                      tag.description))
238         # return tags, None