Cleaning: timezone issues, deprecated urls.py imports, missing notes.
[wolnelektury.git] / apps / oai / handlers.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from oaipmh import server, common, metadata, error
6 from catalogue.models import Book, Tag
7 from api.models import Deleted
8 from api.handlers import WL_BASE
9 from librarian.dcparser import BookInfo
10 from librarian import WLURI
11 from django.contrib.contenttypes.models import ContentType
12 from django.contrib.auth.models import User
13 from datetime import datetime
14 from lxml import etree
15 from lxml.etree import ElementTree
16 from django.db.models import Q
17 from django.conf import settings
18 from django.contrib.sites.models import Site
19 from django.utils import timezone
20
21
22 make_time_naive = lambda d: timezone.localtime(d).replace(tzinfo=None)
23
24 WL_DC_READER_XPATH = '(.|*)/rdf:RDF/rdf:Description/%s/text()' 
25 wl_dc_reader = metadata.MetadataReader(
26     fields={
27     'title':       ('textList', WL_DC_READER_XPATH % 'dc:title'),
28     'creator':     ('textList', WL_DC_READER_XPATH % 'dc:creator'),
29     'subject':     ('textList', (WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH) %
30                     ('dc:subject.period', 'dc:subject.type', 'dc:subject.genre')),
31     'description': ('textList', WL_DC_READER_XPATH % 'dc:description'),
32     'publisher':   ('textList', WL_DC_READER_XPATH % 'dc:publisher'),
33     'contributor': ('textList', (WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH + ' | ' + WL_DC_READER_XPATH) %
34                     ('dc:contributor.editor', 'dc:contributor.translator', 'dc:contributor.technical_editor')),
35     'date':        ('textList', WL_DC_READER_XPATH % 'dc:date'),
36     'type':        ('textList', WL_DC_READER_XPATH % 'dc:type'),
37     'format':      ('textList', WL_DC_READER_XPATH % 'dc:format'),
38     'identifier':  ('textList', WL_DC_READER_XPATH % 'dc:identifier.url'),
39     'source':      ('textList', WL_DC_READER_XPATH % 'dc:source'),
40     'language':    ('textList', WL_DC_READER_XPATH % 'dc:language'),
41     #'isPartOf':     ('textList', 'rdf:RDF/rdf:Description/dc:relation.isPartOf/text()'),
42     'hasPart':     ('textList', WL_DC_READER_XPATH % 'dc:relation.hasPart'),
43     #    'relation':    ('textList', 'rdf:RDF/rdf:Description/dc:relation/text()'),
44     #    'coverage':    ('textList', 'rdf:RDF/rdf:Description/dc:coverage/text()'),
45     'rights':      ('textList', WL_DC_READER_XPATH % 'dc:rights')
46     },
47     namespaces={
48     'dc': 'http://purl.org/dc/elements/1.1/',
49     'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}
50     )
51
52
53 NS_DCTERMS = "http://purl.org/dc/terms/"
54
55
56 def nsdcterms(name):
57     return '{%s}%s' % (NS_DCTERMS, name)
58
59
60 class Catalogue(common.ResumptionOAIPMH):
61     TAG_CATEGORIES = ['author', 'epoch', 'kind', 'genre']
62
63     def __init__(self, metadata_registry):
64         super(Catalogue, self).__init__()
65         self.metadata_registry = metadata_registry
66
67         self.oai_id = "oai:" + Site.objects.get_current().domain + ":%s"
68
69         # earliest change
70         year_zero = timezone.make_aware(datetime(1990, 1, 1, 0, 0, 0), timezone.utc)
71
72         try:
73             earliest_change = \
74                 Book.objects.order_by('changed_at')[0].changed_at
75         except: earliest_change = year_zero
76
77         try:
78             earliest_delete = \
79                 Deleted.objects.exclude(slug__exact=u'').ordery_by('deleted_at')[0].deleted_at
80         except: earliest_delete = year_zero
81
82         self.earliest_datestamp = earliest_change <= earliest_delete and \
83             earliest_change or earliest_delete
84
85     def metadata(self, book):
86         try:
87             xml = etree.parse(book.xml_file)
88         finally:
89             book.xml_file.close()
90         md = wl_dc_reader(xml)
91         m = md.getMap()
92         if book.parent:
93             m['isPartOf'] = [str(WLURI.from_slug(book.parent.slug))]
94         return m
95
96     def record_for_book(self, book, headers_only=False):
97         meta = None
98         identifier = self.slug_to_identifier(book.slug)
99         if isinstance(book, Book):
100             #            setSpec = map(self.tag_to_setspec, book.tags.filter(category__in=self.TAG_CATEGORIES))
101             header = common.Header(identifier, make_time_naive(book.changed_at), [], False)
102             if not headers_only:
103                 meta = common.Metadata(self.metadata(book))
104             about = None
105         elif isinstance(book, Deleted):
106             header = common.Header(identifier, make_time_naive(book.deleted_at), [], True)
107             if not headers_only:
108                 meta = common.Metadata({})
109             about = None
110         if headers_only:
111             return header
112         return header, meta, about
113
114     def identify(self, **kw):
115         ident = common.Identify(
116             'Wolne Lektury',  # generate
117             '%s/oaipmh' % WL_BASE,  # generate
118             '2.0',  # version
119             [m[1] for m in settings.MANAGERS],  # adminEmails
120             make_time_naive(self.earliest_datestamp),  # earliest datestamp of any change
121             'persistent',  # deletedRecord
122             'YYYY-MM-DDThh:mm:ssZ',  # granularity
123             ['identity'],  # compression
124             []  # descriptions
125             )
126         return ident
127
128     def books(self, tag, from_, until):
129         if tag:
130             # we do not support sets, since they are problematic for deleted books.
131             raise error.NoSetHierarchyError("Wolne Lektury does not support sets.")
132             # books = Book.tagged.with_all([tag])
133         else:
134             books = Book.objects.all()
135         deleted = Deleted.objects.exclude(slug__exact=u'')
136
137         books = books.order_by('changed_at')
138         deleted = deleted.order_by('deleted_at')
139         if from_:
140             books = books.filter(changed_at__gte=from_)
141             deleted = deleted.filter(deleted_at__gte=from_)
142         if until:
143             books = books.filter(changed_at__lte=until)
144             deleted = deleted.filter(deleted_at__lte=until)
145         return list(books) + list(deleted)
146
147     @staticmethod
148     def tag_to_setspec(tag):
149         return "%s:%s" % (tag.category, tag.slug)
150
151     @staticmethod
152     def setspec_to_tag(s):
153         if not s: return None
154         cs = s.split(':')
155         if len(cs) == 2:
156             if not cs[0] in Catalogue.TAG_CATEGORIES:
157                 raise error.NoSetHierarchyError("No category part in set")
158             tag = Tag.objects.get(slug=cs[1], category=cs[0])
159             return tag
160         raise error.NoSetHierarchyError("Setspec should have two components: category:slug")
161
162     def getRecord(self, **kw):
163         """
164 Returns (header, metadata, about) for given record.
165         """
166         slug = self.identifier_to_slug(kw['identifier'])
167         try:
168             book = Book.objects.get(slug=slug)
169             return self.record_for_book(book)
170         except Book.DoesNotExist:
171             book_type = ContentType.objects.get_for_model(Book)
172             try:
173                 deleted_book = Deleted.objects.get(content_type=book_type,
174                                                   slug=slug)
175             except:
176                 raise error.IdDoesNotExistError("No item for this identifier")
177             return self.record_for_book(deleted_book)
178
179     def validate_kw(self, kw):
180         if 'resumptionToken' in kw:
181             raise error.BadResumptionTokenError("No resumption token support at this point")
182         if 'metadataPrefix' in kw and not self.metadata_registry.hasWriter(kw['metadataPrefix']):
183             raise error.CannotDisseminateFormatError("This format is not supported")
184
185     def identifier_to_slug(self, ident):
186         return ident.split(':')[-1]
187
188     def slug_to_identifier(self, slug):
189         return self.oai_id % slug
190
191     def listIdentifiers(self, **kw):
192         self.validate_kw(kw)
193         records = [self.record_for_book(book, headers_only=True) for
194                    book in self.books(None,
195                            kw.get('from_', None),
196                            kw.get('until', None))]
197         return records, None
198
199     def listRecords(self, **kw):
200         """
201 can get a resumptionToken kw.
202 returns result, token
203         """
204         self.validate_kw(kw)
205         records = [self.record_for_book(book) for
206                    book in self.books(None,
207                            kw.get('from_', None),
208                            kw.get('until', None))]
209
210         return records, None
211
212     def listMetadataFormats(self, **kw):
213         formats = [
214             ('oai_dc',
215              'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
216              server.NS_OAIDC),
217             ('qdc',
218              'http://dublincore.org/schemas/xmls/qdc/2006/01/06/dcterms.xsd',
219              NS_DCTERMS)]
220         if 'identifier' in kw:
221             slug = self.identifier_to_slug(kw['identifier'])
222             try:
223                 b = Book.objects.get(slug=slug)
224                 return formats
225             except:
226                 try:
227                     d = Deleted.objects.get(slug=slug)
228                     return []
229                 except:
230                     raise error.IdDoesNotExistError("This id does not exist")
231         else:
232             return formats
233
234     def listSets(self, **kw):
235         raise error.NoSetHierarchyError("Wolne Lektury does not support sets.")
236         # tags = []
237         # for category in Catalogue.TAG_CATEGORIES:
238         #     for tag in Tag.objects.filter(category=category):
239         #         tags.append(("%s:%s" % (tag.category, tag.slug),
240         #                      tag.name,
241         #                      tag.description))
242         # return tags, None