Fix XML entities left from MathML.
[librarian.git] / scripts / genslugs
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
5 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 #
7 from __future__ import print_function, unicode_literals
8
9 import os
10 import optparse
11
12 from lxml import etree
13 from librarian import html
14 from slughifi import slughifi
15
16
17 BOOK_URL = 'http://wolnelektury.pl/katalog/lektura/'
18
19
20 if __name__ == '__main__':
21     # Parse commandline arguments
22     usage = """Usage: %prog [options] SOURCE [SOURCE...]
23     Generate slugs for SOURCE."""
24
25     parser = optparse.OptionParser(usage=usage)
26
27     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
28         help='print status messages to stdout')
29     parser.add_option('-f', '--force', action='store_true', dest='force', default=False,
30         help='overwrite current identifiers')
31
32     options, input_filenames = parser.parse_args()
33
34     if len(input_filenames) < 1:
35         parser.print_help()
36         exit(1)
37
38     # Do some real work
39     for input_filename in input_filenames:
40         if options.verbose:
41             print(input_filename)
42
43         doc = etree.parse(input_filename)
44         try:
45             title = doc.find('//{http://purl.org/dc/elements/1.1/}title').text
46         except AttributeError:
47             print('%s:error:Book title not found. Skipping.' % input_filename)
48             continue
49
50         parent = ''
51         try:
52             parent_url = doc.find('//{http://purl.org/dc/elements/1.1/}relation.isPartOf').text
53             parent = parent_url.rsplit('/', 1)[1] + ' '
54         except AttributeError:
55             pass
56         except IndexError:
57             print('%s:error:Invalid parent URL "%s". Skipping.' % (input_filename, parent_url))
58
59         book_url = doc.find('//{http://purl.org/dc/elements/1.1/}identifier.url')
60         if book_url is None:
61             book_description = doc.find('//{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description')
62             book_url = etree.SubElement(book_description, '{http://purl.org/dc/elements/1.1/}identifier.url')
63         if not options.force and book_url.text.startswith('http://'):
64             print('%s:Notice:Book already has identifier URL "%s". Skipping.' % (input_filename, book_url.text))
65             continue
66
67         book_url.text = BOOK_URL + slughifi(parent + title)[:60]
68
69         doc.write(input_filename, xml_declaration=True, pretty_print=True, encoding='utf-8')
70