Wikidata in catalogue.
[redakcja.git] / src / documents / management / __init__.py
1 # This file is part of FNP-Redakcja, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from collections import defaultdict
5 from django.db import transaction
6 from lxml import etree
7
8
9 class XmlUpdater(object):
10     """A base class for massive XML updates.
11
12     In a subclass, override `fix_tree` and/or use `fixes_field` decorator.
13     Attributes:
14     * commit_desc: commits description
15     * retain_publishable: set publishable if head is (default: True)
16     * only_first_chunk: process only first chunks of books (default: False)
17     """
18     commit_desc = "auto-update"
19     retain_publishable = True
20     only_first_chunk = False
21
22     _element_fixers = defaultdict(list)
23
24     def __init__(self):
25         self.counters = defaultdict(lambda: 0)
26
27     @classmethod
28     def fixes_elements(cls, xpath):
29         """Decorator, registering a function as a fixer for given field type.
30
31         Any decorated function will be called like
32             f(element, change=..., verbose=...)
33         providing changeset as context.
34
35         :param xpath: element lookup, e.g. ".//{namespace-uri}tag-name"
36         :returns: True if anything changed
37         """
38         def wrapper(fixer):
39             cls._element_fixers[xpath].append(fixer)
40             return fixer
41         return wrapper
42
43     def fix_tree(self, tree, verbose):
44         """Override to provide general tree-fixing mechanism.
45
46         :param tree: the parsed XML tree
47         :param verbose: verbosity level
48         :returns: True if anythig changed
49         """
50         return False
51
52     def fix_chunk(self, chunk, user, verbose=0, dry_run=False):
53         """Runs the update for a single chunk."""
54         if verbose >= 2:
55             print(chunk.get_absolute_url())
56         old_head = chunk.head
57         src = old_head.materialize()
58         try:
59             tree = etree.fromstring(src)
60         except:
61             if verbose:
62                 print("%s: invalid XML" % chunk.get_absolute_url())
63             self.counters['Bad XML'] += 1
64             return
65
66         dirty = False
67         # Call the general fixing function.
68         if self.fix_tree(tree, verbose=verbose):
69             dirty = True
70         # Call the registered fixers.
71         for xpath, fixers in self._element_fixers.items():
72             for elem in tree.findall(xpath):
73                 for fixer in fixers:
74                     if fixer(elem, change=old_head, verbose=verbose):
75                         dirty = True
76
77         if not dirty:
78             self.counters['Clean'] += 1
79             return
80
81         if not dry_run:
82             new_head = chunk.commit(
83                 etree.tostring(tree, encoding='unicode'),
84                 author=user,
85                 description=self.commit_desc
86             )
87             if self.retain_publishable:
88                 if old_head.publishable:
89                     new_head.set_publishable(True)
90         if verbose >= 2:
91             print("done")
92         self.counters['Updated chunks'] += 1
93
94     def run(self, user, verbose=0, dry_run=False, books=None):
95         """Runs the actual update."""
96         if books is None:
97             from documents.models import Book
98             books = Book.objects.all()
99
100         # Start transaction management.
101         with transaction.atomic():
102             for book in books:
103                 self.counters['All books'] += 1
104                 chunks = book.chunk_set.all()
105                 if self.only_first_chunk:
106                     chunks = chunks[:1]
107                 for chunk in chunks:
108                     self.counters['All chunks'] += 1
109                     self.fix_chunk(chunk, user, verbose, dry_run)
110
111     def print_results(self):
112         """Prints the counters."""
113         for item in sorted(self.counters.items()):
114             print("%s: %d" % item)