X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/357027375ff8867f42ca34bcbfb5a78b5b185fc3..0d2ad838d03a600b034f6355a1d43f74debd62eb:/src/catalogue/helpers.py diff --git a/src/catalogue/helpers.py b/src/catalogue/helpers.py index 7ca2cbd3a..38e2a87a1 100644 --- a/src/catalogue/helpers.py +++ b/src/catalogue/helpers.py @@ -2,57 +2,87 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from django.contrib.contenttypes.models import ContentType -from django.db.models import Count +from django.conf import settings from .models import Tag, Book +from os.path import getmtime +import cPickle +from collections import defaultdict BOOK_CATEGORIES = ('author', 'epoch', 'genre', 'kind') +_COUNTERS = None +_COUNTER_TIME = None -def get_top_level_related_tags(tags=None, categories=BOOK_CATEGORIES): + +def get_top_level_related_tags(tags, categories=None): """ Finds tags related to given tags through books, and counts their usage. Takes ancestry into account: if a tag is applied to a book, its usage on the book's descendants is ignored. + """ + global _COUNTERS, _COUNTER_TIME + # First, check that we have a valid and recent version of the counters. + if getmtime(settings.CATALOGUE_COUNTERS_FILE) > _COUNTER_TIME: + with open(settings.CATALOGUE_COUNTERS_FILE) as f: + _COUNTERS = cPickle.load(f) - This is tested for PostgreSQL 9.1+, and might not work elsewhere. - It particular, it uses raw SQL using WITH clause, which is - supported in SQLite from v. 3.8.3, and is missing in MySQL. - http://bugs.mysql.com/bug.php?id=16244 + tagids = tuple(sorted(t.pk for t in tags)) + try: + related_ids = _COUNTERS['next'][tagids] + except KeyError: + return - """ - # First, find all tag relations of relevant books. - bct = ContentType.objects.get_for_model(Book) - relations = Tag.intermediary_table_model.objects.filter( - content_type=bct) - if tags is not None: - tagged_books = Book.tagged.with_all(tags).only('pk') - relations = relations.filter( - object_id__in=tagged_books).exclude( - tag_id__in=[tag.pk for tag in tags]) - - rel_sql, rel_params = relations.query.sql_with_params() - - # Exclude those relations between a book and a tag, - # for which there is a relation between the book's ancestor - # and the tag and - - return Tag.objects.raw(''' - WITH AllTagged AS (''' + rel_sql + ''') - SELECT catalogue_tag.*, COUNT(catalogue_tag.id) AS count - FROM catalogue_tag, AllTagged - WHERE catalogue_tag.id=AllTagged.tag_id - AND catalogue_tag.category IN %s - AND NOT EXISTS ( - SELECT AncestorTagged.id - FROM catalogue_book_ancestor Ancestor, - AllTagged AncestorTagged - WHERE Ancestor.from_book_id=AllTagged.object_id - AND AncestorTagged.content_type_id=%s - AND AncestorTagged.object_id=Ancestor.to_book_id - AND AncestorTagged.tag_id=AllTagged.tag_id - ) - GROUP BY catalogue_tag.id - ORDER BY sort_key''', rel_params + (categories, bct.pk)) + related = Tag.objects.filter(pk__in=related_ids) + + # TODO: do we really need that? + if categories is not None: + related = related.filter(category__in=categories) + + for tag in related: + tag.count = _COUNTERS['count'][tuple(sorted(tagids + (tag.pk,)))] + yield tag + + +def update_counters(): + def combinations(things): + if len(things): + for c in combinations(things[1:]): + yield c + yield (things[0],) + c + else: + yield () + + def count_for_book(book, count_by_combination=None, parent_combinations=None): + if not parent_combinations: + parent_combinations = set() + tags = sorted(tuple(t.pk for t in book.tags.filter(category__in=('author', 'genre', 'epoch', 'kind')))) + combs = list(combinations(tags)) + for c in combs: + if c not in parent_combinations: + count_by_combination[c] += 1 + combs_for_child = set(list(parent_combinations) + combs) + for child in book.children.all(): + count_for_book(child, count_by_combination, combs_for_child) + + count_by_combination = defaultdict(lambda: 0) + for b in Book.objects.filter(parent=None): + count_for_book(b, count_by_combination) + + next_combinations = defaultdict(set) + # Now build an index of all combinations. + for c in count_by_combination.keys(): + if not c: + continue + for n in c: + rest = tuple(x for x in c if x != n) + next_combinations[rest].add(n) + + counters = { + "count": dict(count_by_combination), + "next": dict(next_combinations), + } + + with open(settings.CATALOGUE_COUNTERS_FILE, 'w') as f: + cPickle.dump(counters, f)