ignore abstrakt tags when indexing book
[wolnelektury.git] / src / catalogue / helpers.py
index 7ca2cbd..4dd7bda 100644 (file)
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
+from django.conf import settings
 from django.contrib.contenttypes.models import ContentType
 from django.contrib.contenttypes.models import ContentType
-from django.db.models import Count
+from django.core.cache import cache
+
 from .models import Tag, Book
 from .models import Tag, Book
+from os.path import getmtime
+import cPickle
+from collections import defaultdict
 
 
 BOOK_CATEGORIES = ('author', 'epoch', 'genre', 'kind')
 
 
 
 BOOK_CATEGORIES = ('author', 'epoch', 'genre', 'kind')
 
+_COUNTERS = None
+_COUNTER_TIME = None
+
 
 
-def get_top_level_related_tags(tags=None, categories=BOOK_CATEGORIES):
+def get_top_level_related_tags(tags, categories=None):
     """
     Finds tags related to given tags through books, and counts their usage.
 
     Takes ancestry into account: if a tag is applied to a book, its
     usage on the book's descendants is ignored.
     """
     Finds tags related to given tags through books, and counts their usage.
 
     Takes ancestry into account: if a tag is applied to a book, its
     usage on the book's descendants is ignored.
+    """
+    global _COUNTERS, _COUNTER_TIME
+    # First, check that we have a valid and recent version of the counters.
+    if getmtime(settings.CATALOGUE_COUNTERS_FILE) > _COUNTER_TIME:
+        for i in xrange(10):
+            try:
+                with open(settings.CATALOGUE_COUNTERS_FILE) as f:
+                    _COUNTERS = cPickle.load(f)
+            except (EOFError, ValueError):
+                if i < 9:
+                    continue
+                else:
+                    raise
+            else:
+                break
 
 
-    This is tested for PostgreSQL 9.1+, and might not work elsewhere.
-    It particular, it uses raw SQL using WITH clause, which is
-    supported in SQLite from v. 3.8.3, and is missing in MySQL.
-    http://bugs.mysql.com/bug.php?id=16244
+    tagids = tuple(sorted(t.pk for t in tags))
+    try:
+        related_ids = _COUNTERS['next'][tagids]
+    except KeyError:
+        return
 
 
-    """
-    # First, find all tag relations of relevant books.
-    bct = ContentType.objects.get_for_model(Book)
-    relations = Tag.intermediary_table_model.objects.filter(
-        content_type=bct)
-    if tags is not None:
-        tagged_books = Book.tagged.with_all(tags).only('pk')
-        relations = relations.filter(
-            object_id__in=tagged_books).exclude(
-            tag_id__in=[tag.pk for tag in tags])
-
-    rel_sql, rel_params = relations.query.sql_with_params()
-
-    # Exclude those relations between a book and a tag,
-    # for which there is a relation between the book's ancestor
-    # and the tag and 
-
-    return Tag.objects.raw('''
-        WITH AllTagged AS (''' + rel_sql + ''')
-        SELECT catalogue_tag.*, COUNT(catalogue_tag.id) AS count
-        FROM catalogue_tag, AllTagged
-        WHERE catalogue_tag.id=AllTagged.tag_id
-            AND catalogue_tag.category IN %s
-            AND NOT EXISTS (
-                SELECT AncestorTagged.id
-                FROM catalogue_book_ancestor Ancestor,
-                    AllTagged AncestorTagged
-                WHERE Ancestor.from_book_id=AllTagged.object_id
-                    AND AncestorTagged.content_type_id=%s
-                    AND AncestorTagged.object_id=Ancestor.to_book_id
-                    AND AncestorTagged.tag_id=AllTagged.tag_id
-            )
-        GROUP BY catalogue_tag.id
-        ORDER BY sort_key''', rel_params + (categories, bct.pk))
+    related = Tag.objects.filter(pk__in=related_ids)
+
+    if categories is not None:
+        related = related.filter(category__in=categories)
+
+    for tag in related:
+        tag.count = _COUNTERS['count'][tuple(sorted(tagids + (tag.pk,)))]
+        yield tag
+
+
+def update_counters():
+    def combinations(things):
+        if len(things):
+            for c in combinations(things[1:]):
+                yield c
+                yield (things[0],) + c
+        else:
+            yield ()
+
+    def count_for_book(book, count_by_combination=None, parent_combinations=None):
+        if not parent_combinations:
+            parent_combinations = set()
+        tags = sorted(book.tags.filter(category__in=('author', 'genre', 'epoch', 'kind')).values_list('pk', flat=True))
+        combs = list(combinations(tags))
+        for c in combs:
+            if c not in parent_combinations:
+                count_by_combination[c] += 1
+        combs_for_child = set(list(parent_combinations) + combs)
+        for child in book.children.all():
+            count_for_book(child, count_by_combination, combs_for_child)
+
+    count_by_combination = defaultdict(lambda: 0)
+    for b in Book.objects.filter(parent=None):
+        count_for_book(b, count_by_combination)
+
+    next_combinations = defaultdict(set)
+    # Now build an index of all combinations.
+    for c in count_by_combination.keys():
+        if not c:
+            continue
+        for n in c:
+            rest = tuple(x for x in c if x != n)
+            next_combinations[rest].add(n)
+
+    counters = {
+        "count": dict(count_by_combination),
+        "next": dict(next_combinations),
+    }
+
+    with open(settings.CATALOGUE_COUNTERS_FILE, 'w') as f:
+        cPickle.dump(counters, f)
+
+
+def get_audiobook_tags():
+    audiobook_tag_ids = cache.get('audiobook_tags')
+    if audiobook_tag_ids is None:
+        books_with_audiobook = Book.objects.filter(media__type__in=('mp3', 'ogg'))\
+            .distinct().values_list('pk', flat=True)
+        audiobook_tag_ids = Tag.objects.filter(
+            items__content_type=ContentType.objects.get_for_model(Book),
+            items__object_id__in=list(books_with_audiobook)).distinct().values_list('pk', flat=True)
+        audiobook_tag_ids = list(audiobook_tag_ids)
+        cache.set('audiobook_tags', audiobook_tag_ids)
+    return audiobook_tag_ids