bcad60189e91a381c14a01cf4eaa440fee494506
[wolnelektury.git] / src / lesmianator / management / commands / lesmianator.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 import re
5 from pickle import dump
6
7 from django.core.management.base import BaseCommand
8 from django.core.management.color import color_style
9 from django.conf import settings
10
11 from catalogue.models import Book, Tag
12
13 # extract text from text file
14 re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search
15
16
17 class Command(BaseCommand):
18     help = 'Prepare data for Leśmianator.'
19
20     def add_arguments(self, parser):
21         parser.add_argument(
22             '-t', '--tags', dest='tags', metavar='SLUG,...',
23             help='Use only books tagged with this tags')
24         parser.add_argument(
25             '-i', '--include', dest='include', metavar='SLUG,...',
26             help='Include specific books by slug')
27         parser.add_argument(
28             '-e', '--exclude', dest='exclude', metavar='SLUG,...',
29             help='Exclude specific books by slug')
30
31     def handle(self, *args, **options):
32         self.style = color_style()
33         verbose = int(options.get('verbosity'))
34         tags = options.get('tags')
35         include = options.get('include')
36         exclude = options.get('exclude')
37
38         try:
39             path = settings.LESMIANATOR_PICKLE
40         except AttributeError:
41             print(self.style.ERROR('LESMIANATOR_PICKLE not set in the settings.'))
42             return
43
44         books = []
45
46         if include:
47             books += list(Book.objects.filter(slug__in=include.split(',')).only('slug', 'txt_file'))
48
49         if tags:
50             books += list(Book.tagged.with_all(Tag.objects.filter(slug__in=tags.split(','))).only('slug', 'txt_file'))
51         elif not include:
52             books = list(Book.objects.all().only('slug', 'txt_file'))
53
54         if exclude:
55             books = [book for book in books if book.slug not in exclude.split(',')]
56
57         books = set(books)
58
59         lesmianator = {}
60         processed = skipped = 0
61         for book in books:
62             if verbose >= 2:
63                 print('Parsing', book.slug)
64             if not book.txt_file:
65                 if verbose >= 1:
66                     print(self.style.NOTICE('%s has no TXT file' % book.slug))
67                 skipped += 1
68                 continue
69             f = open(book.txt_file.path)
70             m = re_text(f.read())
71             if not m:
72                 print(self.style.ERROR("Unknown text format: %s" % book.slug))
73                 skipped += 1
74                 continue
75
76             processed += 1
77             last_word = ''
78             text = str(m.group(1), 'utf-8').lower()
79             for letter in text:
80                 mydict = lesmianator.setdefault(last_word, {})
81                 mydict.setdefault(letter, 0)
82                 mydict[letter] += 1
83                 last_word = last_word[-2:] + letter
84             f.close()
85
86         if not processed:
87             if skipped:
88                 print(self.style.ERROR("No books with TXT files found"))
89             else:
90                 print(self.style.ERROR("No books found"))
91             return
92
93         try:
94             dump(lesmianator, open(path, 'w'))
95         except IOError:
96             print(self.style.ERROR("Couldn't write to $s" % path))
97             return
98
99         dump(lesmianator, open(path, 'w'))
100         if verbose >= 1:
101             print("%d processed, %d skipped" % (processed, skipped))
102             print("Results dumped to %s" % path)