36d71445ff7330a2b57a5e18009fd7890ffa0be0
[wolnelektury.git] / apps / lesmianator / management / commands / lesmianator.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 import sys
6 from cPickle import load, dump
7 from optparse import make_option
8
9 from django.core.management.base import BaseCommand
10 from django.core.management.color import color_style
11 from django.conf import settings
12
13 from catalogue.models import Book, Tag
14
15
16 class Command(BaseCommand):
17     option_list = BaseCommand.option_list + (
18         make_option('-t', '--tags', dest='tags', metavar='SLUG,...',
19             help='Use only books tagged with this tags'),
20         make_option('-i', '--include', dest='include', metavar='SLUG,...',
21             help='Include specific books by slug'),
22         make_option('-e', '--exclude', dest='exclude', metavar='SLUG,...',
23             help='Exclude specific books by slug')
24     )
25     help = 'Prepare data for Lesmianator.'
26
27     def handle(self, *args, **options):
28         self.style = color_style()
29         verbose = int(options.get('verbosity'))
30         tags = options.get('tags')
31         include = options.get('include')
32         exclude = options.get('exclude')
33
34         try:
35             path = settings.LESMIANATOR_PICKLE
36         except:
37             print self.style.ERROR('LESMIANATOR_PICKLE not set in the settings.')
38             return
39
40         books = []
41
42         if include:
43             books += list(Book.objects.filter(slug__in=include.split(',')).only('slug', 'txt_file'))
44
45         if tags:
46             books += list(Book.tagged.with_all(Tag.objects.filter(slug__in=tags.split(','))).only('slug', 'txt_file'))
47         elif not include:
48             books = list(Book.objects.all().only('slug', 'txt_file'))
49
50         if exclude:
51             books = [book for book in books if book.slug not in exclude.split(',')]
52
53         books = set(books)
54
55         lesmianator = {}
56         processed = skipped = 0
57         for book in books:
58             if verbose >= 2:
59                 print 'Parsing', book.slug
60             if not book.txt_file:
61                 if verbose >= 1:
62                     print self.style.NOTICE('%s has no TXT file' % book.slug)
63                 skipped += 1
64                 continue
65             processed += 1
66             last_word = ''
67             for number, line in enumerate(book.txt_file):
68                 if number < 17:
69                     continue
70                 line = unicode(line, 'utf-8').lower()
71                 for letter in line:
72                     mydict = lesmianator.setdefault(last_word, {})
73                     myval = mydict.setdefault(letter, 0)
74                     mydict[letter] += 1
75                     last_word = last_word[-2:] + letter
76
77         if not processed:
78             if skipped:
79                 print self.style.ERROR("No books with TXT files found")
80             else:
81                 print self.style.ERROR("No books found")
82             return
83
84         try:
85             dump(lesmianator, open(path, 'w'))
86         except:
87             print self.style.ERROR("Counldn't write to $s" % path)
88             return
89
90         dump(lesmianator, open(path, 'w'))
91         if verbose >= 1:
92             print "%d processed, %d skipped" % (processed, skipped)
93             print "Results dumped do %s" % path