src/lesmianator/management/commands/lesmianator.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 import re
   6 from cPickle import dump
   7 from optparse import make_option
   8
   9 from django.core.management.base import BaseCommand
  10 from django.core.management.color import color_style
  11 from django.conf import settings
  12
  13 from catalogue.models import Book, Tag
  14
  15 # extract text from text file
  16 re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search
  17
  18
  19 class Command(BaseCommand):
  20     option_list = BaseCommand.option_list + (
  21         make_option('-t', '--tags', dest='tags', metavar='SLUG,...',
  22                     help='Use only books tagged with this tags'),
  23         make_option('-i', '--include', dest='include', metavar='SLUG,...',
  24                     help='Include specific books by slug'),
  25         make_option('-e', '--exclude', dest='exclude', metavar='SLUG,...',
  26                     help='Exclude specific books by slug')
  27     )
  28     help = 'Prepare data for Lesmianator.'
  29
  30     def handle(self, *args, **options):
  31         self.style = color_style()
  32         verbose = int(options.get('verbosity'))
  33         tags = options.get('tags')
  34         include = options.get('include')
  35         exclude = options.get('exclude')
  36
  37         try:
  38             path = settings.LESMIANATOR_PICKLE
  39         except AttributeError:
  40             print self.style.ERROR('LESMIANATOR_PICKLE not set in the settings.')
  41             return
  42
  43         books = []
  44
  45         if include:
  46             books += list(Book.objects.filter(slug__in=include.split(',')).only('slug', 'txt_file'))
  47
  48         if tags:
  49             books += list(Book.tagged.with_all(Tag.objects.filter(slug__in=tags.split(','))).only('slug', 'txt_file'))
  50         elif not include:
  51             books = list(Book.objects.all().only('slug', 'txt_file'))
  52
  53         if exclude:
  54             books = [book for book in books if book.slug not in exclude.split(',')]
  55
  56         books = set(books)
  57
  58         lesmianator = {}
  59         processed = skipped = 0
  60         for book in books:
  61             if verbose >= 2:
  62                 print 'Parsing', book.slug
  63             if not book.txt_file:
  64                 if verbose >= 1:
  65                     print self.style.NOTICE('%s has no TXT file' % book.slug)
  66                 skipped += 1
  67                 continue
  68             f = open(book.txt_file.path)
  69             m = re_text(f.read())
  70             if not m:
  71                 print self.style.ERROR("Unknown text format: %s" % book.slug)
  72                 skipped += 1
  73                 continue
  74
  75             processed += 1
  76             last_word = ''
  77             text = unicode(m.group(1), 'utf-8').lower()
  78             for letter in text:
  79                 mydict = lesmianator.setdefault(last_word, {})
  80                 mydict.setdefault(letter, 0)
  81                 mydict[letter] += 1
  82                 last_word = last_word[-2:] + letter
  83             f.close()
  84
  85         if not processed:
  86             if skipped:
  87                 print self.style.ERROR("No books with TXT files found")
  88             else:
  89                 print self.style.ERROR("No books found")
  90             return
  91
  92         try:
  93             dump(lesmianator, open(path, 'w'))
  94         except IOError:
  95             print self.style.ERROR("Couldn't write to $s" % path)
  96             return
  97
  98         dump(lesmianator, open(path, 'w'))
  99         if verbose >= 1:
 100             print "%d processed, %d skipped" % (processed, skipped)
 101             print "Results dumped to %s" % path