X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/b05172d6b6a148ec45867fb3efd5911471279398..d91551345b68c2bc7d96f2098691fab28276d6b8:/apps/lesmianator/management/commands/lesmianator.py diff --git a/apps/lesmianator/management/commands/lesmianator.py b/apps/lesmianator/management/commands/lesmianator.py index 36d71445f..c0219214d 100644 --- a/apps/lesmianator/management/commands/lesmianator.py +++ b/apps/lesmianator/management/commands/lesmianator.py @@ -2,6 +2,7 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +import re import sys from cPickle import load, dump from optparse import make_option @@ -12,6 +13,9 @@ from django.conf import settings from catalogue.models import Book, Tag +# extract text from text file +re_text = re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search + class Command(BaseCommand): option_list = BaseCommand.option_list + ( @@ -62,13 +66,17 @@ class Command(BaseCommand): print self.style.NOTICE('%s has no TXT file' % book.slug) skipped += 1 continue - processed += 1 - last_word = '' - for number, line in enumerate(book.txt_file): - if number < 17: + with open(book.txt_file.path) as f: + m = re_text(f.read()) + if not m: + print self.style.ERROR("Unknown text format: %s" % book.slug) + skipped += 1 continue - line = unicode(line, 'utf-8').lower() - for letter in line: + + processed += 1 + last_word = '' + text = unicode(m.group(1), 'utf-8').lower() + for letter in text: mydict = lesmianator.setdefault(last_word, {}) myval = mydict.setdefault(letter, 0) mydict[letter] += 1 @@ -84,10 +92,10 @@ class Command(BaseCommand): try: dump(lesmianator, open(path, 'w')) except: - print self.style.ERROR("Counldn't write to $s" % path) + print self.style.ERROR("Couldn't write to $s" % path) return dump(lesmianator, open(path, 'w')) if verbose >= 1: print "%d processed, %d skipped" % (processed, skipped) - print "Results dumped do %s" % path + print "Results dumped to %s" % path