X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/d99d71844b7b97800dcab7cbb81c3a4185acbb48..60b06883b6d5a336ef47c01103ec1ce25aafae69:/apps/lesmianator/management/commands/lesmianator.py diff --git a/apps/lesmianator/management/commands/lesmianator.py b/apps/lesmianator/management/commands/lesmianator.py index 36d71445f..5412bee81 100644 --- a/apps/lesmianator/management/commands/lesmianator.py +++ b/apps/lesmianator/management/commands/lesmianator.py @@ -2,6 +2,7 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +import re import sys from cPickle import load, dump from optparse import make_option @@ -12,6 +13,9 @@ from django.conf import settings from catalogue.models import Book, Tag +# extract text from text file +re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search + class Command(BaseCommand): option_list = BaseCommand.option_list + ( @@ -62,17 +66,22 @@ class Command(BaseCommand): print self.style.NOTICE('%s has no TXT file' % book.slug) skipped += 1 continue + f = open(book.txt_file.path) + m = re_text(f.read()) + if not m: + print self.style.ERROR("Unknown text format: %s" % book.slug) + skipped += 1 + continue + processed += 1 last_word = '' - for number, line in enumerate(book.txt_file): - if number < 17: - continue - line = unicode(line, 'utf-8').lower() - for letter in line: - mydict = lesmianator.setdefault(last_word, {}) - myval = mydict.setdefault(letter, 0) - mydict[letter] += 1 - last_word = last_word[-2:] + letter + text = unicode(m.group(1), 'utf-8').lower() + for letter in text: + mydict = lesmianator.setdefault(last_word, {}) + myval = mydict.setdefault(letter, 0) + mydict[letter] += 1 + last_word = last_word[-2:] + letter + f.close() if not processed: if skipped: @@ -84,10 +93,10 @@ class Command(BaseCommand): try: dump(lesmianator, open(path, 'w')) except: - print self.style.ERROR("Counldn't write to $s" % path) + print self.style.ERROR("Couldn't write to $s" % path) return dump(lesmianator, open(path, 'w')) if verbose >= 1: print "%d processed, %d skipped" % (processed, skipped) - print "Results dumped do %s" % path + print "Results dumped to %s" % path