From: Radek Czajka Date: Mon, 12 Sep 2011 05:38:36 +0000 (+0200) Subject: management: creating file packs, lesmianator update X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/d91551345b68c2bc7d96f2098691fab28276d6b8?ds=sidebyside management: creating file packs, lesmianator update --- diff --git a/apps/catalogue/management/commands/pack.py b/apps/catalogue/management/commands/pack.py new file mode 100755 index 000000000..80f612ad3 --- /dev/null +++ b/apps/catalogue/management/commands/pack.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import re +import sys +from cPickle import load, dump +from optparse import make_option + +from django.core.management.base import BaseCommand +from django.core.management.color import color_style +import zipfile + +from catalogue.models import Book, Tag + + +class Command(BaseCommand): + option_list = BaseCommand.option_list + ( + make_option('-t', '--tags', dest='tags', metavar='SLUG,...', + help='Use only books tagged with this tags'), + make_option('-i', '--include', dest='include', metavar='SLUG,...', + help='Include specific books by slug'), + make_option('-e', '--exclude', dest='exclude', metavar='SLUG,...', + help='Exclude specific books by slug') + ) + help = 'Prepare data for Lesmianator.' + ftypes = ['xml', 'txt', 'html', 'epub', 'pdf'] + args = '[%s] output_path.zip' % '|'.join(ftypes) + + def handle(self, ftype, path, **options): + self.style = color_style() + verbose = int(options.get('verbosity')) + tags = options.get('tags') + include = options.get('include') + exclude = options.get('exclude') + + if ftype in self.ftypes: + field = "%s_file" % ftype + else: + print self.style.ERROR('Unknown file type.') + return + + books = [] + + if include: + books += list(Book.objects.filter(slug__in=include.split(',')).only('slug', field)) + + if tags: + books += list(Book.tagged.with_all(Tag.objects.filter(slug__in=tags.split(','))).only('slug', field)) + elif not include: + books = list(Book.objects.all().only('slug', field)) + + if exclude: + books = [book for book in books if book.slug not in exclude.split(',')] + + archive = zipfile.ZipFile(path, 'w') + + processed = skipped = 0 + for book in books: + if verbose >= 2: + print 'Parsing', book.slug + content = getattr(book, field) + if not content: + if verbose >= 1: + print self.style.NOTICE('%s has no %s file' % (book.slug, ftype)) + skipped += 1 + continue + archive.write(content.path, str('%s.%s' % (book.slug, ftype))) + processed += 1 + archive.close() + + if not processed: + if skipped: + print self.style.ERROR("No books with %s files found" % ftype) + else: + print self.style.ERROR("No books found") + return + + if verbose >= 1: + print "%d processed, %d skipped" % (processed, skipped) + print "Results written to %s" % path diff --git a/apps/catalogue/urls.py b/apps/catalogue/urls.py index 7592ca408..a8f21d1b4 100644 --- a/apps/catalogue/urls.py +++ b/apps/catalogue/urls.py @@ -24,7 +24,6 @@ urlpatterns = patterns('catalogue.views', # tools url(r'^zegar/$', 'clock', name='clock'), - url(r'^xmls.zip$', 'xmls', name='xmls'), url(r'^liczniki/$', 'counters', name='catalogue_counters'), # Public interface. Do not change this URLs. diff --git a/apps/catalogue/views.py b/apps/catalogue/views.py index fd8a6330f..44a01479d 100644 --- a/apps/catalogue/views.py +++ b/apps/catalogue/views.py @@ -779,29 +779,6 @@ def clock(request): return HttpResponse(datetime.now().strftime('%Y/%m/%d %H:%M:%S')) -@cache.never_cache -def xmls(request): - """" - Create a zip archive with all XML files. - This should be removed when we have real API. - """ - temp = tempfile.TemporaryFile() - archive = zipfile.ZipFile(temp, 'w') - - for book in models.Book.objects.all(): - archive.write(book.xml_file.path, str('%s.xml' % book.slug)) - archive.close() - - response = HttpResponse(content_type='application/zip', mimetype='application/x-zip-compressed') - response['Content-Disposition'] = 'attachment; filename=xmls.zip' - response['Content-Length'] = temp.tell() - - temp.seek(0) - response.write(temp.read()) - return response - - - # info views for API def book_info(request, id, lang='pl'): diff --git a/apps/lesmianator/management/commands/lesmianator.py b/apps/lesmianator/management/commands/lesmianator.py index 36d71445f..c0219214d 100644 --- a/apps/lesmianator/management/commands/lesmianator.py +++ b/apps/lesmianator/management/commands/lesmianator.py @@ -2,6 +2,7 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +import re import sys from cPickle import load, dump from optparse import make_option @@ -12,6 +13,9 @@ from django.conf import settings from catalogue.models import Book, Tag +# extract text from text file +re_text = re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search + class Command(BaseCommand): option_list = BaseCommand.option_list + ( @@ -62,13 +66,17 @@ class Command(BaseCommand): print self.style.NOTICE('%s has no TXT file' % book.slug) skipped += 1 continue - processed += 1 - last_word = '' - for number, line in enumerate(book.txt_file): - if number < 17: + with open(book.txt_file.path) as f: + m = re_text(f.read()) + if not m: + print self.style.ERROR("Unknown text format: %s" % book.slug) + skipped += 1 continue - line = unicode(line, 'utf-8').lower() - for letter in line: + + processed += 1 + last_word = '' + text = unicode(m.group(1), 'utf-8').lower() + for letter in text: mydict = lesmianator.setdefault(last_word, {}) myval = mydict.setdefault(letter, 0) mydict[letter] += 1 @@ -84,10 +92,10 @@ class Command(BaseCommand): try: dump(lesmianator, open(path, 'w')) except: - print self.style.ERROR("Counldn't write to $s" % path) + print self.style.ERROR("Couldn't write to $s" % path) return dump(lesmianator, open(path, 'w')) if verbose >= 1: print "%d processed, %d skipped" % (processed, skipped) - print "Results dumped do %s" % path + print "Results dumped to %s" % path