--- /dev/null
+# -*- coding: utf-8 -*-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
+import re
+import sys
+from cPickle import load, dump
+from optparse import make_option
+
+from django.core.management.base import BaseCommand
+from django.core.management.color import color_style
+import zipfile
+
+from catalogue.models import Book, Tag
+
+
+class Command(BaseCommand):
+ option_list = BaseCommand.option_list + (
+ make_option('-t', '--tags', dest='tags', metavar='SLUG,...',
+ help='Use only books tagged with this tags'),
+ make_option('-i', '--include', dest='include', metavar='SLUG,...',
+ help='Include specific books by slug'),
+ make_option('-e', '--exclude', dest='exclude', metavar='SLUG,...',
+ help='Exclude specific books by slug')
+ )
+ help = 'Prepare data for Lesmianator.'
+ ftypes = ['xml', 'txt', 'html', 'epub', 'pdf']
+ args = '[%s] output_path.zip' % '|'.join(ftypes)
+
+ def handle(self, ftype, path, **options):
+ self.style = color_style()
+ verbose = int(options.get('verbosity'))
+ tags = options.get('tags')
+ include = options.get('include')
+ exclude = options.get('exclude')
+
+ if ftype in self.ftypes:
+ field = "%s_file" % ftype
+ else:
+ print self.style.ERROR('Unknown file type.')
+ return
+
+ books = []
+
+ if include:
+ books += list(Book.objects.filter(slug__in=include.split(',')).only('slug', field))
+
+ if tags:
+ books += list(Book.tagged.with_all(Tag.objects.filter(slug__in=tags.split(','))).only('slug', field))
+ elif not include:
+ books = list(Book.objects.all().only('slug', field))
+
+ if exclude:
+ books = [book for book in books if book.slug not in exclude.split(',')]
+
+ archive = zipfile.ZipFile(path, 'w')
+
+ processed = skipped = 0
+ for book in books:
+ if verbose >= 2:
+ print 'Parsing', book.slug
+ content = getattr(book, field)
+ if not content:
+ if verbose >= 1:
+ print self.style.NOTICE('%s has no %s file' % (book.slug, ftype))
+ skipped += 1
+ continue
+ archive.write(content.path, str('%s.%s' % (book.slug, ftype)))
+ processed += 1
+ archive.close()
+
+ if not processed:
+ if skipped:
+ print self.style.ERROR("No books with %s files found" % ftype)
+ else:
+ print self.style.ERROR("No books found")
+ return
+
+ if verbose >= 1:
+ print "%d processed, %d skipped" % (processed, skipped)
+ print "Results written to %s" % path
return HttpResponse(datetime.now().strftime('%Y/%m/%d %H:%M:%S'))
-@cache.never_cache
-def xmls(request):
- """"
- Create a zip archive with all XML files.
- This should be removed when we have real API.
- """
- temp = tempfile.TemporaryFile()
- archive = zipfile.ZipFile(temp, 'w')
-
- for book in models.Book.objects.all():
- archive.write(book.xml_file.path, str('%s.xml' % book.slug))
- archive.close()
-
- response = HttpResponse(content_type='application/zip', mimetype='application/x-zip-compressed')
- response['Content-Disposition'] = 'attachment; filename=xmls.zip'
- response['Content-Length'] = temp.tell()
-
- temp.seek(0)
- response.write(temp.read())
- return response
-
-
-
# info views for API
def book_info(request, id, lang='pl'):
# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
+import re
import sys
from cPickle import load, dump
from optparse import make_option
from catalogue.models import Book, Tag
+# extract text from text file
+re_text = re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search
+
class Command(BaseCommand):
option_list = BaseCommand.option_list + (
print self.style.NOTICE('%s has no TXT file' % book.slug)
skipped += 1
continue
- processed += 1
- last_word = ''
- for number, line in enumerate(book.txt_file):
- if number < 17:
+ with open(book.txt_file.path) as f:
+ m = re_text(f.read())
+ if not m:
+ print self.style.ERROR("Unknown text format: %s" % book.slug)
+ skipped += 1
continue
- line = unicode(line, 'utf-8').lower()
- for letter in line:
+
+ processed += 1
+ last_word = ''
+ text = unicode(m.group(1), 'utf-8').lower()
+ for letter in text:
mydict = lesmianator.setdefault(last_word, {})
myval = mydict.setdefault(letter, 0)
mydict[letter] += 1
try:
dump(lesmianator, open(path, 'w'))
except:
- print self.style.ERROR("Counldn't write to $s" % path)
+ print self.style.ERROR("Couldn't write to $s" % path)
return
dump(lesmianator, open(path, 'w'))
if verbose >= 1:
print "%d processed, %d skipped" % (processed, skipped)
- print "Results dumped do %s" % path
+ print "Results dumped to %s" % path