From: Radek Czajka Date: Fri, 23 Mar 2012 16:27:35 +0000 (+0100) Subject: move a bunch of things to celery X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/cc9a57b827d7303c37f3b2d271e7c2f661d30e45 move a bunch of things to celery --- diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py index f3b7b467e..53caa956d 100644 --- a/apps/catalogue/models.py +++ b/apps/catalogue/models.py @@ -5,10 +5,9 @@ from collections import namedtuple from django.db import models -from django.db.models import permalink, Q +from django.db.models import permalink import django.dispatch from django.core.cache import get_cache -from django.core.files.storage import DefaultStorage from django.utils.translation import ugettext_lazy as _ from django.contrib.auth.models import User from django.template.loader import render_to_string @@ -16,7 +15,7 @@ from django.utils.datastructures import SortedDict from django.utils.safestring import mark_safe from django.utils.translation import get_language from django.core.urlresolvers import reverse -from django.db.models.signals import post_save, m2m_changed, pre_delete, post_delete +from django.db.models.signals import post_save, pre_delete, post_delete import jsonfield from django.conf import settings @@ -25,12 +24,8 @@ from newtagging.models import TagBase, tags_updated from newtagging import managers from catalogue.fields import JSONField, OverwritingFileField from catalogue.utils import create_zip, split_tags, truncate_html_words -from catalogue.tasks import touch_tag, index_book -from shutil import copy -from glob import glob +from catalogue import tasks import re -from os import path -from waiter.settings import WAITER_ROOT import search @@ -217,30 +212,6 @@ def book_upload_path(ext=None, maxlen=100): return lambda *args: get_dynamic_path(*args, ext=ext, maxlen=maxlen) -def customizations_hash(customizations): - customizations.sort() - return hash(tuple(customizations)) - - -def get_customized_pdf_path(book, customizations): - """ - Returns a MEDIA_ROOT relative path for a customized pdf. The name will contain a hash of customization options. - """ - h = customizations_hash(customizations) - pdf_name = '%s/%s-custom-%s' % (book.slug, book.slug, h) - pdf_file = pdf_name + '.pdf' - - return pdf_file - - -def get_existing_customized_pdf(book): - """ - Returns a list of paths to generated customized pdf of a book - """ - pdf_glob = path.join(WAITER_ROOT, book.slug, '*') - return glob(pdf_glob) - - class BookMedia(models.Model): FileFormat = namedtuple("FileFormat", "name ext") formats = SortedDict([ @@ -503,70 +474,6 @@ class Book(models.Model): cover.save(imgstr, 'png') self.cover.save(None, ContentFile(imgstr.getvalue())) - def build_pdf(self, customizations=None, file_name=None): - """ (Re)builds the pdf file. - customizations - customizations which are passed to LaTeX class file. - file_name - save the pdf file under a different name and DO NOT save it in db. - """ - from os import unlink - from django.core.files import File - from catalogue.utils import remove_zip - - pdf = self.wldocument().as_pdf(customizations=customizations, - morefloats=settings.LIBRARIAN_PDF_MOREFLOATS) - - if file_name is None: - # we'd like to be sure not to overwrite changes happening while - # (timely) pdf generation is taking place (async celery scenario) - current_self = Book.objects.get(id=self.id) - current_self.pdf_file.save('%s.pdf' % self.slug, - File(open(pdf.get_filename()))) - self.pdf_file = current_self.pdf_file - - # remove cached downloadables - remove_zip(settings.ALL_PDF_ZIP) - - for customized_pdf in get_existing_customized_pdf(self): - unlink(customized_pdf) - else: - print "saving %s" % file_name - print "to: %s" % DefaultStorage().path(file_name) - DefaultStorage().save(file_name, File(open(pdf.get_filename()))) - - def build_mobi(self): - """ (Re)builds the MOBI file. - - """ - from django.core.files import File - from catalogue.utils import remove_zip - - mobi = self.wldocument().as_mobi() - - self.mobi_file.save('%s.mobi' % self.slug, File(open(mobi.get_filename()))) - - # remove zip with all mobi files - remove_zip(settings.ALL_MOBI_ZIP) - - def build_epub(self): - """(Re)builds the epub file.""" - from django.core.files import File - from catalogue.utils import remove_zip - - epub = self.wldocument().as_epub() - - self.epub_file.save('%s.epub' % self.slug, - File(open(epub.get_filename()))) - - # remove zip package with all epub files - remove_zip(settings.ALL_EPUB_ZIP) - - def build_txt(self): - from django.core.files.base import ContentFile - - text = self.wldocument().as_text() - self.txt_file.save('%s.txt' % self.slug, ContentFile(text.get_string())) - - def build_html(self): from django.core.files.base import ContentFile from slughifi import slughifi @@ -636,15 +543,13 @@ class Book(models.Model): books = Book.objects.filter(parent=None).exclude(**{field_name: ""}) paths = [(pretty_file_name(b), getattr(b, field_name).path) for b in books] - result = create_zip.delay(paths, + return create_zip(paths, getattr(settings, "ALL_%s_ZIP" % format_.upper())) - return result.wait() def zip_audiobooks(self, format_): bm = BookMedia.objects.filter(book=self, type=format_) paths = map(lambda bm: (None, bm.file.path), bm) - result = create_zip.delay(paths, "%s_%s" % (self.slug, format_)) - return result.wait() + return create_zip(paths, "%s_%s" % (self.slug, format_)) def search_index(self, book_info=None, reuse_index=False, index_tags=True): if reuse_index: @@ -680,8 +585,6 @@ class Book(models.Model): def from_text_and_meta(cls, raw_file, book_info, overwrite=False, build_epub=True, build_txt=True, build_pdf=True, build_mobi=True, search_index=True, search_index_tags=True, search_index_reuse=False): - import re - from sortify import sortify # check for parts before we do anything children = [] @@ -737,18 +640,18 @@ class Book(models.Model): if book.build_html(): if not settings.NO_BUILD_TXT and build_txt: - book.build_txt() + tasks.build_txt.delay(book.pk) book.build_cover(book_info) if not settings.NO_BUILD_EPUB and build_epub: - book.build_epub() + tasks.build_epub.delay(book.pk) if not settings.NO_BUILD_PDF and build_pdf: - book.build_pdf() + tasks.build_pdf.delay(book.pk) if not settings.NO_BUILD_MOBI and build_mobi: - book.build_mobi() + tasks.build_mobi.delay(book.pk) if not settings.NO_SEARCH_INDEX and search_index: book.search_index(index_tags=search_index_tags, reuse_index=search_index_reuse) @@ -767,7 +670,7 @@ class Book(models.Model): book_descendants += list(child_book.children.all()) for tag in descendants_tags: - touch_tag(tag) + tasks.touch_tag(tag) book.save() @@ -1055,7 +958,7 @@ def _tags_updated_handler(sender, affected_tags, **kwargs): # reset tag global counter # we want Tag.changed_at updated for API to know the tag was touched for tag in affected_tags: - touch_tag(tag) + tasks.touch_tag(tag) # if book tags changed, reset book tag counter if isinstance(sender, Book) and \ diff --git a/apps/catalogue/tasks.py b/apps/catalogue/tasks.py index e1ff9151a..6d19ee18a 100755 --- a/apps/catalogue/tasks.py +++ b/apps/catalogue/tasks.py @@ -3,11 +3,12 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from datetime import datetime -from celery.task import task -import catalogue.models from traceback import print_exc +from celery.task import task +from django.conf import settings -@task + +# TODO: move to model? def touch_tag(tag): update_dict = { 'book_count': tag.get_count(), @@ -19,9 +20,97 @@ def touch_tag(tag): @task def index_book(book_id, book_info=None): + from catalogue.models import Book try: - return catalogue.models.Book.objects.get(id=book_id).search_index(book_info) + return Book.objects.get(id=book_id).search_index(book_info) except Exception, e: print "Exception during index: %s" % e print_exc() raise e + + +@task(ignore_result=True) +def build_txt(book_id): + """(Re)builds the TXT file for a book.""" + from django.core.files.base import ContentFile + from catalogue.models import Book + + text = Book.objects.get(pk=book_id).wldocument().as_text() + + # Save the file in new instance. Building TXT takes time and we don't want + # to overwrite any interim changes. + book = Book.objects.get(id=book_id) + book.txt_file.save('%s.txt' % book.slug, ContentFile(text.get_string())) + + +@task(ignore_result=True, rate_limit=settings.CATALOGUE_PDF_RATE_LIMIT) +def build_pdf(book_id): + """(Re)builds the pdf file for a book.""" + from django.core.files import File + from catalogue.models import Book + from catalogue.utils import remove_zip + from waiter.utils import clear_cache + + pdf = Book.objects.get(pk=book_id).wldocument().as_pdf( + morefloats=settings.LIBRARIAN_PDF_MOREFLOATS) + + # Save the file in new instance. Building PDF takes time and we don't want + # to overwrite any interim changes. + book = Book.objects.get(id=book_id) + book.pdf_file.save('%s.pdf' % book.slug, + File(open(pdf.get_filename()))) + + # Remove cached downloadables + remove_zip(settings.ALL_PDF_ZIP) + clear_cache(book.slug) + + +@task(ignore_result=True, rate_limit=settings.CATALOGUE_EPUB_RATE_LIMIT) +def build_epub(book_id): + """(Re)builds the EPUB file for a book.""" + from django.core.files import File + from catalogue.models import Book + from catalogue.utils import remove_zip + + epub = Book.objects.get(pk=book_id).wldocument().as_epub() + # Save the file in new instance. Building MOBI takes time and we don't want + # to overwrite any interim changes. + book = Book.objects.get(id=book_id) + book.epub_file.save('%s.epub' % book.slug, + File(open(epub.get_filename()))) + + # remove zip with all epub files + remove_zip(settings.ALL_EPUB_ZIP) + + +@task(ignore_result=True, rate_limit=settings.CATALOGUE_MOBI_RATE_LIMIT) +def build_mobi(book_id): + """(Re)builds the MOBI file for a book.""" + from django.core.files import File + from catalogue.models import Book + from catalogue.utils import remove_zip + + mobi = Book.objects.get(pk=book_id).wldocument().as_mobi() + # Save the file in new instance. Building MOBI takes time and we don't want + # to overwrite any interim changes. + book = Book.objects.get(id=book_id) + book.mobi_file.save('%s.mobi' % book.slug, + File(open(mobi.get_filename()))) + + # remove zip with all mobi files + remove_zip(settings.ALL_MOBI_ZIP) + + +@task(rate_limit=settings.CATALOGUE_CUSTOMPDF_RATE_LIMIT) +def build_custom_pdf(book_id, customizations, file_name): + """Builds a custom PDF file.""" + from django.core.files import File + from django.core.files.storage import DefaultStorage + from catalogue.models import Book + + print "will gen %s" % DefaultStorage().path(file_name) + if not DefaultStorage().exists(file_name): + pdf = Book.objects.get(pk=book_id).wldocument().as_pdf( + customizations=customizations, + morefloats=settings.LIBRARIAN_PDF_MOREFLOATS) + DefaultStorage().save(file_name, File(open(pdf.get_filename()))) diff --git a/apps/catalogue/utils.py b/apps/catalogue/utils.py index 949ac96b2..9de4eaa2d 100644 --- a/apps/catalogue/utils.py +++ b/apps/catalogue/utils.py @@ -9,23 +9,18 @@ import re import time from base64 import urlsafe_b64encode -from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect +from django.http import HttpResponse from django.core.files.uploadedfile import UploadedFile -from django.core.files.base import File from django.core.files.storage import DefaultStorage from django.utils.encoding import force_unicode from django.utils.hashcompat import sha_constructor from django.conf import settings -from celery.task import task from os import mkdir, path, unlink from errno import EEXIST, ENOENT from fcntl import flock, LOCK_EX from zipfile import ZipFile -from traceback import print_exc from reporting.utils import read_chunks -from celery.task import task -import catalogue.models # Use the system (hardware-based) random number generator if it exists. if hasattr(random, 'SystemRandom'): @@ -84,7 +79,7 @@ class LockFile(object): self.lock.close() -@task +#@task def create_zip(paths, zip_slug): """ Creates a zip in MEDIA_ROOT/zip directory containing files from path. @@ -140,25 +135,6 @@ class AttachmentHttpResponse(HttpResponse): for chunk in read_chunks(f): self.write(chunk) -@task(rate_limit=settings.CATALOGUE_CUSTOMPDF_RATE_LIMIT) -def async_build_pdf(book_id, customizations, file_name): - """ - A celery task to generate pdf files. - Accepts the same args as Book.build_pdf, but with book id as first parameter - instead of Book instance - """ - try: - book = catalogue.models.Book.objects.get(id=book_id) - print "will gen %s" % DefaultStorage().path(file_name) - if not DefaultStorage().exists(file_name): - book.build_pdf(customizations=customizations, file_name=file_name) - print "done." - except Exception, e: - print "Error during pdf creation: %s" % e - print_exc - raise e - - class MultiQuerySet(object): def __init__(self, *args, **kwargs): self.querysets = args @@ -260,3 +236,24 @@ def truncate_html_words(s, num, end_text='...'): out += '' % tag # Return string return out + + +def customizations_hash(customizations): + customizations.sort() + return hash(tuple(customizations)) + + +def get_customized_pdf_path(book, customizations): + """ + Returns a MEDIA_ROOT relative path for a customized pdf. The name will contain a hash of customization options. + """ + h = customizations_hash(customizations) + return 'book/%s/%s-custom-%s.pdf' % (book.slug, book.slug, h) + + +def clear_custom_pdf(book): + """ + Returns a list of paths to generated customized pdf of a book + """ + from waiter.utils import clear_cache + clear_cache('book/%s' % book.slug) diff --git a/apps/catalogue/views.py b/apps/catalogue/views.py index 5dff961d4..eadaeca94 100644 --- a/apps/catalogue/views.py +++ b/apps/catalogue/views.py @@ -22,14 +22,13 @@ from ajaxable.utils import JSONResponse, AjaxableFormView from catalogue import models from catalogue import forms -from catalogue.utils import (split_tags, - async_build_pdf, MultiQuerySet) +from catalogue.utils import split_tags, MultiQuerySet, get_customized_pdf_path +from catalogue.tasks import build_custom_pdf from pdcounter import models as pdcounter_models from pdcounter import views as pdcounter_views from suggest.forms import PublishingSuggestForm from picture.models import Picture -from os import path from waiter.models import WaitedFile staff_required = user_passes_test(lambda user: user.is_staff) @@ -539,10 +538,10 @@ def download_custom_pdf(request, slug, method='GET'): form = forms.CustomPDFForm(method == 'GET' and request.GET or request.POST) if form.is_valid(): cust = form.customizations - pdf_file = models.get_customized_pdf_path(book, cust) + pdf_file = get_customized_pdf_path(book, cust) url = WaitedFile.order(pdf_file, - lambda p: async_build_pdf.delay(book.id, cust, p), + lambda p: build_custom_pdf.delay(book.id, cust, p), book.pretty_title() ) return redirect(url) diff --git a/apps/dictionary/models.py b/apps/dictionary/models.py index 1d2fbba39..6238ccbf2 100644 --- a/apps/dictionary/models.py +++ b/apps/dictionary/models.py @@ -3,7 +3,7 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from django.db import models - +from celery.task import task from sortify import sortify from catalogue.models import Book @@ -19,15 +19,17 @@ class Note(models.Model): ordering = ['sort_key'] -def notes_from_book(sender, **kwargs): - from librarian import html - - Note.objects.filter(book=sender).delete() - if sender.html_file: - for anchor, text_str, html_str in html.extract_annotations(sender.html_file.path): - Note.objects.create(book=sender, anchor=anchor, +@task(ignore_result=True) +def build_notes(book_id): + book = Book.objects.get(pk=book_id) + Note.objects.filter(book=book).delete() + if book.html_file: + from librarian import html + for anchor, text_str, html_str in html.extract_annotations(book.html_file.path): + Note.objects.create(book=book, anchor=anchor, html=html_str, sort_key=sortify(text_str).strip()[:128]) - -# always re-extract notes after making a HTML in a Book -Book.html_built.connect(notes_from_book) + +@Book.html_built.connect +def notes_from_book(sender, **kwargs): + build_notes.delat(sender) diff --git a/apps/waiter/migrations/0001_initial.py b/apps/waiter/migrations/0001_initial.py index 062d6f895..1c27085e8 100644 --- a/apps/waiter/migrations/0001_initial.py +++ b/apps/waiter/migrations/0001_initial.py @@ -12,7 +12,7 @@ class Migration(SchemaMigration): db.create_table('waiter_waitedfile', ( ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), ('path', self.gf('django.db.models.fields.CharField')(unique=True, max_length=255, db_index=True)), - ('task', self.gf('django.db.models.fields.CharField')(max_length=64, null=True)), + ('task', self.gf('picklefield.fields.PickledObjectField')(null=True)), ('description', self.gf('django.db.models.fields.CharField')(max_length=255, null=True, blank=True)), )) db.send_create_signal('waiter', ['WaitedFile']) @@ -30,7 +30,7 @@ class Migration(SchemaMigration): 'description': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), 'path': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255', 'db_index': 'True'}), - 'task': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True'}) + 'task': ('picklefield.fields.PickledObjectField', [], {'null': 'True'}) } } diff --git a/apps/waiter/models.py b/apps/waiter/models.py index 26a9a6d4d..59eeea682 100644 --- a/apps/waiter/models.py +++ b/apps/waiter/models.py @@ -1,22 +1,17 @@ -from os.path import join, abspath, exists +from os.path import join, isfile from django.core.urlresolvers import reverse from django.db import models -from waiter.settings import WAITER_ROOT, WAITER_URL from djcelery.models import TaskMeta +from waiter.settings import WAITER_URL +from waiter.utils import check_abspath +from picklefield import PickledObjectField class WaitedFile(models.Model): path = models.CharField(max_length=255, unique=True, db_index=True) - task = models.CharField(max_length=64, null=True, editable=False) + task = PickledObjectField(null=True, editable=False) description = models.CharField(max_length=255, null=True, blank=True) - @staticmethod - def abspath(path): - abs_path = abspath(join(WAITER_ROOT, path)) - if not abs_path.startswith(WAITER_ROOT): - raise ValueError('Path not inside WAITER_ROOT.') - return abs_path - @classmethod def exists(cls, path): """Returns opened file or None. @@ -24,10 +19,10 @@ class WaitedFile(models.Model): `path` is relative to WAITER_ROOT. Won't open a path leading outside of WAITER_ROOT. """ - abs_path = cls.abspath(path) + abs_path = check_abspath(path) # Pre-fetch objects for deletion to avoid minor race condition relevant = [o.id for o in cls.objects.filter(path=path)] - if exists(abs_path): + if isfile(abs_path): cls.objects.filter(id__in=relevant).delete() return True else: @@ -37,13 +32,7 @@ class WaitedFile(models.Model): if self.task is None: # Race; just let the other task roll. return False - try: - meta = TaskMeta.objects.get(task_id=self.task) - assert meta.status in (u'PENDING', u'STARTED', u'SUCCESS', u'RETRY') - except TaskMeta.DoesNotExist: - # Might happen it's not yet there. - pass - except AssertionError: + if self.task.status not in (u'PENDING', u'STARTED', u'SUCCESS', u'RETRY'): return True return False @@ -61,7 +50,7 @@ class WaitedFile(models.Model): if not already: waited, created = cls.objects.get_or_create(path=path) if created or waited.is_stale(): - waited.task = task_creator(cls.abspath(path)) + waited.task = task_creator(check_abspath(path)) waited.description = description waited.save() return reverse("waiter", args=[path]) diff --git a/apps/waiter/templates/waiter/wait.html b/apps/waiter/templates/waiter/wait.html index a9efecddb..e15bd6455 100644 --- a/apps/waiter/templates/waiter/wait.html +++ b/apps/waiter/templates/waiter/wait.html @@ -78,8 +78,8 @@ function wait() { else setTimeout(wait, 10*1000); }, - error: function() { - setTimeout(wait, 10*1000); + error: function(xhr) { + location.reload(); } }); } diff --git a/apps/waiter/utils.py b/apps/waiter/utils.py new file mode 100644 index 000000000..0957e9d80 --- /dev/null +++ b/apps/waiter/utils.py @@ -0,0 +1,17 @@ +from os.path import abspath, join, exists +from shutil import rmtree +from waiter.settings import WAITER_ROOT + + +def check_abspath(path): + abs_path = abspath(join(WAITER_ROOT, path)) + if not abs_path.startswith(WAITER_ROOT): + raise ValueError('Path not inside WAITER_ROOT.') + return abs_path + + +def clear_cache(path): + abs_path = check_abspath(path) + if exists(abs_path): + rmtree(abs_path) + diff --git a/requirements.txt b/requirements.txt index 81451dbb3..3c85e3153 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ django-rosetta>=0.5.3 django-maintenancemode>=0.9 django-piston django-jsonfield +django-picklefield django-allauth django-honeypot diff --git a/wolnelektury/settings/custom.py b/wolnelektury/settings/custom.py index eed38a2bc..9abb5edd3 100644 --- a/wolnelektury/settings/custom.py +++ b/wolnelektury/settings/custom.py @@ -22,7 +22,10 @@ ALL_MOBI_ZIP = 'wolnelektury_pl_mobi' CATALOGUE_DEFAULT_LANGUAGE = 'pol' PUBLISH_PLAN_FEED = 'http://redakcja.wolnelektury.pl/documents/track/editor-proofreading/?published=false' -# limit rate for custom PDF creation +# limit rate for ebooks creation +CATALOGUE_PDF_RATE_LIMIT = '1/m' +CATALOGUE_EPUB_RATE_LIMIT = '6/m' +CATALOGUE_MOBI_RATE_LIMIT = '5/m' CATALOGUE_CUSTOMPDF_RATE_LIMIT = '1/m' # set to 'new' or 'old' to skip time-consuming test