From 6e88635c94b36eaff84bd4274983af977986b699 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 21 Sep 2011 16:51:47 +0200 Subject: [PATCH 1/1] first take on The Great Import --- .../management/commands/import_wl.py | 83 +++++++++++++++++++ apps/catalogue/models.py | 65 +++++++++++++-- apps/catalogue/xml_tools.py | 78 +++++++++++++++-- 3 files changed, 212 insertions(+), 14 deletions(-) create mode 100755 apps/catalogue/management/commands/import_wl.py diff --git a/apps/catalogue/management/commands/import_wl.py b/apps/catalogue/management/commands/import_wl.py new file mode 100755 index 00000000..6836d366 --- /dev/null +++ b/apps/catalogue/management/commands/import_wl.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +import json +from optparse import make_option +import urllib2 + +from django.core.management.base import BaseCommand +from django.core.management.color import color_style +from django.db import transaction +from librarian.dcparser import BookInfo +from librarian import ParseError, ValidationError + +from catalogue.models import Book + + +WL_API = 'http://www.wolnelektury.pl/api/books/' + + +class Command(BaseCommand): + option_list = BaseCommand.option_list + ( + make_option('-q', '--quiet', action='store_false', dest='verbose', default=True, + help='Less output'), + ) + help = 'Imports XML files from WL.' + + def handle(self, *args, **options): + + self.style = color_style() + + verbose = options.get('verbose') + + # Start transaction management. + transaction.commit_unless_managed() + transaction.enter_transaction_management() + transaction.managed(True) + + if verbose: + print 'Reading currently managed files.' + slugs = {} + for b in Book.objects.all(): + if verbose: + print b.slug + text = b.materialize().encode('utf-8') + try: + info = BookInfo.from_string(text) + except (ParseError, ValidationError): + pass + else: + slugs[info.slug] = b + + book_count = 0 + commit_args = { + "author_name": 'Platforma', + "description": 'Import from WL', + } + + if verbose: + print 'Opening books list' + for book in json.load(urllib2.urlopen(WL_API)): + book_detail = json.load(urllib2.urlopen(book['href'])) + xml_text = urllib2.urlopen(book_detail['xml']).read() + info = BookInfo.from_string(xml_text) + previous_book = slugs.get(info.slug, None) + if previous_book: + comm = previous_book.slug + else: + comm = '*' + print book_count, info.slug , '-->', comm + Book.import_xml_text(xml_text, title=info.title, + slug=info.slug, previous_book=slugs.get(info.slug, None)) + book_count += 1 + + # Print results + print + print "Results:" + print "Imported %d books from WL:" % ( + book_count, ) + print + + + transaction.commit() + transaction.leave_transaction_management() + diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py index f9686547..ebeb9aef 100644 --- a/apps/catalogue/models.py +++ b/apps/catalogue/models.py @@ -7,9 +7,12 @@ from django.contrib.auth.models import User from django.core.urlresolvers import reverse from django.db import models from django.utils.translation import ugettext_lazy as _ +from django.db.utils import IntegrityError + +from slughifi import slughifi from dvcs import models as dvcs_models -from catalogue.xml_tools import compile_text +from catalogue.xml_tools import compile_text, split_xml import logging logger = logging.getLogger("fnp.catalogue") @@ -40,6 +43,41 @@ class Book(models.Model): def get_absolute_url(self): return reverse("catalogue_book", args=[self.slug]) + @classmethod + def import_xml_text(cls, text=u'', creator=None, previous_book=None, + *args, **kwargs): + + texts = split_xml(text) + if previous_book: + instance = previous_book + else: + instance = cls(*args, **kwargs) + instance.save() + + # if there are more parts, set the rest to empty strings + book_len = len(instance) + for i in range(book_len - len(texts)): + texts.append(u'pusta część %d' % (i + 1), u'') + + i = 0 + for i, (title, text) in enumerate(texts): + if not title: + title = u'część %d' % (i + 1) + + slug = slughifi(title) + + if i < book_len: + chunk = instance[i] + chunk.slug = slug + chunk.comment = title + chunk.save() + else: + chunk = instance.add(slug, title, creator, adjust_slug=True) + + chunk.commit(text, author=creator) + + return instance + @classmethod def create(cls, creator=None, text=u'', *args, **kwargs): """ @@ -48,7 +86,7 @@ class Book(models.Model): """ instance = cls(*args, **kwargs) instance.save() - instance[0].commit(author=creator, text=text) + instance[0].commit(text, author=creator) return instance def __iter__(self): @@ -75,7 +113,7 @@ class Book(models.Model): if publishable: changes = [chunk.publishable() for chunk in self] else: - changes = [chunk.head for chunk in self] + changes = [chunk.head for chunk in self if chunk.head is not None] if None in changes: raise self.NoTextError('Some chunks have no available text.') return changes @@ -126,6 +164,7 @@ class Book(models.Model): return new_slug def append(self, other): + """Add all chunks of another book to self.""" number = self[len(self) - 1].number + 1 single = len(other) == 1 for chunk in other: @@ -155,6 +194,10 @@ class Book(models.Model): number += 1 other.delete() + def add(self, *args, **kwargs): + """Add a new chunk at the end.""" + return self.chunk_set.reverse()[0].split(*args, **kwargs) + @staticmethod def listener_create(sender, instance, created, **kwargs): if created: @@ -196,12 +239,22 @@ class Chunk(dvcs_models.Document): title += " (%d/%d)" % (self.number, book_length) return title - def split(self, slug, comment='', creator=None): + def split(self, slug, comment='', creator=None, adjust_slug=False): """ Create an empty chunk after this one """ self.book.chunk_set.filter(number__gt=self.number).update( number=models.F('number')+1) - new_chunk = self.book.chunk_set.create(number=self.number+1, - creator=creator, slug=slug, comment=comment) + tries = 1 + new_slug = slug + new_chunk = None + while not new_chunk: + try: + new_chunk = self.book.chunk_set.create(number=self.number+1, + creator=creator, slug=new_slug, comment=comment) + except IntegrityError: + if not adjust_slug: + raise + new_slug = "%s_%d" % (slug, tries) + tries += 1 return new_chunk @staticmethod diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py index 928e57be..522806b6 100755 --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +from copy import deepcopy from functools import wraps import re @@ -139,10 +141,11 @@ def compile_text(parts): for next_text in parts: if not next_text: continue - # trim the end, because there's more non-empty text - # don't trim beginning, if `text' is the first non-empty part - texts.append(_trim(text, trim_begin=trim_begin)) - trim_begin = True + if text: + # trim the end, because there's more non-empty text + # don't trim beginning, if `text' is the first non-empty part + texts.append(_trim(text, trim_begin=trim_begin)) + trim_begin = True text = next_text # don't trim the end, because there's no more text coming after `text' # only trim beginning if it's not still the first non-empty @@ -156,7 +159,7 @@ def change_master(text, master): """ e = etree.fromstring(text) e[-1].tag = master - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def basic_structure(text, master): @@ -167,7 +170,7 @@ def basic_structure(text, master): ''' % (TRIM_BEGIN, TRIM_END)) e[0].tag = master e[0][0].tail = "\n"*3 + text + "\n"*3 - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def add_trim_begin(text): @@ -182,7 +185,7 @@ def add_trim_begin(text): master.insert(0, trim_tag) trim_tag.tail = '\n\n\n' + (master.text or '') master.text = '\n' - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def add_trim_end(text): @@ -201,4 +204,63 @@ def add_trim_end(text): prev.tail = (prev.tail or '') + '\n\n\n' else: master.text = (master.text or '') + '\n\n\n' - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') + + +def split_xml(text): + """Splits text into chapters. + + All this stuff really must go somewhere else. + + """ + src = etree.fromstring(text) + chunks = [] + + splitter = u'naglowek_rozdzial' + parts = src.findall('.//naglowek_rozdzial') + while parts: + # copy the document + copied = deepcopy(src) + + element = parts[-1] + + # find the chapter's title + name_elem = deepcopy(element) + for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga': + for a in name_elem.findall('.//' + tag): + a.text='' + del a[:] + name = etree.tostring(name_elem, method='text', encoding='utf-8') + + # in the original, remove everything from the start of the last chapter + parent = element.getparent() + del parent[parent.index(element):] + element, parent = parent, parent.getparent() + while parent is not None: + del parent[parent.index(element) + 1:] + element, parent = parent, parent.getparent() + + # in the copy, remove everything before the last chapter + element = copied.findall('.//naglowek_rozdzial')[-1] + parent = element.getparent() + while parent is not None: + parent.text = None + while parent[0] is not element: + del parent[0] + element, parent = parent, parent.getparent() + chunks[:0] = [[name, + unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8') + ]] + + parts = src.findall('.//naglowek_rozdzial') + + chunks[:0] = [[u'początek', + unicode(etree.tostring(src, encoding='utf-8'), 'utf-8') + ]] + + for ch in chunks[1:]: + ch[1] = add_trim_begin(ch[1]) + for ch in chunks[:-1]: + ch[1] = add_trim_end(ch[1]) + + return chunks -- 2.20.1