first take on The Great Import
authorRadek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
committerRadek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
apps/catalogue/management/commands/import_wl.py [new file with mode: 0755]
apps/catalogue/models.py
apps/catalogue/xml_tools.py

diff --git a/apps/catalogue/management/commands/import_wl.py b/apps/catalogue/management/commands/import_wl.py
new file mode 100755 (executable)
index 0000000..6836d36
--- /dev/null
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+import json
+from optparse import make_option
+import urllib2
+
+from django.core.management.base import BaseCommand
+from django.core.management.color import color_style
+from django.db import transaction
+from librarian.dcparser import BookInfo
+from librarian import ParseError, ValidationError
+
+from catalogue.models import Book
+
+
+WL_API = 'http://www.wolnelektury.pl/api/books/'
+
+
+class Command(BaseCommand):
+    option_list = BaseCommand.option_list + (
+        make_option('-q', '--quiet', action='store_false', dest='verbose', default=True,
+            help='Less output'),
+    )
+    help = 'Imports XML files from WL.'
+
+    def handle(self, *args, **options):
+
+        self.style = color_style()
+
+        verbose = options.get('verbose')
+
+        # Start transaction management.
+        transaction.commit_unless_managed()
+        transaction.enter_transaction_management()
+        transaction.managed(True)
+
+        if verbose:
+            print 'Reading currently managed files.'
+        slugs = {}
+        for b in Book.objects.all():
+            if verbose:
+                print b.slug
+            text = b.materialize().encode('utf-8')
+            try:
+                info = BookInfo.from_string(text)
+            except (ParseError, ValidationError):
+                pass
+            else:
+                slugs[info.slug] = b
+
+        book_count = 0
+        commit_args = {
+            "author_name": 'Platforma',
+            "description": 'Import from WL',
+        }
+
+        if verbose:
+            print 'Opening books list'
+        for book in json.load(urllib2.urlopen(WL_API)):
+            book_detail = json.load(urllib2.urlopen(book['href']))
+            xml_text = urllib2.urlopen(book_detail['xml']).read()
+            info = BookInfo.from_string(xml_text)
+            previous_book = slugs.get(info.slug, None)
+            if previous_book:
+                comm = previous_book.slug
+            else:
+                comm = '*'
+            print book_count, info.slug , '-->', comm
+            Book.import_xml_text(xml_text, title=info.title,
+                slug=info.slug, previous_book=slugs.get(info.slug, None))
+            book_count += 1
+
+        # Print results
+        print
+        print "Results:"
+        print "Imported %d books from WL:" % (
+                book_count, )
+        print
+
+
+        transaction.commit()
+        transaction.leave_transaction_management()
+
index f968654..ebeb9ae 100644 (file)
@@ -7,9 +7,12 @@ from django.contrib.auth.models import User
 from django.core.urlresolvers import reverse
 from django.db import models
 from django.utils.translation import ugettext_lazy as _
+from django.db.utils import IntegrityError
+
+from slughifi import slughifi
 
 from dvcs import models as dvcs_models
-from catalogue.xml_tools import compile_text
+from catalogue.xml_tools import compile_text, split_xml
 
 import logging
 logger = logging.getLogger("fnp.catalogue")
@@ -40,6 +43,41 @@ class Book(models.Model):
     def get_absolute_url(self):
         return reverse("catalogue_book", args=[self.slug])
 
+    @classmethod
+    def import_xml_text(cls, text=u'', creator=None, previous_book=None,
+                *args, **kwargs):
+
+        texts = split_xml(text)
+        if previous_book:
+            instance = previous_book
+        else:
+            instance = cls(*args, **kwargs)
+            instance.save()
+
+        # if there are more parts, set the rest to empty strings
+        book_len = len(instance)
+        for i in range(book_len - len(texts)):
+            texts.append(u'pusta część %d' % (i + 1), u'')
+
+        i = 0
+        for i, (title, text) in enumerate(texts):
+            if not title:
+                title = u'część %d' % (i + 1)
+
+            slug = slughifi(title)
+
+            if i < book_len:
+                chunk = instance[i]
+                chunk.slug = slug
+                chunk.comment = title
+                chunk.save()
+            else:
+                chunk = instance.add(slug, title, creator, adjust_slug=True)
+
+            chunk.commit(text, author=creator)
+
+        return instance
+
     @classmethod
     def create(cls, creator=None, text=u'', *args, **kwargs):
         """
@@ -48,7 +86,7 @@ class Book(models.Model):
         """
         instance = cls(*args, **kwargs)
         instance.save()
-        instance[0].commit(author=creator, text=text)
+        instance[0].commit(text, author=creator)
         return instance
 
     def __iter__(self):
@@ -75,7 +113,7 @@ class Book(models.Model):
         if publishable:
             changes = [chunk.publishable() for chunk in self]
         else:
-            changes = [chunk.head for chunk in self]
+            changes = [chunk.head for chunk in self if chunk.head is not None]
         if None in changes:
             raise self.NoTextError('Some chunks have no available text.')
         return changes
@@ -126,6 +164,7 @@ class Book(models.Model):
         return new_slug
 
     def append(self, other):
+        """Add all chunks of another book to self."""
         number = self[len(self) - 1].number + 1
         single = len(other) == 1
         for chunk in other:
@@ -155,6 +194,10 @@ class Book(models.Model):
             number += 1
         other.delete()
 
+    def add(self, *args, **kwargs):
+        """Add a new chunk at the end."""
+        return self.chunk_set.reverse()[0].split(*args, **kwargs)
+
     @staticmethod
     def listener_create(sender, instance, created, **kwargs):
         if created:
@@ -196,12 +239,22 @@ class Chunk(dvcs_models.Document):
             title += " (%d/%d)" % (self.number, book_length)
         return title
 
-    def split(self, slug, comment='', creator=None):
+    def split(self, slug, comment='', creator=None, adjust_slug=False):
         """ Create an empty chunk after this one """
         self.book.chunk_set.filter(number__gt=self.number).update(
                 number=models.F('number')+1)
-        new_chunk = self.book.chunk_set.create(number=self.number+1,
-                creator=creator, slug=slug, comment=comment)
+        tries = 1
+        new_slug = slug
+        new_chunk = None
+        while not new_chunk:
+            try:
+                new_chunk = self.book.chunk_set.create(number=self.number+1,
+                    creator=creator, slug=new_slug, comment=comment)
+            except IntegrityError:
+                if not adjust_slug:
+                    raise
+                new_slug = "%s_%d" % (slug, tries)
+                tries += 1
         return new_chunk
 
     @staticmethod
index 928e57b..522806b 100755 (executable)
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+from copy import deepcopy
 from functools import wraps
 import re
 
@@ -139,10 +141,11 @@ def compile_text(parts):
     for next_text in parts:
         if not next_text:
             continue
-        # trim the end, because there's more non-empty text
-        # don't trim beginning, if `text' is the first non-empty part
-        texts.append(_trim(text, trim_begin=trim_begin))
-        trim_begin = True
+        if text:
+            # trim the end, because there's more non-empty text
+            # don't trim beginning, if `text' is the first non-empty part
+            texts.append(_trim(text, trim_begin=trim_begin))
+            trim_begin = True
         text = next_text
     # don't trim the end, because there's no more text coming after `text'
     # only trim beginning if it's not still the first non-empty
@@ -156,7 +159,7 @@ def change_master(text, master):
     """
     e = etree.fromstring(text)
     e[-1].tag = master
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
 
 
 def basic_structure(text, master):
@@ -167,7 +170,7 @@ def basic_structure(text, master):
 </utwor>''' % (TRIM_BEGIN, TRIM_END))
     e[0].tag = master
     e[0][0].tail = "\n"*3 + text + "\n"*3
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
 
 
 def add_trim_begin(text):
@@ -182,7 +185,7 @@ def add_trim_begin(text):
     master.insert(0, trim_tag)
     trim_tag.tail = '\n\n\n' + (master.text or '')
     master.text = '\n'
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
 
 
 def add_trim_end(text):
@@ -201,4 +204,63 @@ def add_trim_end(text):
         prev.tail = (prev.tail or '') + '\n\n\n'
     else:
         master.text = (master.text or '') + '\n\n\n'
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
+
+
+def split_xml(text):
+    """Splits text into chapters.
+
+    All this stuff really must go somewhere else.
+
+    """
+    src = etree.fromstring(text)
+    chunks = []
+
+    splitter = u'naglowek_rozdzial'
+    parts = src.findall('.//naglowek_rozdzial')
+    while parts:
+        # copy the document
+        copied = deepcopy(src)
+
+        element = parts[-1]
+
+        # find the chapter's title
+        name_elem = deepcopy(element)
+        for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
+            for a in name_elem.findall('.//' + tag):
+                a.text=''
+                del a[:]
+        name = etree.tostring(name_elem, method='text', encoding='utf-8')
+
+        # in the original, remove everything from the start of the last chapter
+        parent = element.getparent()
+        del parent[parent.index(element):]
+        element, parent = parent, parent.getparent()
+        while parent is not None:
+            del parent[parent.index(element) + 1:]
+            element, parent = parent, parent.getparent()
+
+        # in the copy, remove everything before the last chapter
+        element = copied.findall('.//naglowek_rozdzial')[-1]
+        parent = element.getparent()
+        while parent is not None:
+            parent.text = None
+            while parent[0] is not element:
+                del parent[0]
+            element, parent = parent, parent.getparent()
+        chunks[:0] = [[name,
+            unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')
+            ]]
+
+        parts = src.findall('.//naglowek_rozdzial')
+
+    chunks[:0] = [[u'początek',
+        unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')
+        ]]
+
+    for ch in chunks[1:]:
+        ch[1] = add_trim_begin(ch[1])
+    for ch in chunks[:-1]:
+        ch[1] = add_trim_end(ch[1])
+
+    return chunks