first take on The Great Import

author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)

committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>

Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
author Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
committer Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
diff --git a/apps/catalogue/management/commands/import_wl.py b/apps/catalogue/management/commands/import_wl.py

new file mode 100755 (executable)

index 0000000..6836d36
--- /dev/null
+++ b/apps/catalogue/management/commands/import_wl.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+import json
+from optparse import make_option
+import urllib2
+
+from django.core.management.base import BaseCommand
+from django.core.management.color import color_style
+from django.db import transaction
+from librarian.dcparser import BookInfo
+from librarian import ParseError, ValidationError
+
+from catalogue.models import Book
+
+
+WL_API = 'http://www.wolnelektury.pl/api/books/'
+
+
+class Command(BaseCommand):
+    option_list = BaseCommand.option_list + (
+        make_option('-q', '--quiet', action='store_false', dest='verbose', default=True,
+            help='Less output'),
+    )
+    help = 'Imports XML files from WL.'
+
+    def handle(self, *args, **options):
+
+        self.style = color_style()
+
+        verbose = options.get('verbose')
+
+        # Start transaction management.
+        transaction.commit_unless_managed()
+        transaction.enter_transaction_management()
+        transaction.managed(True)
+
+        if verbose:
+            print 'Reading currently managed files.'
+        slugs = {}
+        for b in Book.objects.all():
+            if verbose:
+                print b.slug
+            text = b.materialize().encode('utf-8')
+            try:
+                info = BookInfo.from_string(text)
+            except (ParseError, ValidationError):
+                pass
+            else:
+                slugs[info.slug] = b
+
+        book_count = 0
+        commit_args = {
+            "author_name": 'Platforma',
+            "description": 'Import from WL',
+        }
+
+        if verbose:
+            print 'Opening books list'
+        for book in json.load(urllib2.urlopen(WL_API)):
+            book_detail = json.load(urllib2.urlopen(book['href']))
+            xml_text = urllib2.urlopen(book_detail['xml']).read()
+            info = BookInfo.from_string(xml_text)
+            previous_book = slugs.get(info.slug, None)
+            if previous_book:
+                comm = previous_book.slug
+            else:
+                comm = '*'
+            print book_count, info.slug , '-->', comm
+            Book.import_xml_text(xml_text, title=info.title,
+                slug=info.slug, previous_book=slugs.get(info.slug, None))
+            book_count += 1
+
+        # Print results
+        print
+        print "Results:"
+        print "Imported %d books from WL:" % (
+                book_count, )
+        print
+
+
+        transaction.commit()
+        transaction.leave_transaction_management()
+
diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py

index f968654..ebeb9ae 100644 (file)
--- a/apps/catalogue/models.py
+++ b/apps/catalogue/models.py
@@ -7,9 +7,12 @@ from django.contrib.auth.models import User
  from django.core.urlresolvers import reverse
  from django.db import models
  from django.utils.translation import ugettext_lazy as _
+from django.db.utils import IntegrityError
+
+from slughifi import slughifi
  
  from dvcs import models as dvcs_models
-from catalogue.xml_tools import compile_text
+from catalogue.xml_tools import compile_text, split_xml
  
  import logging
  logger = logging.getLogger("fnp.catalogue")
@@ -40,6 +43,41 @@ class Book(models.Model):
      def get_absolute_url(self):
          return reverse("catalogue_book", args=[self.slug])
  
+    @classmethod
+    def import_xml_text(cls, text=u'', creator=None, previous_book=None,
+                *args, **kwargs):
+
+        texts = split_xml(text)
+        if previous_book:
+            instance = previous_book
+        else:
+            instance = cls(*args, **kwargs)
+            instance.save()
+
+        # if there are more parts, set the rest to empty strings
+        book_len = len(instance)
+        for i in range(book_len - len(texts)):
+            texts.append(u'pusta część %d' % (i + 1), u'')
+
+        i = 0
+        for i, (title, text) in enumerate(texts):
+            if not title:
+                title = u'część %d' % (i + 1)
+
+            slug = slughifi(title)
+
+            if i < book_len:
+                chunk = instance[i]
+                chunk.slug = slug
+                chunk.comment = title
+                chunk.save()
+            else:
+                chunk = instance.add(slug, title, creator, adjust_slug=True)
+
+            chunk.commit(text, author=creator)
+
+        return instance
+
      @classmethod
      def create(cls, creator=None, text=u'', *args, **kwargs):
          """
@@ -48,7 +86,7 @@ class Book(models.Model):
          """
          instance = cls(*args, **kwargs)
          instance.save()
-        instance[0].commit(author=creator, text=text)
+        instance[0].commit(text, author=creator)
          return instance
  
      def __iter__(self):
@@ -75,7 +113,7 @@ class Book(models.Model):
          if publishable:
              changes = [chunk.publishable() for chunk in self]
          else:
-            changes = [chunk.head for chunk in self]
+            changes = [chunk.head for chunk in self if chunk.head is not None]
          if None in changes:
              raise self.NoTextError('Some chunks have no available text.')
          return changes
@@ -126,6 +164,7 @@ class Book(models.Model):
          return new_slug
  
      def append(self, other):
+        """Add all chunks of another book to self."""
          number = self[len(self) - 1].number + 1
          single = len(other) == 1
          for chunk in other:
@@ -155,6 +194,10 @@ class Book(models.Model):
              number += 1
          other.delete()
  
+    def add(self, *args, **kwargs):
+        """Add a new chunk at the end."""
+        return self.chunk_set.reverse()[0].split(*args, **kwargs)
+
      @staticmethod
      def listener_create(sender, instance, created, **kwargs):
          if created:
@@ -196,12 +239,22 @@ class Chunk(dvcs_models.Document):
              title += " (%d/%d)" % (self.number, book_length)
          return title
  
-    def split(self, slug, comment='', creator=None):
+    def split(self, slug, comment='', creator=None, adjust_slug=False):
          """ Create an empty chunk after this one """
          self.book.chunk_set.filter(number__gt=self.number).update(
                  number=models.F('number')+1)
-        new_chunk = self.book.chunk_set.create(number=self.number+1,
-                creator=creator, slug=slug, comment=comment)
+        tries = 1
+        new_slug = slug
+        new_chunk = None
+        while not new_chunk:
+            try:
+                new_chunk = self.book.chunk_set.create(number=self.number+1,
+                    creator=creator, slug=new_slug, comment=comment)
+            except IntegrityError:
+                if not adjust_slug:
+                    raise
+                new_slug = "%s_%d" % (slug, tries)
+                tries += 1
          return new_chunk
  
      @staticmethod
diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py

index 928e57b..522806b 100755 (executable)
--- a/apps/catalogue/xml_tools.py
+++ b/apps/catalogue/xml_tools.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+from copy import deepcopy
  from functools import wraps
  import re
  
@@ -139,10 +141,11 @@ def compile_text(parts):
      for next_text in parts:
          if not next_text:
              continue
-        # trim the end, because there's more non-empty text
-        # don't trim beginning, if `text' is the first non-empty part
-        texts.append(_trim(text, trim_begin=trim_begin))
-        trim_begin = True
+        if text:
+            # trim the end, because there's more non-empty text
+            # don't trim beginning, if `text' is the first non-empty part
+            texts.append(_trim(text, trim_begin=trim_begin))
+            trim_begin = True
          text = next_text
      # don't trim the end, because there's no more text coming after `text'
      # only trim beginning if it's not still the first non-empty
@@ -156,7 +159,7 @@ def change_master(text, master):
      """
      e = etree.fromstring(text)
      e[-1].tag = master
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  
  
  def basic_structure(text, master):
@@ -167,7 +170,7 @@ def basic_structure(text, master):
  </utwor>''' % (TRIM_BEGIN, TRIM_END))
      e[0].tag = master
      e[0][0].tail = "\n"*3 + text + "\n"*3
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  
  
  def add_trim_begin(text):
@@ -182,7 +185,7 @@ def add_trim_begin(text):
      master.insert(0, trim_tag)
      trim_tag.tail = '\n\n\n' + (master.text or '')
      master.text = '\n'
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  
  
  def add_trim_end(text):
@@ -201,4 +204,63 @@ def add_trim_end(text):
          prev.tail = (prev.tail or '') + '\n\n\n'
      else:
          master.text = (master.text or '') + '\n\n\n'
-    return etree.tostring(e, encoding="utf-8")
+    return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
+
+
+def split_xml(text):
+    """Splits text into chapters.
+
+    All this stuff really must go somewhere else.
+
+    """
+    src = etree.fromstring(text)
+    chunks = []
+
+    splitter = u'naglowek_rozdzial'
+    parts = src.findall('.//naglowek_rozdzial')
+    while parts:
+        # copy the document
+        copied = deepcopy(src)
+
+        element = parts[-1]
+
+        # find the chapter's title
+        name_elem = deepcopy(element)
+        for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
+            for a in name_elem.findall('.//' + tag):
+                a.text=''
+                del a[:]
+        name = etree.tostring(name_elem, method='text', encoding='utf-8')
+
+        # in the original, remove everything from the start of the last chapter
+        parent = element.getparent()
+        del parent[parent.index(element):]
+        element, parent = parent, parent.getparent()
+        while parent is not None:
+            del parent[parent.index(element) + 1:]
+            element, parent = parent, parent.getparent()
+
+        # in the copy, remove everything before the last chapter
+        element = copied.findall('.//naglowek_rozdzial')[-1]
+        parent = element.getparent()
+        while parent is not None:
+            parent.text = None
+            while parent[0] is not element:
+                del parent[0]
+            element, parent = parent, parent.getparent()
+        chunks[:0] = [[name,
+            unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')
+            ]]
+
+        parts = src.findall('.//naglowek_rozdzial')
+
+    chunks[:0] = [[u'początek',
+        unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')
+        ]]
+
+    for ch in chunks[1:]:
+        ch[1] = add_trim_begin(ch[1])
+    for ch in chunks[:-1]:
+        ch[1] = add_trim_end(ch[1])
+
+    return chunks
author	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
committer	Radek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
	Wed, 21 Sep 2011 14:51:47 +0000 (16:51 +0200)
apps/catalogue/management/commands/import_wl.py	[new file with mode: 0755]	patch \| blob
apps/catalogue/models.py		patch \| blob \| history
apps/catalogue/xml_tools.py		patch \| blob \| history