--- /dev/null
+# -*- coding: utf-8 -*-
+import json
+from optparse import make_option
+import urllib2
+from django.core.management.base import BaseCommand
+from django.core.management.color import color_style
+from django.db import transaction
+from librarian.dcparser import BookInfo
+from librarian import ParseError, ValidationError
+from catalogue.models import Book
+WL_API = 'http://www.wolnelektury.pl/api/books/'
+class Command(BaseCommand):
+ option_list = BaseCommand.option_list + (
+ make_option('-q', '--quiet', action='store_false', dest='verbose', default=True,
+ help='Less output'),
+ )
+ help = 'Imports XML files from WL.'
+ def handle(self, *args, **options):
+ self.style = color_style()
+ verbose = options.get('verbose')
+ # Start transaction management.
+ transaction.commit_unless_managed()
+ transaction.enter_transaction_management()
+ transaction.managed(True)
+ if verbose:
+ print 'Reading currently managed files.'
+ slugs = {}
+ for b in Book.objects.all():
+ if verbose:
+ print b.slug
+ text = b.materialize().encode('utf-8')
+ try:
+ info = BookInfo.from_string(text)
+ except (ParseError, ValidationError):
+ pass
+ else:
+ slugs[info.slug] = b
+ book_count = 0
+ commit_args = {
+ "author_name": 'Platforma',
+ "description": 'Import from WL',
+ }
+ if verbose:
+ print 'Opening books list'
+ for book in json.load(urllib2.urlopen(WL_API)):
+ book_detail = json.load(urllib2.urlopen(book['href']))
+ xml_text = urllib2.urlopen(book_detail['xml']).read()
+ info = BookInfo.from_string(xml_text)
+ previous_book = slugs.get(info.slug, None)
+ if previous_book:
+ comm = previous_book.slug
+ else:
+ comm = '*'
+ print book_count, info.slug , '-->', comm
+ Book.import_xml_text(xml_text, title=info.title,
+ slug=info.slug, previous_book=slugs.get(info.slug, None))
+ book_count += 1
+ # Print results
+ print
+ print "Results:"
+ print "Imported %d books from WL:" % (
+ book_count, )
+ print
+ transaction.commit()
+ transaction.leave_transaction_management()
from django.core.urlresolvers import reverse
from django.db import models
from django.utils.translation import ugettext_lazy as _
+from django.db.utils import IntegrityError
+from slughifi import slughifi
from dvcs import models as dvcs_models
-from catalogue.xml_tools import compile_text
+from catalogue.xml_tools import compile_text, split_xml
import logging
logger = logging.getLogger("fnp.catalogue")
def get_absolute_url(self):
return reverse("catalogue_book", args=[self.slug])
+ @classmethod
+ def import_xml_text(cls, text=u'', creator=None, previous_book=None,
+ *args, **kwargs):
+ texts = split_xml(text)
+ if previous_book:
+ instance = previous_book
+ else:
+ instance = cls(*args, **kwargs)
+ instance.save()
+ # if there are more parts, set the rest to empty strings
+ book_len = len(instance)
+ for i in range(book_len - len(texts)):
+ texts.append(u'pusta część %d' % (i + 1), u'')
+ i = 0
+ for i, (title, text) in enumerate(texts):
+ if not title:
+ title = u'część %d' % (i + 1)
+ slug = slughifi(title)
+ if i < book_len:
+ chunk = instance[i]
+ chunk.slug = slug
+ chunk.comment = title
+ chunk.save()
+ else:
+ chunk = instance.add(slug, title, creator, adjust_slug=True)
+ chunk.commit(text, author=creator)
+ return instance
def create(cls, creator=None, text=u'', *args, **kwargs):
instance = cls(*args, **kwargs)
- instance[0].commit(author=creator, text=text)
+ instance[0].commit(text, author=creator)
return instance
def __iter__(self):
if publishable:
changes = [chunk.publishable() for chunk in self]
- changes = [chunk.head for chunk in self]
+ changes = [chunk.head for chunk in self if chunk.head is not None]
if None in changes:
raise self.NoTextError('Some chunks have no available text.')
return changes
return new_slug
def append(self, other):
+ """Add all chunks of another book to self."""
number = self[len(self) - 1].number + 1
single = len(other) == 1
for chunk in other:
number += 1
+ def add(self, *args, **kwargs):
+ """Add a new chunk at the end."""
+ return self.chunk_set.reverse()[0].split(*args, **kwargs)
def listener_create(sender, instance, created, **kwargs):
if created:
title += " (%d/%d)" % (self.number, book_length)
return title
- def split(self, slug, comment='', creator=None):
+ def split(self, slug, comment='', creator=None, adjust_slug=False):
""" Create an empty chunk after this one """
- new_chunk = self.book.chunk_set.create(number=self.number+1,
- creator=creator, slug=slug, comment=comment)
+ tries = 1
+ new_slug = slug
+ new_chunk = None
+ while not new_chunk:
+ try:
+ new_chunk = self.book.chunk_set.create(number=self.number+1,
+ creator=creator, slug=new_slug, comment=comment)
+ except IntegrityError:
+ if not adjust_slug:
+ raise
+ new_slug = "%s_%d" % (slug, tries)
+ tries += 1
return new_chunk
+# -*- coding: utf-8 -*-
+from copy import deepcopy
from functools import wraps
import re
for next_text in parts:
if not next_text:
- # trim the end, because there's more non-empty text
- # don't trim beginning, if `text' is the first non-empty part
- texts.append(_trim(text, trim_begin=trim_begin))
- trim_begin = True
+ if text:
+ # trim the end, because there's more non-empty text
+ # don't trim beginning, if `text' is the first non-empty part
+ texts.append(_trim(text, trim_begin=trim_begin))
+ trim_begin = True
text = next_text
# don't trim the end, because there's no more text coming after `text'
# only trim beginning if it's not still the first non-empty
e = etree.fromstring(text)
e[-1].tag = master
- return etree.tostring(e, encoding="utf-8")
+ return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
def basic_structure(text, master):
</utwor>''' % (TRIM_BEGIN, TRIM_END))
e[0].tag = master
e[0][0].tail = "\n"*3 + text + "\n"*3
- return etree.tostring(e, encoding="utf-8")
+ return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
def add_trim_begin(text):
master.insert(0, trim_tag)
trim_tag.tail = '\n\n\n' + (master.text or '')
master.text = '\n'
- return etree.tostring(e, encoding="utf-8")
+ return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
def add_trim_end(text):
prev.tail = (prev.tail or '') + '\n\n\n'
master.text = (master.text or '') + '\n\n\n'
- return etree.tostring(e, encoding="utf-8")
+ return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
+def split_xml(text):
+ """Splits text into chapters.
+ All this stuff really must go somewhere else.
+ """
+ src = etree.fromstring(text)
+ chunks = []
+ splitter = u'naglowek_rozdzial'
+ parts = src.findall('.//naglowek_rozdzial')
+ while parts:
+ # copy the document
+ copied = deepcopy(src)
+ element = parts[-1]
+ # find the chapter's title
+ name_elem = deepcopy(element)
+ for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
+ for a in name_elem.findall('.//' + tag):
+ a.text=''
+ del a[:]
+ name = etree.tostring(name_elem, method='text', encoding='utf-8')
+ # in the original, remove everything from the start of the last chapter
+ parent = element.getparent()
+ del parent[parent.index(element):]
+ element, parent = parent, parent.getparent()
+ while parent is not None:
+ del parent[parent.index(element) + 1:]
+ element, parent = parent, parent.getparent()
+ # in the copy, remove everything before the last chapter
+ element = copied.findall('.//naglowek_rozdzial')[-1]
+ parent = element.getparent()
+ while parent is not None:
+ parent.text = None
+ while parent[0] is not element:
+ del parent[0]
+ element, parent = parent, parent.getparent()
+ chunks[:0] = [[name,
+ unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')
+ ]]
+ parts = src.findall('.//naglowek_rozdzial')
+ chunks[:0] = [[u'początek',
+ unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')
+ ]]
+ for ch in chunks[1:]:
+ ch[1] = add_trim_begin(ch[1])
+ for ch in chunks[:-1]:
+ ch[1] = add_trim_end(ch[1])
+ return chunks