From: Radek Czajka Date: Wed, 15 Sep 2010 10:41:39 +0000 (+0200) Subject: initial, incomplete pdf support X-Git-Tag: 1.7~271 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/223fd8f247b4a588d263afaf798dca4cb9ffa639 initial, incomplete pdf support --- diff --git a/librarian/functions.py b/librarian/functions.py new file mode 100644 index 0000000..6d52b84 --- /dev/null +++ b/librarian/functions.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from lxml import etree +import re + +def _register_function(f): + """ Register extension function with lxml """ + ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') + ns[f.__name__] = f + + +def reg_substitute_entities(): + ENTITY_SUBSTITUTIONS = [ + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), + ] + + def substitute_entities(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = ''.join(text) + for entity, substitutution in ENTITY_SUBSTITUTIONS: + text = text.replace(entity, substitutution) + return text + + _register_function(substitute_entities) + + +def reg_strip(): + def strip(context, text): + """Remove unneeded whitespace from beginning and end""" + if isinstance(text, list): + text = ''.join(text) + return re.sub(r'\s+', ' ', text).strip() + _register_function(strip) + + +def reg_wrap_words(): + def wrap_words(context, text, wrapping): + """XPath extension function automatically wrapping words in passed text""" + if isinstance(text, list): + text = ''.join(text) + if not wrapping: + return text + + words = re.split(r'\s', text) + + line_length = 0 + lines = [[]] + for word in words: + line_length += len(word) + 1 + if line_length > wrapping: + # Max line length was exceeded. We create new line + lines.append([]) + line_length = len(word) + lines[-1].append(word) + return '\n'.join(' '.join(line) for line in lines) + _register_function(wrap_words) + diff --git a/librarian/html.py b/librarian/html.py index 92fc134..3912fbc 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -11,16 +11,11 @@ import copy from lxml import etree from librarian.parser import WLDocument from librarian import XHTMLNS, ParseError +from librarian import functions from lxml.etree import XMLSyntaxError, XSLTApplyError -ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), -] +functions.reg_substitute_entities() STYLESHEETS = { 'legacy': 'xslt/book2html.xslt', @@ -31,18 +26,6 @@ STYLESHEETS = { def get_stylesheet(name): return os.path.join(os.path.dirname(__file__), STYLESHEETS[name]) -def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - -# Register substitute_entities function with lxml -ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') -ns['substitute_entities'] = substitute_entities - def html_has_content(text): return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text) diff --git a/librarian/parser.py b/librarian/parser.py index 2cd86ab..b470957 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -38,11 +38,11 @@ class WLDocument(object): self.book_info = None @classmethod - def from_string(cls, xml, swap_endlines=False, parse_dublincore=True): - return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore) + def from_string(cls, xml, *args, **kwargs): + return cls.from_file(StringIO(xml), *args, **kwargs) @classmethod - def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True): + def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True, preserve_lines=True): # first, prepare for parsing if isinstance(xmlfile, basestring): @@ -58,7 +58,10 @@ class WLDocument(object): data = data.decode('utf-8') if swap_endlines: - data = cls.LINE_SWAP_EXPR.sub(u'
\n', data) + sub = u'
' + if preserve_lines: + sub += u'\n' + data = cls.LINE_SWAP_EXPR.sub(sub, data) try: parser = etree.XMLParser(remove_blank_text=False) diff --git a/librarian/pdf.py b/librarian/pdf.py new file mode 100644 index 0000000..169d661 --- /dev/null +++ b/librarian/pdf.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import os +import os.path +import shutil +from StringIO import StringIO +from tempfile import mkdtemp +import re + +from Texml.processor import process +from lxml import etree +from lxml.etree import XMLSyntaxError, XSLTApplyError + +from librarian.parser import WLDocument +from librarian import ParseError +from librarian import functions + +functions.reg_substitute_entities() + +STYLESHEETS = { + 'wl2tex': 'xslt/wl2tex.xslt', +} + + +def insert_tags(doc, split_re, tagname): + print tagname + for elem in doc.iter(): + if elem.text: + chunks = split_re.split(elem.text) + elem.text = chunks.pop(0) + while chunks: + ins = etree.Element(tagname) + ins.tail = chunks.pop() + elem.insert(0, ins) + if elem.tail: + chunks = split_re.split(elem.tail) + parent = elem.getparent() + ins_index = parent.index(elem) + 1 + elem.tail = chunks.pop(0) + while chunks: + ins = etree.Element(tagname) + ins.tail = chunks.pop(0) + parent.insert(ins_index, ins) + + +def substitute_hyphens(doc): + insert_tags(doc, + re.compile("(?<=[^-\s])-(?=[^-\s])"), + "dywiz") + + +def fix_hanging(doc): + insert_tags(doc, + re.compile("(?<=\s\w)\s+"), + "nbsp") + + +def get_stylesheet(name): + return os.path.join(os.path.dirname(__file__), STYLESHEETS[name]) + +def transform(provider, slug, output_file=None, output_dir=None): + """ produces a pdf file + + provider is a DocProvider + either output_file (a file-like object) or output_dir (path to file/dir) should be specified + if output_dir is specified, file will be written to //.pdf + """ + + # Parse XSLT + try: + style_filename = get_stylesheet("wl2tex") + style = etree.parse(style_filename) + + document = load_including_children(provider, slug) + + substitute_hyphens(document.edoc) + fix_hanging(document.edoc) + + print etree.tostring(document.edoc) + + # if output to dir, create the file + if output_dir is not None: + author = unicode(document.book_info.author) + output_dir = os.path.join(output_dir, author) + + texml = document.transform(style) + del document # no longer needed large object :) + + temp = mkdtemp('wl2pdf-') + tex_path = os.path.join(temp, 'doc.tex') + fout = open(tex_path, 'w') + process(StringIO(texml), fout, 'utf8', 255, 0, 0) + fout.close() + del texml + + print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex')) + if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))): + raise ParseError("Error parsing .tex file") + + pdf_path = os.path.join(temp, 'doc.pdf') + if output_dir is not None: + try: + os.makedirs(output_dir) + except OSError: + pass + output_path = os.path.join(output_dir, '%s.pdf' % slug) + shutil.move(pdf_path, output_path) + else: + with open(pdf_path) as f: + output_file.write(f.read()) + output_file.close() + + return True + except (XMLSyntaxError, XSLTApplyError), e: + raise ParseError(e) + + +def load_including_children(provider, slug=None, uri=None): + """ makes one big xml file with children inserted at end + either slug or uri must be provided + """ + + if uri: + f = provider.by_uri(uri) + elif slug: + f = provider[slug] + else: + raise ValueError('Neither slug nor URI provided for a book.') + + document = WLDocument.from_file(f, True, + parse_dublincore=True, + preserve_lines=False) + + for child_uri in document.book_info.parts: + child = load_including_children(provider, uri=child_uri) + document.edoc.getroot().append(child.edoc.getroot()) + + return document + + +if __name__ == '__main__': + import sys + from librarian import DirDocProvider + + if len(sys.argv) < 2: + print >> sys.stderr, 'Usage: python pdf.py ' + sys.exit(1) + + main_input = sys.argv[1] + basepath, ext = os.path.splitext(main_input) + path, slug = os.path.realpath(basepath).rsplit('/', 1) + provider = DirDocProvider(path) + transform(provider, slug, output_dir=path) + diff --git a/librarian/text.py b/librarian/text.py index a6acd8a..b7fbad6 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -3,7 +3,7 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import dcparser, parser +from librarian import dcparser, parser, functions from lxml import etree import cStringIO import codecs @@ -11,14 +11,9 @@ import os import re -ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), -] - +functions.reg_substitute_entities() +functions.reg_wrap_words() +functions.reg_split() TEMPLATE = u"""\ Kodowanie znaków w dokumencie: UTF-8. @@ -35,51 +30,6 @@ Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dost %(text)s """ - -def strip(context, text): - """Remove unneeded whitespace from beginning and end""" - if isinstance(text, list): - text = ''.join(text) - return re.sub(r'\s+', ' ', text).strip() - - -def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - - -def wrap_words(context, text, wrapping): - """XPath extension function automatically wrapping words in passed text""" - if isinstance(text, list): - text = ''.join(text) - if not wrapping: - return text - - words = re.split(r'\s', text) - - line_length = 0 - lines = [[]] - for word in words: - line_length += len(word) + 1 - if line_length > wrapping: - # Max line length was exceeded. We create new line - lines.append([]) - line_length = len(word) - lines[-1].append(word) - return '\n'.join(' '.join(line) for line in lines) - - -# Register substitute_entities function with lxml -ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') -ns['strip'] = strip -ns['substitute_entities'] = substitute_entities -ns['wrap_words'] = wrap_words - - def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options): """Transforms file input_filename in XML to output_filename in TXT.""" # Parse XSLT diff --git a/librarian/xslt/wl2tex.xslt b/librarian/xslt/wl2tex.xslt new file mode 100644 index 0000000..74292a7 --- /dev/null +++ b/librarian/xslt/wl2tex.xslt @@ -0,0 +1,407 @@ + + + + + + + + + + + \documentclass[a4paper, oneside, 11pt]{book} + \usepackage[MeX]{polski} + \usepackage[utf8]{inputenc} + \pagestyle{plain} + \usepackage{antpolt} + \usepackage[bottom]{footmisc} + + \usepackage{color} + \definecolor{theme-gray}{gray}{.3} + + + \setlength{\marginparsep}{2em} + \setlength{\marginparwidth}{8.5em} + \setlength{\oddsidemargin}{0pt} + \clubpenalty=10000 + \widowpenalty=10000 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1em + + + + + + + + pt + + + + em + + + 1em + + + + + 12em + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + „” + + + + + + + + + + + + + + + + + + + 30pt + + + + + * + + + + + + + + + + + + + + + + + + + + 0pt + + theme-gray + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file