X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/fefdce4e24f9e397df5538fe6e7f54b5ece4d841..28532fa3b437bb36b9d5c582851d3cdcf8d772ab:/src/librarian/pdf.py?ds=inline diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py index e6d897d..b32395f 100644 --- a/src/librarian/pdf.py +++ b/src/librarian/pdf.py @@ -20,6 +20,7 @@ from copy import deepcopy from subprocess import call, PIPE from itertools import chain +from PIL import Image from Texml.processor import process from lxml import etree from lxml.etree import XMLSyntaxError, XSLTApplyError @@ -54,7 +55,9 @@ STYLESHEETS = { def insert_tags(doc, split_re, tagname, exclude=None): - """ inserts for every occurence of `split_re' in text nodes in the `doc' tree + """ + Inserts for every occurence of `split_re' + in text nodes in the `doc' tree. >>> t = etree.fromstring('A-B-CX-Y-Z') >>> insert_tags(t, re.compile('-'), 'd') @@ -84,19 +87,21 @@ def insert_tags(doc, split_re, tagname, exclude=None): def substitute_hyphens(doc): - insert_tags(doc, - re.compile("(?<=[^-\s])-(?=[^-\s])"), - "dywiz", - exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"] - ) + insert_tags( + doc, + re.compile(r"(?<=[^-\s])-(?=[^-\s])"), + "dywiz", + exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"] + ) def fix_hanging(doc): - insert_tags(doc, - re.compile("(?<=\s\w)\s+"), - "nbsp", - exclude=[DCNS("identifier.url"), DCNS("rights.license")] - ) + insert_tags( + doc, + re.compile(r"(?<=\s\w)\s+"), + "nbsp", + exclude=[DCNS("identifier.url"), DCNS("rights.license")] + ) def fix_tables(doc): @@ -112,25 +117,37 @@ def fix_tables(doc): def mark_subauthors(doc): - root_author = ', '.join(elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed'))) + root_author = ', '.join( + elem.text + for elem in doc.findall( + './' + RDFNS('RDF') + '//' + DCNS('creator_parsed') + ) + ) last_author = None # jeśli autor jest inny niż autor całości i niż poprzedni autor # to wstawiamy jakiś znacznik w rdf? for subutwor in doc.xpath('/utwor/utwor'): - author = ', '.join(elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed'))) + author = ', '.join( + elem.text + for elem in subutwor.findall('.//' + DCNS('creator_parsed')) + ) if author not in (last_author, root_author): - subutwor.find('.//' + RDFNS('RDF')).append(etree.Element('use_subauthor')) + subutwor.find('.//' + RDFNS('RDF')).append( + etree.Element('use_subauthor') + ) last_author = author def move_motifs_inside(doc): """ moves motifs to be into block elements """ for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|' - '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'): + '//dramat_wierszowany_l|//dramat_wierszowany_lp|' + '//dramat_wspolczesny'): for motif in master.xpath('motyw'): for sib in motif.itersiblings(): - if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', - 'begin', 'end', 'motyw', 'extra', 'uwaga'): + if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', + 'separator_linia', 'begin', 'end', + 'motyw', 'extra', 'uwaga'): # motif shouldn't have a tail - it would be untagged text motif.tail = None motif.getparent().remove(motif) @@ -139,18 +156,21 @@ def move_motifs_inside(doc): def hack_motifs(doc): - """ dirty hack for the marginpar-creates-orphans LaTeX problem + """ + Dirty hack for the marginpar-creates-orphans LaTeX problem see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304 - moves motifs in stanzas from first verse to second - and from next to last to last, then inserts negative vspace before them + Moves motifs in stanzas from first verse to second and from next + to last to last, then inserts negative vspace before them. """ for motif in doc.findall('//strofa//motyw'): # find relevant verse-level tag verse, stanza = motif, motif.getparent() while stanza is not None and stanza.tag != 'strofa': verse, stanza = stanza, stanza.getparent() - breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True)) + breaks_before = sum( + 1 for i in verse.itersiblings('br', preceding=True) + ) breaks_after = sum(1 for i in verse.itersiblings('br')) if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1: move_by = 1 @@ -176,8 +196,11 @@ def parse_creator(doc): Finds all dc:creator and dc.contributor.translator tags and adds *_parsed versions with forenames first. """ - for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')), - namespaces={'dc': str(DCNS)})[::-1]: + for person in doc.xpath( + "|".join('//dc:' + tag for tag in ( + 'creator', 'contributor.translator' + )), + namespaces={'dc': str(DCNS)})[::-1]: if not person.text: continue p = Person.from_text(person.text) @@ -193,7 +216,10 @@ def get_stylesheet(name): def package_available(package, args='', verbose=False): - """ check if a verion of a latex package accepting given args is available """ + """ + Check if a verion of a latex package accepting given args + is available. + """ tempdir = mkdtemp('-wl2pdf-test') fpath = os.path.join(tempdir, 'test.tex') f = open(fpath, 'w') @@ -207,13 +233,18 @@ def package_available(package, args='', verbose=False): if verbose: p = call(['xelatex', '-output-directory', tempdir, fpath]) else: - p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE) + p = call( + ['xelatex', '-interaction=batchmode', '-output-directory', + tempdir, fpath], + stdout=PIPE, stderr=PIPE + ) shutil.rmtree(tempdir) return p == 0 def transform(wldoc, verbose=False, save_tex=None, morefloats=None, - cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False): + cover=None, flags=None, customizations=None, base_url='file://./', + latex_dir=False): """ produces a PDF file with XeLaTeX wldoc: a WLDocument @@ -222,7 +253,8 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, morefloats (old/new/none): force specific morefloats cover: a cover.Cover factory or True for default flags: less-advertising, - customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class) + customizations: user requested customizations regarding various + formatting parameters (passed to wl LaTeX class) """ # Parse XSLT @@ -252,6 +284,14 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, elif package_available('morefloats', 'maxfloats=19'): root.set('morefloats', 'new') + if customizations is None: + customizations = [] + else: + customizations = list(customizations) + + if book_info.endnotes: + customizations.append('endnotes') + # add customizations if customizations is not None: root.set('customizations', u','.join(customizations)) @@ -274,6 +314,7 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, fix_hanging(document.edoc) fix_tables(document.edoc) mark_subauthors(document.edoc) + document.fix_pa_akap() # wl -> TeXML style_filename = get_stylesheet("wl2tex") @@ -283,8 +324,30 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, # TeXML -> LaTeX temp = mkdtemp('-wl2pdf') - for ilustr in document.edoc.findall("//ilustr"): - shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp) + for i, ilustr in enumerate(document.edoc.findall('//ilustr')): + url = six.moves.urllib.parse.urljoin( + base_url, + ilustr.get('src') + ) + imgfile = six.moves.urllib.request.urlopen(url) + img = Image.open(imgfile) + + th_format, ext, media_type = { + 'GIF': ('GIF', 'gif', 'image/gif'), + 'PNG': ('PNG', 'png', 'image/png'), + }.get(img.format, ('JPEG', 'jpg', 'image/jpeg')) + + width = 2400 + if img.size[0] < width: + th = img + else: + th = img.resize((width, round(width * img.size[1] / img.size[0]))) + + file_name = 'image%d.%s' % (i, ext) + th.save(os.path.join(temp, file_name)) + ilustr.set('src', file_name) + + imgfile.close() for sponsor in book_info.sponsors: ins = etree.Element("data-sponsor", name=sponsor) @@ -294,7 +357,7 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, shutil.copy(logo, os.path.join(temp, fname)) ins.set('src', fname) root.insert(0, ins) - + if book_info.sponsor_note: root.set("sponsor-note", book_info.sponsor_note) @@ -334,14 +397,18 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, if verbose: p = call(['xelatex', tex_path]) else: - p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE) + p = call( + ['xelatex', '-interaction=batchmode', tex_path], + stdout=PIPE, stderr=PIPE + ) if p: raise ParseError("Error parsing .tex file") if cwd is not None: os.chdir(cwd) - output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False) + output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', + delete=False) pdf_path = os.path.join(temp, 'doc.pdf') shutil.move(pdf_path, output_file.name) shutil.rmtree(temp) @@ -353,23 +420,29 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, def load_including_children(wldoc=None, provider=None, uri=None): """ Makes one big xml file with children inserted at end. - + Either wldoc or provider and URI must be provided. """ if uri and provider: - f = provider.by_uri(uri) + f = provider.by_slug(uri.slug) text = f.read().decode('utf-8') f.close() elif wldoc is not None: text = etree.tostring(wldoc.edoc, encoding='unicode') provider = wldoc.provider else: - raise ValueError('Neither a WLDocument, nor provider and URI were provided.') + raise ValueError( + 'Neither a WLDocument, nor provider and URI were provided.' + ) + # Cyrrilic text = re.sub(r"([\u0400-\u04ff]+)", r"\1", text) + # Geometric shapes. + text = re.sub(r"([\u25a0-\u25ff]+)", r"\1", text) - document = WLDocument.from_bytes(text.encode('utf-8'), parse_dublincore=True, provider=provider) + document = WLDocument.from_bytes(text.encode('utf-8'), + parse_dublincore=True, provider=provider) document.swap_endlines() for child_uri in document.book_info.parts: