From 12727dd036c3f54c487aaa70b21bf4988135bb7d Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 12 May 2022 09:49:49 +0200 Subject: [PATCH] Geometric shapes in PDF. --- src/librarian/pdf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py index 31dfe1e..7b93997 100644 --- a/src/librarian/pdf.py +++ b/src/librarian/pdf.py @@ -436,7 +436,10 @@ def load_including_children(wldoc=None, provider=None, uri=None): 'Neither a WLDocument, nor provider and URI were provided.' ) + # Cyrrilic text = re.sub(r"([\u0400-\u04ff]+)", r"\1", text) + # Geometric shapes. + text = re.sub(r"([\u25a0-\u25ff]+)", r"\1", text) document = WLDocument.from_bytes(text.encode('utf-8'), parse_dublincore=True, provider=provider) -- 2.20.1