X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/175c5cf4f727162fa5bddd2460d37595251bbe8e..28532fa3b437bb36b9d5c582851d3cdcf8d772ab:/src/librarian/pdf.py
diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py
index 31dfe1e..b32395f 100644
--- a/src/librarian/pdf.py
+++ b/src/librarian/pdf.py
@@ -425,7 +425,7 @@ def load_including_children(wldoc=None, provider=None, uri=None):
"""
if uri and provider:
- f = provider.by_uri(uri)
+ f = provider.by_slug(uri.slug)
text = f.read().decode('utf-8')
f.close()
elif wldoc is not None:
@@ -436,7 +436,10 @@ def load_including_children(wldoc=None, provider=None, uri=None):
'Neither a WLDocument, nor provider and URI were provided.'
)
+ # Cyrrilic
text = re.sub(r"([\u0400-\u04ff]+)", r"\1", text)
+ # Geometric shapes.
+ text = re.sub(r"([\u25a0-\u25ff]+)", r"\1", text)
document = WLDocument.from_bytes(text.encode('utf-8'),
parse_dublincore=True, provider=provider)