fnp
/
librarian.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
| inline |
side by side
(from parent 1:
8495b2c
)
Geometric shapes in PDF.
author
Radek Czajka
<rczajka@rczajka.pl>
Thu, 12 May 2022 07:49:49 +0000
(09:49 +0200)
committer
Radek Czajka
<rczajka@rczajka.pl>
Thu, 12 May 2022 07:49:49 +0000
(09:49 +0200)
src/librarian/pdf.py
patch
|
blob
|
history
diff --git
a/src/librarian/pdf.py
b/src/librarian/pdf.py
index
31dfe1e
..
7b93997
100644
(file)
--- a/
src/librarian/pdf.py
+++ b/
src/librarian/pdf.py
@@
-436,7
+436,10
@@
def load_including_children(wldoc=None, provider=None, uri=None):
'Neither a WLDocument, nor provider and URI were provided.'
)
+ # Cyrrilic
text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
+ # Geometric shapes.
+ text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
document = WLDocument.from_bytes(text.encode('utf-8'),
parse_dublincore=True, provider=provider)