4 from django.apps import apps
5 from django.conf import settings
6 from django.db import models
7 from django.urls import reverse
8 from django.utils.timezone import now
9 from django.utils.translation import gettext_lazy as _
10 from . import conversion
11 from . import document
15 class Source(models.Model):
16 name = models.CharField(_('name'), max_length=1024)
17 notes = models.TextField(blank=True, help_text=_('private'))
18 wikisource = models.CharField(max_length=1024, blank=True)
19 modified_at = models.DateTimeField(null=True, blank=True)
20 processed_at = models.DateTimeField(null=True, blank=True)
25 def get_absolute_url(self):
26 return reverse('source', args=[self.pk])
29 self.modified_at = now()
30 self.save(update_fields=['modified_at'])
32 def get_upload_directory(self):
33 return f'sources/upload/{self.pk}/'
35 def get_view_directory(self):
36 return f'sources/view/{self.pk}/'
38 def get_ocr_directory(self):
39 return f'sources/ocr/{self.pk}/'
41 def has_upload_files(self):
42 d = os.path.join(settings.MEDIA_ROOT, self.get_upload_directory())
43 return os.path.isdir(d) and os.listdir(d)
45 def get_view_files(self):
46 d = self.get_view_directory()
49 for name in sorted(os.listdir(
50 os.path.join(settings.MEDIA_ROOT, d)
54 def has_view_files(self):
55 d = os.path.join(settings.MEDIA_ROOT, self.get_view_directory())
56 return os.path.isdir(d) and os.listdir(d)
58 def get_ocr_files(self):
59 d = os.path.join(settings.MEDIA_ROOT, self.get_ocr_directory())
62 for name in sorted(os.listdir(d))
65 def has_ocr_files(self):
66 d = os.path.join(settings.MEDIA_ROOT, self.get_ocr_directory())
67 return os.path.isdir(d) and os.listdir(d)
73 self.get_upload_directory()
75 view_dir = os.path.join(
77 self.get_view_directory()
79 ocr_dir = os.path.join(
81 self.get_ocr_directory()
83 with utils.replace_dir(view_dir) as d:
84 self.build_view_directory(updir, d)
85 with utils.replace_dir(ocr_dir) as d:
86 self.build_ocr_directory(updir, d)
87 self.processed_at = processed_at
88 self.save(update_fields=['processed_at'])
90 def build_view_directory(self, srcpath, targetpath):
91 for source_file_name in os.listdir(srcpath):
92 print(source_file_name)
93 src = os.path.join(srcpath, source_file_name)
94 ext = source_file_name.rsplit('.', 1)[-1].lower()
95 if ext in ('png', 'jpg', 'jpeg'):
96 conversion.resize_image(src, targetpath)
99 elif ext in ('tiff', 'tif'):
100 conversion.convert_image(src, targetpath)
102 conversion.convert_pdf(src, targetpath)
104 conversion.convert_djvu(src, targetpath)
108 def build_ocr_directory(self, srcpath, targetpath):
109 for source_file_name in os.listdir(srcpath):
110 print(source_file_name)
113 os.path.join(srcpath, source_file_name),
114 os.path.join(targetpath, source_file_name),
119 class BookSource(models.Model):
120 book = models.ForeignKey('catalogue.Book', models.CASCADE)
121 source = models.ForeignKey(Source, models.CASCADE)
122 ordering = models.IntegerField(default=1)
123 page_start = models.IntegerField(null=True, blank=True)
124 page_end = models.IntegerField(null=True, blank=True)
127 ordering = ('ordering', 'page_start',)
130 return f'{self.source} -> {self.book}'
132 def get_absolute_url(self):
133 return reverse('source_book_prepare', args=[self.pk])
135 def get_view_files(self):
136 # TODO: won't work for PDFs.
137 files = self.source.get_view_files()
139 files = files[:self.page_end]
141 files = files[self.page_start - 1:]
144 def get_ocr_files(self):
145 # TODO: won't work for PDFs.
146 files = self.source.get_ocr_files()
148 files = files[:self.page_end]
150 files = files[self.page_start - 1:]
153 def get_document(self):
154 return self.book.document_books.first()
157 def prepare_document(cls, book, user=None):
158 DBook = apps.get_model('documents', 'Book')
159 texts = document.build_document_texts(book)
161 dbook = book.document_books.first()
163 dbook = DBook.create(
166 slug=str(uuid.uuid4()),
169 dbook[0].commit(text=texts[0], description='OCR', author=user)
170 for text in texts[1:]:
171 dbook[0].commit(text=text, description='OCR', author=user)
173 dbook[0].head.set_publishable(True)