Obey length limits for wikidata import.
[redakcja.git] / src / sources / models.py
1 import os
2 import subprocess
3 import uuid
4 from django.apps import apps
5 from django.conf import settings
6 from django.db import models
7 from django.urls import reverse
8 from django.utils.timezone import now
9 from django.utils.translation import gettext_lazy as _
10 from . import conversion
11 from . import document
12 from . import utils
13
14
15 class Source(models.Model):
16     name = models.CharField(_('name'), max_length=1024)
17     notes = models.TextField(blank=True, help_text=_('private'))
18     wikisource = models.CharField(max_length=1024, blank=True)
19     modified_at = models.DateTimeField(null=True, blank=True)
20     processed_at = models.DateTimeField(null=True, blank=True)
21
22     def __str__(self):
23         return self.name
24
25     def get_absolute_url(self):
26         return reverse('source', args=[self.pk])
27
28     def touch(self):
29         self.modified_at = now()
30         self.save(update_fields=['modified_at'])
31     
32     def get_upload_directory(self):
33         return f'sources/upload/{self.pk}/'
34
35     def get_view_directory(self):
36         return f'sources/view/{self.pk}/'
37
38     def get_ocr_directory(self):
39         return f'sources/ocr/{self.pk}/'
40
41     def has_upload_files(self):
42         d = os.path.join(settings.MEDIA_ROOT, self.get_upload_directory())
43         return os.path.isdir(d) and os.listdir(d)
44     
45     def get_view_files(self):
46         d = self.get_view_directory()
47         return [
48             d + name
49             for name in sorted(os.listdir(
50                     os.path.join(settings.MEDIA_ROOT, d)
51             ))
52         ]
53
54     def has_view_files(self):
55         d = os.path.join(settings.MEDIA_ROOT, self.get_view_directory())
56         return os.path.isdir(d) and os.listdir(d)
57     
58     def get_ocr_files(self):
59         d = os.path.join(settings.MEDIA_ROOT, self.get_ocr_directory())
60         return [
61             d + name
62             for name in sorted(os.listdir(d))
63         ]
64
65     def has_ocr_files(self):
66         d = os.path.join(settings.MEDIA_ROOT, self.get_ocr_directory())
67         return os.path.isdir(d) and os.listdir(d)
68
69     def process(self):
70         processed_at = now()
71         updir = os.path.join(
72             settings.MEDIA_ROOT,
73             self.get_upload_directory()
74         )
75         view_dir = os.path.join(
76             settings.MEDIA_ROOT,
77             self.get_view_directory()
78         )
79         ocr_dir = os.path.join(
80             settings.MEDIA_ROOT,
81             self.get_ocr_directory()
82         )
83         with utils.replace_dir(view_dir) as d:
84             self.build_view_directory(updir, d)
85         with utils.replace_dir(ocr_dir) as d:
86             self.build_ocr_directory(updir, d)
87         self.processed_at = processed_at
88         self.save(update_fields=['processed_at'])
89     
90     def build_view_directory(self, srcpath, targetpath):
91         for source_file_name in os.listdir(srcpath):
92             print(source_file_name)
93             src = os.path.join(srcpath, source_file_name)
94             ext = source_file_name.rsplit('.', 1)[-1].lower()
95             if ext in ('png', 'jpg', 'jpeg'):
96                 conversion.resize_image(src, targetpath)
97                 # cp?
98                 # maybe resize
99             elif ext in ('tiff', 'tif'):
100                 conversion.convert_image(src, targetpath)
101             elif ext == 'pdf':
102                 conversion.convert_pdf(src, targetpath)
103             elif ext == 'djvu':
104                 conversion.convert_djvu(src, targetpath)
105             else:
106                 pass
107
108     def build_ocr_directory(self, srcpath, targetpath):
109         for source_file_name in os.listdir(srcpath):
110             print(source_file_name)
111             subprocess.run([
112                 'tesseract',
113                 os.path.join(srcpath, source_file_name),
114                 os.path.join(targetpath, source_file_name),
115                 '-l', 'pol'
116             ])
117
118
119 class BookSource(models.Model):
120     book = models.ForeignKey('catalogue.Book', models.CASCADE)
121     source = models.ForeignKey(Source, models.CASCADE)
122     ordering = models.IntegerField(default=1)
123     page_start = models.IntegerField(null=True, blank=True)
124     page_end = models.IntegerField(null=True, blank=True)
125         
126     class Meta:
127         ordering = ('ordering', 'page_start',)
128
129     def __str__(self):
130         return f'{self.source} -> {self.book}'
131
132     def get_absolute_url(self):
133         return reverse('source_book_prepare', args=[self.book.pk])
134
135     def get_view_files(self):
136         # TODO: won't work for PDFs.
137         files = self.source.get_view_files()
138         if self.page_end:
139             files = files[:self.page_end]
140         if self.page_start:
141             files = files[self.page_start - 1:]
142         return files
143
144     def get_ocr_files(self):
145         # TODO: won't work for PDFs.
146         files = self.source.get_ocr_files()
147         if self.page_end:
148             files = files[:self.page_end]
149         if self.page_start:
150             files = files[self.page_start - 1:]
151         return files
152
153     def get_document(self):
154         return self.book.document_books.first()
155
156     @classmethod
157     def prepare_document(cls, book, user=None):
158         DBook = apps.get_model('documents', 'Book')
159         texts = document.build_document_texts(book)
160
161         dbook = book.document_books.first()
162         if dbook is None:
163             dbook = DBook.create(
164                 user, texts[0],
165                 catalogue_book=book,
166                 title=book.title,
167                 slug=str(uuid.uuid4()),
168             )
169         else:
170             dbook[0].commit(text=texts[0], description='OCR', author=user)
171         for text in texts[1:]:
172             dbook[0].commit(text=text, description='OCR', author=user)
173
174         dbook[0].head.set_publishable(True)
175         return dbook
176