plwiki for books
[redakcja.git] / src / sources / models.py
1 import os
2 import subprocess
3 import uuid
4 from django.apps import apps
5 from django.conf import settings
6 from django.db import models
7 from django.urls import reverse
8 from django.utils.timezone import now
9 from django.utils.translation import gettext_lazy as _
10 from . import conversion
11 from . import document
12 from . import utils
13
14
15 class Source(models.Model):
16     name = models.CharField(_('name'), max_length=1024)
17     notes = models.TextField(blank=True, help_text=_('private'))
18     wikisource = models.CharField(max_length=1024, blank=True)
19     modified_at = models.DateTimeField(null=True, blank=True)
20     processed_at = models.DateTimeField(null=True, blank=True)
21
22     def __str__(self):
23         return self.name
24
25     def get_absolute_url(self):
26         return reverse('source', args=[self.pk])
27
28     def touch(self):
29         self.modified_at = now()
30         self.save(update_fields=['modified_at'])
31     
32     def get_upload_directory(self):
33         return f'sources/upload/{self.pk}/'
34
35     def get_view_directory(self):
36         return f'sources/view/{self.pk}/'
37
38     def get_ocr_directory(self):
39         return f'sources/ocr/{self.pk}/'
40
41     def get_view_files(self):
42         d = self.get_view_directory()
43         return [
44             d + name
45             for name in sorted(os.listdir(
46                     os.path.join(settings.MEDIA_ROOT, d)
47             ))
48         ]
49
50     def get_ocr_files(self):
51         d = os.path.join(settings.MEDIA_ROOT, self.get_ocr_directory())
52         return [
53             d + name
54             for name in sorted(os.listdir(d))
55         ]
56
57     def process(self):
58         updir = os.path.join(
59             settings.MEDIA_ROOT,
60             self.get_upload_directory()
61         )
62         view_dir = os.path.join(
63             settings.MEDIA_ROOT,
64             self.get_view_directory()
65         )
66         ocr_dir = os.path.join(
67             settings.MEDIA_ROOT,
68             self.get_ocr_directory()
69         )
70         with utils.replace_dir(view_dir) as d:
71             self.build_view_directory(updir, d)
72         with utils.replace_dir(ocr_dir) as d:
73             self.build_ocr_directory(updir, d)
74         self.processed_at = now()
75         self.save(update_fields=['processed_at'])
76     
77     def build_view_directory(self, srcpath, targetpath):
78         for source_file_name in os.listdir(srcpath):
79             print(source_file_name)
80             src = os.path.join(srcpath, source_file_name)
81             ext = source_file_name.rsplit('.', 1)[-1].lower()
82             if ext in ('png', 'jpg', 'jpeg'):
83                 conversion.resize_image(src, targetpath)
84                 # cp?
85                 # maybe resize
86             elif ext in ('tiff', 'tif'):
87                 conversion.convert_image(src, targetpath)
88             elif ext == 'pdf':
89                 conversion.convert_pdf(src, targetpath)
90             elif ext == 'djvu':
91                 conversion.convert_djvu(src, targetpath)
92             else:
93                 pass
94
95     def build_ocr_directory(self, srcpath, targetpath):
96         for source_file_name in os.listdir(srcpath):
97             print(source_file_name)
98             subprocess.run([
99                 'tesseract',
100                 os.path.join(srcpath, source_file_name),
101                 os.path.join(targetpath, source_file_name),
102                 '-l', 'pol'
103             ])
104
105
106 class BookSource(models.Model):
107     book = models.ForeignKey('catalogue.Book', models.CASCADE)
108     source = models.ForeignKey(Source, models.CASCADE)
109     page_start = models.IntegerField(null=True, blank=True)
110     page_end = models.IntegerField(null=True, blank=True)
111         
112     class Meta:
113         ordering = ('page_start',)
114
115     def __str__(self):
116         return f'{self.source} -> {self.book}'
117
118     def get_absolute_url(self):
119         return reverse('source_book_prepare', args=[self.pk])
120
121     def get_view_files(self):
122         # TODO: won't work for PDFs.
123         files = self.source.get_view_files()
124         if self.page_end:
125             files = files[:self.page_end]
126         if self.page_start:
127             files = files[self.page_start - 1:]
128         return files
129
130     def get_ocr_files(self):
131         # TODO: won't work for PDFs.
132         files = self.source.get_ocr_files()
133         if self.page_end:
134             files = files[:self.page_end]
135         if self.page_start:
136             files = files[self.page_start - 1:]
137         return files
138
139     def get_document(self):
140         return self.book.document_books.first()
141         
142     def prepare_document(self, user=None):
143         DBook = apps.get_model('documents', 'Book')
144         texts = document.build_document_texts(self)
145
146         dbook = self.get_document()
147         if dbook is None:
148             dbook = DBook.create(
149                 user, texts[0],
150                 title=self.book.title,
151                 slug=str(uuid.uuid4()),
152             )
153         else:
154             dbook[0].commit(text=texts[0], description='OCR', author=user)
155         for text in texts[1:]:
156             dbook[0].commit(text=text, description='OCR', author=user)
157
158         dbook[0].head.set_publishable(True)
159         return dbook
160