src/catalogue/models/book.py

   1 # This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
   3 #
   4 from collections import OrderedDict
   5 import json
   6 from datetime import date, timedelta
   7 from random import randint
   8 import os.path
   9 import re
  10 import requests
  11 from slugify import slugify
  12 from sortify import sortify
  13 from urllib.request import urlretrieve
  14 from django.apps import apps
  15 from django.conf import settings
  16 from django.db import connection, models, transaction
  17 import django.dispatch
  18 from django.contrib.contenttypes.fields import GenericRelation
  19 from django.template.loader import render_to_string
  20 from django.urls import reverse
  21 from django.utils.translation import gettext_lazy as _, get_language
  22 from fnpdjango.storage import BofhFileSystemStorage
  23 from lxml import html
  24 from librarian.cover import WLCover
  25 from librarian.html import transform_abstrakt
  26 from librarian.builders import builders
  27 from newtagging import managers
  28 from catalogue import constants
  29 from catalogue import fields
  30 from catalogue.models import Tag, Fragment, BookMedia
  31 from catalogue.utils import create_zip, gallery_url, gallery_path, split_tags, get_random_hash
  32 from catalogue.models.tag import prefetched_relations
  33 from catalogue import app_settings
  34 from wolnelektury.utils import makedirs, cached_render, clear_cached_renders
  35
  36 bofh_storage = BofhFileSystemStorage()
  37
  38
  39 class Book(models.Model):
  40     """Represents a book imported from WL-XML."""
  41     title = models.CharField('tytuł', max_length=32767)
  42     sort_key = models.CharField('klucz sortowania', max_length=120, db_index=True, db_collation='C', editable=False)
  43     sort_key_author = models.CharField(
  44         'klucz sortowania wg autora', max_length=120, db_index=True, db_collation='C', editable=False, default='')
  45     slug = models.SlugField('slug', max_length=120, db_index=True, unique=True)
  46     common_slug = models.SlugField('wspólny slug', max_length=120, db_index=True)
  47     language = models.CharField('kod języka', max_length=3, db_index=True, default=app_settings.DEFAULT_LANGUAGE)
  48     description = models.TextField('opis', blank=True)
  49     license = models.CharField('licencja', max_length=255, blank=True, db_index=True)
  50     abstract = models.TextField('abstrakt', blank=True)
  51     toc = models.TextField('spis treści', blank=True)
  52     created_at = models.DateTimeField('data utworzenia', auto_now_add=True, db_index=True)
  53     changed_at = models.DateTimeField('data motyfikacji', auto_now=True, db_index=True)
  54     parent_number = models.IntegerField('numer w ramach rodzica', default=0)
  55     extra_info = models.TextField('dodatkowe informacje', default='{}')
  56     gazeta_link = models.CharField(blank=True, max_length=240)
  57     wiki_link = models.CharField(blank=True, max_length=240)
  58     print_on_demand = models.BooleanField('druk na żądanie', default=False)
  59     recommended = models.BooleanField('polecane', default=False)
  60     audio_length = models.CharField('długość audio', blank=True, max_length=8)
  61     preview = models.BooleanField('prapremiera', default=False)
  62     preview_until = models.DateField('prapremiera do', blank=True, null=True)
  63     preview_key = models.CharField(max_length=32, blank=True, null=True)
  64     findable = models.BooleanField('wyszukiwalna', default=True, db_index=True)
  65     can_sell = models.BooleanField('do sprzedaży', default=True)
  66     can_sell_mp3 = models.BooleanField('do sprzedaży mp3', default=False)
  67     isbn_mp3 = models.CharField('ISBN audiobooka', max_length=32, blank=True)
  68
  69     # files generated during publication
  70     xml_file = fields.XmlField(storage=bofh_storage, with_etag=False)
  71     html_file = fields.HtmlField(storage=bofh_storage)
  72     html_nonotes_file = fields.HtmlNonotesField(storage=bofh_storage)
  73     fb2_file = fields.Fb2Field(storage=bofh_storage)
  74     txt_file = fields.TxtField(storage=bofh_storage)
  75     epub_file = fields.EpubField(storage=bofh_storage)
  76     mobi_file = fields.MobiField(storage=bofh_storage)
  77     pdf_file = fields.PdfField(storage=bofh_storage)
  78
  79     cover = fields.CoverField('okładka', storage=bofh_storage)
  80     # Cleaner version of cover for thumbs
  81     cover_clean = fields.CoverCleanField('czysta okładka')
  82     cover_thumb = fields.CoverThumbField('miniatura okładki')
  83     cover_api_thumb = fields.CoverApiThumbField(
  84         'mniaturka okładki dla aplikacji')
  85     simple_cover = fields.SimpleCoverField('okładka dla aplikacji')
  86     cover_ebookpoint = fields.CoverEbookpointField(
  87         'okładka dla Ebookpoint')
  88
  89     ebook_formats = constants.EBOOK_FORMATS
  90     formats = ebook_formats + ['html', 'xml', 'html_nonotes']
  91
  92     parent = models.ForeignKey('self', models.CASCADE, blank=True, null=True, related_name='children')
  93     ancestor = models.ManyToManyField('self', blank=True, editable=False, related_name='descendant', symmetrical=False)
  94
  95     cached_author = models.CharField(blank=True, max_length=240, db_index=True)
  96     has_audience = models.BooleanField(default=False)
  97
  98     objects = models.Manager()
  99     tagged = managers.ModelTaggedItemManager(Tag)
 100     tags = managers.TagDescriptor(Tag)
 101     tag_relations = GenericRelation(Tag.intermediary_table_model, related_query_name='tagged_book')
 102     translators = models.ManyToManyField(Tag, blank=True)
 103     narrators = models.ManyToManyField(Tag, blank=True, related_name='narrated')
 104     has_audio = models.BooleanField(default=False)
 105     read_time = models.IntegerField(blank=True, null=True)
 106     pages = models.IntegerField(blank=True, null=True)
 107
 108     html_built = django.dispatch.Signal()
 109     published = django.dispatch.Signal()
 110
 111     SORT_KEY_SEP = '$'
 112
 113     is_book = True
 114
 115     class AlreadyExists(Exception):
 116         pass
 117
 118     class Meta:
 119         ordering = ('sort_key_author', 'sort_key')
 120         verbose_name = 'książka'
 121         verbose_name_plural = 'książki'
 122         app_label = 'catalogue'
 123
 124     def __str__(self):
 125         return self.title
 126
 127     def get_extra_info_json(self):
 128         return json.loads(self.extra_info or '{}')
 129
 130     def get_initial(self):
 131         try:
 132             return re.search(r'\w', self.title, re.U).group(0)
 133         except AttributeError:
 134             return ''
 135
 136     def authors(self):
 137         return self.tags.filter(category='author')
 138
 139     def epochs(self):
 140         return self.tags.filter(category='epoch')
 141
 142     def genres(self):
 143         return self.tags.filter(category='genre')
 144
 145     def kinds(self):
 146         return self.tags.filter(category='kind')
 147
 148     def tag_unicode(self, category):
 149         relations = prefetched_relations(self, category)
 150         if relations:
 151             return ', '.join(rel.tag.name for rel in relations)
 152         else:
 153             return ', '.join(self.tags.filter(category=category).values_list('name', flat=True))
 154
 155     def tags_by_category(self):
 156         return split_tags(self.tags.exclude(category__in=('set', 'theme')))
 157
 158     def author_unicode(self):
 159         return self.cached_author
 160
 161     def kind_unicode(self):
 162         return self.tag_unicode('kind')
 163
 164     def epoch_unicode(self):
 165         return self.tag_unicode('epoch')
 166
 167     def genre_unicode(self):
 168         return self.tag_unicode('genre')
 169
 170     def translator(self):
 171         translators = self.get_extra_info_json().get('translators')
 172         if not translators:
 173             return None
 174         if len(translators) > 3:
 175             translators = translators[:2]
 176             others = ' i inni'
 177         else:
 178             others = ''
 179         return ', '.join('\xa0'.join(reversed(translator.split(', ', 1))) for translator in translators) + others
 180
 181     def cover_source(self):
 182         return self.get_extra_info_json().get('cover_source', self.parent.cover_source() if self.parent else '')
 183
 184     @property
 185     def isbn_pdf(self):
 186         return self.get_extra_info_json().get('isbn_pdf')
 187
 188     @property
 189     def isbn_epub(self):
 190         return self.get_extra_info_json().get('isbn_epub')
 191
 192     @property
 193     def isbn_mobi(self):
 194         return self.get_extra_info_json().get('isbn_mobi')
 195
 196     @property
 197     def redakcja(self):
 198         return self.get_extra_info_json().get('about')
 199
 200     def is_accessible_to(self, user):
 201         if not self.preview:
 202             return True
 203         if not user.is_authenticated:
 204             return False
 205         Membership = apps.get_model('club', 'Membership')
 206         if Membership.is_active_for(user):
 207             return True
 208         Funding = apps.get_model('funding', 'Funding')
 209         if Funding.objects.filter(user=user, offer__book=self):
 210             return True
 211         return False
 212
 213     def save(self, force_insert=False, force_update=False, **kwargs):
 214         from sortify import sortify
 215
 216         self.sort_key = sortify(self.title)[:120]
 217         self.title = str(self.title)  # ???
 218
 219         try:
 220             author = self.authors().first().sort_key
 221         except AttributeError:
 222             author = ''
 223         self.sort_key_author = author
 224
 225         self.cached_author = self.tag_unicode('author')
 226         self.has_audience = 'audience' in self.get_extra_info_json()
 227
 228         if self.preview and not self.preview_key:
 229             self.preview_key = get_random_hash(self.slug)[:32]
 230
 231         ret = super(Book, self).save(force_insert, force_update, **kwargs)
 232
 233         return ret
 234
 235     def get_absolute_url(self):
 236         return reverse('book_detail', args=[self.slug])
 237
 238     def gallery_path(self):
 239         return gallery_path(self.slug)
 240
 241     def gallery_url(self):
 242         return gallery_url(self.slug)
 243
 244     def get_first_text(self):
 245         if self.html_file:
 246             return self
 247         child = self.children.all().order_by('parent_number').first()
 248         if child is not None:
 249             return child.get_first_text()
 250
 251     def get_last_text(self):
 252         if self.html_file:
 253             return self
 254         child = self.children.all().order_by('parent_number').last()
 255         if child is not None:
 256             return child.get_last_text()
 257
 258     def get_prev_text(self):
 259         if not self.parent:
 260             return None
 261         sibling = self.parent.children.filter(parent_number__lt=self.parent_number).order_by('-parent_number').first()
 262         if sibling is not None:
 263             return sibling.get_last_text()
 264
 265         if self.parent.html_file:
 266             return self.parent
 267
 268         return self.parent.get_prev_text()
 269
 270     def get_next_text(self, inside=True):
 271         if inside:
 272             child = self.children.order_by('parent_number').first()
 273             if child is not None:
 274                 return child.get_first_text()
 275
 276         if not self.parent:
 277             return None
 278         sibling = self.parent.children.filter(parent_number__gt=self.parent_number).order_by('parent_number').first()
 279         if sibling is not None:
 280             return sibling.get_first_text()
 281         return self.parent.get_next_text(inside=False)
 282
 283     def get_siblings(self):
 284         if not self.parent:
 285             return []
 286         return self.parent.children.all().order_by('parent_number')
 287
 288     def get_children(self):
 289         return self.children.all().order_by('parent_number')
 290
 291     @property
 292     def name(self):
 293         return self.title
 294
 295     def language_code(self):
 296         return constants.LANGUAGES_3TO2.get(self.language, self.language)
 297
 298     def language_name(self):
 299         return dict(settings.LANGUAGES).get(self.language_code(), "")
 300
 301     def is_foreign(self):
 302         return self.language_code() != settings.LANGUAGE_CODE
 303
 304     def set_audio_length(self):
 305         length = self.get_audio_length()
 306         if length > 0:
 307             self.audio_length = self.format_audio_length(length)
 308             self.save()
 309
 310     @staticmethod
 311     def format_audio_length(seconds):
 312         """
 313         >>> Book.format_audio_length(1)
 314         '0:01'
 315         >>> Book.format_audio_length(3661)
 316         '1:01:01'
 317         """
 318         if seconds < 60*60:
 319             minutes = seconds // 60
 320             seconds = seconds % 60
 321             return '%d:%02d' % (minutes, seconds)
 322         else:
 323             hours = seconds // 3600
 324             minutes = seconds % 3600 // 60
 325             seconds = seconds % 60
 326             return '%d:%02d:%02d' % (hours, minutes, seconds)
 327
 328     def get_audio_length(self):
 329         total = 0
 330         for media in self.get_mp3() or ():
 331             total += app_settings.GET_MP3_LENGTH(media.file.path)
 332         return int(total)
 333
 334     def get_time(self):
 335         return round(self.xml_file.size / 1000 * 40)
 336
 337     def has_media(self, type_):
 338         if type_ in Book.formats:
 339             return bool(getattr(self, "%s_file" % type_))
 340         else:
 341             return self.media.filter(type=type_).exists()
 342
 343     def get_media(self, type_):
 344         if self.has_media(type_):
 345             if type_ in Book.formats:
 346                 return getattr(self, "%s_file" % type_)
 347             else:
 348                 return self.media.filter(type=type_)
 349         else:
 350             return None
 351
 352     def get_mp3(self):
 353         return self.get_media("mp3")
 354
 355     def get_odt(self):
 356         return self.get_media("odt")
 357
 358     def get_ogg(self):
 359         return self.get_media("ogg")
 360
 361     def get_daisy(self):
 362         return self.get_media("daisy")
 363
 364     def get_audio_epub(self):
 365         return self.get_media("audio.epub")
 366
 367     def media_url(self, format_):
 368         media = self.get_media(format_)
 369         if media:
 370             if self.preview:
 371                 return reverse('embargo_link', kwargs={'key': self.preview_key, 'slug': self.slug, 'format_': format_})
 372             else:
 373                 return media.url
 374         else:
 375             return None
 376
 377     def html_url(self):
 378         return self.media_url('html')
 379
 380     def html_nonotes_url(self):
 381         return self.media_url('html_nonotes')
 382
 383     def pdf_url(self):
 384         return self.media_url('pdf')
 385
 386     def epub_url(self):
 387         return self.media_url('epub')
 388
 389     def mobi_url(self):
 390         return self.media_url('mobi')
 391
 392     def txt_url(self):
 393         return self.media_url('txt')
 394
 395     def fb2_url(self):
 396         return self.media_url('fb2')
 397
 398     def xml_url(self):
 399         return self.media_url('xml')
 400
 401     def has_description(self):
 402         return len(self.description) > 0
 403     has_description.short_description = 'opis'
 404     has_description.boolean = True
 405
 406     def has_mp3_file(self):
 407         return self.has_media("mp3")
 408     has_mp3_file.short_description = 'MP3'
 409     has_mp3_file.boolean = True
 410
 411     def has_ogg_file(self):
 412         return self.has_media("ogg")
 413     has_ogg_file.short_description = 'OGG'
 414     has_ogg_file.boolean = True
 415
 416     def has_daisy_file(self):
 417         return self.has_media("daisy")
 418     has_daisy_file.short_description = 'DAISY'
 419     has_daisy_file.boolean = True
 420
 421     def has_sync_file(self):
 422         return settings.FEATURE_SYNCHRO and self.has_media("sync")
 423
 424     def build_sync_file(self):
 425         from lxml import html
 426         from django.core.files.base import ContentFile
 427         with self.html_file.open('rb') as f:
 428             h = html.fragment_fromstring(f.read().decode('utf-8'))
 429
 430         durations = [
 431             m['mp3'].duration
 432             for m in self.get_audiobooks()[0]
 433         ]
 434         if settings.MOCK_DURATIONS:
 435             durations = settings.MOCK_DURATIONS
 436
 437         sync = []
 438         ts = None
 439         sid = 1
 440         dirty = False
 441         for elem in h.iter():
 442             if elem.get('data-audio-ts'):
 443                 part, ts = int(elem.get('data-audio-part')), float(elem.get('data-audio-ts'))
 444                 ts = str(round(sum(durations[:part - 1]) + ts, 3))
 445                 # check if inside verse
 446                 p = elem.getparent()
 447                 while p is not None:
 448                     # Workaround for missing ids.
 449                     if 'verse' in p.get('class', ''):
 450                         if not p.get('id'):
 451                             p.set('id', f'syn{sid}')
 452                             dirty = True
 453                             sid += 1
 454                         sync.append((ts, p.get('id')))
 455                         ts = None
 456                         break
 457                     p = p.getparent()
 458             elif ts:
 459                 cls = elem.get('class', '')
 460                 # Workaround for missing ids.
 461                 if 'paragraph' in cls or 'verse' in cls or elem.tag in ('h1', 'h2', 'h3', 'h4'):
 462                     if not elem.get('id'):
 463                         elem.set('id', f'syn{sid}')
 464                         dirty = True
 465                         sid += 1
 466                     sync.append((ts, elem.get('id')))
 467                     ts = None
 468         if dirty:
 469             htext = html.tostring(h, encoding='utf-8')
 470             with open(self.html_file.path, 'wb') as f:
 471                 f.write(htext)
 472         try:
 473             bm = self.media.get(type='sync')
 474         except:
 475             bm = BookMedia(book=self, type='sync')
 476         sync = (
 477             '27\n' + '\n'.join(
 478                 f'{s[0]}\t{sync[i+1][0]}\t{s[1]}' for i, s in enumerate(sync[:-1])
 479             )).encode('latin1')
 480         bm.file.save(
 481             None, ContentFile(sync)
 482             )
 483
 484     def get_sync(self):
 485         if not self.has_sync_file():
 486             return []
 487         with self.get_media('sync').first().file.open('r') as f:
 488             sync = f.read().split('\n')
 489         offset = float(sync[0])
 490         items = []
 491         for line in sync[1:]:
 492             if not line:
 493                 continue
 494             start, end, elid = line.split()
 495             items.append([elid, float(start) + offset])
 496         return items
 497
 498     def sync_ts(self, ts):
 499         elid = None
 500         for cur_id, t in self.get_sync():
 501             if ts >= t:
 502                 elid = cur_id
 503             else:
 504                 break
 505         return elid
 506
 507     def sync_elid(self, elid):
 508         for cur_id, t in self.get_sync():
 509             if cur_id == elid:
 510                 return t
 511
 512     def has_audio_epub_file(self):
 513         return self.has_media("audio.epub")
 514
 515     @property
 516     def media_daisy(self):
 517         return self.get_media('daisy')
 518
 519     @property
 520     def media_audio_epub(self):
 521         return self.get_media('audio.epub')
 522
 523     def get_audiobooks(self, with_children=False, processing=False):
 524         ogg_files = {}
 525         for m in self.media.filter(type='ogg').order_by().iterator():
 526             ogg_files[m.name] = m
 527
 528         audiobooks = []
 529         projects = set()
 530         total_duration = 0
 531         for mp3 in self.media.filter(type='mp3').iterator():
 532             # ogg files are always from the same project
 533             meta = mp3.get_extra_info_json()
 534             project = meta.get('project')
 535             if not project:
 536                 # temporary fallback
 537                 project = 'CzytamySłuchając'
 538
 539             projects.add((project, meta.get('funded_by', '')))
 540             total_duration += mp3.duration or 0
 541
 542             media = {'mp3': mp3}
 543
 544             ogg = ogg_files.get(mp3.name)
 545             if ogg:
 546                 media['ogg'] = ogg
 547             audiobooks.append(media)
 548
 549         if with_children:
 550             for child in self.get_children():
 551                 ch_audiobooks, ch_projects, ch_duration = child.get_audiobooks(
 552                     with_children=True, processing=True)
 553                 audiobooks.append({'part': child})
 554                 audiobooks += ch_audiobooks
 555                 projects.update(ch_projects)
 556                 total_duration += ch_duration
 557
 558         if not processing:
 559             projects = sorted(projects)
 560             total_duration = '%d:%02d' % (
 561                 total_duration // 60,
 562                 total_duration % 60
 563             )
 564
 565         return audiobooks, projects, total_duration
 566
 567     def get_audiobooks_with_children(self):
 568         return self.get_audiobooks(with_children=True)
 569
 570     def wldocument(self, parse_dublincore=True, inherit=True):
 571         from catalogue.import_utils import ORMDocProvider
 572         from librarian.parser import WLDocument
 573
 574         if inherit and self.parent:
 575             meta_fallbacks = self.parent.cover_info()
 576         else:
 577             meta_fallbacks = None
 578
 579         return WLDocument.from_file(
 580             self.xml_file.path,
 581             provider=ORMDocProvider(self),
 582             parse_dublincore=parse_dublincore,
 583             meta_fallbacks=meta_fallbacks)
 584
 585     def wldocument2(self):
 586         from catalogue.import_utils import ORMDocProvider
 587         from librarian.document import WLDocument
 588         doc = WLDocument(
 589             self.xml_file.path,
 590             provider=ORMDocProvider(self)
 591         )
 592         doc.meta.update(self.cover_info())
 593         return doc
 594
 595
 596     @staticmethod
 597     def zip_format(format_):
 598         def pretty_file_name(book):
 599             return "%s/%s.%s" % (
 600                 book.get_extra_info_json()['author'],
 601                 book.slug,
 602                 format_)
 603
 604         field_name = "%s_file" % format_
 605         field = getattr(Book, field_name)
 606         books = Book.objects.filter(parent=None).exclude(**{field_name: ""}).exclude(preview=True).exclude(findable=False)
 607         paths = [(pretty_file_name(b), getattr(b, field_name).path) for b in books.iterator()]
 608         return create_zip(paths, field.ZIP)
 609
 610     def zip_audiobooks(self, format_):
 611         bm = BookMedia.objects.filter(book=self, type=format_)
 612         paths = map(lambda bm: (bm.get_nice_filename(), bm.file.path), bm)
 613         licenses = set()
 614         for m in bm:
 615             license = constants.LICENSES.get(
 616                 m.get_extra_info_json().get('license'), {}
 617             ).get('locative')
 618             if license:
 619                 licenses.add(license)
 620         readme = render_to_string('catalogue/audiobook_zip_readme.txt', {
 621             'licenses': licenses,
 622             'meta': self.wldocument2().meta,
 623         })
 624         return create_zip(paths, "%s_%s" % (self.slug, format_), {'informacje.txt': readme})
 625
 626     def search_index(self, index=None):
 627         if not self.findable:
 628             return
 629         from search.index import Index
 630         Index.index_book(self)
 631
 632     # will make problems in conjunction with paid previews
 633     def download_pictures(self, remote_gallery_url):
 634         # This is only needed for legacy relative image paths.
 635         gallery_path = self.gallery_path()
 636         # delete previous files, so we don't include old files in ebooks
 637         if os.path.isdir(gallery_path):
 638             for filename in os.listdir(gallery_path):
 639                 file_path = os.path.join(gallery_path, filename)
 640                 os.unlink(file_path)
 641         ilustr_elements = list(self.wldocument().edoc.findall('//ilustr'))
 642         if ilustr_elements:
 643             makedirs(gallery_path)
 644             for ilustr in ilustr_elements:
 645                 ilustr_src = ilustr.get('src')
 646                 if '/' in ilustr_src:
 647                     continue
 648                 ilustr_path = os.path.join(gallery_path, ilustr_src)
 649                 urlretrieve('%s/%s' % (remote_gallery_url, ilustr_src), ilustr_path)
 650
 651     def load_abstract(self):
 652         abstract = self.wldocument(parse_dublincore=False).edoc.getroot().find('.//abstrakt')
 653         if abstract is not None:
 654             self.abstract = transform_abstrakt(abstract)
 655         else:
 656             self.abstract = ''
 657
 658     def load_toc(self):
 659         self.toc = ''
 660         if self.html_file:
 661             parser = html.HTMLParser(encoding='utf-8')
 662             tree = html.parse(self.html_file.path, parser=parser)
 663             toc = tree.find('//div[@id="toc"]/ol')
 664             if toc is None or not len(toc):
 665                 return
 666             html_link = reverse('book_text', args=[self.slug])
 667             for a in toc.findall('.//a'):
 668                 a.attrib['href'] = html_link + a.attrib['href']
 669             self.toc = html.tostring(toc, encoding='unicode')
 670             # div#toc
 671
 672     @classmethod
 673     def from_xml_file(cls, xml_file, **kwargs):
 674         from django.core.files import File
 675         from librarian import dcparser
 676
 677         # use librarian to parse meta-data
 678         book_info = dcparser.parse(xml_file)
 679
 680         if not isinstance(xml_file, File):
 681             xml_file = File(open(xml_file))
 682
 683         try:
 684             return cls.from_text_and_meta(xml_file, book_info, **kwargs)
 685         finally:
 686             xml_file.close()
 687
 688     @classmethod
 689     def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
 690                            remote_gallery_url=None, days=0, findable=True, logo=None, logo_mono=None, logo_alt=None, can_sell=None, isbn_mp3=None):
 691         from catalogue import tasks
 692
 693         if dont_build is None:
 694             dont_build = set()
 695         dont_build = set.union(set(dont_build), set(app_settings.DONT_BUILD))
 696
 697         # check for parts before we do anything
 698         children = []
 699         if hasattr(book_info, 'parts'):
 700             for part_url in book_info.parts:
 701                 try:
 702                     children.append(Book.objects.get(slug=part_url.slug))
 703                 except Book.DoesNotExist:
 704                     raise Book.DoesNotExist('Książka "%s" nie istnieje.' % part_url.slug)
 705
 706         # Read book metadata
 707         book_slug = book_info.url.slug
 708         if re.search(r'[^a-z0-9-]', book_slug):
 709             raise ValueError('Invalid characters in slug')
 710         book, created = Book.objects.get_or_create(slug=book_slug)
 711
 712         if created:
 713             book_shelves = []
 714             old_cover = None
 715             book.preview = bool(days)
 716             if book.preview:
 717                 book.preview_until = date.today() + timedelta(days)
 718         else:
 719             if not overwrite:
 720                 raise Book.AlreadyExists('Książka %s już istnieje' % book_slug)
 721             # Save shelves for this book
 722             book_shelves = list(book.tags.filter(category='set'))
 723             old_cover = book.cover_info()
 724
 725         # Save XML file
 726         book.xml_file.save('%s.xml' % book.slug, raw_file, save=False)
 727         if book.preview:
 728             book.xml_file.set_readable(False)
 729
 730         book.findable = findable
 731         book.language = book_info.language
 732         book.title = book_info.title
 733         book.license = book_info.license or ''
 734         if book_info.variant_of:
 735             book.common_slug = book_info.variant_of.slug
 736         else:
 737             book.common_slug = book.slug
 738         extra = book_info.to_dict()
 739         if logo:
 740             extra['logo'] = logo
 741         if logo_mono:
 742             extra['logo_mono'] = logo_mono
 743         if logo_alt:
 744             extra['logo_alt'] = logo_alt
 745         if can_sell is not None:
 746             book.can_sell = can_sell
 747         if isbn_mp3 is not None:
 748             book.isbn_mp3 = isbn_mp3
 749         book.extra_info = json.dumps(extra)
 750         book.load_abstract()
 751         book.load_toc()
 752         book.save()
 753
 754         book.update_stats()
 755
 756         meta_tags = Tag.tags_from_info(book_info)
 757
 758         just_tags = [t for (t, rel) in meta_tags if not rel]
 759         book.tags = set(just_tags + book_shelves)
 760         book.save()  # update sort_key_author
 761
 762         book.translators.set([t for (t, rel) in meta_tags if rel == 'translator'])
 763
 764         cover_changed = old_cover != book.cover_info()
 765         obsolete_children = set(b for b in book.children.all()
 766                                 if b not in children)
 767         notify_cover_changed = []
 768         for n, child_book in enumerate(children):
 769             new_child = child_book.parent != book
 770             child_book.parent = book
 771             child_book.parent_number = n
 772             child_book.save()
 773             if new_child or cover_changed:
 774                 notify_cover_changed.append(child_book)
 775         # Disown unfaithful children and let them cope on their own.
 776         for child in obsolete_children:
 777             child.parent = None
 778             child.parent_number = 0
 779             child.save()
 780             if old_cover:
 781                 notify_cover_changed.append(child)
 782
 783         cls.repopulate_ancestors()
 784         tasks.update_counters.delay()
 785
 786         if remote_gallery_url:
 787             book.download_pictures(remote_gallery_url)
 788
 789         # No saves beyond this point.
 790
 791         # Build cover.
 792         if 'cover' not in dont_build:
 793             book.cover.build_delay()
 794             book.cover_clean.build_delay()
 795             book.cover_thumb.build_delay()
 796             book.cover_api_thumb.build_delay()
 797             book.simple_cover.build_delay()
 798             book.cover_ebookpoint.build_delay()
 799
 800         # Build HTML and ebooks.
 801         book.html_file.build_delay()
 802         if not children:
 803             for format_ in constants.EBOOK_FORMATS_WITHOUT_CHILDREN:
 804                 if format_ not in dont_build:
 805                     getattr(book, '%s_file' % format_).build_delay()
 806         for format_ in constants.EBOOK_FORMATS_WITH_CHILDREN:
 807             if format_ not in dont_build:
 808                 getattr(book, '%s_file' % format_).build_delay()
 809         book.html_nonotes_file.build_delay()
 810
 811         if not settings.NO_SEARCH_INDEX and search_index and findable:
 812             tasks.index_book.delay(book.id)
 813
 814         for child in notify_cover_changed:
 815             child.parent_cover_changed()
 816
 817         book.update_popularity()
 818         tasks.update_references.delay(book.id)
 819
 820         cls.published.send(sender=cls, instance=book)
 821         return book
 822
 823     def update_stats(self):
 824         stats = self.wldocument2().get_statistics()['total']
 825         self.pages = round(
 826             stats.get('verses_with_fn', 0) / 30 +
 827             stats.get('chars_out_verse_with_fn', 0) / 1800)
 828         self.read_time = round(self.get_time())
 829         self.save(update_fields=['pages', 'read_time'])
 830         if self.parent is not None:
 831             self.parent.update_stats()
 832
 833     def update_references(self):
 834         Entity = apps.get_model('references', 'Entity')
 835         doc = self.wldocument2()
 836         doc._compat_assign_section_ids()
 837         doc._compat_assign_ordered_ids()
 838         refs = {}
 839         for ref_elem in doc.references():
 840             uri = ref_elem.attrib.get('href', '')
 841             if not uri:
 842                 continue
 843             if uri in refs:
 844                 ref = refs[uri]
 845             else:
 846                 entity, entity_created = Entity.objects.get_or_create(uri=uri)
 847                 if entity_created:
 848                     try:
 849                         entity.populate()
 850                     except:
 851                         pass
 852                     else:
 853                         entity.save()
 854                 ref, ref_created = entity.reference_set.get_or_create(book=self)
 855                 refs[uri] = ref
 856                 if not ref_created:
 857                     ref.occurence_set.all().delete()
 858             sec = ref_elem.get_link()
 859             m = re.match(r'sec(\d+)', sec)
 860             assert m is not None
 861             sec = int(m.group(1))
 862             snippet = ref_elem.get_snippet()
 863             b = builders['html-snippet']()
 864             for s in snippet:
 865                 s.html_build(b)
 866             html = b.output().get_bytes().decode('utf-8')
 867
 868             ref.occurence_set.create(
 869                 section=sec,
 870                 html=html
 871             )
 872         self.reference_set.exclude(entity__uri__in=refs).delete()
 873
 874     @property
 875     def references(self):
 876         return self.reference_set.all().select_related('entity')
 877
 878     def update_has_audio(self):
 879         self.has_audio = False
 880         if self.media.filter(type='mp3').exists():
 881             self.has_audio = True
 882         if self.descendant.filter(has_audio=True).exists():
 883             self.has_audio = True
 884         self.save(update_fields=['has_audio'])
 885         if self.parent is not None:
 886             self.parent.update_has_audio()
 887
 888     def update_narrators(self):
 889         narrator_names = set()
 890         for bm in self.media.filter(type='mp3'):
 891             narrator_names.update(set(
 892                 a.strip() for a in re.split(r',|\si\s', bm.artist)
 893             ))
 894         narrators = []
 895
 896         for name in narrator_names:
 897             if not name: continue
 898             slug = slugify(name)
 899             try:
 900                 t = Tag.objects.get(category='author', slug=slug)
 901             except Tag.DoesNotExist:
 902                 sort_key = sortify(
 903                     ' '.join(name.rsplit(' ', 1)[::-1]).lower()
 904                 )
 905                 t = Tag.objects.create(
 906                     category='author',
 907                     name_pl=name,
 908                     slug=slug,
 909                     sort_key=sort_key,
 910                 )
 911             narrators.append(t)
 912         self.narrators.set(narrators)
 913
 914     def update_can_sell_mp3(self):
 915         ret = True
 916         for child in self.get_children():
 917             child.update_can_sell_mp3()
 918             if not child.can_sell_mp3:
 919                 ret = False
 920         if self.has_mp3_file():
 921             audio_items = requests.get(f'https://audio.wolnelektury.pl/archive/book/{self.slug}.json').json()['items']
 922             if not all(x['project']['can_sell'] for x in audio_items):
 923                 ret = False
 924         self.can_sell_mp3 = ret
 925         self.save(update_fields=['can_sell_mp3'])
 926
 927     @classmethod
 928     @transaction.atomic
 929     def repopulate_ancestors(cls):
 930         """Fixes the ancestry cache."""
 931         # TODO: table names
 932         cursor = connection.cursor()
 933         if connection.vendor == 'postgres':
 934             cursor.execute("TRUNCATE catalogue_book_ancestor")
 935             cursor.execute("""
 936                 WITH RECURSIVE ancestry AS (
 937                     SELECT book.id, book.parent_id
 938                     FROM catalogue_book AS book
 939                     WHERE book.parent_id IS NOT NULL
 940                     UNION
 941                     SELECT ancestor.id, book.parent_id
 942                     FROM ancestry AS ancestor, catalogue_book AS book
 943                     WHERE ancestor.parent_id = book.id
 944                         AND book.parent_id IS NOT NULL
 945                     )
 946                 INSERT INTO catalogue_book_ancestor
 947                     (from_book_id, to_book_id)
 948                     SELECT id, parent_id
 949                     FROM ancestry
 950                     ORDER BY id;
 951                 """)
 952         else:
 953             cursor.execute("DELETE FROM catalogue_book_ancestor")
 954             for b in cls.objects.exclude(parent=None):
 955                 parent = b.parent
 956                 while parent is not None:
 957                     b.ancestor.add(parent)
 958                     parent = parent.parent
 959
 960     @property
 961     def ancestors(self):
 962         if self.parent:
 963             for anc in self.parent.ancestors:
 964                 yield anc
 965             yield self.parent
 966         else:
 967             return []
 968
 969     def clear_cache(self):
 970         clear_cached_renders(self.mini_box)
 971         clear_cached_renders(self.mini_box_nolink)
 972
 973     def cover_info(self, inherit=True):
 974         """Returns a dictionary to serve as fallback for BookInfo.
 975
 976         For now, the only thing inherited is the cover image.
 977         """
 978         need = False
 979         info = {}
 980         for field in ('cover_url', 'cover_by', 'cover_source'):
 981             val = self.get_extra_info_json().get(field)
 982             if val:
 983                 info[field] = val
 984             else:
 985                 need = True
 986         if inherit and need and self.parent is not None:
 987             parent_info = self.parent.cover_info()
 988             parent_info.update(info)
 989             info = parent_info
 990         return info
 991
 992     def related_themes(self):
 993         return Tag.objects.usage_for_queryset(
 994             Fragment.objects.filter(models.Q(book=self) | models.Q(book__ancestor=self)),
 995             counts=True).filter(category='theme').order_by('-count')
 996
 997     def parent_cover_changed(self):
 998         """Called when parent book's cover image is changed."""
 999         if not self.cover_info(inherit=False):
1000             if 'cover' not in app_settings.DONT_BUILD:
1001                 self.cover.build_delay()
1002                 self.cover_clean.build_delay()
1003                 self.cover_thumb.build_delay()
1004                 self.cover_api_thumb.build_delay()
1005                 self.simple_cover.build_delay()
1006                 self.cover_ebookpoint.build_delay()
1007             for format_ in constants.EBOOK_FORMATS_WITH_COVERS:
1008                 if format_ not in app_settings.DONT_BUILD:
1009                     getattr(self, '%s_file' % format_).build_delay()
1010             for child in self.children.all():
1011                 child.parent_cover_changed()
1012
1013     def other_versions(self):
1014         """Find other versions (i.e. in other languages) of the book."""
1015         return type(self).objects.filter(common_slug=self.common_slug, findable=True).exclude(pk=self.pk)
1016
1017     def parents(self):
1018         books = []
1019         parent = self.parent
1020         while parent is not None:
1021             books.insert(0, parent)
1022             parent = parent.parent
1023         return books
1024
1025     def pretty_title(self, html_links=False):
1026         names = [(tag.name, tag.get_absolute_url()) for tag in self.authors().only('name', 'category', 'slug')]
1027         books = self.parents() + [self]
1028         names.extend([(b.title, b.get_absolute_url()) for b in books])
1029
1030         if html_links:
1031             names = ['<a href="%s">%s</a>' % (tag[1], tag[0]) for tag in names]
1032         else:
1033             names = [tag[0] for tag in names]
1034         return ', '.join(names)
1035
1036     def publisher(self):
1037         publisher = self.get_extra_info_json()['publisher']
1038         if isinstance(publisher, str):
1039             return publisher
1040         elif isinstance(publisher, list):
1041             return ', '.join(publisher)
1042
1043     def get_recommended(self, limit=4):
1044         books_qs = type(self).objects.filter(findable=True)
1045         books_qs = books_qs.exclude(common_slug=self.common_slug).exclude(ancestor=self)
1046         books = type(self).tagged.related_to(self, books_qs)[:limit]
1047         return books
1048
1049     @classmethod
1050     def tagged_top_level(cls, tags):
1051         """ Returns top-level books tagged with `tags`.
1052
1053         It only returns those books which don't have ancestors which are
1054         also tagged with those tags.
1055
1056         """
1057         objects = cls.tagged.with_all(tags)
1058         return objects.filter(findable=True).exclude(ancestor__in=objects)
1059
1060     @classmethod
1061     def book_list(cls, book_filter=None):
1062         """Generates a hierarchical listing of all books.
1063
1064         Books are optionally filtered with a test function.
1065
1066         """
1067
1068         books_by_parent = {}
1069         books = cls.objects.filter(findable=True).order_by('parent_number', 'sort_key').only('title', 'parent', 'slug', 'extra_info')
1070         if book_filter:
1071             books = books.filter(book_filter).distinct()
1072
1073             book_ids = set(b['pk'] for b in books.values("pk").iterator())
1074             for book in books.iterator():
1075                 parent = book.parent_id
1076                 if parent not in book_ids:
1077                     parent = None
1078                 books_by_parent.setdefault(parent, []).append(book)
1079         else:
1080             for book in books.iterator():
1081                 books_by_parent.setdefault(book.parent_id, []).append(book)
1082
1083         orphans = []
1084         books_by_author = OrderedDict()
1085         for tag in Tag.objects.filter(category='author').iterator():
1086             books_by_author[tag] = []
1087
1088         for book in books_by_parent.get(None, ()):
1089             authors = list(book.authors().only('pk'))
1090             if authors:
1091                 for author in authors:
1092                     books_by_author[author].append(book)
1093             else:
1094                 orphans.append(book)
1095
1096         return books_by_author, orphans, books_by_parent
1097
1098     _audiences_pl = {
1099         "SP": (1, "szkoła podstawowa"),
1100         "SP1": (1, "szkoła podstawowa"),
1101         "SP2": (1, "szkoła podstawowa"),
1102         "SP3": (1, "szkoła podstawowa"),
1103         "P": (1, "szkoła podstawowa"),
1104         "G": (2, "gimnazjum"),
1105         "L": (3, "liceum"),
1106         "LP": (3, "liceum"),
1107     }
1108
1109     def audiences_pl(self):
1110         audiences = self.get_extra_info_json().get('audiences', [])
1111         audiences = sorted(set([self._audiences_pl.get(a, (99, a)) for a in audiences]))
1112         return [a[1] for a in audiences]
1113
1114     def stage_note(self):
1115         stage = self.get_extra_info_json().get('stage')
1116         if stage and stage < '0.4':
1117             return (_('Ten utwór wymaga uwspółcześnienia'),
1118                     reverse('infopage', args=['wymagajace-uwspolczesnienia']))
1119         else:
1120             return None, None
1121
1122     def choose_fragments(self, number):
1123         fragments = self.fragments.order_by()
1124         fragments_count = fragments.count()
1125         if not fragments_count and self.children.exists():
1126             fragments = Fragment.objects.filter(book__ancestor=self).order_by()
1127             fragments_count = fragments.count()
1128         if fragments_count:
1129             if fragments_count > number:
1130                 offset = randint(0, fragments_count - number)
1131             else:
1132                 offset = 0
1133             return fragments[offset : offset + number]
1134         elif self.parent:
1135             return self.parent.choose_fragments(number)
1136         else:
1137             return []
1138
1139     def choose_fragment(self):
1140         fragments = self.choose_fragments(1)
1141         if fragments:
1142             return fragments[0]
1143         else:
1144             return None
1145
1146     def fragment_data(self):
1147         fragment = self.choose_fragment()
1148         if fragment:
1149             return {
1150                 'title': fragment.book.pretty_title(),
1151                 'html': re.sub('</?blockquote[^>]*>', '', fragment.get_short_text()),
1152             }
1153         else:
1154             return None
1155
1156     def update_popularity(self):
1157         count = self.userlistitem_set.values('list__user').order_by('list__user').distinct().count()
1158         try:
1159             pop = self.popularity
1160             pop.count = count
1161             pop.save()
1162         except BookPopularity.DoesNotExist:
1163             BookPopularity.objects.create(book=self, count=count)
1164
1165     def ridero_link(self):
1166         return 'https://ridero.eu/%s/books/wl_%s/' % (get_language(), self.slug.replace('-', '_'))
1167
1168     def elevenreader_link(self):
1169         first_text = self.get_first_text()
1170         if first_text is None:
1171             return None
1172         return 'https://elevenreader.io/audiobooks/wolnelektury:' + first_text.slug
1173
1174     def content_warnings(self):
1175         warnings_def = {
1176             'wulgaryzmy': _('wulgaryzmy'),
1177         }
1178         warnings = self.get_extra_info_json().get('content_warnings', [])
1179         warnings = [
1180             warnings_def.get(w, w)
1181             for w in warnings
1182         ]
1183         warnings.sort()
1184         return warnings
1185
1186     def full_sort_key(self):
1187         return self.SORT_KEY_SEP.join((self.sort_key_author, self.sort_key, str(self.id)))
1188
1189     def cover_color(self):
1190         return WLCover.epoch_colors.get(self.get_extra_info_json().get('epoch'), '#000000')
1191
1192     @cached_render('catalogue/book_mini_box.html')
1193     def mini_box(self):
1194         return {
1195             'book': self
1196         }
1197
1198     @cached_render('catalogue/book_mini_box.html')
1199     def mini_box_nolink(self):
1200         return {
1201             'book': self,
1202             'no_link': True,
1203         }
1204
1205
1206 class BookPopularity(models.Model):
1207     book = models.OneToOneField(Book, models.CASCADE, related_name='popularity')
1208     count = models.IntegerField(default=0, db_index=True)