Enhancements for wikidata imports.
[redakcja.git] / src / wlxml / models.py
1 from io import BytesIO
2 from django.apps import apps
3 from django.core.files import File
4 from django.db import models
5 from django.urls import reverse
6 from django.utils.translation import gettext_lazy as _
7 from librarian import DocProvider
8 from librarian.parser import WLDocument as LegacyWLDocument
9 from librarian.builders import StandaloneHtmlBuilder, TxtBuilder
10 from librarian.document import WLDocument
11
12
13 class Tag(models.Model):
14     name = models.CharField(max_length=255, unique=True, db_index=True)
15     type = models.CharField(max_length=255, choices=[
16         ('section', _('Section, contains blocks')),
17         ('div', _('Block element, like a paragraph')),
18         ('span', _('Inline element, like an emphasis')),
19         ('sep', _('Separator, has no content')),
20         ('aside', _('Aside content, like a footnote')),
21         ('verse', _('Verse element')),
22     ], blank=True)
23     similar_to = models.ForeignKey('self', models.PROTECT, null=True, blank=True)
24     description = models.TextField(blank=True)
25     example = models.TextField(blank=True)
26
27     example_html = models.FileField(upload_to='wlxml/tag/example/html/', blank=True)
28     example_pdf = models.FileField(upload_to='wlxml/tag/example/pdf/', blank=True)
29     example_txt = models.FileField(upload_to='wlxml/tag/example/txt/', blank=True)
30
31     # border_radius?
32     editor_css = models.TextField(blank=True)
33     editor_css_after = models.TextField(blank=True)
34
35     class Meta:
36         verbose_name = _('tag')
37         verbose_name_plural = _('tags')
38
39     def __str__(self):
40         return self.name
41
42     def get_absolute_url(self):
43         return reverse('wlxml_tag', args=[self.name])
44     ### allowed tags?
45
46     def save(self, **kwargs):
47         docbytes = b'''<utwor>
48
49 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
50 <rdf:Description rdf:about="http://redakcja.wolnelektury.pl/documents/book/brudnopis/">
51
52 <dc:language xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">pol</dc:language>
53 <dc:creator xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">test</dc:creator>
54 <dc:title xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">test</dc:title>
55 <dc:date xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">test</dc:date>
56 <dc:publisher xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">test</dc:publisher>
57 <dc:identifier.url xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">test</dc:identifier.url>
58 <dc:rights xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">test</dc:rights>
59
60 </rdf:Description>
61 </rdf:RDF>
62
63 <opowiadanie>''' + self.example.encode('utf-8') + b'</opowiadanie></utwor>'
64
65
66         doc = WLDocument(filename=BytesIO(docbytes))
67
68         self.example_html.save(
69             self.name + '.html',
70             File(
71                 StandaloneHtmlBuilder().build(doc).get_file()),
72             save=False)
73         self.example_txt.save(
74             self.name + '.txt',
75             File(
76                 TxtBuilder().build(doc).get_file()),
77             save=False)
78
79         provider=DocProvider()
80         legacy_doc = LegacyWLDocument.from_bytes(docbytes, provider=provider)
81
82         self.example_pdf.save(
83             self.name + '.pdf',
84             File(legacy_doc.as_pdf().get_file()),
85             save=False)
86         
87
88         super().save(**kwargs)
89
90     
91
92 class Attribute(models.Model):
93     tag = models.ForeignKey(Tag, models.CASCADE)
94     name = models.CharField(max_length=255)
95
96     class Meta:
97         verbose_name = _('attribute')
98         verbose_name_plural = _('attribute')
99     
100         unique_together = [
101             ('tag', 'name'),
102         ]
103
104     def __str__(self):
105         return self.name
106
107     
108 class TagUsage(models.Model):
109     tag = models.ForeignKey(Tag, models.CASCADE)
110     chunk = models.ForeignKey('documents.Chunk', models.CASCADE)
111
112     class Meta:
113         verbose_name = _('tag usage')
114         verbose_name_plural = _('tags usage')
115     
116     def __str__(self):
117         return f'{self.tag.name} @ {self.chunk.slug}'
118         
119     
120     @classmethod
121     def update_chunk(cls, chunk):
122         tag_names = set()
123         attribute_items = {}
124         doc = WLDocument.from_bytes(chunk.materialize().encode('utf-8'))
125         for element in doc.edoc.iter():
126             tag_names.add(element.tag)
127             for k, v in element.attrib.iteritems():
128                 attribute_items.setdefault(element.tag, set()).add((k, v))
129
130         cls.objects.filter(chunk=chunk).exclude(tag__name__in=tag_names).delete()
131         for tag_name in tag_names:
132             tag, create = Tag.objects.get_or_create(name=tag_name)
133             tu, created = cls.objects.get_or_create(tag=tag, chunk=chunk)
134
135             new_attributes = attribute_items.get(tag_name, [])
136             
137             for attr in tu.attributeusage_set.all():
138                 key = (attr.attribute.name, value)
139                 if key not in new_attributes:
140                     attr.delete()
141                 else:
142                     new_attributes.delete(key)
143
144             for k, v in new_attributes:
145                 attribute, created = tag.attribute_set.get_or_create(name=k)
146                 tu.attributeusage_set.create(attribute=attribute, value=v)
147
148
149     @classmethod
150     def update_all_chunks(cls):
151         Chunk = apps.get_model('documents', 'Chunk')
152         for chunk in Chunk.objects.all():
153             cls.update_chunk(chunk)
154
155
156 class AttributeUsage(models.Model):
157     tag_usage = models.ForeignKey(TagUsage, models.CASCADE)
158     attribute = models.ForeignKey(Attribute, models.CASCADE)
159     value = models.CharField(max_length=2048, blank=True)
160
161     class Meta:
162         verbose_name = _('attribute usage')
163         verbose_name_plural = _('attributes usage')
164