From: Radek Czajka Date: Fri, 17 Apr 2020 16:53:07 +0000 (+0200) Subject: Importing catalogue from WL dump. X-Git-Url: https://git.mdrn.pl/redakcja.git/commitdiff_plain/73da85af22d6ef3decdb1ffa5af819a9fbb19e32?ds=inline Importing catalogue from WL dump. --- diff --git a/src/catalogue/admin.py b/src/catalogue/admin.py index 95da4994..91656f13 100644 --- a/src/catalogue/admin.py +++ b/src/catalogue/admin.py @@ -4,60 +4,95 @@ from .wikidata import WikidataAdminMixin class AuthorAdmin(WikidataAdminMixin, admin.ModelAdmin): - list_display = ["first_name", "last_name", 'status', "year_of_death", "priority", "wikidata_link"] - list_filter = ['year_of_death', 'priority', 'collections', 'status'] + list_display = [ + "first_name", + "last_name", + "status", + "year_of_death", + "priority", + "wikidata_link", + "slug", + ] + list_filter = ["year_of_death", "priority", "collections", "status"] search_fields = ["first_name", "last_name", "wikidata"] prepopulated_fields = {"slug": ("first_name", "last_name")} - autocomplete_fields = ['collections'] + autocomplete_fields = ["collections"] admin.site.register(models.Author, AuthorAdmin) class BookAdmin(WikidataAdminMixin, admin.ModelAdmin): - list_display = ["title", 'authors_str', 'translators_str', 'language', 'pd_year', 'priority', 'wikidata_link'] - search_fields = ["title", 'wikidata'] - autocomplete_fields = ["authors", "translators", "based_on", 'collections'] + list_display = [ + "title", + "authors_str", + "translators_str", + "language", + "pd_year", + "priority", + "wikidata_link", + ] + search_fields = ["title", "wikidata"] + autocomplete_fields = ["authors", "translators", "based_on", "collections"] prepopulated_fields = {"slug": ("title",)} - list_filter = ['language', 'pd_year', 'collections'] - readonly_fields = ['wikidata_link'] + list_filter = ["language", "pd_year", "collections"] + readonly_fields = ["wikidata_link"] fieldsets = [ - (None, {'fields': [ - ('wikidata', 'wikidata_link'), - ]}), - ('Identification', {'fields': [ - 'title', 'slug', 'authors', 'translators', 'language', - 'based_on', - 'pd_year', - ]}), - ('Plan', {'fields': [ - 'scans_source', - 'text_source', - 'priority', - 'collections', - 'notes', - ]}), + (None, {"fields": [("wikidata", "wikidata_link")]}), + ( + "Identification", + { + "fields": [ + "title", + "slug", + "authors", + "translators", + "language", + "based_on", + "pd_year", + ] + }, + ), + ( + "Plan", + { + "fields": [ + "scans_source", + "text_source", + "priority", + "collections", + "notes", + ] + }, + ), ] + def get_queryset(self, request): + qs = super().get_queryset(request) + if request.resolver_match.view_name.endswith("changelist"): + qs = qs.prefetch_related("authors", "translators") + return qs + admin.site.register(models.Book, BookAdmin) class AuthorInline(admin.TabularInline): model = models.Author.collections.through - autocomplete_fields = ['author'] + autocomplete_fields = ["author"] class BookInline(admin.TabularInline): model = models.Book.collections.through - autocomplete_fields = ['book'] + autocomplete_fields = ["book"] class CollectionAdmin(admin.ModelAdmin): - list_display = ['name'] + list_display = ["name"] autocomplete_fields = [] - prepopulated_fields = {'slug': ('name',)} - search_fields = ['name'] + prepopulated_fields = {"slug": ("name",)} + search_fields = ["name"] inlines = [AuthorInline, BookInline] + admin.site.register(models.Collection, CollectionAdmin) diff --git a/src/catalogue/management/commands/import_catalogue_from_wl_dump.py b/src/catalogue/management/commands/import_catalogue_from_wl_dump.py index ea3f8054..2ce39f06 100644 --- a/src/catalogue/management/commands/import_catalogue_from_wl_dump.py +++ b/src/catalogue/management/commands/import_catalogue_from_wl_dump.py @@ -1,4 +1,5 @@ import json +from urllib.request import urlopen import sys from django.core.management import BaseCommand from slugify import slugify @@ -7,54 +8,177 @@ from catalogue.models import Book, Author def parse_name(name): - name_pieces = name.rsplit(' ', 1) + name_pieces = name.rsplit(" ", 1) if len(name_pieces) == 1: - return name_pieces[0], '' + return name_pieces[0], "" else: return name_pieces +def find_wikidata(link, lang): + link = link.rstrip() + title = link.rsplit("/", 1)[-1] + title = link.split("#", 1)[0] + title = title.replace(" ", "_") + data = json.load( + urlopen( + f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites={lang}wiki&titles={title}&format=json" + ) + ) + wikidata_id = list(data["entities"].keys())[0] + if not wikidata_id.startswith("Q"): + return None + return wikidata_id + class Command(BaseCommand): def add_arguments(self, parser): - parser.add_argument('path') + parser.add_argument("path") def handle(self, path, **kwargs): with open(path) as f: data = json.load(f) - for item in data: - if item['model'] == 'pdcounter.bookstub': - notes = [] - slug = item['fields']['slug'] - book, created = Book.objects.get_or_create(slug=slug) - if item['fields']['translator'] and not book.translators.exists(): - notes.append('tłum.: ' + item['fields']['translator']) - book.title = book.title or item['fields']['title'] - book.pd_year = book.pd_year or item['fields']['pd'] - notes = '\n'.join(notes) - if notes and notes not in book.notes: - book.notes = '\n'.join([notes, book.notes]) - book.save() - - if not book.authors.exists(): - first_name, last_name = parse_name(item['fields']['author']) - author, created = Author.objects.get_or_create(first_name=first_name, last_name=last_name) - if not author.slug: - author.slug = slugify(author_name) + + for pass_n in (1, 2): + for item in data: + if item["model"] == "pdcounter.bookstub": + if pass_n != 2: + continue + notes = [] + print(item["fields"]["author"], item["fields"]["title"]) + slug = item["fields"]["slug"] + book, created = Book.objects.get_or_create(slug=slug) + if item["fields"]["translator"] and not book.translators.exists(): + notes.append("tłum.: " + item["fields"]["translator"]) + book.title = book.title or item["fields"]["title"] + book.pd_year = book.pd_year or item["fields"]["pd"] + notes = "\n".join(notes) + if notes and notes not in book.notes: + book.notes = "\n".join([notes, book.notes]) + book.save() + + if not book.authors.exists(): + first_name, last_name = parse_name(item["fields"]["author"]) + author_slug = slugify(item["fields"]["author"]) + author = ( + Author.objects.filter(slug=author_slug).first() + or Author.objects.filter( + first_name=first_name, last_name=last_name + ).first() + or Author() + ) + author.slug = author.slug or author_slug + author.first_name = author.first_name or first_name + author.last_name = author.last_name or last_name + author.save() + book.authors.set([author]) + elif item["model"] == "pdcounter.author": + if pass_n != 1: + continue + slug = item["fields"]["slug"] + author, created = Author.objects.get_or_create(slug=slug) + if not author.first_name and not author.last_name: + author.first_name, author.last_name = parse_name( + item["fields"]["name"] + ) + author.year_of_death = ( + author.year_of_death or item["fields"]["death"] + ) + author.notes = author.notes or item["fields"]["description"] + author.gazeta_link = ( + author.gazeta_link or item["fields"]["gazeta_link"] + ) author.save() - book.authors.set([author]) - elif item['model'] == 'pdcounter.author': - slug = item['fields']['slug'] - author, created = Author.objects.get_or_create(slug=slug) - if not author.first_name and not author.last_name: - author.first_name, author.last_name = parse_name(item['fields']['name']) - author.year_of_death = author.year_of_death or item['fields']['death'] - author.notes = author.notes or item['fields']['description'] - author.gazeta_link = author.gazeta_link or item['fields']['gazeta_link'] + wiki_link = item["fields"]["wiki_link"] + assert not wiki_link # Welp + elif item["model"] == "catalogue.book": + if pass_n != 2: + continue + if item["fields"]["parent"]: + continue + print(item["fields"]["slug"]) + slug = item["fields"]["slug"] + book, created = Book.objects.get_or_create(slug=slug) + book.title = book.title or item["fields"]["title"] + book.language = book.language or item["fields"]["language"] + book.gazeta_link = book.gazeta_link or item["fields"]["gazeta_link"] + if item["fields"]["wiki_link"]: + book.wikidata = ( + book.wikidata + or find_wikidata(item["fields"]["wiki_link"], "pl") + or "" + ) + + extra_info = json.loads(item["fields"]["extra_info"]) + if book.pd_year is None and extra_info.get( + "released_to_public_domain_at" + ): + book.pd_year = int( + extra_info["released_to_public_domain_at"].split("-", 1)[0] + ) + + book.save() + + if not book.authors.exists(): + authors = [] + for astr in extra_info.get("authors", []): + parts = astr.split(", ") + if len(parts) == 1: + first_name = parts[0] + last_name = "" + else: + last_name, first_name = parts + aslug = slugify(f"{first_name} {last_name}".strip()) + author = ( + Author.objects.filter(slug=aslug).first() + or Author.objects.filter( + first_name=first_name, last_name=last_name + ).first() + or Author.objects.filter(name_de=astr).first() + or Author.objects.filter(name_lt=astr).first() + ) + # Not trying to create the author or set properties, because here we don't know the dc:creator@xml:lang property. + if author is not None: + authors.append(author) + book.authors.set(authors) + elif item["model"] == "catalogue.tag": + if pass_n != 1: + continue + if item["fields"]["category"] != "author": + continue + slug = item["fields"]["slug"] + author, created = Author.objects.get_or_create(slug=slug) + author.name_de = author.name_de or item["fields"]["name_de"] or "" + author.name_lt = author.name_lt or item["fields"]["name_lt"] or "" + if not author.first_name and not author.last_name: + author.first_name, author.last_name = parse_name( + item["fields"]["name_pl"] + ) + author.culturepl_link = ( + author.culturepl_link or item["fields"]["culturepl_link"] or "" + ) + author.gazeta_link = ( + author.gazeta_link or item["fields"]["gazeta_link"] or "" + ) + author.description = ( + author.description or item["fields"]["description_pl"] or "" + ) + author.description_de = ( + author.description_de or item["fields"]["description_de"] or "" + ) + author.description_lt = ( + author.description_lt or item["fields"]["description_lt"] or "" + ) + + if not author.wikidata: + for field, value in item["fields"].items(): + if field.startswith("wiki_link_") and value: + wd = find_wikidata(value, field.rsplit("_", 1)[-1]) + if wd: + author.wikidata = wd + break author.save() - wiki_link = item['fields']['wiki_link'] - assert not wiki_link # Welp - else: - print(item) - break + else: + print(item) + break diff --git a/src/catalogue/migrations/0014_book_gazeta_link.py b/src/catalogue/migrations/0014_book_gazeta_link.py new file mode 100644 index 00000000..4b0a496f --- /dev/null +++ b/src/catalogue/migrations/0014_book_gazeta_link.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.4 on 2020-04-15 22:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0013_auto_20200415_1755'), + ] + + operations = [ + migrations.AddField( + model_name='book', + name='gazeta_link', + field=models.CharField(blank=True, max_length=255), + ), + ] diff --git a/src/catalogue/migrations/0015_auto_20200416_1120.py b/src/catalogue/migrations/0015_auto_20200416_1120.py new file mode 100644 index 00000000..996e5a6c --- /dev/null +++ b/src/catalogue/migrations/0015_auto_20200416_1120.py @@ -0,0 +1,23 @@ +# Generated by Django 3.0.4 on 2020-04-16 11:20 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0014_book_gazeta_link'), + ] + + operations = [ + migrations.AddField( + model_name='author', + name='name_de', + field=models.CharField(blank=True, max_length=255), + ), + migrations.AddField( + model_name='author', + name='name_lt', + field=models.CharField(blank=True, max_length=255), + ), + ] diff --git a/src/catalogue/migrations/0016_auto_20200417_1421.py b/src/catalogue/migrations/0016_auto_20200417_1421.py new file mode 100644 index 00000000..ca4b7145 --- /dev/null +++ b/src/catalogue/migrations/0016_auto_20200417_1421.py @@ -0,0 +1,23 @@ +# Generated by Django 3.0.4 on 2020-04-17 14:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0015_auto_20200416_1120'), + ] + + operations = [ + migrations.AddField( + model_name='author', + name='description_de', + field=models.TextField(blank=True), + ), + migrations.AddField( + model_name='author', + name='description_lt', + field=models.TextField(blank=True), + ), + ] diff --git a/src/catalogue/migrations/0017_auto_20200417_1638.py b/src/catalogue/migrations/0017_auto_20200417_1638.py new file mode 100644 index 00000000..c4a983a4 --- /dev/null +++ b/src/catalogue/migrations/0017_auto_20200417_1638.py @@ -0,0 +1,25 @@ +# Generated by Django 3.0.4 on 2020-04-17 16:38 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0016_auto_20200417_1421'), + ] + + operations = [ + migrations.AlterField( + model_name='author', + name='wikidata', + field=models.CharField(blank=True, default='', help_text='If you have a Wikidata ID, like "Q1337", enter it and save.', max_length=255), + preserve_default=False, + ), + migrations.AlterField( + model_name='book', + name='wikidata', + field=models.CharField(blank=True, default='', help_text='If you have a Wikidata ID, like "Q1337", enter it and save.', max_length=255), + preserve_default=False, + ), + ] diff --git a/src/catalogue/models.py b/src/catalogue/models.py index e41ba35e..b45cefed 100644 --- a/src/catalogue/models.py +++ b/src/catalogue/models.py @@ -8,6 +8,10 @@ class Author(WikidataMixin, models.Model): slug = models.SlugField(null=True, blank=True, unique=True) first_name = models.CharField(max_length=255, blank=True) last_name = models.CharField(max_length=255, blank=True) + + name_de = models.CharField(max_length=255, blank=True) + name_lt = models.CharField(max_length=255, blank=True) + year_of_death = models.SmallIntegerField(null=True, blank=True) status = models.PositiveSmallIntegerField( null=True, @@ -22,14 +26,18 @@ class Author(WikidataMixin, models.Model): notes = models.TextField(blank=True) gazeta_link = models.CharField(max_length=255, blank=True) culturepl_link = models.CharField(max_length=255, blank=True) + description = models.TextField(blank=True) + description_de = models.TextField(blank=True) + description_lt = models.TextField(blank=True) + priority = models.PositiveSmallIntegerField( default=0, choices=[(0, _("Low")), (1, _("Medium")), (2, _("High"))] ) - collections = models.ManyToManyField('Collection', blank=True) + collections = models.ManyToManyField("Collection", blank=True) class Meta: - ordering = ('last_name', 'first_name', 'year_of_death') + ordering = ("last_name", "first_name", "year_of_death") class Wikidata: first_name = WIKIDATA.GIVEN_NAME @@ -62,10 +70,11 @@ class Book(WikidataMixin, models.Model): default=0, choices=[(0, _("Low")), (1, _("Medium")), (2, _("High"))] ) pd_year = models.IntegerField(null=True, blank=True) - collections = models.ManyToManyField('Collection', blank=True) + gazeta_link = models.CharField(max_length=255, blank=True) + collections = models.ManyToManyField("Collection", blank=True) class Meta: - ordering = ('title',) + ordering = ("title",) class Wikidata: authors = WIKIDATA.AUTHOR @@ -79,17 +88,17 @@ class Book(WikidataMixin, models.Model): txt = self.title astr = self.authors_str() if astr: - txt = f'{astr} – {txt}' + txt = f"{astr} – {txt}" tstr = self.translators_str() if tstr: - txt = f'{txt} (tłum. {tstr})' + txt = f"{txt} (tłum. {tstr})" return txt def authors_str(self): - return ', '.join(str(author) for author in self.authors.all()) + return ", ".join(str(author) for author in self.authors.all()) def translators_str(self): - return ', '.join(str(author) for author in self.translators.all()) + return ", ".join(str(author) for author in self.translators.all()) class Collection(models.Model): diff --git a/src/catalogue/wikidata.py b/src/catalogue/wikidata.py index c6885705..88686b6e 100644 --- a/src/catalogue/wikidata.py +++ b/src/catalogue/wikidata.py @@ -11,9 +11,7 @@ from wikidata.datavalue import DatavalueError class WikidataMixin(models.Model): wikidata = models.CharField( max_length=255, - null=True, blank=True, - unique=True, help_text=_('If you have a Wikidata ID, like "Q1337", enter it and save.'), ) @@ -89,7 +87,11 @@ class WikidataAdminMixin: def wikidata_link(self, obj): if obj.wikidata: - return format_html('{wd}', wd=obj.wikidata) + return format_html( + '{wd}', + wd=obj.wikidata, + ) else: - return '' - wikidata_link.admin_order_field = 'wikidata' + return "" + + wikidata_link.admin_order_field = "wikidata"