Importing catalogue from WL dump.
[redakcja.git] / src / catalogue / management / commands / import_catalogue_from_wl_dump.py
1 import json
2 from urllib.request import urlopen
3 import sys
4 from django.core.management import BaseCommand
5 from slugify import slugify
6 import wikidata
7 from catalogue.models import Book, Author
8
9
10 def parse_name(name):
11     name_pieces = name.rsplit(" ", 1)
12     if len(name_pieces) == 1:
13         return name_pieces[0], ""
14     else:
15         return name_pieces
16
17
18 def find_wikidata(link, lang):
19     link = link.rstrip()
20     title = link.rsplit("/", 1)[-1]
21     title = link.split("#", 1)[0]
22     title = title.replace(" ", "_")
23     data = json.load(
24         urlopen(
25             f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites={lang}wiki&titles={title}&format=json"
26         )
27     )
28     wikidata_id = list(data["entities"].keys())[0]
29     if not wikidata_id.startswith("Q"):
30         return None
31     return wikidata_id
32
33
34 class Command(BaseCommand):
35     def add_arguments(self, parser):
36         parser.add_argument("path")
37
38     def handle(self, path, **kwargs):
39         with open(path) as f:
40             data = json.load(f)
41
42         for pass_n in (1, 2):
43             for item in data:
44                 if item["model"] == "pdcounter.bookstub":
45                     if pass_n != 2:
46                         continue
47                     notes = []
48                     print(item["fields"]["author"], item["fields"]["title"])
49                     slug = item["fields"]["slug"]
50                     book, created = Book.objects.get_or_create(slug=slug)
51                     if item["fields"]["translator"] and not book.translators.exists():
52                         notes.append("tłum.: " + item["fields"]["translator"])
53                     book.title = book.title or item["fields"]["title"]
54                     book.pd_year = book.pd_year or item["fields"]["pd"]
55                     notes = "\n".join(notes)
56                     if notes and notes not in book.notes:
57                         book.notes = "\n".join([notes, book.notes])
58                     book.save()
59
60                     if not book.authors.exists():
61                         first_name, last_name = parse_name(item["fields"]["author"])
62                         author_slug = slugify(item["fields"]["author"])
63                         author = (
64                             Author.objects.filter(slug=author_slug).first()
65                             or Author.objects.filter(
66                                 first_name=first_name, last_name=last_name
67                             ).first()
68                             or Author()
69                         )
70                         author.slug = author.slug or author_slug
71                         author.first_name = author.first_name or first_name
72                         author.last_name = author.last_name or last_name
73                         author.save()
74                         book.authors.set([author])
75                 elif item["model"] == "pdcounter.author":
76                     if pass_n != 1:
77                         continue
78                     slug = item["fields"]["slug"]
79                     author, created = Author.objects.get_or_create(slug=slug)
80                     if not author.first_name and not author.last_name:
81                         author.first_name, author.last_name = parse_name(
82                             item["fields"]["name"]
83                         )
84                         author.year_of_death = (
85                             author.year_of_death or item["fields"]["death"]
86                         )
87                         author.notes = author.notes or item["fields"]["description"]
88                         author.gazeta_link = (
89                             author.gazeta_link or item["fields"]["gazeta_link"]
90                         )
91                         author.save()
92                         wiki_link = item["fields"]["wiki_link"]
93                         assert not wiki_link  # Welp
94                 elif item["model"] == "catalogue.book":
95                     if pass_n != 2:
96                         continue
97                     if item["fields"]["parent"]:
98                         continue
99                     print(item["fields"]["slug"])
100                     slug = item["fields"]["slug"]
101                     book, created = Book.objects.get_or_create(slug=slug)
102                     book.title = book.title or item["fields"]["title"]
103                     book.language = book.language or item["fields"]["language"]
104                     book.gazeta_link = book.gazeta_link or item["fields"]["gazeta_link"]
105                     if item["fields"]["wiki_link"]:
106                         book.wikidata = (
107                             book.wikidata
108                             or find_wikidata(item["fields"]["wiki_link"], "pl")
109                             or ""
110                         )
111
112                     extra_info = json.loads(item["fields"]["extra_info"])
113                     if book.pd_year is None and extra_info.get(
114                         "released_to_public_domain_at"
115                     ):
116                         book.pd_year = int(
117                             extra_info["released_to_public_domain_at"].split("-", 1)[0]
118                         )
119
120                     book.save()
121
122                     if not book.authors.exists():
123                         authors = []
124                         for astr in extra_info.get("authors", []):
125                             parts = astr.split(", ")
126                             if len(parts) == 1:
127                                 first_name = parts[0]
128                                 last_name = ""
129                             else:
130                                 last_name, first_name = parts
131                             aslug = slugify(f"{first_name} {last_name}".strip())
132                             author = (
133                                 Author.objects.filter(slug=aslug).first()
134                                 or Author.objects.filter(
135                                     first_name=first_name, last_name=last_name
136                                 ).first()
137                                 or Author.objects.filter(name_de=astr).first()
138                                 or Author.objects.filter(name_lt=astr).first()
139                             )
140                             # Not trying to create the author or set properties, because here we don't know the dc:creator@xml:lang property.
141                             if author is not None:
142                                 authors.append(author)
143                         book.authors.set(authors)
144                 elif item["model"] == "catalogue.tag":
145                     if pass_n != 1:
146                         continue
147                     if item["fields"]["category"] != "author":
148                         continue
149                     slug = item["fields"]["slug"]
150                     author, created = Author.objects.get_or_create(slug=slug)
151                     author.name_de = author.name_de or item["fields"]["name_de"] or ""
152                     author.name_lt = author.name_lt or item["fields"]["name_lt"] or ""
153                     if not author.first_name and not author.last_name:
154                         author.first_name, author.last_name = parse_name(
155                             item["fields"]["name_pl"]
156                         )
157                     author.culturepl_link = (
158                         author.culturepl_link or item["fields"]["culturepl_link"] or ""
159                     )
160                     author.gazeta_link = (
161                         author.gazeta_link or item["fields"]["gazeta_link"] or ""
162                     )
163                     author.description = (
164                         author.description or item["fields"]["description_pl"] or ""
165                     )
166                     author.description_de = (
167                         author.description_de or item["fields"]["description_de"] or ""
168                     )
169                     author.description_lt = (
170                         author.description_lt or item["fields"]["description_lt"] or ""
171                     )
172
173                     if not author.wikidata:
174                         for field, value in item["fields"].items():
175                             if field.startswith("wiki_link_") and value:
176                                 wd = find_wikidata(value, field.rsplit("_", 1)[-1])
177                                 if wd:
178                                     author.wikidata = wd
179                                     break
180                     author.save()
181
182                 else:
183                     print(item)
184                     break