Link catalogue to documents.
[redakcja.git] / src / catalogue / management / commands / import_catalogue_from_wl_dump.py
1 # This file is part of FNP-Redakcja, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 import json
5 from urllib.request import urlopen
6 import sys
7 from django.core.management import BaseCommand
8 from slugify import slugify
9 import wikidata
10 from catalogue.models import Book, Author
11
12
13 def parse_name(name):
14     name_pieces = name.rsplit(" ", 1)
15     if len(name_pieces) == 1:
16         return name_pieces[0], ""
17     else:
18         return name_pieces
19
20
21 def find_wikidata(link, lang):
22     link = link.rstrip()
23     title = link.rsplit("/", 1)[-1]
24     title = title.split("#", 1)[0]
25     title = title.replace(" ", "_")
26     data = json.load(
27         urlopen(
28             f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites={lang}wiki&titles={title}&format=json"
29         )
30     )
31     wikidata_id = list(data["entities"].keys())[0]
32     if not wikidata_id.startswith("Q"):
33         return None
34     return wikidata_id
35
36
37 class Command(BaseCommand):
38     def add_arguments(self, parser):
39         parser.add_argument("path")
40
41     def handle(self, path, **kwargs):
42         with open(path) as f:
43             data = json.load(f)
44
45         for pass_n in (1, 2):
46             for item in data:
47                 if item["model"] == "pdcounter.bookstub":
48                     if pass_n != 2:
49                         continue
50                     notes = []
51                     print(item["fields"]["author"], item["fields"]["title"])
52                     slug = item["fields"]["slug"]
53                     book, created = Book.objects.get_or_create(slug=slug)
54                     if item["fields"]["translator"] and not book.translators.exists():
55                         notes.append("tłum.: " + item["fields"]["translator"])
56                     book.title = book.title or item["fields"]["title"]
57                     book.pd_year = book.pd_year or item["fields"]["pd"]
58                     notes = "\n".join(notes)
59                     if notes and notes not in book.notes:
60                         book.notes = "\n".join([notes, book.notes])
61                     book.save()
62
63                     if not book.authors.exists():
64                         first_name, last_name = parse_name(item["fields"]["author"])
65                         author_slug = slugify(item["fields"]["author"])
66                         author = (
67                             Author.objects.filter(slug=author_slug).first()
68                             or Author.objects.filter(
69                                 first_name=first_name, last_name=last_name
70                             ).first()
71                             or Author()
72                         )
73                         author.slug = author.slug or author_slug
74                         author.first_name = author.first_name or first_name
75                         author.last_name = author.last_name or last_name
76                         author.save()
77                         book.authors.set([author])
78                 elif item["model"] == "pdcounter.author":
79                     if pass_n != 1:
80                         continue
81                     slug = item["fields"]["slug"]
82                     author, created = Author.objects.get_or_create(slug=slug)
83                     if not author.first_name and not author.last_name:
84                         author.first_name, author.last_name = parse_name(
85                             item["fields"]["name"]
86                         )
87                         author.year_of_death = (
88                             author.year_of_death or item["fields"]["death"]
89                         )
90                         author.notes = author.notes or item["fields"]["description"]
91                         author.gazeta_link = (
92                             author.gazeta_link or item["fields"]["gazeta_link"]
93                         )
94                         author.save()
95                         wiki_link = item["fields"]["wiki_link"]
96                         assert not wiki_link  # Welp
97                 elif item["model"] == "catalogue.book":
98                     if pass_n != 2:
99                         continue
100                     if item["fields"]["parent"]:
101                         continue
102                     print(item["fields"]["slug"])
103                     slug = item["fields"]["slug"]
104                     book, created = Book.objects.get_or_create(slug=slug)
105                     book.title = book.title or item["fields"]["title"]
106                     book.language = book.language or item["fields"]["language"]
107                     book.gazeta_link = book.gazeta_link or item["fields"]["gazeta_link"]
108                     if item["fields"]["wiki_link"]:
109                         book.wikidata = (
110                             book.wikidata
111                             or find_wikidata(item["fields"]["wiki_link"], "pl")
112                             or ""
113                         )
114
115                     extra_info = json.loads(item["fields"]["extra_info"])
116                     if book.pd_year is None and extra_info.get(
117                         "released_to_public_domain_at"
118                     ):
119                         book.pd_year = int(
120                             extra_info["released_to_public_domain_at"].split("-", 1)[0]
121                         )
122
123                     book.save()
124
125                     if not book.authors.exists():
126                         authors = []
127                         for astr in extra_info.get("authors", []):
128                             parts = astr.split(", ")
129                             if len(parts) == 1:
130                                 first_name = parts[0]
131                                 last_name = ""
132                             else:
133                                 last_name, first_name = parts
134                             aslug = slugify(f"{first_name} {last_name}".strip())
135                             author = (
136                                 Author.objects.filter(slug=aslug).first()
137                                 or Author.objects.filter(
138                                     first_name=first_name, last_name=last_name
139                                 ).first()
140                                 or Author.objects.filter(name_de=astr).first()
141                                 or Author.objects.filter(name_lt=astr).first()
142                             )
143                             # Not trying to create the author or set properties, because here we don't know the dc:creator@xml:lang property.
144                             if author is not None:
145                                 authors.append(author)
146                         book.authors.set(authors)
147                 elif item["model"] == "catalogue.tag":
148                     if pass_n != 1:
149                         continue
150                     if item["fields"]["category"] != "author":
151                         continue
152                     slug = item["fields"]["slug"]
153                     author, created = Author.objects.get_or_create(slug=slug)
154                     author.name_de = author.name_de or item["fields"]["name_de"] or ""
155                     author.name_lt = author.name_lt or item["fields"]["name_lt"] or ""
156                     if not author.first_name and not author.last_name:
157                         author.first_name, author.last_name = parse_name(
158                             item["fields"]["name_pl"]
159                         )
160                     author.culturepl_link = (
161                         author.culturepl_link or item["fields"]["culturepl_link"] or ""
162                     )
163                     author.gazeta_link = (
164                         author.gazeta_link or item["fields"]["gazeta_link"] or ""
165                     )
166                     author.description = (
167                         author.description or item["fields"]["description_pl"] or ""
168                     )
169                     author.description_de = (
170                         author.description_de or item["fields"]["description_de"] or ""
171                     )
172                     author.description_lt = (
173                         author.description_lt or item["fields"]["description_lt"] or ""
174                     )
175
176                     if not author.wikidata:
177                         for field, value in item["fields"].items():
178                             if field.startswith("wiki_link_") and value:
179                                 wd = find_wikidata(value, field.rsplit("_", 1)[-1])
180                                 if wd:
181                                     author.wikidata = wd
182                                     break
183                     author.save()
184
185                 else:
186                     print(item)
187                     break