X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/52d79280796679d28fb526063c77c43851ae52c4..6d1bf593b94be78e15a7657a9adbd727bd59f69c:/src/cover/utils.py diff --git a/src/cover/utils.py b/src/cover/utils.py index 3bf3e9f9..6815d29d 100644 --- a/src/cover/utils.py +++ b/src/cover/utils.py @@ -6,6 +6,11 @@ from io import StringIO import json import re from urllib.request import FancyURLopener +from django.conf import settings +import requests +from wikidata.client import Client +from catalogue.constants import WIKIDATA + class URLOpener(FancyURLopener): @property @@ -71,13 +76,9 @@ def get_flickr_data(url): def get_wikimedia_data(url): - """ - >>> get_wikimedia_data('https://commons.wikimedia.org/wiki/File:Valdai_IverskyMon_asv2018_img47.jpg') - {'title': 'Valdai IverskyMon asv2018 img47', 'author': 'A.Savin', 'source_url': 'https://commons.wikimedia.org/wiki/File:Valdai_IverskyMon_asv2018_img47.jpg', 'download_url': 'https://upload.wikimedia.org/wikipedia/commons/4/43/Valdai_IverskyMon_asv2018_img47.jpg', 'license_url': 'http://artlibre.org/licence/lal/en', 'license_name': 'FAL'} - - """ file_name = url.rsplit('/', 1)[-1].rsplit(':', 1)[-1] d = json.loads(URLOpener().open('https://commons.wikimedia.org/w/api.php?action=query&titles=File:{}&prop=imageinfo&iiprop=url|user|extmetadata&iimetadataversion=latest&format=json'.format(file_name)).read().decode('utf-8')) + d = list(d['query']['pages'].values())[0]['imageinfo'][0] ext = d['extmetadata'] @@ -86,19 +87,32 @@ def get_wikimedia_data(url): 'author': d['user'], 'source_url': d['descriptionurl'], 'download_url': d['url'], - 'license_url': ext['LicenseUrl']['value'], + 'license_url': ext.get('LicenseUrl', {}).get('value', ''), 'license_name': ext['LicenseShortName']['value'], } - + + # There are Wikidata links in ObjectName sametimes. Let's use it. + wikidata_match = re.search(r'wikidata\.org/wiki/(Q\d+)', meta['title']) + if wikidata_match is not None: + qitem = wikidata_match.group(1) + client = Client() + entity = client.get(qitem) + meta['title'] = entity.label.get('pl', str(entity.label)) + author = entity.get(client.get(WIKIDATA.CREATOR)) + if author is not None: + meta['author'] = author.label.get('pl', str(author.label)) + else: + meta['author'] = '' + + if meta['license_name'] == 'Public domain': + meta['license_name'] = 'domena publiczna' + meta['license_url'] = 'https://pl.wikipedia.org/wiki/Domena_publiczna' + + return meta def get_mnw_data(url): - """ - >>> get_mnw_data('https://cyfrowe.mnw.art.pl/pl/katalog/794032') - {'title': 'Pejzaż z podwójnym świerkiem', 'author': 'nieznany, Altdorfer, Albrecht (ca 1480-1538)', 'source_url': 'https://cyfrowe.mnw.art.pl/pl/katalog/794032', 'download_url': 'https://cyfrowe-cdn.mnw.art.pl/upload/multimedia/49/58/49583b3e9b23e2d25f372fe6021ae220.jpg', 'license_url': 'https://pl.wikipedia.org/wiki/Domena_publiczna', 'license_name': 'DOMENA PUBLICZNA'} - - """ nr = url.rsplit('/', 1)[-1] d = list( csv.DictReader( @@ -111,29 +125,61 @@ def get_mnw_data(url): )[0] authors = [] - i = 0 - while f'authors.{i}.name' in d: - authors.append(d[f'authors.{i}.name']) + i = 1 + while f'twórca/wytwórnia {i}' in d: + authors.append(d[f'twórca/wytwórnia {i}']) i += 1 - license_url = d['copyrights.0.link'] - license_name = d['copyrights.0.name'] + license_url = '' + license_name = d['klasyfikacja praw autorskich 1'] if license_name == 'DOMENA PUBLICZNA': license_name = 'domena publiczna' - license_url = '' + license_url = 'https://pl.wikipedia.org/wiki/Domena_publiczna' return { - 'title': d['title'], + 'title': d['nazwa/tytuł'], 'author': ', '.join(authors), 'source_url': url, 'download_url': 'https://cyfrowe-cdn.mnw.art.pl/upload/multimedia/{}.{}'.format( - d['image.filePath'], - d['image.extension'], + d['ścieżka wizerunku'], + d['rozszerzenie pliku wizerunku'], ), 'license_url': license_url, 'license_name': license_name, } +def get_rawpixel_data(url): + photo_id = re.search(r'/(\d+)/', url).group(1) + + s = requests.Session() + cookies = settings.RAWPIXEL_SESSION + + token = s.post( + 'https://www.rawpixel.com/api/v1/user/session', + cookies=cookies + ).json()['token'] + + h = {'X-CSRF-Token': token, 'Accept': 'application/json'} + + data = s.get( + f'https://www.rawpixel.com/api/v1/image/data/{photo_id}', + headers=h, cookies=cookies).json() + download_url = s.post( + f'https://www.rawpixel.com/api/v1/image/download/{photo_id}/original', + headers=h, cookies=cookies + ).json()['downloadUrl'] + + title = data['metadata']['title'].rsplit('|', 1)[0].strip() + + return { + 'title': title, + 'author': ', '.join(data['metadata']['artist_names']), + 'source_url': data['url'], + 'download_url': download_url, + 'license_url': data['metadata']['licenseUrl'], + 'license_name': data['metadata']['license'], + } + def get_import_data(url): if re.match(r'(https?://)?(www\.|secure\.)?flickr\.com/', url): @@ -142,3 +188,5 @@ def get_import_data(url): return get_wikimedia_data(url) if re.match(r'(https?://)?cyfrowe\.mnw\.art\.pl/', url): return get_mnw_data(url) + if re.match(r'(https?://)?www\.rawpixel\.com/', url): + return get_rawpixel_data(url)