improve search for hints by title
[wolnelektury.git] / src / catalogue / management / commands / eisbn_csv.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 import csv
6 import sys
7 from django.core.management.base import BaseCommand
8 from django.utils.timezone import localtime
9
10 from catalogue.models import Book
11 from librarian import RDFNS, DCNS
12
13
14 FORMATS = ('PDF', 'HTML', 'TXT', 'EPUB', 'MOBI')
15
16 FORMATS_WITH_CHILDREN = ('PDF', 'EPUB', 'MOBI')
17
18
19 PRODUCT_FORMS_1 = {
20     'HTML': 'EC',
21     'PDF': 'EB',
22     'TXT': 'EB',
23     'EPUB': 'ED',
24     'MOBI': 'ED',
25 }
26
27 PRODUCT_FORMS_2 = {
28     'HTML': 'E105',
29     'PDF': 'E107',
30     'TXT': 'E112',
31     'EPUB': 'E101',
32     'MOBI': 'E127',
33 }
34
35
36 def is_institution(name):
37     return name.startswith(u'Zgromadzenie Ogólne')
38
39
40 VOLUME_SEPARATORS = (u'. część ', u', część ', u', tom ', u'. der tragödie ')
41
42
43 def get_volume(title):
44     for volume_separator in VOLUME_SEPARATORS:
45         if volume_separator in title.lower():
46             vol_idx = title.lower().index(volume_separator)
47             stripped = title[:vol_idx]
48             vol_name = title[vol_idx + 2:]
49             return stripped, vol_name
50     return title, ''
51
52
53 class Command(BaseCommand):
54     @staticmethod
55     def dc_values(desc, tag):
56         return [e.text for e in desc.findall('.//' + DCNS(tag))]
57
58     def handle(self, *args, **options):
59         slugs = [line.strip() for line in sys.stdin]
60         writer = csv.writer(sys.stdout)
61         all_books = Book.objects.filter(slug__in=slugs)
62         books_without_children = all_books.filter(children=None)
63         for file_format in FORMATS:
64             if file_format in FORMATS_WITH_CHILDREN:
65                 books = all_books
66             else:
67                 books = books_without_children
68             for book in books:
69                 desc = book.wldocument().edoc.find('.//' + RDFNS('Description'))
70                 imprint = '; '.join(self.dc_values(desc, 'publisher'))
71                 title, volume = get_volume(book.title)
72                 subtitle = ''
73                 year = ''
74                 publication_date = localtime(book.created_at).date().isoformat()
75                 info_date = publication_date
76                 author = '; '.join(author.strip() for author in self.dc_values(desc, 'creator'))
77                 author_person = author if not is_institution(author) else ''
78                 author_institution = author if is_institution(author) else ''
79                 publication_type = 'DGO'
80                 edition = '1'
81                 product_form1 = PRODUCT_FORMS_1[file_format]
82                 product_form2 = PRODUCT_FORMS_2[file_format]
83                 language = self.dc_values(desc, 'language')[0]
84                 row = [
85                     imprint,
86                     title,
87                     subtitle,
88                     year,
89                     volume,
90                     publication_date,
91                     info_date,
92                     author_person,
93                     author_institution,
94                     publication_type,
95                     edition,
96                     product_form1,
97                     product_form2,
98                     language,
99                     book.slug,
100                     file_format,
101                 ]
102                 writer.writerow([s.encode('utf-8') for s in row])