Edumed import from pad + auto tagging
[redakcja.git] / apps / catalogue / management / commands / import_pad.py
1 # -*- coding: utf-8 -*-
2 from slughifi import slughifi
3 from collections import defaultdict
4 import json
5 from optparse import make_option
6 import urllib2
7
8 from py_etherpad import EtherpadLiteClient
9 from django.core.management.base import BaseCommand
10 from django.core.management.color import color_style
11 from django.db import transaction
12 from librarian.dcparser import BookInfo
13 from librarian import ParseError, ValidationError
14 from django.conf import settings
15 from catalogue.models import Book
16 from catalogue.management import auto_taggers
17
18
19 class Command(BaseCommand):
20     option_list = BaseCommand.option_list + (
21         make_option('-q', '--quiet', action='store_false', dest='verbose', default=True,
22             help='Less output'),
23         make_option('-p', '--pad', dest='pad_id', help='Pad Id (or many id\'s, comma separated)'),
24         make_option('-P', '--pad-ids', dest='pad_ids_file', help='Read Pad id\'s from file'),
25         make_option('-E', '--edumed', dest="tag_edumed", default=False,
26                     action='store_true', help="Perform EduMed pre-tagging"),
27         make_option('-a', '--autotagger', dest="auto_tagger", default=None, help="Use auto-tagger (one of: %s)" % ', '.join(auto_taggers.keys())),
28     )
29     help = 'Imports Text files from EtherPad Lite.'
30
31     def handle(self, *args, **options):
32
33         self.style = color_style()
34
35         verbose = options.get('verbose')
36         pad_ids_file = options.get('pad_ids_file')
37         if pad_ids_file:
38             pad_id = open(pad_ids_file).readlines()
39         else:
40             pad_id = options.get("pad_id").split(',')
41         pad_id = map(str.strip, pad_id)
42
43         # Start transaction management.
44         transaction.commit_unless_managed()
45         transaction.enter_transaction_management()
46         transaction.managed(True)
47
48         if verbose:
49             print 'Reading currently managed files (skipping hidden ones).'
50         slugs = defaultdict(list)
51         for b in Book.objects.exclude(slug__startswith='.').all():
52             if verbose:
53                 print b.slug
54             text = b.materialize().encode('utf-8')
55             try:
56                 info = BookInfo.from_string(text)
57                 slugs[info.slug].append(b)
58             except (ParseError, ValidationError):
59                 slugs[b.slug].append(b)
60
61         book_count = 0
62         commit_args = {
63             "author_name": 'Platforma',
64             "description": 'Automatycznie zaimportowane z EtherPad',
65             "publishable": False,
66         }
67
68         if verbose:
69             print 'Opening Pad'
70         pad = EtherpadLiteClient(settings.ETHERPAD_APIKEY, settings.ETHERPAD_URL)
71
72         for pid in pad_id:
73             try:
74                 text = pad.getText(pid)['text']
75             except ValueError:
76                 print "pad '%s' does not exist" % pid
77                 continue
78             slug = slughifi(pid)
79             title = pid
80
81             previous_books = slugs.get(slug)
82             if previous_books:
83                 if len(previous_books) > 1:
84                     print self.style.ERROR("There is more than one book "
85                         "with slug %s:" % slug),
86                 previous_book = previous_books[0]
87                 comm = previous_book.slug
88             else:
89                 previous_book = None
90                 comm = '*'
91             print book_count, slug, '-->', comm
92
93             if previous_book:
94                 book = previous_book
95             else:
96                 book = Book()
97                 book.slug = slug
98             book.title = title
99             book.save()
100
101             if len(book) > 0:
102                 chunk = book[0]
103                 chunk.slug = slug[:50]
104                 chunk.title = title[:255]
105                 chunk.save()
106             else:
107                 chunk = book.add(slug, title)
108
109             if options.get('tag_edumed'):
110                 auto_tagger = 'edumed'
111             else:
112                 auto_tagger = options.get('auto_tagger')
113             if auto_tagger:
114                 text = auto_taggers[auto_tagger](text)
115             chunk.commit(text, **commit_args)
116
117             book_count += 1
118
119         # Print results
120         print
121         print "Results:"
122         print "Imported %d books from Pad" % book_count
123
124         transaction.commit()
125         transaction.leave_transaction_management()