Fix dictionary auto-tagging
[redakcja.git] / apps / catalogue / management / commands / import_pad.py
1 # -*- coding: utf-8 -*-
2 from slughifi import slughifi
3 from collections import defaultdict
4 import json
5 from optparse import make_option
6 import urllib2
7
8 from py_etherpad import EtherpadLiteClient
9 from django.core.management.base import BaseCommand
10 from django.core.management.color import color_style
11 from django.db import transaction
12 from librarian.dcparser import BookInfo
13 from librarian import ParseError, ValidationError, WLURI
14 from django.conf import settings
15 from catalogue.models import Book
16 from catalogue.management import auto_taggers
17
18
19 class Command(BaseCommand):
20     option_list = BaseCommand.option_list + (
21         make_option('-q', '--quiet', action='store_false', dest='verbose', default=True,
22             help='Less output'),
23         make_option('-p', '--pad', dest='pad_id', help='Pad Id (or many id\'s, comma separated)'),
24         make_option('-P', '--pad-ids', dest='pad_ids_file', help='Read Pad id\'s from file'),
25         make_option('-E', '--edumed', dest="tag_edumed", default=False,
26                     action='store_true', help="Perform EduMed pre-tagging"),
27         make_option('-a', '--autotagger', dest="auto_tagger", default=None, help="Use auto-tagger (one of: %s)" % ', '.join(auto_taggers.keys())),
28     )
29     help = 'Imports Text files from EtherPad Lite.'
30
31     def handle(self, *args, **options):
32
33         self.style = color_style()
34
35         verbose = options.get('verbose')
36         pad_ids_file = options.get('pad_ids_file')
37         if pad_ids_file:
38             pad_id = open(pad_ids_file).readlines()
39         else:
40             pad_id = options.get("pad_id").split(',')
41         pad_id = map(str.strip, pad_id)
42
43         # Start transaction management.
44         transaction.commit_unless_managed()
45         transaction.enter_transaction_management()
46         transaction.managed(True)
47
48         if verbose:
49             print 'Reading currently managed files (skipping hidden ones).'
50         slugs = defaultdict(list)
51         for b in Book.objects.exclude(slug__startswith='.').all():
52             if verbose:
53                 print b.slug
54             text = b.materialize().encode('utf-8')
55             try:
56                 info = BookInfo.from_string(text)
57                 slugs[info.url.slug].append(b)
58             except (ParseError, ValidationError):
59                 slugs[b.slug].append(b)
60
61         book_count = 0
62         commit_args = {
63             "author_name": 'Platforma',
64             "description": 'Automatycznie zaimportowane z EtherPad',
65             "publishable": False,
66         }
67
68         if verbose:
69             print 'Opening Pad'
70         pad = EtherpadLiteClient(settings.ETHERPAD_APIKEY, settings.ETHERPAD_URL)
71
72         for pid in pad_id:
73             try:
74                 text = pad.getText(pid)['text']
75             except ValueError:
76                 print "pad '%s' does not exist" % pid
77                 continue
78
79             open("/tmp/pad_%s.txt" % pid, 'w').write(text.encode('utf-8'))
80             
81             if options.get('tag_edumed'):
82                 auto_tagger = 'edumed'
83             else:
84                 auto_tagger = options.get('auto_tagger')
85             if auto_tagger:
86                 text = auto_taggers[auto_tagger](text)
87             try:
88                 info = BookInfo.from_string(text.encode('utf-8'))
89                 slug = info.url.slug
90             except (ParseError, ValidationError):
91                 slug = slughifi(pid)
92
93             print "Importing %s (slug %s)..." % (pid, slug)
94             title = pid
95
96             #            print slugs, slug
97             previous_books = slugs.get(slug)
98             if previous_books:
99                 if len(previous_books) > 1:
100                     print self.style.ERROR("There is more than one book "
101                         "with slug %s:" % slug),
102                 previous_book = previous_books[0]
103                 comm = previous_book.slug
104             else:
105                 previous_book = None
106                 comm = '*'
107             print book_count, slug, '-->', comm
108
109             if previous_book:
110                 book = previous_book
111                 book.slug = slug
112             else:
113                 book = Book()
114                 book.slug = slug
115             book.title = title
116             book.save()
117
118             if len(book) > 0:
119                 chunk = book[0]
120                 chunk.slug = slug[:50]
121                 chunk.title = title[:255]
122                 chunk.save()
123             else:
124                 chunk = book.add(slug, title)
125
126             chunk.commit(text, **commit_args)
127
128             book_count += 1
129
130         # Print results
131         print
132         print "Results:"
133         print "Imported %d books from Pad" % book_count
134
135         transaction.commit()
136         transaction.leave_transaction_management()