librarian
[wolnelektury.git] / src / search / management / commands / snippets.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.core.management.base import BaseCommand
6
7 from glob import glob
8 from optparse import make_option
9 from os import path
10 from sys import stdout
11 from django.conf import settings
12
13
14 class Command(BaseCommand):
15     help = 'Reindex everything.'
16     args = ''
17
18     option_list = BaseCommand.option_list + (
19         make_option('-C', '--check-just-read', action='store_true', dest='check', default=False,
20                     help='Check snippets utf-8'),
21         make_option('-c', '--check', action='store_true', dest='check2', default=False,
22                     help='Check snippets utf-8 by walking through index'),
23         )
24
25     def handle(self, *args, **opts):
26         from search.index import Search, Snippets
27
28         if opts['check']:
29             sfn = glob(settings.SEARCH_INDEX+'snippets/*')
30             print sfn
31             for fn in sfn:
32                 print fn
33                 bkid = int(path.basename(fn))
34                 with open(fn) as f:
35                     cont = f.read()
36                     try:
37                         uc = cont.decode('utf-8')
38                     except UnicodeDecodeError, ude:
39                         print "error in snippets %d" % bkid
40         if opts['check2']:
41             s = Search()
42             reader = s.searcher.getIndexReader()
43             numdocs = reader.numDocs()
44             for did in range(numdocs):
45                 doc = reader.document(did)
46                 if doc and doc.get('book_id'):
47                     bkid = int(doc.get('book_id'))
48                     # import pdb; pdb.set_trace()
49                     stdout.write("\r%d / %d" % (did, numdocs))
50                     stdout.flush()
51                     ss = doc.get('snippet_position')
52                     sl = doc.get('snippet_length')
53                     if ss and sl:
54                         # WTF (nie było zaimportowane)
55                         snips = Snippets(bkid)
56                         try:
57                             txt = snips.get((ss, sl))
58                             assert len(txt) == sl
59                         except UnicodeDecodeError, ude:
60                             stdout.write("\nerror in snippets %d\n" % bkid)
61                             raise ude
62
63             stdout.write("\ndone.\n")