40310eda10611ecf275b6ce3f14582b953f6cbda
[wolnelektury.git] / apps / search / management / commands / snippets.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.core.management.base import BaseCommand
6
7 from glob import glob
8 from optparse import make_option
9 from os import path
10 from sys import stdout
11 from django.conf import settings
12
13 class Command(BaseCommand):
14     help = 'Reindex everything.'
15     args = ''
16
17     option_list = BaseCommand.option_list + (
18         make_option('-C', '--check-just-read', action='store_true', dest='check', default=False,
19             help='Check snippets utf-8'),
20         make_option('-c', '--check', action='store_true', dest='check2', default=False,
21             help='Check snippets utf-8 by walking through index'),
22         )
23
24
25     def handle(self, *args, **opts):
26         from catalogue.models import Book
27         from search.index import Search
28
29         if opts['check']:
30             sfn = glob(settings.SEARCH_INDEX+'snippets/*')
31             print sfn
32             for fn in sfn:
33                 print fn
34                 bkid = int(path.basename(fn))
35                 with open(fn) as f:
36                     cont = f.read()
37                     try:
38                         uc = cont.decode('utf-8')
39                     except UnicodeDecodeError, ude:
40                         print "error in snippets %d" % bkid
41         if opts['check2']:
42             s = Search()
43             reader = s.searcher.getIndexReader()
44             numdocs = reader.numDocs()
45             for did in range(numdocs):
46                 doc = reader.document(did)
47                 if doc and doc.get('book_id'):
48                     bkid = int(doc.get('book_id'))
49                     #import pdb; pdb.set_trace()
50                     stdout.write("\r%d / %d" % (did, numdocs))
51                     stdout.flush()
52                     ss  = doc.get('snippet_position')
53                     sl  = doc.get('snippet_length')
54                     if ss and sl:
55                         snips = Snippets(bkid)
56                         try:
57                             txt = snips.get((ss,sl))
58                             assert len(txt) == sl
59                         except UnicodeDecodeError, ude:
60                             stdout.write("\nerror in snippets %d\n" % bkid)
61                             raise ude
62
63             stdout.write("\ndone.\n")
64