1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
23 if os.path.isfile(settings.SOLR_STOPWORDS):
25 line.decode('utf-8').strip()
26 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
31 class SolrIndex(object):
32 def __init__(self, mode=None):
33 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
36 class Snippets(object):
38 This class manages snippet files for indexed object (book)
39 the snippets are concatenated together, and their positions and
40 lengths are kept in lucene index fields.
42 SNIPPET_DIR = "snippets"
44 def __init__(self, book_id, revision=None):
45 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
46 self.book_id = book_id
47 self.revision = revision
54 fn = "%d.%d" % (self.book_id, self.revision)
56 fn = "%d" % self.book_id
58 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60 def open(self, mode='r'):
62 Open the snippet file. Call .close() afterwards.
68 if os.path.exists(self.path):
71 if not os.path.exists(self.path):
75 self.file = open(self.path, mode)
79 def add(self, snippet):
81 Append a snippet (unicode) to the snippet file.
82 Return a (position, length) tuple
84 txt = snippet.encode('utf-8')
87 pos = (self.position, l)
93 Given a tuple of (position, length) return an unicode
94 of the snippet stored there.
96 self.file.seek(pos[0], 0)
97 txt = self.file.read(pos[1]).decode('utf-8')
101 """Close snippet file"""
117 class Index(SolrIndex):
119 Class indexing books.
122 super(Index, self).__init__(mode='rw')
124 def delete_query(self, *queries):
126 index.delete(queries=...) doesn't work, so let's reimplement it
127 using deletion of list of uids.
131 if isinstance(q, sunburnt.search.LuceneQuery):
132 q = self.index.query(q)
133 q.field_limiter.update(['uid'])
137 ids = q.paginate(start=st, rows=rows).execute()
144 self.index.delete(uids)
149 def index_tags(self, *tags, **kw):
151 Re-index global tag list.
152 Removes all tags from index, then index them again.
153 Indexed fields include: id, name (with and without polish stems), category
155 log.debug("Indexing tags")
156 remove_only = kw.get('remove_only', False)
157 # first, remove tags from index.
161 q_id = self.index.Q(tag_id=tag.id)
163 if isinstance(tag, PDCounterAuthor):
164 q_cat = self.index.Q(tag_category='pd_author')
165 elif isinstance(tag, PDCounterBook):
166 q_cat = self.index.Q(tag_category='pd_book')
168 q_cat = self.index.Q(tag_category=tag.category)
170 q_id_cat = self.index.Q(q_id & q_cat)
171 tag_qs.append(q_id_cat)
172 self.delete_query(*tag_qs)
174 q = self.index.Q(tag_id__any=True)
178 # then add them [all or just one passed]
181 catalogue.models.Tag.objects.exclude(category='set'),
182 PDCounterAuthor.objects.all(),
183 PDCounterBook.objects.all())
186 if isinstance(tag, PDCounterAuthor):
188 "tag_id": int(tag.id),
189 "tag_name": tag.name,
190 "tag_name_pl": tag.name,
191 "tag_category": 'pd_author',
192 "is_pdcounter": True,
193 "uid": "tag%d_pd_a" % tag.id
195 elif isinstance(tag, PDCounterBook):
197 "tag_id": int(tag.id),
198 "tag_name": tag.title,
199 "tag_name_pl": tag.title,
200 "tag_category": 'pd_book',
201 "is_pdcounter": True,
202 "uid": "tag%d_pd_b" % tag.id
206 "tag_id": int(tag.id),
207 "tag_name": tag.name,
208 "tag_name_pl": tag.name,
209 "tag_category": tag.category,
210 "is_pdcounter": False,
211 "uid": "tag%d" % tag.id
215 def create_book_doc(self, book):
217 Create a lucene document referring book id.
219 doc = {'book_id': int(book.id)}
220 if book.parent is not None:
221 doc['parent_id'] = int(book.parent.id)
224 def remove_book(self, book_or_id, remove_snippets=True):
225 """Removes a book from search index.
226 book - Book instance."""
227 if isinstance(book_or_id, catalogue.models.Book):
228 book_id = book_or_id.id
232 self.delete_query(self.index.Q(book_id=book_id))
235 snippets = Snippets(book_id)
238 def index_book(self, book, book_info=None, overwrite=True):
241 Creates a lucene document for extracted metadata
242 and calls self.index_content() to index the contents of the book.
245 # we don't remove snippets, since they might be still needed by
246 # threads using not reopened index
247 self.remove_book(book, remove_snippets=False)
249 book_doc = self.create_book_doc(book)
250 meta_fields = self.extract_metadata(book, book_info, dc_only=[
251 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
252 # let's not index it - it's only used for extracting publish date
253 if 'source_name' in meta_fields:
254 del meta_fields['source_name']
256 for n, f in meta_fields.items():
259 book_doc['uid'] = "book%s" % book_doc['book_id']
260 self.index.add(book_doc)
263 'title': meta_fields['title'],
264 'authors': meta_fields['authors'],
265 'published_date': meta_fields['published_date']
268 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
269 if tag_name in meta_fields:
270 book_fields[tag_name] = meta_fields[tag_name]
272 self.index_content(book, book_fields=book_fields)
277 'dramat_wierszowany_l',
278 'dramat_wierszowany_lp',
279 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
283 ignore_content_tags = [
284 'uwaga', 'extra', 'nota_red', 'abstrakt',
285 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
287 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
290 footnote_tags = ['pa', 'pt', 'pr', 'pe']
292 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
293 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
295 published_date_re = re.compile("([0-9]+)[\]. ]*$")
297 def extract_metadata(self, book, book_info=None, dc_only=None):
299 Extract metadata from book and returns a map of fields keyed by fieldname
303 if book_info is None:
304 book_info = dcparser.parse(open(book.xml_file.path))
306 fields['slug'] = book.slug
307 fields['is_book'] = True
310 for field in dcparser.BookInfo.FIELDS:
311 if dc_only and field.name not in dc_only:
313 if hasattr(book_info, field.name):
314 if not getattr(book_info, field.name):
316 # since no type information is available, we use validator
317 type_indicator = field.validator
318 if type_indicator == dcparser.as_unicode:
319 s = getattr(book_info, field.name)
322 fields[field.name] = s
323 elif type_indicator == dcparser.as_person:
324 p = getattr(book_info, field.name)
325 if isinstance(p, dcparser.Person):
328 persons = ', '.join(map(unicode, p))
329 fields[field.name] = persons
330 elif type_indicator == dcparser.as_date:
331 dt = getattr(book_info, field.name)
332 fields[field.name] = dt
336 if hasattr(book_info, 'source_name') and book_info.source_name:
337 match = self.published_date_re.search(book_info.source_name)
338 if match is not None:
339 pd = str(match.groups()[0])
342 fields["published_date"] = pd
346 # def add_gaps(self, fields, fieldname):
348 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
349 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
353 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
354 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
356 def get_master(self, root):
358 Returns the first master tag from an etree.
360 for master in root.iter():
361 if master.tag in self.master_tags:
364 def index_content(self, book, book_fields):
366 Walks the book XML and extract content from it.
367 Adds parts for each header tag and for each fragment.
369 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
370 root = wld.edoc.getroot()
372 master = self.get_master(root)
377 if node.tag not in self.ignore_content_tags:
378 yield node, None, None
379 if node.text is not None:
380 yield None, node.text, None
381 for child in list(node):
382 for b, t, e in walker(child):
384 yield None, None, node
386 if node.tail is not None:
387 yield None, node.tail, None
390 def fix_format(text):
391 # separator = [u" ", u"\t", u".", u";", u","]
392 if isinstance(text, list):
393 # need to join it first
394 text = filter(lambda s: s is not None, content)
395 text = u' '.join(text)
396 # for i in range(len(text)):
398 # if text[i][0] not in separator\
399 # and text[i - 1][-1] not in separator:
400 # text.insert(i, u" ")
402 return re.sub("(?m)/$", "", text)
404 def add_part(snippets, **fields):
405 doc = self.create_book_doc(book)
406 for n, v in book_fields.items():
409 doc['header_index'] = fields["header_index"]
410 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
411 doc['header_type'] = fields['header_type']
413 doc['text'] = fields['text']
416 snip_pos = snippets.add(fields["text"])
418 doc['snippets_position'] = snip_pos[0]
419 doc['snippets_length'] = snip_pos[1]
420 if snippets.revision:
421 doc["snippets_revision"] = snippets.revision
423 if 'fragment_anchor' in fields:
424 doc["fragment_anchor"] = fields['fragment_anchor']
426 if 'themes' in fields:
427 doc['themes'] = fields['themes']
428 doc['uid'] = "part%s-%s-%s-%s" % (
429 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
433 snippets = Snippets(book.id).open('w')
435 for header, position in zip(list(master), range(len(master))):
437 if header.tag in self.skip_header_tags:
439 if header.tag is etree.Comment:
446 def all_content(text):
447 for frag in fragments.values():
448 frag['text'].append(text)
450 handle_text = [all_content]
452 for start, text, end in walker(header):
454 if start is not None and start.tag in self.footnote_tags:
457 def collect_footnote(t):
460 handle_text.append(collect_footnote)
461 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
463 doc = add_part(snippets, header_index=position, header_type=header.tag,
464 text=u''.join(footnote),
469 # handle fragments and themes.
470 if start is not None and start.tag == 'begin':
471 fid = start.attrib['id'][1:]
473 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475 # themes for this fragment
476 elif start is not None and start.tag == 'motyw':
477 fid = start.attrib['id'][1:]
478 handle_text.append(lambda text: None)
479 if start.text is not None:
480 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
481 elif end is not None and end.tag == 'motyw':
484 elif start is not None and start.tag == 'end':
485 fid = start.attrib['id'][1:]
486 if fid not in fragments:
487 continue # a broken <end> node, skip it
488 frag = fragments[fid]
489 if not frag['themes']:
490 continue # empty themes list.
493 doc = add_part(snippets,
494 header_type=frag['start_header'],
495 header_index=frag['start_section'],
496 header_span=position - frag['start_section'] + 1,
498 text=fix_format(frag['text']),
499 themes=frag['themes'])
504 if text is not None and handle_text is not []:
505 hdl = handle_text[-1]
508 # in the end, add a section text.
509 doc = add_part(snippets, header_index=position,
510 header_type=header.tag, text=fix_format(content))
518 class SearchResult(object):
519 def __init__(self, doc, how_found=None, query_terms=None):
522 self._processed_hits = None # processed hits
524 self.query_terms = query_terms
528 self._score = doc['score']
532 self.book_id = int(doc["book_id"])
535 self.published_date = int(doc.get("published_date"))
537 self.published_date = 0
540 header_type = doc.get("header_type", None)
541 # we have a content hit in some header of fragment
542 if header_type is not None:
543 sec = (header_type, int(doc["header_index"]))
544 header_span = doc['header_span']
545 header_span = header_span is not None and int(header_span) or 1
546 fragment = doc.get("fragment_anchor", None)
547 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
548 snippets_rev = doc.get('snippets_revision', None)
550 hit = (sec + (header_span,), fragment, self._score, {
551 'how_found': how_found,
552 'snippets_pos': snippets_pos,
553 'snippets_revision': snippets_rev,
554 'themes': doc.get('themes', []),
555 'themes_pl': doc.get('themes_pl', [])
558 self._hits.append(hit)
561 def from_book(cls, book, how_found=None, query_terms=None):
563 'score': book.popularity.count,
567 result = cls(doc, how_found=how_found, query_terms=query_terms)
571 def __unicode__(self):
572 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
573 (self.book_id, len(self._hits),
574 len(self._processed_hits) if self._processed_hits else -1,
575 self._score, len(self.snippets))
578 return unicode(self).encode('utf-8')
582 return self._score * self.boost
584 def merge(self, other):
585 if self.book_id != other.book_id:
586 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
587 self._hits += other._hits
588 self._score += max(other._score, 0)
592 if self._book is not None:
594 self._book = catalogue.models.Book.objects.get(id=self.book_id)
597 book = property(get_book)
608 if self._processed_hits is not None:
609 return self._processed_hits
611 # to sections and fragments
612 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
614 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
616 # sections not covered by fragments
617 sect = filter(lambda s: 0 == len(filter(
618 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
619 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
621 def remove_duplicates(lst, keyfn, compare):
626 if compare(els[eif], e) >= 1:
631 # remove fragments with duplicated fid's and duplicated snippets
632 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
633 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
634 # lambda a, b: cmp(a[SCORE], b[SCORE]))
636 # remove duplicate sections
640 si = s[self.POSITION][self.POSITION_INDEX]
643 if sections[si]['score'] >= s[self.SCORE]:
646 m = {'score': s[self.SCORE],
647 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
649 m.update(s[self.OTHER])
652 hits = sections.values()
656 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
657 except catalogue.models.Fragment.DoesNotExist:
660 # Figure out if we were searching for a token matching some word in theme name.
661 themes = frag.tags.filter(category='theme')
663 if self.query_terms is not None:
664 for i in range(0, len(f[self.OTHER]['themes'])):
665 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
666 tms = map(unicode.lower, tms)
667 for qt in self.query_terms:
669 themes_hit.add(f[self.OTHER]['themes'][i])
672 def theme_by_name(n):
673 th = filter(lambda t: t.name == n, themes)
678 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
680 m = {'score': f[self.SCORE],
682 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
684 'themes_hit': themes_hit
686 m.update(f[self.OTHER])
689 hits.sort(key=lambda h: h['score'], reverse=True)
691 self._processed_hits = hits
696 def aggregate(*result_lists):
698 for rl in result_lists:
700 if r.book_id in books:
701 books[r.book_id].merge(r)
704 return books.values()
706 def __cmp__(self, other):
707 c = cmp(self.score, other.score)
709 # this is inverted, because earlier date is better
710 return cmp(other.published_date, self.published_date)
715 return len(self.hits)
717 def snippet_pos(self, idx=0):
718 return self.hits[idx]['snippets_pos']
720 def snippet_revision(self, idx=0):
722 return self.hits[idx]['snippets_revision']
723 except (IndexError, KeyError):
727 class Search(SolrIndex):
731 def __init__(self, default_field="text"):
732 super(Search, self).__init__(mode='r')
734 def make_term_query(self, query, field='text', modal=operator.or_):
736 Returns term queries joined by boolean query.
737 modal - applies to boolean query
738 fuzzy - should the query by fuzzy.
743 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
747 def search_by_author(self, words):
748 from catalogue.models import Book
749 books = Book.objects.filter(parent=None).order_by('-popularity__count')
751 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
752 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
754 def search_words(self, words, fields, book=True):
755 if book and fields == ['authors']:
756 return self.search_by_author(words)
759 if book or (word not in stopwords):
762 q = self.index.Q(**{field: word})
763 if word_filter is None:
767 filters.append(word_filter)
771 query = self.index.query(is_book=True)
773 query = self.index.query()
774 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
775 return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
777 def get_snippets(self, searchresult, query, field='text', num=1):
779 Returns a snippet for found scoreDoc.
781 maxnum = len(searchresult)
782 if num is None or num < 0 or num > maxnum:
784 book_id = searchresult.book_id
785 revision = searchresult.snippet_revision()
786 snippets = Snippets(book_id, revision=revision)
787 snips = [None] * maxnum
791 while idx < maxnum and num > 0:
792 position, length = searchresult.snippet_pos(idx)
793 if position is None or length is None:
795 text = snippets.get((int(position),
797 snip = self.index.highlight(text=text, field=field, q=query)
798 if snip not in snips:
805 book = catalogue.models.Book.objects.filter(id=book_id)
807 log.error("Book does not exist for book id = %d" % book_id)
808 elif not book.get().children.exists():
809 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
814 # remove verse end markers..
815 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
817 searchresult.snippets = snips
822 def apply_filters(query, filters):
824 Apply filters to a query
828 filters = filter(lambda x: x is not None, filters)
830 query = query.query(f)
834 if getattr(settings, 'SEARCH_MOCK', False):
835 from .mock_search import Search