1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
15 import catalogue.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 if os.path.isfile(settings.SOLR_STOPWORDS):
27 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
32 class SolrIndex(object):
33 def __init__(self, mode=None):
34 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
37 class Snippets(object):
39 This class manages snippet files for indexed object (book)
40 the snippets are concatenated together, and their positions and
41 lengths are kept in lucene index fields.
43 SNIPPET_DIR = "snippets"
45 def __init__(self, book_id, revision=None):
46 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47 self.book_id = book_id
48 self.revision = revision
55 fn = "%d.%d" % (self.book_id, self.revision)
57 fn = "%d" % self.book_id
59 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
61 def open(self, mode='r'):
63 Open the snippet file. Call .close() afterwards.
69 if os.path.exists(self.path):
72 if not os.path.exists(self.path):
76 self.file = open(self.path, mode)
80 def add(self, snippet):
82 Append a snippet (unicode) to the snippet file.
83 Return a (position, length) tuple
85 txt = snippet.encode('utf-8')
88 pos = (self.position, l)
94 Given a tuple of (position, length) return an unicode
95 of the snippet stored there.
97 self.file.seek(pos[0], 0)
99 txt = self.file.read(pos[1]).decode('utf-8')
105 """Close snippet file"""
121 class Index(SolrIndex):
123 Class indexing books.
126 super(Index, self).__init__(mode='rw')
128 def delete_query(self, *queries):
130 index.delete(queries=...) doesn't work, so let's reimplement it
131 using deletion of list of uids.
135 if isinstance(q, scorched.search.LuceneQuery):
136 q = self.index.query(q)
137 q.field_limiter.update(['uid'])
141 ids = q.paginate(start=st, rows=rows).execute()
148 # FIXME: With Solr API change, this doesn't work.
149 #self.index.delete(uids)
154 def index_tags(self, *tags, **kw):
156 Re-index global tag list.
157 Removes all tags from index, then index them again.
158 Indexed fields include: id, name (with and without polish stems), category
160 log.debug("Indexing tags")
161 remove_only = kw.get('remove_only', False)
162 # first, remove tags from index.
166 q_id = self.index.Q(tag_id=tag.id)
168 if isinstance(tag, PDCounterAuthor):
169 q_cat = self.index.Q(tag_category='pd_author')
170 elif isinstance(tag, PDCounterBook):
171 q_cat = self.index.Q(tag_category='pd_book')
173 q_cat = self.index.Q(tag_category=tag.category)
175 q_id_cat = self.index.Q(q_id & q_cat)
176 tag_qs.append(q_id_cat)
177 self.delete_query(*tag_qs)
179 q = self.index.Q(tag_id__any=True)
183 # then add them [all or just one passed]
186 catalogue.models.Tag.objects.exclude(category='set'),
187 PDCounterAuthor.objects.all(),
188 PDCounterBook.objects.all())
191 if isinstance(tag, PDCounterAuthor):
193 "tag_id": int(tag.id),
194 "tag_name": tag.name,
195 "tag_name_pl": tag.name,
196 "tag_category": 'pd_author',
197 "is_pdcounter": True,
198 "uid": "tag%d_pd_a" % tag.id
200 elif isinstance(tag, PDCounterBook):
202 "tag_id": int(tag.id),
203 "tag_name": tag.title,
204 "tag_name_pl": tag.title,
205 "tag_category": 'pd_book',
206 "is_pdcounter": True,
207 "uid": "tag%d_pd_b" % tag.id
211 "tag_id": int(tag.id),
212 "tag_name": tag.name,
213 "tag_name_pl": tag.name,
214 "tag_category": tag.category,
215 "is_pdcounter": False,
216 "uid": "tag%d" % tag.id
220 def create_book_doc(self, book):
222 Create a lucene document referring book id.
224 doc = {'book_id': int(book.id)}
225 if book.parent is not None:
226 doc['parent_id'] = int(book.parent.id)
229 def remove_book(self, book_or_id, remove_snippets=True):
230 """Removes a book from search index.
231 book - Book instance."""
232 if isinstance(book_or_id, catalogue.models.Book):
233 book_id = book_or_id.id
237 self.delete_query(self.index.Q(book_id=book_id))
240 snippets = Snippets(book_id)
243 def index_book(self, book, book_info=None, overwrite=True):
246 Creates a lucene document for extracted metadata
247 and calls self.index_content() to index the contents of the book.
250 # we don't remove snippets, since they might be still needed by
251 # threads using not reopened index
252 self.remove_book(book, remove_snippets=False)
254 book_doc = self.create_book_doc(book)
255 meta_fields = self.extract_metadata(book, book_info, dc_only=[
256 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
257 # let's not index it - it's only used for extracting publish date
258 if 'source_name' in meta_fields:
259 del meta_fields['source_name']
261 for n, f in meta_fields.items():
264 book_doc['uid'] = "book%s" % book_doc['book_id']
265 self.index.add(book_doc)
268 'title': meta_fields['title'],
269 'authors': meta_fields['authors'],
270 'published_date': meta_fields['published_date']
273 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
274 if tag_name in meta_fields:
275 book_fields[tag_name] = meta_fields[tag_name]
277 self.index_content(book, book_fields=book_fields)
282 'dramat_wierszowany_l',
283 'dramat_wierszowany_lp',
284 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
288 ignore_content_tags = [
289 'uwaga', 'extra', 'nota_red', 'abstrakt',
290 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
292 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
295 footnote_tags = ['pa', 'pt', 'pr', 'pe']
297 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
298 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
300 published_date_re = re.compile("([0-9]+)[\]. ]*$")
302 def extract_metadata(self, book, book_info=None, dc_only=None):
304 Extract metadata from book and returns a map of fields keyed by fieldname
308 if book_info is None:
309 book_info = dcparser.parse(open(book.xml_file.path))
311 fields['slug'] = book.slug
312 fields['is_book'] = True
315 for field in dcparser.BookInfo.FIELDS:
316 if dc_only and field.name not in dc_only:
318 if hasattr(book_info, field.name):
319 if not getattr(book_info, field.name):
321 # since no type information is available, we use validator
322 type_indicator = field.validator
323 if type_indicator == dcparser.as_unicode:
324 s = getattr(book_info, field.name)
327 fields[field.name] = s
328 elif type_indicator == dcparser.as_person:
329 p = getattr(book_info, field.name)
330 if isinstance(p, dcparser.Person):
333 persons = ', '.join(map(str, p))
334 fields[field.name] = persons
335 elif type_indicator == dcparser.as_date:
336 dt = getattr(book_info, field.name)
337 fields[field.name] = dt
341 if hasattr(book_info, 'source_name') and book_info.source_name:
342 match = self.published_date_re.search(book_info.source_name)
343 if match is not None:
344 pd = str(match.groups()[0])
347 fields["published_date"] = pd
351 # def add_gaps(self, fields, fieldname):
353 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
354 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
358 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
359 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
361 def get_master(self, root):
363 Returns the first master tag from an etree.
365 for master in root.iter():
366 if master.tag in self.master_tags:
369 def index_content(self, book, book_fields):
371 Walks the book XML and extract content from it.
372 Adds parts for each header tag and for each fragment.
374 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
375 root = wld.edoc.getroot()
377 master = self.get_master(root)
382 if node.tag not in self.ignore_content_tags:
383 yield node, None, None
384 if node.text is not None:
385 yield None, node.text, None
386 for child in list(node):
387 for b, t, e in walker(child):
389 yield None, None, node
391 if node.tail is not None:
392 yield None, node.tail, None
395 def fix_format(text):
396 # separator = [" ", "\t", ".", ";", ","]
397 if isinstance(text, list):
398 # need to join it first
399 text = filter(lambda s: s is not None, content)
400 text = ' '.join(text)
401 # for i in range(len(text)):
403 # if text[i][0] not in separator\
404 # and text[i - 1][-1] not in separator:
405 # text.insert(i, " ")
407 return re.sub("(?m)/$", "", text)
409 def add_part(snippets, **fields):
410 doc = self.create_book_doc(book)
411 for n, v in book_fields.items():
414 doc['header_index'] = fields["header_index"]
415 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
416 doc['header_type'] = fields['header_type']
418 doc['text'] = fields['text']
421 snip_pos = snippets.add(fields["text"])
423 doc['snippets_position'] = snip_pos[0]
424 doc['snippets_length'] = snip_pos[1]
425 if snippets.revision:
426 doc["snippets_revision"] = snippets.revision
428 if 'fragment_anchor' in fields:
429 doc["fragment_anchor"] = fields['fragment_anchor']
431 if 'themes' in fields:
432 doc['themes'] = fields['themes']
433 doc['uid'] = "part%s-%s-%s-%s" % (
434 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
438 snippets = Snippets(book.id).open('w')
440 for header, position in zip(list(master), range(len(master))):
442 if header.tag in self.skip_header_tags:
444 if header.tag is etree.Comment:
451 def all_content(text):
452 for frag in fragments.values():
453 frag['text'].append(text)
455 handle_text = [all_content]
457 for start, text, end in walker(header):
459 if start is not None and start.tag in self.footnote_tags:
462 def collect_footnote(t):
465 handle_text.append(collect_footnote)
466 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
468 doc = add_part(snippets, header_index=position, header_type=header.tag,
469 text=''.join(footnote),
474 # handle fragments and themes.
475 if start is not None and start.tag == 'begin':
476 fid = start.attrib['id'][1:]
478 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
480 # themes for this fragment
481 elif start is not None and start.tag == 'motyw':
482 fid = start.attrib['id'][1:]
483 handle_text.append(lambda text: None)
484 if start.text is not None:
485 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
486 elif end is not None and end.tag == 'motyw':
489 elif start is not None and start.tag == 'end':
490 fid = start.attrib['id'][1:]
491 if fid not in fragments:
492 continue # a broken <end> node, skip it
493 frag = fragments[fid]
494 if not frag['themes']:
495 continue # empty themes list.
498 doc = add_part(snippets,
499 header_type=frag['start_header'],
500 header_index=frag['start_section'],
501 header_span=position - frag['start_section'] + 1,
503 text=fix_format(frag['text']),
504 themes=frag['themes'])
509 if text is not None and handle_text is not []:
510 hdl = handle_text[-1]
513 # in the end, add a section text.
514 doc = add_part(snippets, header_index=position,
515 header_type=header.tag, text=fix_format(content))
522 def remove_picture(self, picture_or_id):
523 """Removes a picture from search index."""
524 if isinstance(picture_or_id, picture.models.Picture):
525 picture_id = picture_or_id.id
527 picture_id = picture_or_id
528 self.delete_query(self.index.Q(picture_id=picture_id))
530 def index_picture(self, picture, picture_info=None, overwrite=True):
533 Creates a lucene document for extracted metadata
534 and calls self.index_area() to index the contents of the picture.
537 # we don't remove snippets, since they might be still needed by
538 # threads using not reopened index
539 self.remove_picture(picture)
541 picture_doc = {'picture_id': int(picture.id)}
542 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
543 'authors', 'title', 'epochs', 'kinds', 'genres'])
545 picture_doc.update(meta_fields)
547 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
548 self.index.add(picture_doc)
549 del picture_doc['is_book']
550 for area in picture.areas.all():
551 self.index_area(area, picture_fields=picture_doc)
553 def index_area(self, area, picture_fields):
555 Indexes themes and objects on the area.
557 doc = dict(picture_fields)
558 doc['area_id'] = area.id
559 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
560 doc['uid'] = 'area%s' % area.id
565 class SearchResult(object):
566 def __init__(self, doc, how_found=None, query_terms=None):
569 self._processed_hits = None # processed hits
571 self.query_terms = query_terms
575 self._score = doc['score']
579 self.book_id = int(doc["book_id"])
582 self.published_date = int(doc.get("published_date"))
584 self.published_date = 0
587 header_type = doc.get("header_type", None)
588 # we have a content hit in some header of fragment
589 if header_type is not None:
590 sec = (header_type, int(doc["header_index"]))
591 header_span = doc['header_span']
592 header_span = header_span is not None and int(header_span) or 1
593 fragment = doc.get("fragment_anchor", None)
594 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
595 snippets_rev = doc.get('snippets_revision', None)
597 hit = (sec + (header_span,), fragment, self._score, {
598 'how_found': how_found,
599 'snippets_pos': snippets_pos,
600 'snippets_revision': snippets_rev,
601 'themes': doc.get('themes', []),
602 'themes_pl': doc.get('themes_pl', [])
605 self._hits.append(hit)
608 def from_book(cls, book, how_found=None, query_terms=None):
610 'score': book.popularity.count,
614 result = cls(doc, how_found=how_found, query_terms=query_terms)
619 return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
620 (self.book_id, len(self._hits),
621 len(self._processed_hits) if self._processed_hits else -1,
622 self._score, len(self.snippets))
625 return str(self).encode('utf-8')
629 return self._score * self.boost
631 def merge(self, other):
632 if self.book_id != other.book_id:
633 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
634 self._hits += other._hits
635 self._score += max(other._score, 0)
639 if self._book is not None:
642 self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
643 except catalogue.models.Book.DoesNotExist:
647 book = property(get_book)
658 if self._processed_hits is not None:
659 return self._processed_hits
661 # to sections and fragments
662 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
664 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
666 # sections not covered by fragments
667 sect = filter(lambda s: 0 == len(list(filter(
668 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
669 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
671 def remove_duplicates(lst, keyfn, larger):
676 if larger(els[eif], e):
681 # remove fragments with duplicated fid's and duplicated snippets
682 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
684 # remove duplicate sections
688 si = s[self.POSITION][self.POSITION_INDEX]
691 if sections[si]['score'] >= s[self.SCORE]:
694 m = {'score': s[self.SCORE],
695 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
697 m.update(s[self.OTHER])
700 hits = list(sections.values())
704 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
705 except catalogue.models.Fragment.DoesNotExist:
708 # Figure out if we were searching for a token matching some word in theme name.
709 themes = frag.tags.filter(category='theme')
711 if self.query_terms is not None:
712 for i in range(0, len(f[self.OTHER]['themes'])):
713 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
714 tms = map(str.lower, tms)
715 for qt in self.query_terms:
717 themes_hit.add(f[self.OTHER]['themes'][i])
720 def theme_by_name(n):
721 th = list(filter(lambda t: t.name == n, themes))
726 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
728 m = {'score': f[self.SCORE],
730 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
732 'themes_hit': themes_hit
734 m.update(f[self.OTHER])
737 hits.sort(key=lambda h: h['score'], reverse=True)
739 self._processed_hits = hits
744 def aggregate(*result_lists):
746 for rl in result_lists:
748 if r.book_id in books:
749 books[r.book_id].merge(r)
752 return books.values()
754 def get_sort_key(self):
757 self.book.sort_key_author if self.book else '',
758 self.book.sort_key if self.book else '')
760 def __lt__(self, other):
761 return self.get_sort_key() > other.get_sort_key()
763 def __eq__(self, other):
764 return self.get_sort_key() == other.get_sort_key()
767 return len(self.hits)
769 def snippet_pos(self, idx=0):
770 return self.hits[idx]['snippets_pos']
772 def snippet_revision(self, idx=0):
774 return self.hits[idx]['snippets_revision']
775 except (IndexError, KeyError):
780 class PictureResult(object):
781 def __init__(self, doc, how_found=None, query_terms=None):
783 self.query_terms = query_terms
786 self._processed_hits = None
789 self._score = doc['score']
793 self.picture_id = int(doc["picture_id"])
795 if doc.get('area_id'):
796 hit = (self._score, {
797 'how_found': how_found,
798 'area_id': doc['area_id'],
799 'themes': doc.get('themes', []),
800 'themes_pl': doc.get('themes_pl', []),
803 self._hits.append(hit)
806 return "<PR id=%d score=%f >" % (self.picture_id, self._score)
813 return self._score * self.boost
815 def merge(self, other):
816 if self.picture_id != other.picture_id:
818 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
819 self._hits += other._hits
820 self._score += max(other._score, 0)
828 if self._processed_hits is not None:
829 return self._processed_hits
832 for hit in self._hits:
834 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
835 except picture.models.PictureArea.DoesNotExist:
838 # Figure out if we were searching for a token matching some word in theme name.
840 if self.query_terms is not None:
841 for i in range(0, len(hit[self.OTHER]['themes'])):
842 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
843 tms = map(str.lower, tms)
844 for qt in self.query_terms:
846 themes_hit.add(hit[self.OTHER]['themes'][i])
850 'score': hit[self.SCORE],
852 'themes_hit': themes_hit,
854 m.update(hit[self.OTHER])
857 hits.sort(key=lambda h: h['score'], reverse=True)
859 self._processed_hits = hits
862 def get_picture(self):
863 if self._picture is None:
864 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
867 picture = property(get_picture)
870 def aggregate(*result_lists):
872 for rl in result_lists:
874 if r.picture_id in books:
875 books[r.picture_id].merge(r)
877 books[r.picture_id] = r
878 return books.values()
880 def __lt__(self, other):
881 return self.score < other.score
883 def __eq__(self, other):
884 return self.score == other.score
887 class Search(SolrIndex):
891 def __init__(self, default_field="text"):
892 super(Search, self).__init__(mode='r')
894 def make_term_query(self, query, field='text', modal=operator.or_):
896 Returns term queries joined by boolean query.
897 modal - applies to boolean query
898 fuzzy - should the query by fuzzy.
903 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
907 def search_by_author(self, words):
908 from catalogue.models import Book
909 books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
911 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
912 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
914 def search_words(self, words, fields, required=None, book=True, picture=False):
915 if book and not picture and fields == ['authors']:
916 return self.search_by_author(words)
919 if book or picture or (word not in stopwords):
922 q = self.index.Q(**{field: word})
923 if word_filter is None:
927 filters.append(word_filter)
929 required_filter = None
930 for field in required:
932 if book or picture or (word not in stopwords):
933 q = self.index.Q(**{field: word})
934 if required_filter is None:
938 filters.append(required_filter)
943 params['is_book'] = True
945 params['picture_id__gt'] = 0
947 params['book_id__gt'] = 0
948 query = self.index.query(**params)
949 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
950 result_class = PictureResult if picture else SearchResult
951 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
953 def get_snippets(self, searchresult, query, field='text', num=1):
955 Returns a snippet for found scoreDoc.
957 maxnum = len(searchresult)
958 if num is None or num < 0 or num > maxnum:
960 book_id = searchresult.book_id
961 revision = searchresult.snippet_revision()
962 snippets = Snippets(book_id, revision=revision)
963 snips = [None] * maxnum
967 while idx < maxnum and num > 0:
968 position, length = searchresult.snippet_pos(idx)
969 if position is None or length is None:
971 text = snippets.get((int(position),
973 snip = self.index.highlight(text=text, field=field, q=query)
974 if not snip and field == 'text':
975 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
976 if snip not in snips:
983 book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
985 log.error("Book does not exist for book id = %d" % book_id)
986 elif not book.get().children.exists():
987 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
992 # remove verse end markers..
993 snips = [s.replace("/\n", "\n") if s else s for s in snips]
995 searchresult.snippets = snips
1000 def apply_filters(query, filters):
1002 Apply filters to a query
1006 filters = filter(lambda x: x is not None, filters)
1008 query = query.query(f)
1012 if getattr(settings, 'SEARCH_MOCK', False):
1013 from .mock_search import Search