1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.date
13 import librarian.meta.types.person
14 import librarian.meta.types.text
15 from librarian.parser import WLDocument
16 from lxml import etree
18 import catalogue.models
20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
21 from wolnelektury.utils import makedirs
24 log = logging.getLogger('search')
27 if os.path.isfile(settings.SOLR_STOPWORDS):
30 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
35 class SolrIndex(object):
36 def __init__(self, mode=None):
37 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
40 class Snippets(object):
42 This class manages snippet files for indexed object (book)
43 the snippets are concatenated together, and their positions and
44 lengths are kept in lucene index fields.
46 SNIPPET_DIR = "snippets"
48 def __init__(self, book_id, revision=None):
49 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
50 self.book_id = book_id
51 self.revision = revision
58 fn = "%d.%d" % (self.book_id, self.revision)
60 fn = "%d" % self.book_id
62 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
64 def open(self, mode='r'):
66 Open the snippet file. Call .close() afterwards.
72 if os.path.exists(self.path):
75 if not os.path.exists(self.path):
79 self.file = open(self.path, mode)
83 def add(self, snippet):
85 Append a snippet (unicode) to the snippet file.
86 Return a (position, length) tuple
88 txt = snippet.encode('utf-8')
91 pos = (self.position, l)
97 Given a tuple of (position, length) return an unicode
98 of the snippet stored there.
100 self.file.seek(pos[0], 0)
102 txt = self.file.read(pos[1]).decode('utf-8')
108 """Close snippet file"""
124 class Index(SolrIndex):
126 Class indexing books.
129 super(Index, self).__init__(mode='rw')
131 def remove_snippets(self, book):
132 book.snippet_set.all().delete()
134 def add_snippet(self, book, doc):
135 assert book.id == doc.pop('book_id')
136 # Fragments already exist and can be indexed where they live.
137 if 'fragment_anchor' in doc:
140 text = doc.pop('text')
141 header_index = doc.pop('header_index')
142 book.snippet_set.create(
147 def delete_query(self, *queries):
149 index.delete(queries=...) doesn't work, so let's reimplement it
150 using deletion of list of uids.
154 if isinstance(q, scorched.search.LuceneQuery):
155 q = self.index.query(q)
156 q.field_limiter.update(['uid'])
160 ids = q.paginate(start=st, rows=rows).execute()
167 # FIXME: With Solr API change, this doesn't work.
168 #self.index.delete(uids)
173 def index_tags(self, *tags, **kw):
175 Re-index global tag list.
176 Removes all tags from index, then index them again.
177 Indexed fields include: id, name (with and without polish stems), category
179 log.debug("Indexing tags")
180 remove_only = kw.get('remove_only', False)
181 # first, remove tags from index.
185 q_id = self.index.Q(tag_id=tag.id)
187 if isinstance(tag, PDCounterAuthor):
188 q_cat = self.index.Q(tag_category='pd_author')
189 elif isinstance(tag, PDCounterBook):
190 q_cat = self.index.Q(tag_category='pd_book')
192 q_cat = self.index.Q(tag_category=tag.category)
194 q_id_cat = self.index.Q(q_id & q_cat)
195 tag_qs.append(q_id_cat)
196 self.delete_query(*tag_qs)
198 q = self.index.Q(tag_id__any=True)
202 # then add them [all or just one passed]
205 catalogue.models.Tag.objects.exclude(category='set'),
206 PDCounterAuthor.objects.all(),
207 PDCounterBook.objects.all())
210 if isinstance(tag, PDCounterAuthor):
212 "tag_id": int(tag.id),
213 "tag_name": tag.name,
214 "tag_name_pl": tag.name,
215 "tag_category": 'pd_author',
216 "is_pdcounter": True,
217 "uid": "tag%d_pd_a" % tag.id
219 elif isinstance(tag, PDCounterBook):
221 "tag_id": int(tag.id),
222 "tag_name": tag.title,
223 "tag_name_pl": tag.title,
224 "tag_category": 'pd_book',
225 "is_pdcounter": True,
226 "uid": "tag%d_pd_b" % tag.id
230 "tag_id": int(tag.id),
231 "tag_name": tag.name,
232 "tag_name_pl": tag.name,
233 "tag_category": tag.category,
234 "is_pdcounter": False,
235 "uid": "tag%d" % tag.id
239 def create_book_doc(self, book):
241 Create a lucene document referring book id.
243 doc = {'book_id': int(book.id)}
244 if book.parent is not None:
245 doc['parent_id'] = int(book.parent.id)
248 def remove_book(self, book, remove_snippets=True, legacy=True):
249 """Removes a book from search index.
250 book - Book instance."""
252 self.delete_query(self.index.Q(book_id=book.id))
255 snippets = Snippets(book.id)
257 self.remove_snippets(book)
259 def index_book(self, book, book_info=None, overwrite=True, legacy=True):
262 Creates a lucene document for extracted metadata
263 and calls self.index_content() to index the contents of the book.
265 if not book.xml_file: return
268 # we don't remove snippets, since they might be still needed by
269 # threads using not reopened index
270 self.remove_book(book, remove_snippets=False, legacy=legacy)
272 book_doc = self.create_book_doc(book)
273 meta_fields = self.extract_metadata(book, book_info, dc_only=[
274 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
275 # let's not index it - it's only used for extracting publish date
276 if 'source_name' in meta_fields:
277 del meta_fields['source_name']
279 for n, f in meta_fields.items():
282 book_doc['uid'] = "book%s" % book_doc['book_id']
284 self.index.add(book_doc)
287 'title': meta_fields['title'],
288 'authors': meta_fields['authors'],
289 'published_date': meta_fields['published_date']
292 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
293 if tag_name in meta_fields:
294 book_fields[tag_name] = meta_fields[tag_name]
296 self.index_content(book, book_fields=book_fields, legacy=legacy)
301 'dramat_wierszowany_l',
302 'dramat_wierszowany_lp',
303 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
307 ignore_content_tags = [
308 'uwaga', 'extra', 'nota_red', 'abstrakt',
309 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
311 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
314 footnote_tags = ['pa', 'pt', 'pr', 'pe']
316 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
317 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
319 published_date_re = re.compile("([0-9]+)[\]. ]*$")
321 def extract_metadata(self, book, book_info=None, dc_only=None):
323 Extract metadata from book and returns a map of fields keyed by fieldname
327 if book_info is None:
328 book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
330 fields['slug'] = book.slug
331 fields['is_book'] = True
334 for field in dcparser.BookInfo.FIELDS:
335 if dc_only and field.name not in dc_only:
337 if hasattr(book_info, field.name):
338 if not getattr(book_info, field.name):
340 type_indicator = field.value_type
341 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
342 s = getattr(book_info, field.name)
345 fields[field.name] = s
346 elif issubclass(type_indicator, librarian.meta.types.person.Person):
347 p = getattr(book_info, field.name)
348 if isinstance(p, librarian.meta.types.person.Person):
351 persons = ', '.join(map(str, p))
352 fields[field.name] = persons
353 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
354 dt = getattr(book_info, field.name)
355 fields[field.name] = dt
359 if hasattr(book_info, 'source_name') and book_info.source_name:
360 match = self.published_date_re.search(book_info.source_name)
361 if match is not None:
362 pd = str(match.groups()[0])
365 fields["published_date"] = pd
369 # def add_gaps(self, fields, fieldname):
371 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
372 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
376 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
377 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
379 def get_master(self, root):
381 Returns the first master tag from an etree.
383 for master in root.iter():
384 if master.tag in self.master_tags:
387 def index_content(self, book, book_fields, legacy=True):
389 Walks the book XML and extract content from it.
390 Adds parts for each header tag and for each fragment.
392 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
393 root = wld.edoc.getroot()
395 master = self.get_master(root)
400 if node.tag not in self.ignore_content_tags:
401 yield node, None, None
402 if node.text is not None:
403 yield None, node.text, None
404 for child in list(node):
405 for b, t, e in walker(child):
407 yield None, None, node
409 if node.tail is not None:
410 yield None, node.tail, None
413 def fix_format(text):
414 # separator = [" ", "\t", ".", ";", ","]
415 if isinstance(text, list):
416 # need to join it first
417 text = filter(lambda s: s is not None, content)
418 text = ' '.join(text)
419 # for i in range(len(text)):
421 # if text[i][0] not in separator\
422 # and text[i - 1][-1] not in separator:
423 # text.insert(i, " ")
425 return re.sub("(?m)/$", "", text)
427 def add_part(snippets, **fields):
428 doc = self.create_book_doc(book)
429 for n, v in book_fields.items():
432 doc['header_index'] = fields["header_index"]
433 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
434 doc['header_type'] = fields['header_type']
436 doc['text'] = fields['text']
439 snip_pos = snippets.add(fields["text"])
441 doc['snippets_position'] = snip_pos[0]
442 doc['snippets_length'] = snip_pos[1]
443 if snippets.revision:
444 doc["snippets_revision"] = snippets.revision
446 if 'fragment_anchor' in fields:
447 doc["fragment_anchor"] = fields['fragment_anchor']
449 if 'themes' in fields:
450 doc['themes'] = fields['themes']
451 doc['uid'] = "part%s-%s-%s-%s" % (
452 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
456 snippets = Snippets(book.id).open('w')
458 for header, position in zip(list(master), range(len(master))):
460 if header.tag in self.skip_header_tags:
462 if header.tag is etree.Comment:
469 def all_content(text):
470 for frag in fragments.values():
471 frag['text'].append(text)
473 handle_text = [all_content]
475 for start, text, end in walker(header):
477 if start is not None and start.tag in self.footnote_tags:
480 def collect_footnote(t):
483 handle_text.append(collect_footnote)
484 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
486 doc = add_part(snippets, header_index=position, header_type=header.tag,
487 text=''.join(footnote))
488 self.add_snippet(book, doc)
493 # handle fragments and themes.
494 if start is not None and start.tag == 'begin':
495 fid = start.attrib['id'][1:]
497 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
499 # themes for this fragment
500 elif start is not None and start.tag == 'motyw':
501 fid = start.attrib['id'][1:]
502 handle_text.append(lambda text: None)
503 if start.text is not None:
504 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
505 elif end is not None and end.tag == 'motyw':
508 elif start is not None and start.tag == 'end':
509 fid = start.attrib['id'][1:]
510 if fid not in fragments:
511 continue # a broken <end> node, skip it
512 frag = fragments[fid]
513 if not frag['themes']:
514 continue # empty themes list.
517 doc = add_part(snippets,
518 header_type=frag['start_header'],
519 header_index=frag['start_section'],
520 header_span=position - frag['start_section'] + 1,
522 text=fix_format(frag['text']),
523 themes=frag['themes'])
524 # Add searchable fragment
525 self.add_snippet(book, doc)
531 if text is not None and handle_text is not []:
532 hdl = handle_text[-1]
535 # in the end, add a section text.
536 doc = add_part(snippets, header_index=position,
537 header_type=header.tag, text=fix_format(content))
539 self.add_snippet(book, doc)
546 def remove_picture(self, picture_or_id):
547 """Removes a picture from search index."""
548 if isinstance(picture_or_id, picture.models.Picture):
549 picture_id = picture_or_id.id
551 picture_id = picture_or_id
552 self.delete_query(self.index.Q(picture_id=picture_id))
554 def index_picture(self, picture, picture_info=None, overwrite=True):
557 Creates a lucene document for extracted metadata
558 and calls self.index_area() to index the contents of the picture.
561 # we don't remove snippets, since they might be still needed by
562 # threads using not reopened index
563 self.remove_picture(picture)
565 picture_doc = {'picture_id': int(picture.id)}
566 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
567 'authors', 'title', 'epochs', 'kinds', 'genres'])
569 picture_doc.update(meta_fields)
571 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
572 self.index.add(picture_doc)
573 del picture_doc['is_book']
574 for area in picture.areas.all():
575 self.index_area(area, picture_fields=picture_doc)
577 def index_area(self, area, picture_fields):
579 Indexes themes and objects on the area.
581 doc = dict(picture_fields)
582 doc['area_id'] = area.id
583 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
584 doc['uid'] = 'area%s' % area.id
589 class SearchResult(object):
590 def __init__(self, doc, how_found=None, query_terms=None):
593 self._processed_hits = None # processed hits
595 self.query_terms = query_terms
599 self._score = doc['score']
603 self.book_id = int(doc["book_id"])
606 self.published_date = int(doc.get("published_date"))
608 self.published_date = 0
611 header_type = doc.get("header_type", None)
612 # we have a content hit in some header of fragment
613 if header_type is not None:
614 sec = (header_type, int(doc["header_index"]))
615 header_span = doc['header_span']
616 header_span = header_span is not None and int(header_span) or 1
617 fragment = doc.get("fragment_anchor", None)
618 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
619 snippets_rev = doc.get('snippets_revision', None)
621 hit = (sec + (header_span,), fragment, self._score, {
622 'how_found': how_found,
623 'snippets_pos': snippets_pos,
624 'snippets_revision': snippets_rev,
625 'themes': doc.get('themes', []),
626 'themes_pl': doc.get('themes_pl', [])
629 self._hits.append(hit)
632 def from_book(cls, book, how_found=None, query_terms=None):
634 'score': book.popularity.count,
638 result = cls(doc, how_found=how_found, query_terms=query_terms)
643 return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
644 (self.book_id, len(self._hits),
645 len(self._processed_hits) if self._processed_hits else -1,
646 self._score, len(self.snippets))
649 return str(self).encode('utf-8')
653 return self._score * self.boost
655 def merge(self, other):
656 if self.book_id != other.book_id:
657 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
658 self._hits += other._hits
659 self._score += max(other._score, 0)
663 if self._book is not None:
666 self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
667 except catalogue.models.Book.DoesNotExist:
671 book = property(get_book)
682 if self._processed_hits is not None:
683 return self._processed_hits
685 # to sections and fragments
686 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
688 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
690 # sections not covered by fragments
691 sect = filter(lambda s: 0 == len(list(filter(
692 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
693 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
695 def remove_duplicates(lst, keyfn, larger):
700 if larger(els[eif], e):
705 # remove fragments with duplicated fid's and duplicated snippets
706 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
708 # remove duplicate sections
712 si = s[self.POSITION][self.POSITION_INDEX]
715 if sections[si]['score'] >= s[self.SCORE]:
718 m = {'score': s[self.SCORE],
719 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
721 m.update(s[self.OTHER])
724 hits = list(sections.values())
728 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
729 except catalogue.models.Fragment.DoesNotExist:
732 # Figure out if we were searching for a token matching some word in theme name.
733 themes = frag.tags.filter(category='theme')
735 if self.query_terms is not None:
736 for i in range(0, len(f[self.OTHER]['themes'])):
737 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
738 tms = map(str.lower, tms)
739 for qt in self.query_terms:
741 themes_hit.add(f[self.OTHER]['themes'][i])
744 def theme_by_name(n):
745 th = list(filter(lambda t: t.name == n, themes))
750 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
752 m = {'score': f[self.SCORE],
754 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
756 'themes_hit': themes_hit
758 m.update(f[self.OTHER])
761 hits.sort(key=lambda h: h['score'], reverse=True)
763 self._processed_hits = hits
768 def aggregate(*result_lists):
770 for rl in result_lists:
772 if r.book_id in books:
773 books[r.book_id].merge(r)
776 return books.values()
778 def get_sort_key(self):
781 self.book.sort_key_author if self.book else '',
782 self.book.sort_key if self.book else '')
784 def __lt__(self, other):
785 return self.get_sort_key() > other.get_sort_key()
787 def __eq__(self, other):
788 return self.get_sort_key() == other.get_sort_key()
791 return len(self.hits)
793 def snippet_pos(self, idx=0):
794 return self.hits[idx]['snippets_pos']
796 def snippet_revision(self, idx=0):
798 return self.hits[idx]['snippets_revision']
799 except (IndexError, KeyError):
804 class PictureResult(object):
805 def __init__(self, doc, how_found=None, query_terms=None):
807 self.query_terms = query_terms
810 self._processed_hits = None
813 self._score = doc['score']
817 self.picture_id = int(doc["picture_id"])
819 if doc.get('area_id'):
820 hit = (self._score, {
821 'how_found': how_found,
822 'area_id': doc['area_id'],
823 'themes': doc.get('themes', []),
824 'themes_pl': doc.get('themes_pl', []),
827 self._hits.append(hit)
830 return "<PR id=%d score=%f >" % (self.picture_id, self._score)
837 return self._score * self.boost
839 def merge(self, other):
840 if self.picture_id != other.picture_id:
842 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
843 self._hits += other._hits
844 self._score += max(other._score, 0)
852 if self._processed_hits is not None:
853 return self._processed_hits
856 for hit in self._hits:
858 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
859 except picture.models.PictureArea.DoesNotExist:
862 # Figure out if we were searching for a token matching some word in theme name.
864 if self.query_terms is not None:
865 for i in range(0, len(hit[self.OTHER]['themes'])):
866 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
867 tms = map(str.lower, tms)
868 for qt in self.query_terms:
870 themes_hit.add(hit[self.OTHER]['themes'][i])
874 'score': hit[self.SCORE],
876 'themes_hit': themes_hit,
878 m.update(hit[self.OTHER])
881 hits.sort(key=lambda h: h['score'], reverse=True)
883 self._processed_hits = hits
886 def get_picture(self):
887 if self._picture is None:
888 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
891 picture = property(get_picture)
894 def aggregate(*result_lists):
896 for rl in result_lists:
898 if r.picture_id in books:
899 books[r.picture_id].merge(r)
901 books[r.picture_id] = r
902 return books.values()
904 def __lt__(self, other):
905 return self.score < other.score
907 def __eq__(self, other):
908 return self.score == other.score
911 class Search(SolrIndex):
915 def __init__(self, default_field="text"):
916 super(Search, self).__init__(mode='r')
918 def make_term_query(self, query, field='text', modal=operator.or_):
920 Returns term queries joined by boolean query.
921 modal - applies to boolean query
922 fuzzy - should the query by fuzzy.
927 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
931 def search_by_author(self, words):
932 from catalogue.models import Book
933 books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
935 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
936 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
938 def search_words(self, words, fields, required=None, book=True, picture=False):
939 if book and not picture and fields == ['authors']:
940 return self.search_by_author(words)
943 if book or picture or (word not in stopwords):
946 q = self.index.Q(**{field: word})
947 if word_filter is None:
951 filters.append(word_filter)
953 required_filter = None
954 for field in required:
956 if book or picture or (word not in stopwords):
957 q = self.index.Q(**{field: word})
958 if required_filter is None:
962 filters.append(required_filter)
967 params['is_book'] = True
969 params['picture_id__gt'] = 0
971 params['book_id__gt'] = 0
972 query = self.index.query(**params)
973 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
974 result_class = PictureResult if picture else SearchResult
975 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
977 def get_snippets(self, searchresult, query, field='text', num=1):
979 Returns a snippet for found scoreDoc.
981 maxnum = len(searchresult)
982 if num is None or num < 0 or num > maxnum:
984 book_id = searchresult.book_id
985 revision = searchresult.snippet_revision()
986 snippets = Snippets(book_id, revision=revision)
987 snips = [None] * maxnum
991 while idx < maxnum and num > 0:
992 position, length = searchresult.snippet_pos(idx)
993 if position is None or length is None:
995 text = snippets.get((int(position),
997 snip = self.index.highlight(text=text, field=field, q=query)
998 if not snip and field == 'text':
999 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
1000 if snip not in snips:
1006 except IOError as e:
1007 book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1009 log.error("Book does not exist for book id = %d" % book_id)
1010 elif not book.get().children.exists():
1011 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1016 # remove verse end markers..
1017 snips = [s.replace("/\n", "\n") if s else s for s in snips]
1019 searchresult.snippets = snips
1024 def apply_filters(query, filters):
1026 Apply filters to a query
1030 filters = filter(lambda x: x is not None, filters)
1032 query = query.query(f)
1036 if getattr(settings, 'SEARCH_MOCK', False):
1037 from .mock_search import Search