1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.date
13 import librarian.meta.types.person
14 import librarian.meta.types.text
15 from librarian.parser import WLDocument
16 from lxml import etree
18 import catalogue.models
20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
21 from wolnelektury.utils import makedirs
24 log = logging.getLogger('search')
27 if os.path.isfile(settings.SOLR_STOPWORDS):
30 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
35 class SolrIndex(object):
36 def __init__(self, mode=None):
37 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
40 class Snippets(object):
42 This class manages snippet files for indexed object (book)
43 the snippets are concatenated together, and their positions and
44 lengths are kept in lucene index fields.
46 SNIPPET_DIR = "snippets"
48 def __init__(self, book_id, revision=None):
49 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
50 self.book_id = book_id
51 self.revision = revision
58 fn = "%d.%d" % (self.book_id, self.revision)
60 fn = "%d" % self.book_id
62 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
64 def open(self, mode='r'):
66 Open the snippet file. Call .close() afterwards.
72 if os.path.exists(self.path):
75 if not os.path.exists(self.path):
79 self.file = open(self.path, mode)
83 def add(self, snippet):
85 Append a snippet (unicode) to the snippet file.
86 Return a (position, length) tuple
88 txt = snippet.encode('utf-8')
91 pos = (self.position, l)
97 Given a tuple of (position, length) return an unicode
98 of the snippet stored there.
100 self.file.seek(pos[0], 0)
102 txt = self.file.read(pos[1]).decode('utf-8')
108 """Close snippet file"""
124 class Index(SolrIndex):
126 Class indexing books.
129 super(Index, self).__init__(mode='rw')
131 def delete_query(self, *queries):
133 index.delete(queries=...) doesn't work, so let's reimplement it
134 using deletion of list of uids.
138 if isinstance(q, scorched.search.LuceneQuery):
139 q = self.index.query(q)
140 q.field_limiter.update(['uid'])
144 ids = q.paginate(start=st, rows=rows).execute()
151 # FIXME: With Solr API change, this doesn't work.
152 #self.index.delete(uids)
157 def index_tags(self, *tags, **kw):
159 Re-index global tag list.
160 Removes all tags from index, then index them again.
161 Indexed fields include: id, name (with and without polish stems), category
163 log.debug("Indexing tags")
164 remove_only = kw.get('remove_only', False)
165 # first, remove tags from index.
169 q_id = self.index.Q(tag_id=tag.id)
171 if isinstance(tag, PDCounterAuthor):
172 q_cat = self.index.Q(tag_category='pd_author')
173 elif isinstance(tag, PDCounterBook):
174 q_cat = self.index.Q(tag_category='pd_book')
176 q_cat = self.index.Q(tag_category=tag.category)
178 q_id_cat = self.index.Q(q_id & q_cat)
179 tag_qs.append(q_id_cat)
180 self.delete_query(*tag_qs)
182 q = self.index.Q(tag_id__any=True)
186 # then add them [all or just one passed]
189 catalogue.models.Tag.objects.exclude(category='set'),
190 PDCounterAuthor.objects.all(),
191 PDCounterBook.objects.all())
194 if isinstance(tag, PDCounterAuthor):
196 "tag_id": int(tag.id),
197 "tag_name": tag.name,
198 "tag_name_pl": tag.name,
199 "tag_category": 'pd_author',
200 "is_pdcounter": True,
201 "uid": "tag%d_pd_a" % tag.id
203 elif isinstance(tag, PDCounterBook):
205 "tag_id": int(tag.id),
206 "tag_name": tag.title,
207 "tag_name_pl": tag.title,
208 "tag_category": 'pd_book',
209 "is_pdcounter": True,
210 "uid": "tag%d_pd_b" % tag.id
214 "tag_id": int(tag.id),
215 "tag_name": tag.name,
216 "tag_name_pl": tag.name,
217 "tag_category": tag.category,
218 "is_pdcounter": False,
219 "uid": "tag%d" % tag.id
223 def create_book_doc(self, book):
225 Create a lucene document referring book id.
227 doc = {'book_id': int(book.id)}
228 if book.parent is not None:
229 doc['parent_id'] = int(book.parent.id)
232 def remove_book(self, book_or_id, remove_snippets=True):
233 """Removes a book from search index.
234 book - Book instance."""
235 if isinstance(book_or_id, catalogue.models.Book):
236 book_id = book_or_id.id
240 self.delete_query(self.index.Q(book_id=book_id))
243 snippets = Snippets(book_id)
246 def index_book(self, book, book_info=None, overwrite=True):
249 Creates a lucene document for extracted metadata
250 and calls self.index_content() to index the contents of the book.
253 # we don't remove snippets, since they might be still needed by
254 # threads using not reopened index
255 self.remove_book(book, remove_snippets=False)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info, dc_only=[
259 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
260 # let's not index it - it's only used for extracting publish date
261 if 'source_name' in meta_fields:
262 del meta_fields['source_name']
264 for n, f in meta_fields.items():
267 book_doc['uid'] = "book%s" % book_doc['book_id']
268 self.index.add(book_doc)
271 'title': meta_fields['title'],
272 'authors': meta_fields['authors'],
273 'published_date': meta_fields['published_date']
276 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
277 if tag_name in meta_fields:
278 book_fields[tag_name] = meta_fields[tag_name]
280 self.index_content(book, book_fields=book_fields)
285 'dramat_wierszowany_l',
286 'dramat_wierszowany_lp',
287 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
291 ignore_content_tags = [
292 'uwaga', 'extra', 'nota_red', 'abstrakt',
293 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
295 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
298 footnote_tags = ['pa', 'pt', 'pr', 'pe']
300 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
301 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
303 published_date_re = re.compile("([0-9]+)[\]. ]*$")
305 def extract_metadata(self, book, book_info=None, dc_only=None):
307 Extract metadata from book and returns a map of fields keyed by fieldname
311 if book_info is None:
312 book_info = dcparser.parse(open(book.xml_file.path))
314 fields['slug'] = book.slug
315 fields['is_book'] = True
318 for field in dcparser.BookInfo.FIELDS:
319 if dc_only and field.name not in dc_only:
321 if hasattr(book_info, field.name):
322 if not getattr(book_info, field.name):
324 type_indicator = field.value_type
325 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
326 s = getattr(book_info, field.name)
329 fields[field.name] = s
330 elif issubclass(type_indicator, librarian.meta.types.person.Person):
331 p = getattr(book_info, field.name)
332 if isinstance(p, librarian.meta.types.person.Person):
335 persons = ', '.join(map(str, p))
336 fields[field.name] = persons
337 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
338 dt = getattr(book_info, field.name)
339 fields[field.name] = dt
343 if hasattr(book_info, 'source_name') and book_info.source_name:
344 match = self.published_date_re.search(book_info.source_name)
345 if match is not None:
346 pd = str(match.groups()[0])
349 fields["published_date"] = pd
353 # def add_gaps(self, fields, fieldname):
355 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
356 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
360 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
361 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
363 def get_master(self, root):
365 Returns the first master tag from an etree.
367 for master in root.iter():
368 if master.tag in self.master_tags:
371 def index_content(self, book, book_fields):
373 Walks the book XML and extract content from it.
374 Adds parts for each header tag and for each fragment.
376 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
377 root = wld.edoc.getroot()
379 master = self.get_master(root)
384 if node.tag not in self.ignore_content_tags:
385 yield node, None, None
386 if node.text is not None:
387 yield None, node.text, None
388 for child in list(node):
389 for b, t, e in walker(child):
391 yield None, None, node
393 if node.tail is not None:
394 yield None, node.tail, None
397 def fix_format(text):
398 # separator = [" ", "\t", ".", ";", ","]
399 if isinstance(text, list):
400 # need to join it first
401 text = filter(lambda s: s is not None, content)
402 text = ' '.join(text)
403 # for i in range(len(text)):
405 # if text[i][0] not in separator\
406 # and text[i - 1][-1] not in separator:
407 # text.insert(i, " ")
409 return re.sub("(?m)/$", "", text)
411 def add_part(snippets, **fields):
412 doc = self.create_book_doc(book)
413 for n, v in book_fields.items():
416 doc['header_index'] = fields["header_index"]
417 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
418 doc['header_type'] = fields['header_type']
420 doc['text'] = fields['text']
423 snip_pos = snippets.add(fields["text"])
425 doc['snippets_position'] = snip_pos[0]
426 doc['snippets_length'] = snip_pos[1]
427 if snippets.revision:
428 doc["snippets_revision"] = snippets.revision
430 if 'fragment_anchor' in fields:
431 doc["fragment_anchor"] = fields['fragment_anchor']
433 if 'themes' in fields:
434 doc['themes'] = fields['themes']
435 doc['uid'] = "part%s-%s-%s-%s" % (
436 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
440 snippets = Snippets(book.id).open('w')
442 for header, position in zip(list(master), range(len(master))):
444 if header.tag in self.skip_header_tags:
446 if header.tag is etree.Comment:
453 def all_content(text):
454 for frag in fragments.values():
455 frag['text'].append(text)
457 handle_text = [all_content]
459 for start, text, end in walker(header):
461 if start is not None and start.tag in self.footnote_tags:
464 def collect_footnote(t):
467 handle_text.append(collect_footnote)
468 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
470 doc = add_part(snippets, header_index=position, header_type=header.tag,
471 text=''.join(footnote),
476 # handle fragments and themes.
477 if start is not None and start.tag == 'begin':
478 fid = start.attrib['id'][1:]
480 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
482 # themes for this fragment
483 elif start is not None and start.tag == 'motyw':
484 fid = start.attrib['id'][1:]
485 handle_text.append(lambda text: None)
486 if start.text is not None:
487 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
488 elif end is not None and end.tag == 'motyw':
491 elif start is not None and start.tag == 'end':
492 fid = start.attrib['id'][1:]
493 if fid not in fragments:
494 continue # a broken <end> node, skip it
495 frag = fragments[fid]
496 if not frag['themes']:
497 continue # empty themes list.
500 doc = add_part(snippets,
501 header_type=frag['start_header'],
502 header_index=frag['start_section'],
503 header_span=position - frag['start_section'] + 1,
505 text=fix_format(frag['text']),
506 themes=frag['themes'])
511 if text is not None and handle_text is not []:
512 hdl = handle_text[-1]
515 # in the end, add a section text.
516 doc = add_part(snippets, header_index=position,
517 header_type=header.tag, text=fix_format(content))
524 def remove_picture(self, picture_or_id):
525 """Removes a picture from search index."""
526 if isinstance(picture_or_id, picture.models.Picture):
527 picture_id = picture_or_id.id
529 picture_id = picture_or_id
530 self.delete_query(self.index.Q(picture_id=picture_id))
532 def index_picture(self, picture, picture_info=None, overwrite=True):
535 Creates a lucene document for extracted metadata
536 and calls self.index_area() to index the contents of the picture.
539 # we don't remove snippets, since they might be still needed by
540 # threads using not reopened index
541 self.remove_picture(picture)
543 picture_doc = {'picture_id': int(picture.id)}
544 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
545 'authors', 'title', 'epochs', 'kinds', 'genres'])
547 picture_doc.update(meta_fields)
549 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
550 self.index.add(picture_doc)
551 del picture_doc['is_book']
552 for area in picture.areas.all():
553 self.index_area(area, picture_fields=picture_doc)
555 def index_area(self, area, picture_fields):
557 Indexes themes and objects on the area.
559 doc = dict(picture_fields)
560 doc['area_id'] = area.id
561 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
562 doc['uid'] = 'area%s' % area.id
567 class SearchResult(object):
568 def __init__(self, doc, how_found=None, query_terms=None):
571 self._processed_hits = None # processed hits
573 self.query_terms = query_terms
577 self._score = doc['score']
581 self.book_id = int(doc["book_id"])
584 self.published_date = int(doc.get("published_date"))
586 self.published_date = 0
589 header_type = doc.get("header_type", None)
590 # we have a content hit in some header of fragment
591 if header_type is not None:
592 sec = (header_type, int(doc["header_index"]))
593 header_span = doc['header_span']
594 header_span = header_span is not None and int(header_span) or 1
595 fragment = doc.get("fragment_anchor", None)
596 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
597 snippets_rev = doc.get('snippets_revision', None)
599 hit = (sec + (header_span,), fragment, self._score, {
600 'how_found': how_found,
601 'snippets_pos': snippets_pos,
602 'snippets_revision': snippets_rev,
603 'themes': doc.get('themes', []),
604 'themes_pl': doc.get('themes_pl', [])
607 self._hits.append(hit)
610 def from_book(cls, book, how_found=None, query_terms=None):
612 'score': book.popularity.count,
616 result = cls(doc, how_found=how_found, query_terms=query_terms)
621 return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
622 (self.book_id, len(self._hits),
623 len(self._processed_hits) if self._processed_hits else -1,
624 self._score, len(self.snippets))
627 return str(self).encode('utf-8')
631 return self._score * self.boost
633 def merge(self, other):
634 if self.book_id != other.book_id:
635 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
636 self._hits += other._hits
637 self._score += max(other._score, 0)
641 if self._book is not None:
644 self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
645 except catalogue.models.Book.DoesNotExist:
649 book = property(get_book)
660 if self._processed_hits is not None:
661 return self._processed_hits
663 # to sections and fragments
664 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
666 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
668 # sections not covered by fragments
669 sect = filter(lambda s: 0 == len(list(filter(
670 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
671 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
673 def remove_duplicates(lst, keyfn, larger):
678 if larger(els[eif], e):
683 # remove fragments with duplicated fid's and duplicated snippets
684 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
686 # remove duplicate sections
690 si = s[self.POSITION][self.POSITION_INDEX]
693 if sections[si]['score'] >= s[self.SCORE]:
696 m = {'score': s[self.SCORE],
697 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
699 m.update(s[self.OTHER])
702 hits = list(sections.values())
706 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
707 except catalogue.models.Fragment.DoesNotExist:
710 # Figure out if we were searching for a token matching some word in theme name.
711 themes = frag.tags.filter(category='theme')
713 if self.query_terms is not None:
714 for i in range(0, len(f[self.OTHER]['themes'])):
715 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
716 tms = map(str.lower, tms)
717 for qt in self.query_terms:
719 themes_hit.add(f[self.OTHER]['themes'][i])
722 def theme_by_name(n):
723 th = list(filter(lambda t: t.name == n, themes))
728 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
730 m = {'score': f[self.SCORE],
732 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
734 'themes_hit': themes_hit
736 m.update(f[self.OTHER])
739 hits.sort(key=lambda h: h['score'], reverse=True)
741 self._processed_hits = hits
746 def aggregate(*result_lists):
748 for rl in result_lists:
750 if r.book_id in books:
751 books[r.book_id].merge(r)
754 return books.values()
756 def get_sort_key(self):
759 self.book.sort_key_author if self.book else '',
760 self.book.sort_key if self.book else '')
762 def __lt__(self, other):
763 return self.get_sort_key() > other.get_sort_key()
765 def __eq__(self, other):
766 return self.get_sort_key() == other.get_sort_key()
769 return len(self.hits)
771 def snippet_pos(self, idx=0):
772 return self.hits[idx]['snippets_pos']
774 def snippet_revision(self, idx=0):
776 return self.hits[idx]['snippets_revision']
777 except (IndexError, KeyError):
782 class PictureResult(object):
783 def __init__(self, doc, how_found=None, query_terms=None):
785 self.query_terms = query_terms
788 self._processed_hits = None
791 self._score = doc['score']
795 self.picture_id = int(doc["picture_id"])
797 if doc.get('area_id'):
798 hit = (self._score, {
799 'how_found': how_found,
800 'area_id': doc['area_id'],
801 'themes': doc.get('themes', []),
802 'themes_pl': doc.get('themes_pl', []),
805 self._hits.append(hit)
808 return "<PR id=%d score=%f >" % (self.picture_id, self._score)
815 return self._score * self.boost
817 def merge(self, other):
818 if self.picture_id != other.picture_id:
820 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
821 self._hits += other._hits
822 self._score += max(other._score, 0)
830 if self._processed_hits is not None:
831 return self._processed_hits
834 for hit in self._hits:
836 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
837 except picture.models.PictureArea.DoesNotExist:
840 # Figure out if we were searching for a token matching some word in theme name.
842 if self.query_terms is not None:
843 for i in range(0, len(hit[self.OTHER]['themes'])):
844 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
845 tms = map(str.lower, tms)
846 for qt in self.query_terms:
848 themes_hit.add(hit[self.OTHER]['themes'][i])
852 'score': hit[self.SCORE],
854 'themes_hit': themes_hit,
856 m.update(hit[self.OTHER])
859 hits.sort(key=lambda h: h['score'], reverse=True)
861 self._processed_hits = hits
864 def get_picture(self):
865 if self._picture is None:
866 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
869 picture = property(get_picture)
872 def aggregate(*result_lists):
874 for rl in result_lists:
876 if r.picture_id in books:
877 books[r.picture_id].merge(r)
879 books[r.picture_id] = r
880 return books.values()
882 def __lt__(self, other):
883 return self.score < other.score
885 def __eq__(self, other):
886 return self.score == other.score
889 class Search(SolrIndex):
893 def __init__(self, default_field="text"):
894 super(Search, self).__init__(mode='r')
896 def make_term_query(self, query, field='text', modal=operator.or_):
898 Returns term queries joined by boolean query.
899 modal - applies to boolean query
900 fuzzy - should the query by fuzzy.
905 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
909 def search_by_author(self, words):
910 from catalogue.models import Book
911 books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
913 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
914 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
916 def search_words(self, words, fields, required=None, book=True, picture=False):
917 if book and not picture and fields == ['authors']:
918 return self.search_by_author(words)
921 if book or picture or (word not in stopwords):
924 q = self.index.Q(**{field: word})
925 if word_filter is None:
929 filters.append(word_filter)
931 required_filter = None
932 for field in required:
934 if book or picture or (word not in stopwords):
935 q = self.index.Q(**{field: word})
936 if required_filter is None:
940 filters.append(required_filter)
945 params['is_book'] = True
947 params['picture_id__gt'] = 0
949 params['book_id__gt'] = 0
950 query = self.index.query(**params)
951 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
952 result_class = PictureResult if picture else SearchResult
953 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
955 def get_snippets(self, searchresult, query, field='text', num=1):
957 Returns a snippet for found scoreDoc.
959 maxnum = len(searchresult)
960 if num is None or num < 0 or num > maxnum:
962 book_id = searchresult.book_id
963 revision = searchresult.snippet_revision()
964 snippets = Snippets(book_id, revision=revision)
965 snips = [None] * maxnum
969 while idx < maxnum and num > 0:
970 position, length = searchresult.snippet_pos(idx)
971 if position is None or length is None:
973 text = snippets.get((int(position),
975 snip = self.index.highlight(text=text, field=field, q=query)
976 if not snip and field == 'text':
977 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
978 if snip not in snips:
985 book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
987 log.error("Book does not exist for book id = %d" % book_id)
988 elif not book.get().children.exists():
989 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
994 # remove verse end markers..
995 snips = [s.replace("/\n", "\n") if s else s for s in snips]
997 searchresult.snippets = snips
1002 def apply_filters(query, filters):
1004 Apply filters to a query
1008 filters = filter(lambda x: x is not None, filters)
1010 query = query.query(f)
1014 if getattr(settings, 'SEARCH_MOCK', False):
1015 from .mock_search import Search