1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.person
13 import librarian.meta.types.text
14 from librarian.parser import WLDocument
15 from lxml import etree
17 import catalogue.models
19 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
20 from wolnelektury.utils import makedirs
23 log = logging.getLogger('search')
26 if os.path.isfile(settings.SOLR_STOPWORDS):
29 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
34 class SolrIndex(object):
35 def __init__(self, mode=None):
36 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
39 class Snippets(object):
41 This class manages snippet files for indexed object (book)
42 the snippets are concatenated together, and their positions and
43 lengths are kept in lucene index fields.
45 SNIPPET_DIR = "snippets"
47 def __init__(self, book_id, revision=None):
48 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
49 self.book_id = book_id
50 self.revision = revision
57 fn = "%d.%d" % (self.book_id, self.revision)
59 fn = "%d" % self.book_id
61 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
63 def open(self, mode='r'):
65 Open the snippet file. Call .close() afterwards.
71 if os.path.exists(self.path):
74 if not os.path.exists(self.path):
78 self.file = open(self.path, mode)
82 def add(self, snippet):
84 Append a snippet (unicode) to the snippet file.
85 Return a (position, length) tuple
87 txt = snippet.encode('utf-8')
90 pos = (self.position, l)
96 Given a tuple of (position, length) return an unicode
97 of the snippet stored there.
99 self.file.seek(pos[0], 0)
101 txt = self.file.read(pos[1]).decode('utf-8')
107 """Close snippet file"""
123 class Index(SolrIndex):
125 Class indexing books.
128 super(Index, self).__init__(mode='rw')
130 def remove_snippets(self, book):
131 book.snippet_set.all().delete()
133 def add_snippet(self, book, doc):
134 assert book.id == doc.pop('book_id')
135 # Fragments already exist and can be indexed where they live.
136 if 'fragment_anchor' in doc:
139 text = doc.pop('text')
140 header_index = doc.pop('header_index')
141 book.snippet_set.create(
146 def delete_query(self, *queries):
148 index.delete(queries=...) doesn't work, so let's reimplement it
149 using deletion of list of uids.
153 if isinstance(q, scorched.search.LuceneQuery):
154 q = self.index.query(q)
155 q.field_limiter.update(['uid'])
159 ids = q.paginate(start=st, rows=rows).execute()
166 # FIXME: With Solr API change, this doesn't work.
167 #self.index.delete(uids)
172 def index_tags(self, *tags, **kw):
174 Re-index global tag list.
175 Removes all tags from index, then index them again.
176 Indexed fields include: id, name (with and without polish stems), category
178 log.debug("Indexing tags")
179 remove_only = kw.get('remove_only', False)
180 # first, remove tags from index.
184 q_id = self.index.Q(tag_id=tag.id)
186 if isinstance(tag, PDCounterAuthor):
187 q_cat = self.index.Q(tag_category='pd_author')
188 elif isinstance(tag, PDCounterBook):
189 q_cat = self.index.Q(tag_category='pd_book')
191 q_cat = self.index.Q(tag_category=tag.category)
193 q_id_cat = self.index.Q(q_id & q_cat)
194 tag_qs.append(q_id_cat)
195 self.delete_query(*tag_qs)
197 q = self.index.Q(tag_id__any=True)
201 # then add them [all or just one passed]
204 catalogue.models.Tag.objects.exclude(category='set'),
205 PDCounterAuthor.objects.all(),
206 PDCounterBook.objects.all())
209 if isinstance(tag, PDCounterAuthor):
211 "tag_id": int(tag.id),
212 "tag_name": tag.name,
213 "tag_name_pl": tag.name,
214 "tag_category": 'pd_author',
215 "is_pdcounter": True,
216 "uid": "tag%d_pd_a" % tag.id
218 elif isinstance(tag, PDCounterBook):
220 "tag_id": int(tag.id),
221 "tag_name": tag.title,
222 "tag_name_pl": tag.title,
223 "tag_category": 'pd_book',
224 "is_pdcounter": True,
225 "uid": "tag%d_pd_b" % tag.id
229 "tag_id": int(tag.id),
230 "tag_name": tag.name,
231 "tag_name_pl": tag.name,
232 "tag_category": tag.category,
233 "is_pdcounter": False,
234 "uid": "tag%d" % tag.id
238 def create_book_doc(self, book):
240 Create a lucene document referring book id.
242 doc = {'book_id': int(book.id)}
243 if book.parent is not None:
244 doc['parent_id'] = int(book.parent.id)
247 def remove_book(self, book, remove_snippets=True, legacy=True):
248 """Removes a book from search index.
249 book - Book instance."""
251 self.delete_query(self.index.Q(book_id=book.id))
254 snippets = Snippets(book.id)
256 self.remove_snippets(book)
258 def index_book(self, book, book_info=None, overwrite=True, legacy=True):
261 Creates a lucene document for extracted metadata
262 and calls self.index_content() to index the contents of the book.
264 if not book.xml_file: return
267 # we don't remove snippets, since they might be still needed by
268 # threads using not reopened index
269 self.remove_book(book, remove_snippets=False, legacy=legacy)
271 book_doc = self.create_book_doc(book)
272 meta_fields = self.extract_metadata(book, book_info, dc_only=[
273 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
274 # let's not index it - it's only used for extracting publish date
275 if 'source_name' in meta_fields:
276 del meta_fields['source_name']
278 for n, f in meta_fields.items():
281 book_doc['uid'] = "book%s" % book_doc['book_id']
283 self.index.add(book_doc)
286 'title': meta_fields['title'],
287 'authors': meta_fields['authors'],
288 'published_date': meta_fields['published_date']
291 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
292 if tag_name in meta_fields:
293 book_fields[tag_name] = meta_fields[tag_name]
295 self.index_content(book, book_fields=book_fields, legacy=legacy)
300 'dramat_wierszowany_l',
301 'dramat_wierszowany_lp',
302 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
306 ignore_content_tags = [
307 'uwaga', 'extra', 'nota_red', 'abstrakt',
308 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
310 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
313 footnote_tags = ['pa', 'pt', 'pr', 'pe']
315 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
316 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
318 published_date_re = re.compile("([0-9]+)[\]. ]*$")
320 def extract_metadata(self, book, book_info=None, dc_only=None):
322 Extract metadata from book and returns a map of fields keyed by fieldname
326 if book_info is None:
327 book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
329 fields['slug'] = book.slug
330 fields['is_book'] = True
333 for field in dcparser.BookInfo.FIELDS:
334 if dc_only and field.name not in dc_only:
336 if hasattr(book_info, field.name):
337 if not getattr(book_info, field.name):
339 type_indicator = field.value_type
340 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
341 s = getattr(book_info, field.name)
344 fields[field.name] = s
345 elif issubclass(type_indicator, librarian.meta.types.person.Person):
346 p = getattr(book_info, field.name)
347 if isinstance(p, librarian.meta.types.person.Person):
350 persons = ', '.join(map(str, p))
351 fields[field.name] = persons
355 if hasattr(book_info, 'source_name') and book_info.source_name:
356 match = self.published_date_re.search(book_info.source_name)
357 if match is not None:
358 pd = str(match.groups()[0])
361 fields["published_date"] = pd
365 # def add_gaps(self, fields, fieldname):
367 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
368 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
372 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
373 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
375 def get_master(self, root):
377 Returns the first master tag from an etree.
379 for master in root.iter():
380 if master.tag in self.master_tags:
383 def index_content(self, book, book_fields, legacy=True):
385 Walks the book XML and extract content from it.
386 Adds parts for each header tag and for each fragment.
388 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
389 root = wld.edoc.getroot()
391 master = self.get_master(root)
396 if node.tag not in self.ignore_content_tags:
397 yield node, None, None
398 if node.text is not None:
399 yield None, node.text, None
400 for child in list(node):
401 for b, t, e in walker(child):
403 yield None, None, node
405 if node.tail is not None:
406 yield None, node.tail, None
409 def fix_format(text):
410 # separator = [" ", "\t", ".", ";", ","]
411 if isinstance(text, list):
412 # need to join it first
413 text = filter(lambda s: s is not None, content)
414 text = ' '.join(text)
415 # for i in range(len(text)):
417 # if text[i][0] not in separator\
418 # and text[i - 1][-1] not in separator:
419 # text.insert(i, " ")
421 return re.sub("(?m)/$", "", text)
423 def add_part(snippets, **fields):
424 doc = self.create_book_doc(book)
425 for n, v in book_fields.items():
428 doc['header_index'] = fields["header_index"]
429 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
430 doc['header_type'] = fields['header_type']
432 doc['text'] = fields['text']
435 snip_pos = snippets.add(fields["text"])
437 doc['snippets_position'] = snip_pos[0]
438 doc['snippets_length'] = snip_pos[1]
439 if snippets.revision:
440 doc["snippets_revision"] = snippets.revision
442 if 'fragment_anchor' in fields:
443 doc["fragment_anchor"] = fields['fragment_anchor']
445 if 'themes' in fields:
446 doc['themes'] = fields['themes']
447 doc['uid'] = "part%s-%s-%s-%s" % (
448 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
452 snippets = Snippets(book.id).open('w')
454 for header, position in zip(list(master), range(len(master))):
456 if header.tag in self.skip_header_tags:
458 if header.tag is etree.Comment:
465 def all_content(text):
466 for frag in fragments.values():
467 frag['text'].append(text)
469 handle_text = [all_content]
471 for start, text, end in walker(header):
473 if start is not None and start.tag in self.footnote_tags:
476 def collect_footnote(t):
479 handle_text.append(collect_footnote)
480 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
482 doc = add_part(snippets, header_index=position, header_type=header.tag,
483 text=''.join(footnote))
484 self.add_snippet(book, doc)
489 # handle fragments and themes.
490 if start is not None and start.tag == 'begin':
491 fid = start.attrib['id'][1:]
493 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
495 # themes for this fragment
496 elif start is not None and start.tag == 'motyw':
497 fid = start.attrib['id'][1:]
498 handle_text.append(lambda text: None)
499 if start.text is not None:
500 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
501 elif end is not None and end.tag == 'motyw':
504 elif start is not None and start.tag == 'end':
505 fid = start.attrib['id'][1:]
506 if fid not in fragments:
507 continue # a broken <end> node, skip it
508 frag = fragments[fid]
509 if not frag['themes']:
510 continue # empty themes list.
513 doc = add_part(snippets,
514 header_type=frag['start_header'],
515 header_index=frag['start_section'],
516 header_span=position - frag['start_section'] + 1,
518 text=fix_format(frag['text']),
519 themes=frag['themes'])
520 # Add searchable fragment
521 self.add_snippet(book, doc)
527 if text is not None and handle_text is not []:
528 hdl = handle_text[-1]
531 # in the end, add a section text.
532 doc = add_part(snippets, header_index=position,
533 header_type=header.tag, text=fix_format(content))
535 self.add_snippet(book, doc)
542 def remove_picture(self, picture_or_id):
543 """Removes a picture from search index."""
544 if isinstance(picture_or_id, picture.models.Picture):
545 picture_id = picture_or_id.id
547 picture_id = picture_or_id
548 self.delete_query(self.index.Q(picture_id=picture_id))
550 def index_picture(self, picture, picture_info=None, overwrite=True):
553 Creates a lucene document for extracted metadata
554 and calls self.index_area() to index the contents of the picture.
557 # we don't remove snippets, since they might be still needed by
558 # threads using not reopened index
559 self.remove_picture(picture)
561 picture_doc = {'picture_id': int(picture.id)}
562 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
563 'authors', 'title', 'epochs', 'kinds', 'genres'])
565 picture_doc.update(meta_fields)
567 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
568 self.index.add(picture_doc)
569 del picture_doc['is_book']
570 for area in picture.areas.all():
571 self.index_area(area, picture_fields=picture_doc)
573 def index_area(self, area, picture_fields):
575 Indexes themes and objects on the area.
577 doc = dict(picture_fields)
578 doc['area_id'] = area.id
579 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
580 doc['uid'] = 'area%s' % area.id
585 class SearchResult(object):
586 def __init__(self, doc, how_found=None, query_terms=None):
589 self._processed_hits = None # processed hits
591 self.query_terms = query_terms
595 self._score = doc['score']
599 self.book_id = int(doc["book_id"])
602 self.published_date = int(doc.get("published_date"))
604 self.published_date = 0
607 header_type = doc.get("header_type", None)
608 # we have a content hit in some header of fragment
609 if header_type is not None:
610 sec = (header_type, int(doc["header_index"]))
611 header_span = doc['header_span']
612 header_span = header_span is not None and int(header_span) or 1
613 fragment = doc.get("fragment_anchor", None)
614 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
615 snippets_rev = doc.get('snippets_revision', None)
617 hit = (sec + (header_span,), fragment, self._score, {
618 'how_found': how_found,
619 'snippets_pos': snippets_pos,
620 'snippets_revision': snippets_rev,
621 'themes': doc.get('themes', []),
622 'themes_pl': doc.get('themes_pl', [])
625 self._hits.append(hit)
628 def from_book(cls, book, how_found=None, query_terms=None):
630 'score': book.popularity.count,
634 result = cls(doc, how_found=how_found, query_terms=query_terms)
639 return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
640 (self.book_id, len(self._hits),
641 len(self._processed_hits) if self._processed_hits else -1,
642 self._score, len(self.snippets))
645 return str(self).encode('utf-8')
649 return self._score * self.boost
651 def merge(self, other):
652 if self.book_id != other.book_id:
653 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
654 self._hits += other._hits
655 self._score += max(other._score, 0)
659 if self._book is not None:
662 self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
663 except catalogue.models.Book.DoesNotExist:
667 book = property(get_book)
678 if self._processed_hits is not None:
679 return self._processed_hits
681 # to sections and fragments
682 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
684 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
686 # sections not covered by fragments
687 sect = filter(lambda s: 0 == len(list(filter(
688 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
689 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
691 def remove_duplicates(lst, keyfn, larger):
696 if larger(els[eif], e):
701 # remove fragments with duplicated fid's and duplicated snippets
702 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
704 # remove duplicate sections
708 si = s[self.POSITION][self.POSITION_INDEX]
711 if sections[si]['score'] >= s[self.SCORE]:
714 m = {'score': s[self.SCORE],
715 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
717 m.update(s[self.OTHER])
720 hits = list(sections.values())
724 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
725 except catalogue.models.Fragment.DoesNotExist:
728 # Figure out if we were searching for a token matching some word in theme name.
729 themes = frag.tags.filter(category='theme')
731 if self.query_terms is not None:
732 for i in range(0, len(f[self.OTHER]['themes'])):
733 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
734 tms = map(str.lower, tms)
735 for qt in self.query_terms:
737 themes_hit.add(f[self.OTHER]['themes'][i])
740 def theme_by_name(n):
741 th = list(filter(lambda t: t.name == n, themes))
746 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
748 m = {'score': f[self.SCORE],
750 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
752 'themes_hit': themes_hit
754 m.update(f[self.OTHER])
757 hits.sort(key=lambda h: h['score'], reverse=True)
759 self._processed_hits = hits
764 def aggregate(*result_lists):
766 for rl in result_lists:
768 if r.book_id in books:
769 books[r.book_id].merge(r)
772 return books.values()
774 def get_sort_key(self):
777 self.book.sort_key_author if self.book else '',
778 self.book.sort_key if self.book else '')
780 def __lt__(self, other):
781 return self.get_sort_key() > other.get_sort_key()
783 def __eq__(self, other):
784 return self.get_sort_key() == other.get_sort_key()
787 return len(self.hits)
789 def snippet_pos(self, idx=0):
790 return self.hits[idx]['snippets_pos']
792 def snippet_revision(self, idx=0):
794 return self.hits[idx]['snippets_revision']
795 except (IndexError, KeyError):
800 class PictureResult(object):
801 def __init__(self, doc, how_found=None, query_terms=None):
803 self.query_terms = query_terms
806 self._processed_hits = None
809 self._score = doc['score']
813 self.picture_id = int(doc["picture_id"])
815 if doc.get('area_id'):
816 hit = (self._score, {
817 'how_found': how_found,
818 'area_id': doc['area_id'],
819 'themes': doc.get('themes', []),
820 'themes_pl': doc.get('themes_pl', []),
823 self._hits.append(hit)
826 return "<PR id=%d score=%f >" % (self.picture_id, self._score)
833 return self._score * self.boost
835 def merge(self, other):
836 if self.picture_id != other.picture_id:
838 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
839 self._hits += other._hits
840 self._score += max(other._score, 0)
848 if self._processed_hits is not None:
849 return self._processed_hits
852 for hit in self._hits:
854 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
855 except picture.models.PictureArea.DoesNotExist:
858 # Figure out if we were searching for a token matching some word in theme name.
860 if self.query_terms is not None:
861 for i in range(0, len(hit[self.OTHER]['themes'])):
862 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
863 tms = map(str.lower, tms)
864 for qt in self.query_terms:
866 themes_hit.add(hit[self.OTHER]['themes'][i])
870 'score': hit[self.SCORE],
872 'themes_hit': themes_hit,
874 m.update(hit[self.OTHER])
877 hits.sort(key=lambda h: h['score'], reverse=True)
879 self._processed_hits = hits
882 def get_picture(self):
883 if self._picture is None:
884 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
887 picture = property(get_picture)
890 def aggregate(*result_lists):
892 for rl in result_lists:
894 if r.picture_id in books:
895 books[r.picture_id].merge(r)
897 books[r.picture_id] = r
898 return books.values()
900 def __lt__(self, other):
901 return self.score < other.score
903 def __eq__(self, other):
904 return self.score == other.score
907 class Search(SolrIndex):
911 def __init__(self, default_field="text"):
912 super(Search, self).__init__(mode='r')
914 def make_term_query(self, query, field='text', modal=operator.or_):
916 Returns term queries joined by boolean query.
917 modal - applies to boolean query
918 fuzzy - should the query by fuzzy.
923 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
927 def search_by_author(self, words):
928 from catalogue.models import Book
929 books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
931 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
932 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
934 def search_words(self, words, fields, required=None, book=True, picture=False):
935 if book and not picture and fields == ['authors']:
936 return self.search_by_author(words)
939 if book or picture or (word not in stopwords):
942 q = self.index.Q(**{field: word})
943 if word_filter is None:
947 filters.append(word_filter)
949 required_filter = None
950 for field in required:
952 if book or picture or (word not in stopwords):
953 q = self.index.Q(**{field: word})
954 if required_filter is None:
958 filters.append(required_filter)
963 params['is_book'] = True
965 params['picture_id__gt'] = 0
967 params['book_id__gt'] = 0
968 query = self.index.query(**params)
969 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
970 result_class = PictureResult if picture else SearchResult
971 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
973 def get_snippets(self, searchresult, query, field='text', num=1):
975 Returns a snippet for found scoreDoc.
977 maxnum = len(searchresult)
978 if num is None or num < 0 or num > maxnum:
980 book_id = searchresult.book_id
981 revision = searchresult.snippet_revision()
982 snippets = Snippets(book_id, revision=revision)
983 snips = [None] * maxnum
987 while idx < maxnum and num > 0:
988 position, length = searchresult.snippet_pos(idx)
989 if position is None or length is None:
991 text = snippets.get((int(position),
993 snip = self.index.highlight(text=text, field=field, q=query)
994 if not snip and field == 'text':
995 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
996 if snip not in snips:
1002 except IOError as e:
1003 book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1005 log.error("Book does not exist for book id = %d" % book_id)
1006 elif not book.get().children.exists():
1007 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1012 # remove verse end markers..
1013 snips = [s.replace("/\n", "\n") if s else s for s in snips]
1015 searchresult.snippets = snips
1020 def apply_filters(query, filters):
1022 Apply filters to a query
1026 filters = filter(lambda x: x is not None, filters)
1028 query = query.query(f)
1032 if getattr(settings, 'SEARCH_MOCK', False):
1033 from .mock_search import Search