1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
15 import catalogue.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 if os.path.isfile(settings.SOLR_STOPWORDS):
27 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
32 class SolrIndex(object):
33 def __init__(self, mode=None):
34 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
37 class Snippets(object):
39 This class manages snippet files for indexed object (book)
40 the snippets are concatenated together, and their positions and
41 lengths are kept in lucene index fields.
43 SNIPPET_DIR = "snippets"
45 def __init__(self, book_id, revision=None):
46 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47 self.book_id = book_id
48 self.revision = revision
55 fn = "%d.%d" % (self.book_id, self.revision)
57 fn = "%d" % self.book_id
59 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
61 def open(self, mode='r'):
63 Open the snippet file. Call .close() afterwards.
69 if os.path.exists(self.path):
72 if not os.path.exists(self.path):
76 self.file = open(self.path, mode)
80 def add(self, snippet):
82 Append a snippet (unicode) to the snippet file.
83 Return a (position, length) tuple
85 txt = snippet.encode('utf-8')
88 pos = (self.position, l)
94 Given a tuple of (position, length) return an unicode
95 of the snippet stored there.
97 self.file.seek(pos[0], 0)
98 txt = self.file.read(pos[1]).decode('utf-8')
102 """Close snippet file"""
118 class Index(SolrIndex):
120 Class indexing books.
123 super(Index, self).__init__(mode='rw')
125 def delete_query(self, *queries):
127 index.delete(queries=...) doesn't work, so let's reimplement it
128 using deletion of list of uids.
132 if isinstance(q, scorched.search.LuceneQuery):
133 q = self.index.query(q)
134 q.field_limiter.update(['uid'])
138 ids = q.paginate(start=st, rows=rows).execute()
145 # FIXME: With Solr API change, this doesn't work.
146 #self.index.delete(uids)
151 def index_tags(self, *tags, **kw):
153 Re-index global tag list.
154 Removes all tags from index, then index them again.
155 Indexed fields include: id, name (with and without polish stems), category
157 log.debug("Indexing tags")
158 remove_only = kw.get('remove_only', False)
159 # first, remove tags from index.
163 q_id = self.index.Q(tag_id=tag.id)
165 if isinstance(tag, PDCounterAuthor):
166 q_cat = self.index.Q(tag_category='pd_author')
167 elif isinstance(tag, PDCounterBook):
168 q_cat = self.index.Q(tag_category='pd_book')
170 q_cat = self.index.Q(tag_category=tag.category)
172 q_id_cat = self.index.Q(q_id & q_cat)
173 tag_qs.append(q_id_cat)
174 self.delete_query(*tag_qs)
176 q = self.index.Q(tag_id__any=True)
180 # then add them [all or just one passed]
183 catalogue.models.Tag.objects.exclude(category='set'),
184 PDCounterAuthor.objects.all(),
185 PDCounterBook.objects.all())
188 if isinstance(tag, PDCounterAuthor):
190 "tag_id": int(tag.id),
191 "tag_name": tag.name,
192 "tag_name_pl": tag.name,
193 "tag_category": 'pd_author',
194 "is_pdcounter": True,
195 "uid": "tag%d_pd_a" % tag.id
197 elif isinstance(tag, PDCounterBook):
199 "tag_id": int(tag.id),
200 "tag_name": tag.title,
201 "tag_name_pl": tag.title,
202 "tag_category": 'pd_book',
203 "is_pdcounter": True,
204 "uid": "tag%d_pd_b" % tag.id
208 "tag_id": int(tag.id),
209 "tag_name": tag.name,
210 "tag_name_pl": tag.name,
211 "tag_category": tag.category,
212 "is_pdcounter": False,
213 "uid": "tag%d" % tag.id
217 def create_book_doc(self, book):
219 Create a lucene document referring book id.
221 doc = {'book_id': int(book.id)}
222 if book.parent is not None:
223 doc['parent_id'] = int(book.parent.id)
226 def remove_book(self, book_or_id, remove_snippets=True):
227 """Removes a book from search index.
228 book - Book instance."""
229 if isinstance(book_or_id, catalogue.models.Book):
230 book_id = book_or_id.id
234 self.delete_query(self.index.Q(book_id=book_id))
237 snippets = Snippets(book_id)
240 def index_book(self, book, book_info=None, overwrite=True):
243 Creates a lucene document for extracted metadata
244 and calls self.index_content() to index the contents of the book.
247 # we don't remove snippets, since they might be still needed by
248 # threads using not reopened index
249 self.remove_book(book, remove_snippets=False)
251 book_doc = self.create_book_doc(book)
252 meta_fields = self.extract_metadata(book, book_info, dc_only=[
253 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
254 # let's not index it - it's only used for extracting publish date
255 if 'source_name' in meta_fields:
256 del meta_fields['source_name']
258 for n, f in meta_fields.items():
261 book_doc['uid'] = "book%s" % book_doc['book_id']
262 self.index.add(book_doc)
265 'title': meta_fields['title'],
266 'authors': meta_fields['authors'],
267 'published_date': meta_fields['published_date']
270 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
271 if tag_name in meta_fields:
272 book_fields[tag_name] = meta_fields[tag_name]
274 self.index_content(book, book_fields=book_fields)
279 'dramat_wierszowany_l',
280 'dramat_wierszowany_lp',
281 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
285 ignore_content_tags = [
286 'uwaga', 'extra', 'nota_red', 'abstrakt',
287 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
289 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
292 footnote_tags = ['pa', 'pt', 'pr', 'pe']
294 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
295 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
297 published_date_re = re.compile("([0-9]+)[\]. ]*$")
299 def extract_metadata(self, book, book_info=None, dc_only=None):
301 Extract metadata from book and returns a map of fields keyed by fieldname
305 if book_info is None:
306 book_info = dcparser.parse(open(book.xml_file.path))
308 fields['slug'] = book.slug
309 fields['is_book'] = True
312 for field in dcparser.BookInfo.FIELDS:
313 if dc_only and field.name not in dc_only:
315 if hasattr(book_info, field.name):
316 if not getattr(book_info, field.name):
318 # since no type information is available, we use validator
319 type_indicator = field.validator
320 if type_indicator == dcparser.as_unicode:
321 s = getattr(book_info, field.name)
324 fields[field.name] = s
325 elif type_indicator == dcparser.as_person:
326 p = getattr(book_info, field.name)
327 if isinstance(p, dcparser.Person):
330 persons = ', '.join(map(str, p))
331 fields[field.name] = persons
332 elif type_indicator == dcparser.as_date:
333 dt = getattr(book_info, field.name)
334 fields[field.name] = dt
338 if hasattr(book_info, 'source_name') and book_info.source_name:
339 match = self.published_date_re.search(book_info.source_name)
340 if match is not None:
341 pd = str(match.groups()[0])
344 fields["published_date"] = pd
348 # def add_gaps(self, fields, fieldname):
350 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
351 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
355 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
356 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
358 def get_master(self, root):
360 Returns the first master tag from an etree.
362 for master in root.iter():
363 if master.tag in self.master_tags:
366 def index_content(self, book, book_fields):
368 Walks the book XML and extract content from it.
369 Adds parts for each header tag and for each fragment.
371 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
372 root = wld.edoc.getroot()
374 master = self.get_master(root)
379 if node.tag not in self.ignore_content_tags:
380 yield node, None, None
381 if node.text is not None:
382 yield None, node.text, None
383 for child in list(node):
384 for b, t, e in walker(child):
386 yield None, None, node
388 if node.tail is not None:
389 yield None, node.tail, None
392 def fix_format(text):
393 # separator = [" ", "\t", ".", ";", ","]
394 if isinstance(text, list):
395 # need to join it first
396 text = filter(lambda s: s is not None, content)
397 text = ' '.join(text)
398 # for i in range(len(text)):
400 # if text[i][0] not in separator\
401 # and text[i - 1][-1] not in separator:
402 # text.insert(i, " ")
404 return re.sub("(?m)/$", "", text)
406 def add_part(snippets, **fields):
407 doc = self.create_book_doc(book)
408 for n, v in book_fields.items():
411 doc['header_index'] = fields["header_index"]
412 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
413 doc['header_type'] = fields['header_type']
415 doc['text'] = fields['text']
418 snip_pos = snippets.add(fields["text"])
420 doc['snippets_position'] = snip_pos[0]
421 doc['snippets_length'] = snip_pos[1]
422 if snippets.revision:
423 doc["snippets_revision"] = snippets.revision
425 if 'fragment_anchor' in fields:
426 doc["fragment_anchor"] = fields['fragment_anchor']
428 if 'themes' in fields:
429 doc['themes'] = fields['themes']
430 doc['uid'] = "part%s-%s-%s-%s" % (
431 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
435 snippets = Snippets(book.id).open('w')
437 for header, position in zip(list(master), range(len(master))):
439 if header.tag in self.skip_header_tags:
441 if header.tag is etree.Comment:
448 def all_content(text):
449 for frag in fragments.values():
450 frag['text'].append(text)
452 handle_text = [all_content]
454 for start, text, end in walker(header):
456 if start is not None and start.tag in self.footnote_tags:
459 def collect_footnote(t):
462 handle_text.append(collect_footnote)
463 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
465 doc = add_part(snippets, header_index=position, header_type=header.tag,
466 text=''.join(footnote),
471 # handle fragments and themes.
472 if start is not None and start.tag == 'begin':
473 fid = start.attrib['id'][1:]
475 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
477 # themes for this fragment
478 elif start is not None and start.tag == 'motyw':
479 fid = start.attrib['id'][1:]
480 handle_text.append(lambda text: None)
481 if start.text is not None:
482 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
483 elif end is not None and end.tag == 'motyw':
486 elif start is not None and start.tag == 'end':
487 fid = start.attrib['id'][1:]
488 if fid not in fragments:
489 continue # a broken <end> node, skip it
490 frag = fragments[fid]
491 if not frag['themes']:
492 continue # empty themes list.
495 doc = add_part(snippets,
496 header_type=frag['start_header'],
497 header_index=frag['start_section'],
498 header_span=position - frag['start_section'] + 1,
500 text=fix_format(frag['text']),
501 themes=frag['themes'])
506 if text is not None and handle_text is not []:
507 hdl = handle_text[-1]
510 # in the end, add a section text.
511 doc = add_part(snippets, header_index=position,
512 header_type=header.tag, text=fix_format(content))
519 def remove_picture(self, picture_or_id):
520 """Removes a picture from search index."""
521 if isinstance(picture_or_id, picture.models.Picture):
522 picture_id = picture_or_id.id
524 picture_id = picture_or_id
525 self.delete_query(self.index.Q(picture_id=picture_id))
527 def index_picture(self, picture, picture_info=None, overwrite=True):
530 Creates a lucene document for extracted metadata
531 and calls self.index_area() to index the contents of the picture.
534 # we don't remove snippets, since they might be still needed by
535 # threads using not reopened index
536 self.remove_picture(picture)
538 picture_doc = {'picture_id': int(picture.id)}
539 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
540 'authors', 'title', 'epochs', 'kinds', 'genres'])
542 picture_doc.update(meta_fields)
544 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
545 self.index.add(picture_doc)
546 del picture_doc['is_book']
547 for area in picture.areas.all():
548 self.index_area(area, picture_fields=picture_doc)
550 def index_area(self, area, picture_fields):
552 Indexes themes and objects on the area.
554 doc = dict(picture_fields)
555 doc['area_id'] = area.id
556 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
557 doc['uid'] = 'area%s' % area.id
562 class SearchResult(object):
563 def __init__(self, doc, how_found=None, query_terms=None):
566 self._processed_hits = None # processed hits
568 self.query_terms = query_terms
572 self._score = doc['score']
576 self.book_id = int(doc["book_id"])
579 self.published_date = int(doc.get("published_date"))
581 self.published_date = 0
584 header_type = doc.get("header_type", None)
585 # we have a content hit in some header of fragment
586 if header_type is not None:
587 sec = (header_type, int(doc["header_index"]))
588 header_span = doc['header_span']
589 header_span = header_span is not None and int(header_span) or 1
590 fragment = doc.get("fragment_anchor", None)
591 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
592 snippets_rev = doc.get('snippets_revision', None)
594 hit = (sec + (header_span,), fragment, self._score, {
595 'how_found': how_found,
596 'snippets_pos': snippets_pos,
597 'snippets_revision': snippets_rev,
598 'themes': doc.get('themes', []),
599 'themes_pl': doc.get('themes_pl', [])
602 self._hits.append(hit)
605 def from_book(cls, book, how_found=None, query_terms=None):
607 'score': book.popularity.count,
611 result = cls(doc, how_found=how_found, query_terms=query_terms)
616 return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
617 (self.book_id, len(self._hits),
618 len(self._processed_hits) if self._processed_hits else -1,
619 self._score, len(self.snippets))
622 return str(self).encode('utf-8')
626 return self._score * self.boost
628 def merge(self, other):
629 if self.book_id != other.book_id:
630 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
631 self._hits += other._hits
632 self._score += max(other._score, 0)
636 if self._book is not None:
639 self._book = catalogue.models.Book.objects.get(id=self.book_id)
640 except catalogue.models.Book.DoesNotExist:
644 book = property(get_book)
655 if self._processed_hits is not None:
656 return self._processed_hits
658 # to sections and fragments
659 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
661 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
663 # sections not covered by fragments
664 sect = filter(lambda s: 0 == len(list(filter(
665 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
666 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
668 def remove_duplicates(lst, keyfn, larger):
673 if larger(els[eif], e):
678 # remove fragments with duplicated fid's and duplicated snippets
679 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
681 # remove duplicate sections
685 si = s[self.POSITION][self.POSITION_INDEX]
688 if sections[si]['score'] >= s[self.SCORE]:
691 m = {'score': s[self.SCORE],
692 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
694 m.update(s[self.OTHER])
697 hits = list(sections.values())
701 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
702 except catalogue.models.Fragment.DoesNotExist:
705 # Figure out if we were searching for a token matching some word in theme name.
706 themes = frag.tags.filter(category='theme')
708 if self.query_terms is not None:
709 for i in range(0, len(f[self.OTHER]['themes'])):
710 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
711 tms = map(str.lower, tms)
712 for qt in self.query_terms:
714 themes_hit.add(f[self.OTHER]['themes'][i])
717 def theme_by_name(n):
718 th = list(filter(lambda t: t.name == n, themes))
723 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
725 m = {'score': f[self.SCORE],
727 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
729 'themes_hit': themes_hit
731 m.update(f[self.OTHER])
734 hits.sort(key=lambda h: h['score'], reverse=True)
736 self._processed_hits = hits
741 def aggregate(*result_lists):
743 for rl in result_lists:
745 if r.book_id in books:
746 books[r.book_id].merge(r)
749 return books.values()
751 def get_sort_key(self):
754 self.book.sort_key_author if self.book else '',
755 self.book.sort_key if self.book else '')
757 def __lt__(self, other):
758 return self.get_sort_key() > other.get_sort_key()
760 def __eq__(self, other):
761 return self.get_sort_key() == other.get_sort_key()
764 return len(self.hits)
766 def snippet_pos(self, idx=0):
767 return self.hits[idx]['snippets_pos']
769 def snippet_revision(self, idx=0):
771 return self.hits[idx]['snippets_revision']
772 except (IndexError, KeyError):
777 class PictureResult(object):
778 def __init__(self, doc, how_found=None, query_terms=None):
780 self.query_terms = query_terms
783 self._processed_hits = None
786 self._score = doc['score']
790 self.picture_id = int(doc["picture_id"])
792 if doc.get('area_id'):
793 hit = (self._score, {
794 'how_found': how_found,
795 'area_id': doc['area_id'],
796 'themes': doc.get('themes', []),
797 'themes_pl': doc.get('themes_pl', []),
800 self._hits.append(hit)
803 return "<PR id=%d score=%f >" % (self.picture_id, self._score)
810 return self._score * self.boost
812 def merge(self, other):
813 if self.picture_id != other.picture_id:
815 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
816 self._hits += other._hits
817 self._score += max(other._score, 0)
825 if self._processed_hits is not None:
826 return self._processed_hits
829 for hit in self._hits:
831 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
832 except picture.models.PictureArea.DoesNotExist:
835 # Figure out if we were searching for a token matching some word in theme name.
837 if self.query_terms is not None:
838 for i in range(0, len(hit[self.OTHER]['themes'])):
839 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
840 tms = map(str.lower, tms)
841 for qt in self.query_terms:
843 themes_hit.add(hit[self.OTHER]['themes'][i])
847 'score': hit[self.SCORE],
849 'themes_hit': themes_hit,
851 m.update(hit[self.OTHER])
854 hits.sort(key=lambda h: h['score'], reverse=True)
856 self._processed_hits = hits
859 def get_picture(self):
860 if self._picture is None:
861 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
864 picture = property(get_picture)
867 def aggregate(*result_lists):
869 for rl in result_lists:
871 if r.picture_id in books:
872 books[r.picture_id].merge(r)
874 books[r.picture_id] = r
875 return books.values()
877 def __lt__(self, other):
878 return self.score < other.score
880 def __eq__(self, other):
881 return self.score == other.score
884 class Search(SolrIndex):
888 def __init__(self, default_field="text"):
889 super(Search, self).__init__(mode='r')
891 def make_term_query(self, query, field='text', modal=operator.or_):
893 Returns term queries joined by boolean query.
894 modal - applies to boolean query
895 fuzzy - should the query by fuzzy.
900 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
904 def search_by_author(self, words):
905 from catalogue.models import Book
906 books = Book.objects.filter(parent=None).order_by('-popularity__count')
908 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
909 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
911 def search_words(self, words, fields, required=None, book=True, picture=False):
912 if book and not picture and fields == ['authors']:
913 return self.search_by_author(words)
916 if book or picture or (word not in stopwords):
919 q = self.index.Q(**{field: word})
920 if word_filter is None:
924 filters.append(word_filter)
926 required_filter = None
927 for field in required:
929 if book or picture or (word not in stopwords):
930 q = self.index.Q(**{field: word})
931 if required_filter is None:
935 filters.append(required_filter)
940 params['is_book'] = True
942 params['picture_id__gt'] = 0
944 params['book_id__gt'] = 0
945 query = self.index.query(**params)
946 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
947 result_class = PictureResult if picture else SearchResult
948 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
950 def get_snippets(self, searchresult, query, field='text', num=1):
952 Returns a snippet for found scoreDoc.
954 maxnum = len(searchresult)
955 if num is None or num < 0 or num > maxnum:
957 book_id = searchresult.book_id
958 revision = searchresult.snippet_revision()
959 snippets = Snippets(book_id, revision=revision)
960 snips = [None] * maxnum
964 while idx < maxnum and num > 0:
965 position, length = searchresult.snippet_pos(idx)
966 if position is None or length is None:
968 text = snippets.get((int(position),
970 snip = self.index.highlight(text=text, field=field, q=query)
971 if not snip and field == 'text':
972 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
973 if snip not in snips:
980 book = catalogue.models.Book.objects.filter(id=book_id)
982 log.error("Book does not exist for book id = %d" % book_id)
983 elif not book.get().children.exists():
984 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
989 # remove verse end markers..
990 snips = [s.replace("/\n", "\n") if s else s for s in snips]
992 searchresult.snippets = snips
997 def apply_filters(query, filters):
999 Apply filters to a query
1003 filters = filter(lambda x: x is not None, filters)
1005 query = query.query(f)
1009 if getattr(settings, 'SEARCH_MOCK', False):
1010 from .mock_search import Search