1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.date
13 import librarian.meta.types.person
14 import librarian.meta.types.text
15 from librarian.parser import WLDocument
16 from lxml import etree
18 import catalogue.models
20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
21 from wolnelektury.utils import makedirs
24 log = logging.getLogger('search')
27 if os.path.isfile(settings.SOLR_STOPWORDS):
30 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
35 class SolrIndex(object):
36 def __init__(self, mode=None):
37 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
40 class Snippets(object):
42 This class manages snippet files for indexed object (book)
43 the snippets are concatenated together, and their positions and
44 lengths are kept in lucene index fields.
46 SNIPPET_DIR = "snippets"
48 def __init__(self, book_id, revision=None):
49 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
50 self.book_id = book_id
51 self.revision = revision
58 fn = "%d.%d" % (self.book_id, self.revision)
60 fn = "%d" % self.book_id
62 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
64 def open(self, mode='r'):
66 Open the snippet file. Call .close() afterwards.
72 if os.path.exists(self.path):
75 if not os.path.exists(self.path):
79 self.file = open(self.path, mode)
83 def add(self, snippet):
85 Append a snippet (unicode) to the snippet file.
86 Return a (position, length) tuple
88 txt = snippet.encode('utf-8')
91 pos = (self.position, l)
97 Given a tuple of (position, length) return an unicode
98 of the snippet stored there.
100 self.file.seek(pos[0], 0)
102 txt = self.file.read(pos[1]).decode('utf-8')
108 """Close snippet file"""
124 class Index(SolrIndex):
126 Class indexing books.
129 super(Index, self).__init__(mode='rw')
131 def remove_snippets(self, book):
132 book.snippet_set.all().delete()
134 def add_snippet(self, book, doc):
135 assert book.id == doc.pop('book_id')
136 # Fragments already exist and can be indexed where they live.
137 if 'fragment_anchor' in doc:
140 text = doc.pop('text')
141 header_index = doc.pop('header_index')
142 book.snippet_set.create(
147 def delete_query(self, *queries):
149 index.delete(queries=...) doesn't work, so let's reimplement it
150 using deletion of list of uids.
154 if isinstance(q, scorched.search.LuceneQuery):
155 q = self.index.query(q)
156 q.field_limiter.update(['uid'])
160 ids = q.paginate(start=st, rows=rows).execute()
167 # FIXME: With Solr API change, this doesn't work.
168 #self.index.delete(uids)
173 def index_tags(self, *tags, **kw):
175 Re-index global tag list.
176 Removes all tags from index, then index them again.
177 Indexed fields include: id, name (with and without polish stems), category
179 log.debug("Indexing tags")
180 remove_only = kw.get('remove_only', False)
181 # first, remove tags from index.
185 q_id = self.index.Q(tag_id=tag.id)
187 if isinstance(tag, PDCounterAuthor):
188 q_cat = self.index.Q(tag_category='pd_author')
189 elif isinstance(tag, PDCounterBook):
190 q_cat = self.index.Q(tag_category='pd_book')
192 q_cat = self.index.Q(tag_category=tag.category)
194 q_id_cat = self.index.Q(q_id & q_cat)
195 tag_qs.append(q_id_cat)
196 self.delete_query(*tag_qs)
198 q = self.index.Q(tag_id__any=True)
202 # then add them [all or just one passed]
205 catalogue.models.Tag.objects.exclude(category='set'),
206 PDCounterAuthor.objects.all(),
207 PDCounterBook.objects.all())
210 if isinstance(tag, PDCounterAuthor):
212 "tag_id": int(tag.id),
213 "tag_name": tag.name,
214 "tag_name_pl": tag.name,
215 "tag_category": 'pd_author',
216 "is_pdcounter": True,
217 "uid": "tag%d_pd_a" % tag.id
219 elif isinstance(tag, PDCounterBook):
221 "tag_id": int(tag.id),
222 "tag_name": tag.title,
223 "tag_name_pl": tag.title,
224 "tag_category": 'pd_book',
225 "is_pdcounter": True,
226 "uid": "tag%d_pd_b" % tag.id
230 "tag_id": int(tag.id),
231 "tag_name": tag.name,
232 "tag_name_pl": tag.name,
233 "tag_category": tag.category,
234 "is_pdcounter": False,
235 "uid": "tag%d" % tag.id
239 def create_book_doc(self, book):
241 Create a lucene document referring book id.
243 doc = {'book_id': int(book.id)}
244 if book.parent is not None:
245 doc['parent_id'] = int(book.parent.id)
248 def remove_book(self, book, remove_snippets=True):
249 """Removes a book from search index.
250 book - Book instance."""
251 self.delete_query(self.index.Q(book_id=book.id))
254 snippets = Snippets(book.id)
256 self.remove_snippets(book)
258 def index_book(self, book, book_info=None, overwrite=True):
261 Creates a lucene document for extracted metadata
262 and calls self.index_content() to index the contents of the book.
264 if not book.xml_file: return
267 # we don't remove snippets, since they might be still needed by
268 # threads using not reopened index
269 self.remove_book(book, remove_snippets=False)
271 book_doc = self.create_book_doc(book)
272 meta_fields = self.extract_metadata(book, book_info, dc_only=[
273 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
274 # let's not index it - it's only used for extracting publish date
275 if 'source_name' in meta_fields:
276 del meta_fields['source_name']
278 for n, f in meta_fields.items():
281 book_doc['uid'] = "book%s" % book_doc['book_id']
282 self.index.add(book_doc)
285 'title': meta_fields['title'],
286 'authors': meta_fields['authors'],
287 'published_date': meta_fields['published_date']
290 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
291 if tag_name in meta_fields:
292 book_fields[tag_name] = meta_fields[tag_name]
294 self.index_content(book, book_fields=book_fields)
299 'dramat_wierszowany_l',
300 'dramat_wierszowany_lp',
301 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
305 ignore_content_tags = [
306 'uwaga', 'extra', 'nota_red', 'abstrakt',
307 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
309 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
312 footnote_tags = ['pa', 'pt', 'pr', 'pe']
314 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
315 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
317 published_date_re = re.compile("([0-9]+)[\]. ]*$")
319 def extract_metadata(self, book, book_info=None, dc_only=None):
321 Extract metadata from book and returns a map of fields keyed by fieldname
325 if book_info is None:
326 book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
328 fields['slug'] = book.slug
329 fields['is_book'] = True
332 for field in dcparser.BookInfo.FIELDS:
333 if dc_only and field.name not in dc_only:
335 if hasattr(book_info, field.name):
336 if not getattr(book_info, field.name):
338 type_indicator = field.value_type
339 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
340 s = getattr(book_info, field.name)
343 fields[field.name] = s
344 elif issubclass(type_indicator, librarian.meta.types.person.Person):
345 p = getattr(book_info, field.name)
346 if isinstance(p, librarian.meta.types.person.Person):
349 persons = ', '.join(map(str, p))
350 fields[field.name] = persons
351 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
352 dt = getattr(book_info, field.name)
353 fields[field.name] = dt
357 if hasattr(book_info, 'source_name') and book_info.source_name:
358 match = self.published_date_re.search(book_info.source_name)
359 if match is not None:
360 pd = str(match.groups()[0])
363 fields["published_date"] = pd
367 # def add_gaps(self, fields, fieldname):
369 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
370 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
374 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
375 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
377 def get_master(self, root):
379 Returns the first master tag from an etree.
381 for master in root.iter():
382 if master.tag in self.master_tags:
385 def index_content(self, book, book_fields):
387 Walks the book XML and extract content from it.
388 Adds parts for each header tag and for each fragment.
390 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
391 root = wld.edoc.getroot()
393 master = self.get_master(root)
398 if node.tag not in self.ignore_content_tags:
399 yield node, None, None
400 if node.text is not None:
401 yield None, node.text, None
402 for child in list(node):
403 for b, t, e in walker(child):
405 yield None, None, node
407 if node.tail is not None:
408 yield None, node.tail, None
411 def fix_format(text):
412 # separator = [" ", "\t", ".", ";", ","]
413 if isinstance(text, list):
414 # need to join it first
415 text = filter(lambda s: s is not None, content)
416 text = ' '.join(text)
417 # for i in range(len(text)):
419 # if text[i][0] not in separator\
420 # and text[i - 1][-1] not in separator:
421 # text.insert(i, " ")
423 return re.sub("(?m)/$", "", text)
425 def add_part(snippets, **fields):
426 doc = self.create_book_doc(book)
427 for n, v in book_fields.items():
430 doc['header_index'] = fields["header_index"]
431 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
432 doc['header_type'] = fields['header_type']
434 doc['text'] = fields['text']
437 snip_pos = snippets.add(fields["text"])
439 doc['snippets_position'] = snip_pos[0]
440 doc['snippets_length'] = snip_pos[1]
441 if snippets.revision:
442 doc["snippets_revision"] = snippets.revision
444 if 'fragment_anchor' in fields:
445 doc["fragment_anchor"] = fields['fragment_anchor']
447 if 'themes' in fields:
448 doc['themes'] = fields['themes']
449 doc['uid'] = "part%s-%s-%s-%s" % (
450 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
454 snippets = Snippets(book.id).open('w')
456 for header, position in zip(list(master), range(len(master))):
458 if header.tag in self.skip_header_tags:
460 if header.tag is etree.Comment:
467 def all_content(text):
468 for frag in fragments.values():
469 frag['text'].append(text)
471 handle_text = [all_content]
473 for start, text, end in walker(header):
475 if start is not None and start.tag in self.footnote_tags:
478 def collect_footnote(t):
481 handle_text.append(collect_footnote)
482 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
484 doc = add_part(snippets, header_index=position, header_type=header.tag,
485 text=''.join(footnote))
486 self.add_snippet(book, doc)
490 # handle fragments and themes.
491 if start is not None and start.tag == 'begin':
492 fid = start.attrib['id'][1:]
494 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
496 # themes for this fragment
497 elif start is not None and start.tag == 'motyw':
498 fid = start.attrib['id'][1:]
499 handle_text.append(lambda text: None)
500 if start.text is not None:
501 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
502 elif end is not None and end.tag == 'motyw':
505 elif start is not None and start.tag == 'end':
506 fid = start.attrib['id'][1:]
507 if fid not in fragments:
508 continue # a broken <end> node, skip it
509 frag = fragments[fid]
510 if not frag['themes']:
511 continue # empty themes list.
514 doc = add_part(snippets,
515 header_type=frag['start_header'],
516 header_index=frag['start_section'],
517 header_span=position - frag['start_section'] + 1,
519 text=fix_format(frag['text']),
520 themes=frag['themes'])
521 # Add searchable fragment
522 self.add_snippet(book, doc)
527 if text is not None and handle_text is not []:
528 hdl = handle_text[-1]
531 # in the end, add a section text.
532 doc = add_part(snippets, header_index=position,
533 header_type=header.tag, text=fix_format(content))
535 self.add_snippet(book, doc)
541 def remove_picture(self, picture_or_id):
542 """Removes a picture from search index."""
543 if isinstance(picture_or_id, picture.models.Picture):
544 picture_id = picture_or_id.id
546 picture_id = picture_or_id
547 self.delete_query(self.index.Q(picture_id=picture_id))
549 def index_picture(self, picture, picture_info=None, overwrite=True):
552 Creates a lucene document for extracted metadata
553 and calls self.index_area() to index the contents of the picture.
556 # we don't remove snippets, since they might be still needed by
557 # threads using not reopened index
558 self.remove_picture(picture)
560 picture_doc = {'picture_id': int(picture.id)}
561 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
562 'authors', 'title', 'epochs', 'kinds', 'genres'])
564 picture_doc.update(meta_fields)
566 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
567 self.index.add(picture_doc)
568 del picture_doc['is_book']
569 for area in picture.areas.all():
570 self.index_area(area, picture_fields=picture_doc)
572 def index_area(self, area, picture_fields):
574 Indexes themes and objects on the area.
576 doc = dict(picture_fields)
577 doc['area_id'] = area.id
578 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
579 doc['uid'] = 'area%s' % area.id
584 class SearchResult(object):
585 def __init__(self, doc, how_found=None, query_terms=None):
588 self._processed_hits = None # processed hits
590 self.query_terms = query_terms
594 self._score = doc['score']
598 self.book_id = int(doc["book_id"])
601 self.published_date = int(doc.get("published_date"))
603 self.published_date = 0
606 header_type = doc.get("header_type", None)
607 # we have a content hit in some header of fragment
608 if header_type is not None:
609 sec = (header_type, int(doc["header_index"]))
610 header_span = doc['header_span']
611 header_span = header_span is not None and int(header_span) or 1
612 fragment = doc.get("fragment_anchor", None)
613 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
614 snippets_rev = doc.get('snippets_revision', None)
616 hit = (sec + (header_span,), fragment, self._score, {
617 'how_found': how_found,
618 'snippets_pos': snippets_pos,
619 'snippets_revision': snippets_rev,
620 'themes': doc.get('themes', []),
621 'themes_pl': doc.get('themes_pl', [])
624 self._hits.append(hit)
627 def from_book(cls, book, how_found=None, query_terms=None):
629 'score': book.popularity.count,
633 result = cls(doc, how_found=how_found, query_terms=query_terms)
638 return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
639 (self.book_id, len(self._hits),
640 len(self._processed_hits) if self._processed_hits else -1,
641 self._score, len(self.snippets))
644 return str(self).encode('utf-8')
648 return self._score * self.boost
650 def merge(self, other):
651 if self.book_id != other.book_id:
652 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
653 self._hits += other._hits
654 self._score += max(other._score, 0)
658 if self._book is not None:
661 self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
662 except catalogue.models.Book.DoesNotExist:
666 book = property(get_book)
677 if self._processed_hits is not None:
678 return self._processed_hits
680 # to sections and fragments
681 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
683 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
685 # sections not covered by fragments
686 sect = filter(lambda s: 0 == len(list(filter(
687 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
688 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
690 def remove_duplicates(lst, keyfn, larger):
695 if larger(els[eif], e):
700 # remove fragments with duplicated fid's and duplicated snippets
701 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
703 # remove duplicate sections
707 si = s[self.POSITION][self.POSITION_INDEX]
710 if sections[si]['score'] >= s[self.SCORE]:
713 m = {'score': s[self.SCORE],
714 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
716 m.update(s[self.OTHER])
719 hits = list(sections.values())
723 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
724 except catalogue.models.Fragment.DoesNotExist:
727 # Figure out if we were searching for a token matching some word in theme name.
728 themes = frag.tags.filter(category='theme')
730 if self.query_terms is not None:
731 for i in range(0, len(f[self.OTHER]['themes'])):
732 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
733 tms = map(str.lower, tms)
734 for qt in self.query_terms:
736 themes_hit.add(f[self.OTHER]['themes'][i])
739 def theme_by_name(n):
740 th = list(filter(lambda t: t.name == n, themes))
745 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
747 m = {'score': f[self.SCORE],
749 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
751 'themes_hit': themes_hit
753 m.update(f[self.OTHER])
756 hits.sort(key=lambda h: h['score'], reverse=True)
758 self._processed_hits = hits
763 def aggregate(*result_lists):
765 for rl in result_lists:
767 if r.book_id in books:
768 books[r.book_id].merge(r)
771 return books.values()
773 def get_sort_key(self):
776 self.book.sort_key_author if self.book else '',
777 self.book.sort_key if self.book else '')
779 def __lt__(self, other):
780 return self.get_sort_key() > other.get_sort_key()
782 def __eq__(self, other):
783 return self.get_sort_key() == other.get_sort_key()
786 return len(self.hits)
788 def snippet_pos(self, idx=0):
789 return self.hits[idx]['snippets_pos']
791 def snippet_revision(self, idx=0):
793 return self.hits[idx]['snippets_revision']
794 except (IndexError, KeyError):
799 class PictureResult(object):
800 def __init__(self, doc, how_found=None, query_terms=None):
802 self.query_terms = query_terms
805 self._processed_hits = None
808 self._score = doc['score']
812 self.picture_id = int(doc["picture_id"])
814 if doc.get('area_id'):
815 hit = (self._score, {
816 'how_found': how_found,
817 'area_id': doc['area_id'],
818 'themes': doc.get('themes', []),
819 'themes_pl': doc.get('themes_pl', []),
822 self._hits.append(hit)
825 return "<PR id=%d score=%f >" % (self.picture_id, self._score)
832 return self._score * self.boost
834 def merge(self, other):
835 if self.picture_id != other.picture_id:
837 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
838 self._hits += other._hits
839 self._score += max(other._score, 0)
847 if self._processed_hits is not None:
848 return self._processed_hits
851 for hit in self._hits:
853 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
854 except picture.models.PictureArea.DoesNotExist:
857 # Figure out if we were searching for a token matching some word in theme name.
859 if self.query_terms is not None:
860 for i in range(0, len(hit[self.OTHER]['themes'])):
861 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
862 tms = map(str.lower, tms)
863 for qt in self.query_terms:
865 themes_hit.add(hit[self.OTHER]['themes'][i])
869 'score': hit[self.SCORE],
871 'themes_hit': themes_hit,
873 m.update(hit[self.OTHER])
876 hits.sort(key=lambda h: h['score'], reverse=True)
878 self._processed_hits = hits
881 def get_picture(self):
882 if self._picture is None:
883 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
886 picture = property(get_picture)
889 def aggregate(*result_lists):
891 for rl in result_lists:
893 if r.picture_id in books:
894 books[r.picture_id].merge(r)
896 books[r.picture_id] = r
897 return books.values()
899 def __lt__(self, other):
900 return self.score < other.score
902 def __eq__(self, other):
903 return self.score == other.score
906 class Search(SolrIndex):
910 def __init__(self, default_field="text"):
911 super(Search, self).__init__(mode='r')
913 def make_term_query(self, query, field='text', modal=operator.or_):
915 Returns term queries joined by boolean query.
916 modal - applies to boolean query
917 fuzzy - should the query by fuzzy.
922 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
926 def search_by_author(self, words):
927 from catalogue.models import Book
928 books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
930 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
931 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
933 def search_words(self, words, fields, required=None, book=True, picture=False):
934 if book and not picture and fields == ['authors']:
935 return self.search_by_author(words)
938 if book or picture or (word not in stopwords):
941 q = self.index.Q(**{field: word})
942 if word_filter is None:
946 filters.append(word_filter)
948 required_filter = None
949 for field in required:
951 if book or picture or (word not in stopwords):
952 q = self.index.Q(**{field: word})
953 if required_filter is None:
957 filters.append(required_filter)
962 params['is_book'] = True
964 params['picture_id__gt'] = 0
966 params['book_id__gt'] = 0
967 query = self.index.query(**params)
968 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
969 result_class = PictureResult if picture else SearchResult
970 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
972 def get_snippets(self, searchresult, query, field='text', num=1):
974 Returns a snippet for found scoreDoc.
976 maxnum = len(searchresult)
977 if num is None or num < 0 or num > maxnum:
979 book_id = searchresult.book_id
980 revision = searchresult.snippet_revision()
981 snippets = Snippets(book_id, revision=revision)
982 snips = [None] * maxnum
986 while idx < maxnum and num > 0:
987 position, length = searchresult.snippet_pos(idx)
988 if position is None or length is None:
990 text = snippets.get((int(position),
992 snip = self.index.highlight(text=text, field=field, q=query)
993 if not snip and field == 'text':
994 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
995 if snip not in snips:
1001 except IOError as e:
1002 book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1004 log.error("Book does not exist for book id = %d" % book_id)
1005 elif not book.get().children.exists():
1006 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1011 # remove verse end markers..
1012 snips = [s.replace("/\n", "\n") if s else s for s in snips]
1014 searchresult.snippets = snips
1019 def apply_filters(query, filters):
1021 Apply filters to a query
1025 filters = filter(lambda x: x is not None, filters)
1027 query = query.query(f)
1031 if getattr(settings, 'SEARCH_MOCK', False):
1032 from .mock_search import Search