1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
15 import catalogue.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 if os.path.isfile(settings.SOLR_STOPWORDS):
27 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
32 class SolrIndex(object):
33 def __init__(self, mode=None):
34 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
37 class Snippets(object):
39 This class manages snippet files for indexed object (book)
40 the snippets are concatenated together, and their positions and
41 lengths are kept in lucene index fields.
43 SNIPPET_DIR = "snippets"
45 def __init__(self, book_id, revision=None):
46 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47 self.book_id = book_id
48 self.revision = revision
55 fn = "%d.%d" % (self.book_id, self.revision)
57 fn = "%d" % self.book_id
59 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
61 def open(self, mode='r'):
63 Open the snippet file. Call .close() afterwards.
69 if os.path.exists(self.path):
72 if not os.path.exists(self.path):
76 self.file = open(self.path, mode)
80 def add(self, snippet):
82 Append a snippet (unicode) to the snippet file.
83 Return a (position, length) tuple
85 txt = snippet.encode('utf-8')
88 pos = (self.position, l)
94 Given a tuple of (position, length) return an unicode
95 of the snippet stored there.
97 self.file.seek(pos[0], 0)
98 txt = self.file.read(pos[1]).decode('utf-8')
102 """Close snippet file"""
118 class Index(SolrIndex):
120 Class indexing books.
123 super(Index, self).__init__(mode='rw')
125 def delete_query(self, *queries):
127 index.delete(queries=...) doesn't work, so let's reimplement it
128 using deletion of list of uids.
132 if isinstance(q, scorched.search.LuceneQuery):
133 q = self.index.query(q)
134 q.field_limiter.update(['uid'])
138 ids = q.paginate(start=st, rows=rows).execute()
145 self.index.delete(uids)
150 def index_tags(self, *tags, **kw):
152 Re-index global tag list.
153 Removes all tags from index, then index them again.
154 Indexed fields include: id, name (with and without polish stems), category
156 log.debug("Indexing tags")
157 remove_only = kw.get('remove_only', False)
158 # first, remove tags from index.
162 q_id = self.index.Q(tag_id=tag.id)
164 if isinstance(tag, PDCounterAuthor):
165 q_cat = self.index.Q(tag_category='pd_author')
166 elif isinstance(tag, PDCounterBook):
167 q_cat = self.index.Q(tag_category='pd_book')
169 q_cat = self.index.Q(tag_category=tag.category)
171 q_id_cat = self.index.Q(q_id & q_cat)
172 tag_qs.append(q_id_cat)
173 self.delete_query(*tag_qs)
175 q = self.index.Q(tag_id__any=True)
179 # then add them [all or just one passed]
182 catalogue.models.Tag.objects.exclude(category='set'),
183 PDCounterAuthor.objects.all(),
184 PDCounterBook.objects.all())
187 if isinstance(tag, PDCounterAuthor):
189 "tag_id": int(tag.id),
190 "tag_name": tag.name,
191 "tag_name_pl": tag.name,
192 "tag_category": 'pd_author',
193 "is_pdcounter": True,
194 "uid": "tag%d_pd_a" % tag.id
196 elif isinstance(tag, PDCounterBook):
198 "tag_id": int(tag.id),
199 "tag_name": tag.title,
200 "tag_name_pl": tag.title,
201 "tag_category": 'pd_book',
202 "is_pdcounter": True,
203 "uid": "tag%d_pd_b" % tag.id
207 "tag_id": int(tag.id),
208 "tag_name": tag.name,
209 "tag_name_pl": tag.name,
210 "tag_category": tag.category,
211 "is_pdcounter": False,
212 "uid": "tag%d" % tag.id
216 def create_book_doc(self, book):
218 Create a lucene document referring book id.
220 doc = {'book_id': int(book.id)}
221 if book.parent is not None:
222 doc['parent_id'] = int(book.parent.id)
225 def remove_book(self, book_or_id, remove_snippets=True):
226 """Removes a book from search index.
227 book - Book instance."""
228 if isinstance(book_or_id, catalogue.models.Book):
229 book_id = book_or_id.id
233 self.delete_query(self.index.Q(book_id=book_id))
236 snippets = Snippets(book_id)
239 def index_book(self, book, book_info=None, overwrite=True):
242 Creates a lucene document for extracted metadata
243 and calls self.index_content() to index the contents of the book.
246 # we don't remove snippets, since they might be still needed by
247 # threads using not reopened index
248 self.remove_book(book, remove_snippets=False)
250 book_doc = self.create_book_doc(book)
251 meta_fields = self.extract_metadata(book, book_info, dc_only=[
252 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
253 # let's not index it - it's only used for extracting publish date
254 if 'source_name' in meta_fields:
255 del meta_fields['source_name']
257 for n, f in meta_fields.items():
260 book_doc['uid'] = "book%s" % book_doc['book_id']
261 self.index.add(book_doc)
264 'title': meta_fields['title'],
265 'authors': meta_fields['authors'],
266 'published_date': meta_fields['published_date']
269 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
270 if tag_name in meta_fields:
271 book_fields[tag_name] = meta_fields[tag_name]
273 self.index_content(book, book_fields=book_fields)
278 'dramat_wierszowany_l',
279 'dramat_wierszowany_lp',
280 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
284 ignore_content_tags = [
285 'uwaga', 'extra', 'nota_red', 'abstrakt',
286 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
288 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
291 footnote_tags = ['pa', 'pt', 'pr', 'pe']
293 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
294 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
296 published_date_re = re.compile("([0-9]+)[\]. ]*$")
298 def extract_metadata(self, book, book_info=None, dc_only=None):
300 Extract metadata from book and returns a map of fields keyed by fieldname
304 if book_info is None:
305 book_info = dcparser.parse(open(book.xml_file.path))
307 fields['slug'] = book.slug
308 fields['is_book'] = True
311 for field in dcparser.BookInfo.FIELDS:
312 if dc_only and field.name not in dc_only:
314 if hasattr(book_info, field.name):
315 if not getattr(book_info, field.name):
317 # since no type information is available, we use validator
318 type_indicator = field.validator
319 if type_indicator == dcparser.as_unicode:
320 s = getattr(book_info, field.name)
323 fields[field.name] = s
324 elif type_indicator == dcparser.as_person:
325 p = getattr(book_info, field.name)
326 if isinstance(p, dcparser.Person):
329 persons = ', '.join(map(str, p))
330 fields[field.name] = persons
331 elif type_indicator == dcparser.as_date:
332 dt = getattr(book_info, field.name)
333 fields[field.name] = dt
337 if hasattr(book_info, 'source_name') and book_info.source_name:
338 match = self.published_date_re.search(book_info.source_name)
339 if match is not None:
340 pd = str(match.groups()[0])
343 fields["published_date"] = pd
347 # def add_gaps(self, fields, fieldname):
349 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
350 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
354 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
355 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
357 def get_master(self, root):
359 Returns the first master tag from an etree.
361 for master in root.iter():
362 if master.tag in self.master_tags:
365 def index_content(self, book, book_fields):
367 Walks the book XML and extract content from it.
368 Adds parts for each header tag and for each fragment.
370 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
371 root = wld.edoc.getroot()
373 master = self.get_master(root)
378 if node.tag not in self.ignore_content_tags:
379 yield node, None, None
380 if node.text is not None:
381 yield None, node.text, None
382 for child in list(node):
383 for b, t, e in walker(child):
385 yield None, None, node
387 if node.tail is not None:
388 yield None, node.tail, None
391 def fix_format(text):
392 # separator = [u" ", u"\t", u".", u";", u","]
393 if isinstance(text, list):
394 # need to join it first
395 text = filter(lambda s: s is not None, content)
396 text = u' '.join(text)
397 # for i in range(len(text)):
399 # if text[i][0] not in separator\
400 # and text[i - 1][-1] not in separator:
401 # text.insert(i, u" ")
403 return re.sub("(?m)/$", "", text)
405 def add_part(snippets, **fields):
406 doc = self.create_book_doc(book)
407 for n, v in book_fields.items():
410 doc['header_index'] = fields["header_index"]
411 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
412 doc['header_type'] = fields['header_type']
414 doc['text'] = fields['text']
417 snip_pos = snippets.add(fields["text"])
419 doc['snippets_position'] = snip_pos[0]
420 doc['snippets_length'] = snip_pos[1]
421 if snippets.revision:
422 doc["snippets_revision"] = snippets.revision
424 if 'fragment_anchor' in fields:
425 doc["fragment_anchor"] = fields['fragment_anchor']
427 if 'themes' in fields:
428 doc['themes'] = fields['themes']
429 doc['uid'] = "part%s-%s-%s-%s" % (
430 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
434 snippets = Snippets(book.id).open('w')
436 for header, position in zip(list(master), range(len(master))):
438 if header.tag in self.skip_header_tags:
440 if header.tag is etree.Comment:
447 def all_content(text):
448 for frag in fragments.values():
449 frag['text'].append(text)
451 handle_text = [all_content]
453 for start, text, end in walker(header):
455 if start is not None and start.tag in self.footnote_tags:
458 def collect_footnote(t):
461 handle_text.append(collect_footnote)
462 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
464 doc = add_part(snippets, header_index=position, header_type=header.tag,
465 text=u''.join(footnote),
470 # handle fragments and themes.
471 if start is not None and start.tag == 'begin':
472 fid = start.attrib['id'][1:]
474 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
476 # themes for this fragment
477 elif start is not None and start.tag == 'motyw':
478 fid = start.attrib['id'][1:]
479 handle_text.append(lambda text: None)
480 if start.text is not None:
481 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
482 elif end is not None and end.tag == 'motyw':
485 elif start is not None and start.tag == 'end':
486 fid = start.attrib['id'][1:]
487 if fid not in fragments:
488 continue # a broken <end> node, skip it
489 frag = fragments[fid]
490 if not frag['themes']:
491 continue # empty themes list.
494 doc = add_part(snippets,
495 header_type=frag['start_header'],
496 header_index=frag['start_section'],
497 header_span=position - frag['start_section'] + 1,
499 text=fix_format(frag['text']),
500 themes=frag['themes'])
505 if text is not None and handle_text is not []:
506 hdl = handle_text[-1]
509 # in the end, add a section text.
510 doc = add_part(snippets, header_index=position,
511 header_type=header.tag, text=fix_format(content))
518 def remove_picture(self, picture_or_id):
519 """Removes a picture from search index."""
520 if isinstance(picture_or_id, picture.models.Picture):
521 picture_id = picture_or_id.id
523 picture_id = picture_or_id
524 self.delete_query(self.index.Q(picture_id=picture_id))
526 def index_picture(self, picture, picture_info=None, overwrite=True):
529 Creates a lucene document for extracted metadata
530 and calls self.index_area() to index the contents of the picture.
533 # we don't remove snippets, since they might be still needed by
534 # threads using not reopened index
535 self.remove_picture(picture)
537 picture_doc = {'picture_id': int(picture.id)}
538 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
539 'authors', 'title', 'epochs', 'kinds', 'genres'])
541 picture_doc.update(meta_fields)
543 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
544 self.index.add(picture_doc)
545 del picture_doc['is_book']
546 for area in picture.areas.all():
547 self.index_area(area, picture_fields=picture_doc)
549 def index_area(self, area, picture_fields):
551 Indexes themes and objects on the area.
553 doc = dict(picture_fields)
554 doc['area_id'] = area.id
555 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
556 doc['uid'] = 'area%s' % area.id
561 class SearchResult(object):
562 def __init__(self, doc, how_found=None, query_terms=None):
565 self._processed_hits = None # processed hits
567 self.query_terms = query_terms
571 self._score = doc['score']
575 self.book_id = int(doc["book_id"])
578 self.published_date = int(doc.get("published_date"))
580 self.published_date = 0
583 header_type = doc.get("header_type", None)
584 # we have a content hit in some header of fragment
585 if header_type is not None:
586 sec = (header_type, int(doc["header_index"]))
587 header_span = doc['header_span']
588 header_span = header_span is not None and int(header_span) or 1
589 fragment = doc.get("fragment_anchor", None)
590 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
591 snippets_rev = doc.get('snippets_revision', None)
593 hit = (sec + (header_span,), fragment, self._score, {
594 'how_found': how_found,
595 'snippets_pos': snippets_pos,
596 'snippets_revision': snippets_rev,
597 'themes': doc.get('themes', []),
598 'themes_pl': doc.get('themes_pl', [])
601 self._hits.append(hit)
604 def from_book(cls, book, how_found=None, query_terms=None):
606 'score': book.popularity.count,
610 result = cls(doc, how_found=how_found, query_terms=query_terms)
615 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
616 (self.book_id, len(self._hits),
617 len(self._processed_hits) if self._processed_hits else -1,
618 self._score, len(self.snippets))
621 return str(self).encode('utf-8')
625 return self._score * self.boost
627 def merge(self, other):
628 if self.book_id != other.book_id:
629 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
630 self._hits += other._hits
631 self._score += max(other._score, 0)
635 if self._book is not None:
637 self._book = catalogue.models.Book.objects.get(id=self.book_id)
640 book = property(get_book)
651 if self._processed_hits is not None:
652 return self._processed_hits
654 # to sections and fragments
655 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
657 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
659 # sections not covered by fragments
660 sect = filter(lambda s: 0 == len(list(filter(
661 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
662 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
664 def remove_duplicates(lst, keyfn, compare):
669 if compare(els[eif], e) >= 1:
674 # remove fragments with duplicated fid's and duplicated snippets
675 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
676 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
677 # lambda a, b: cmp(a[SCORE], b[SCORE]))
679 # remove duplicate sections
683 si = s[self.POSITION][self.POSITION_INDEX]
686 if sections[si]['score'] >= s[self.SCORE]:
689 m = {'score': s[self.SCORE],
690 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
692 m.update(s[self.OTHER])
695 hits = list(sections.values())
699 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
700 except catalogue.models.Fragment.DoesNotExist:
703 # Figure out if we were searching for a token matching some word in theme name.
704 themes = frag.tags.filter(category='theme')
706 if self.query_terms is not None:
707 for i in range(0, len(f[self.OTHER]['themes'])):
708 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
709 tms = map(str.lower, tms)
710 for qt in self.query_terms:
712 themes_hit.add(f[self.OTHER]['themes'][i])
715 def theme_by_name(n):
716 th = list(filter(lambda t: t.name == n, themes))
721 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
723 m = {'score': f[self.SCORE],
725 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
727 'themes_hit': themes_hit
729 m.update(f[self.OTHER])
732 hits.sort(key=lambda h: h['score'], reverse=True)
734 self._processed_hits = hits
739 def aggregate(*result_lists):
741 for rl in result_lists:
743 if r.book_id in books:
744 books[r.book_id].merge(r)
747 return books.values()
749 def __lt__(self, other):
750 return (-self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) > \
751 (-other.score, other.published_date, other.book.sort_key_author, other.book.sort_key)
753 def __eq__(self, other):
754 return (self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) == \
755 (other.score, other.published_date, other.book.sort_key_author, other.book.sort_key)
758 return len(self.hits)
760 def snippet_pos(self, idx=0):
761 return self.hits[idx]['snippets_pos']
763 def snippet_revision(self, idx=0):
765 return self.hits[idx]['snippets_revision']
766 except (IndexError, KeyError):
771 class PictureResult(object):
772 def __init__(self, doc, how_found=None, query_terms=None):
774 self.query_terms = query_terms
777 self._processed_hits = None
780 self._score = doc['score']
784 self.picture_id = int(doc["picture_id"])
786 if doc.get('area_id'):
787 hit = (self._score, {
788 'how_found': how_found,
789 'area_id': doc['area_id'],
790 'themes': doc.get('themes', []),
791 'themes_pl': doc.get('themes_pl', []),
794 self._hits.append(hit)
797 return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
804 return self._score * self.boost
806 def merge(self, other):
807 if self.picture_id != other.picture_id:
809 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
810 self._hits += other._hits
811 self._score += max(other._score, 0)
819 if self._processed_hits is not None:
820 return self._processed_hits
823 for hit in self._hits:
825 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
826 except picture.models.PictureArea.DoesNotExist:
829 # Figure out if we were searching for a token matching some word in theme name.
831 if self.query_terms is not None:
832 for i in range(0, len(hit[self.OTHER]['themes'])):
833 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
834 tms = map(str.lower, tms)
835 for qt in self.query_terms:
837 themes_hit.add(hit[self.OTHER]['themes'][i])
841 'score': hit[self.SCORE],
843 'themes_hit': themes_hit,
845 m.update(hit[self.OTHER])
848 hits.sort(key=lambda h: h['score'], reverse=True)
850 self._processed_hits = hits
853 def get_picture(self):
854 if self._picture is None:
855 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
858 picture = property(get_picture)
861 def aggregate(*result_lists):
863 for rl in result_lists:
865 if r.picture_id in books:
866 books[r.picture_id].merge(r)
868 books[r.picture_id] = r
869 return books.values()
871 def __lt__(self, other):
872 return self.score < other.score
874 def __eq__(self, other):
875 return self.score == other.score
878 class Search(SolrIndex):
882 def __init__(self, default_field="text"):
883 super(Search, self).__init__(mode='r')
885 def make_term_query(self, query, field='text', modal=operator.or_):
887 Returns term queries joined by boolean query.
888 modal - applies to boolean query
889 fuzzy - should the query by fuzzy.
894 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
898 def search_by_author(self, words):
899 from catalogue.models import Book
900 books = Book.objects.filter(parent=None).order_by('-popularity__count')
902 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
903 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
905 def search_words(self, words, fields, required=None, book=True, picture=False):
906 if book and not picture and fields == ['authors']:
907 return self.search_by_author(words)
910 if book or picture or (word not in stopwords):
913 q = self.index.Q(**{field: word})
914 if word_filter is None:
918 filters.append(word_filter)
920 required_filter = None
921 for field in required:
923 if book or picture or (word not in stopwords):
924 q = self.index.Q(**{field: word})
925 if required_filter is None:
929 filters.append(required_filter)
934 params['is_book'] = True
936 params['picture_id__gt'] = 0
938 params['book_id__gt'] = 0
939 query = self.index.query(**params)
940 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
941 result_class = PictureResult if picture else SearchResult
942 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
944 def get_snippets(self, searchresult, query, field='text', num=1):
946 Returns a snippet for found scoreDoc.
948 maxnum = len(searchresult)
949 if num is None or num < 0 or num > maxnum:
951 book_id = searchresult.book_id
952 revision = searchresult.snippet_revision()
953 snippets = Snippets(book_id, revision=revision)
954 snips = [None] * maxnum
958 while idx < maxnum and num > 0:
959 position, length = searchresult.snippet_pos(idx)
960 if position is None or length is None:
962 text = snippets.get((int(position),
964 snip = self.index.highlight(text=text, field=field, q=query)
965 if not snip and field == 'text':
966 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
967 if snip not in snips:
974 book = catalogue.models.Book.objects.filter(id=book_id)
976 log.error("Book does not exist for book id = %d" % book_id)
977 elif not book.get().children.exists():
978 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
983 # remove verse end markers..
984 snips = [s.replace("/\n", "\n") if s else s for s in snips]
986 searchresult.snippets = snips
991 def apply_filters(query, filters):
993 Apply filters to a query
997 filters = filter(lambda x: x is not None, filters)
999 query = query.query(f)
1003 if getattr(settings, 'SEARCH_MOCK', False):
1004 from .mock_search import Search