1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from functools import reduce, total_ordering
5 from itertools import chain
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
15 import catalogue.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 if os.path.isfile(settings.SOLR_STOPWORDS):
27 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
32 class SolrIndex(object):
33 def __init__(self, mode=None):
34 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
37 class Snippets(object):
39 This class manages snippet files for indexed object (book)
40 the snippets are concatenated together, and their positions and
41 lengths are kept in lucene index fields.
43 SNIPPET_DIR = "snippets"
45 def __init__(self, book_id, revision=None):
46 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47 self.book_id = book_id
48 self.revision = revision
55 fn = "%d.%d" % (self.book_id, self.revision)
57 fn = "%d" % self.book_id
59 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
61 def open(self, mode='r'):
63 Open the snippet file. Call .close() afterwards.
69 if os.path.exists(self.path):
72 if not os.path.exists(self.path):
76 self.file = open(self.path, mode)
80 def add(self, snippet):
82 Append a snippet (unicode) to the snippet file.
83 Return a (position, length) tuple
85 txt = snippet.encode('utf-8')
88 pos = (self.position, l)
94 Given a tuple of (position, length) return an unicode
95 of the snippet stored there.
97 self.file.seek(pos[0], 0)
98 txt = self.file.read(pos[1]).decode('utf-8')
102 """Close snippet file"""
118 class Index(SolrIndex):
120 Class indexing books.
123 super(Index, self).__init__(mode='rw')
125 def delete_query(self, *queries):
127 index.delete(queries=...) doesn't work, so let's reimplement it
128 using deletion of list of uids.
132 if isinstance(q, scorched.search.LuceneQuery):
133 q = self.index.query(q)
134 q.field_limiter.update(['uid'])
138 ids = q.paginate(start=st, rows=rows).execute()
145 # FIXME: With Solr API change, this doesn't work.
146 #self.index.delete(uids)
151 def index_tags(self, *tags, **kw):
153 Re-index global tag list.
154 Removes all tags from index, then index them again.
155 Indexed fields include: id, name (with and without polish stems), category
157 log.debug("Indexing tags")
158 remove_only = kw.get('remove_only', False)
159 # first, remove tags from index.
163 q_id = self.index.Q(tag_id=tag.id)
165 if isinstance(tag, PDCounterAuthor):
166 q_cat = self.index.Q(tag_category='pd_author')
167 elif isinstance(tag, PDCounterBook):
168 q_cat = self.index.Q(tag_category='pd_book')
170 q_cat = self.index.Q(tag_category=tag.category)
172 q_id_cat = self.index.Q(q_id & q_cat)
173 tag_qs.append(q_id_cat)
174 self.delete_query(*tag_qs)
176 q = self.index.Q(tag_id__any=True)
180 # then add them [all or just one passed]
183 catalogue.models.Tag.objects.exclude(category='set'),
184 PDCounterAuthor.objects.all(),
185 PDCounterBook.objects.all())
188 if isinstance(tag, PDCounterAuthor):
190 "tag_id": int(tag.id),
191 "tag_name": tag.name,
192 "tag_name_pl": tag.name,
193 "tag_category": 'pd_author',
194 "is_pdcounter": True,
195 "uid": "tag%d_pd_a" % tag.id
197 elif isinstance(tag, PDCounterBook):
199 "tag_id": int(tag.id),
200 "tag_name": tag.title,
201 "tag_name_pl": tag.title,
202 "tag_category": 'pd_book',
203 "is_pdcounter": True,
204 "uid": "tag%d_pd_b" % tag.id
208 "tag_id": int(tag.id),
209 "tag_name": tag.name,
210 "tag_name_pl": tag.name,
211 "tag_category": tag.category,
212 "is_pdcounter": False,
213 "uid": "tag%d" % tag.id
217 def create_book_doc(self, book):
219 Create a lucene document referring book id.
221 doc = {'book_id': int(book.id)}
222 if book.parent is not None:
223 doc['parent_id'] = int(book.parent.id)
226 def remove_book(self, book_or_id, remove_snippets=True):
227 """Removes a book from search index.
228 book - Book instance."""
229 if isinstance(book_or_id, catalogue.models.Book):
230 book_id = book_or_id.id
234 self.delete_query(self.index.Q(book_id=book_id))
237 snippets = Snippets(book_id)
240 def index_book(self, book, book_info=None, overwrite=True):
243 Creates a lucene document for extracted metadata
244 and calls self.index_content() to index the contents of the book.
247 # we don't remove snippets, since they might be still needed by
248 # threads using not reopened index
249 self.remove_book(book, remove_snippets=False)
251 book_doc = self.create_book_doc(book)
252 meta_fields = self.extract_metadata(book, book_info, dc_only=[
253 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
254 # let's not index it - it's only used for extracting publish date
255 if 'source_name' in meta_fields:
256 del meta_fields['source_name']
258 for n, f in meta_fields.items():
261 book_doc['uid'] = "book%s" % book_doc['book_id']
262 self.index.add(book_doc)
265 'title': meta_fields['title'],
266 'authors': meta_fields['authors'],
267 'published_date': meta_fields['published_date']
270 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
271 if tag_name in meta_fields:
272 book_fields[tag_name] = meta_fields[tag_name]
274 self.index_content(book, book_fields=book_fields)
279 'dramat_wierszowany_l',
280 'dramat_wierszowany_lp',
281 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
285 ignore_content_tags = [
286 'uwaga', 'extra', 'nota_red', 'abstrakt',
287 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
289 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
292 footnote_tags = ['pa', 'pt', 'pr', 'pe']
294 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
295 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
297 published_date_re = re.compile("([0-9]+)[\]. ]*$")
299 def extract_metadata(self, book, book_info=None, dc_only=None):
301 Extract metadata from book and returns a map of fields keyed by fieldname
305 if book_info is None:
306 book_info = dcparser.parse(open(book.xml_file.path))
308 fields['slug'] = book.slug
309 fields['is_book'] = True
312 for field in dcparser.BookInfo.FIELDS:
313 if dc_only and field.name not in dc_only:
315 if hasattr(book_info, field.name):
316 if not getattr(book_info, field.name):
318 # since no type information is available, we use validator
319 type_indicator = field.validator
320 if type_indicator == dcparser.as_unicode:
321 s = getattr(book_info, field.name)
324 fields[field.name] = s
325 elif type_indicator == dcparser.as_person:
326 p = getattr(book_info, field.name)
327 if isinstance(p, dcparser.Person):
330 persons = ', '.join(map(str, p))
331 fields[field.name] = persons
332 elif type_indicator == dcparser.as_date:
333 dt = getattr(book_info, field.name)
334 fields[field.name] = dt
338 if hasattr(book_info, 'source_name') and book_info.source_name:
339 match = self.published_date_re.search(book_info.source_name)
340 if match is not None:
341 pd = str(match.groups()[0])
344 fields["published_date"] = pd
348 # def add_gaps(self, fields, fieldname):
350 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
351 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
355 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
356 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
358 def get_master(self, root):
360 Returns the first master tag from an etree.
362 for master in root.iter():
363 if master.tag in self.master_tags:
366 def index_content(self, book, book_fields):
368 Walks the book XML and extract content from it.
369 Adds parts for each header tag and for each fragment.
371 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
372 root = wld.edoc.getroot()
374 master = self.get_master(root)
379 if node.tag not in self.ignore_content_tags:
380 yield node, None, None
381 if node.text is not None:
382 yield None, node.text, None
383 for child in list(node):
384 for b, t, e in walker(child):
386 yield None, None, node
388 if node.tail is not None:
389 yield None, node.tail, None
392 def fix_format(text):
393 # separator = [u" ", u"\t", u".", u";", u","]
394 if isinstance(text, list):
395 # need to join it first
396 text = filter(lambda s: s is not None, content)
397 text = u' '.join(text)
398 # for i in range(len(text)):
400 # if text[i][0] not in separator\
401 # and text[i - 1][-1] not in separator:
402 # text.insert(i, u" ")
404 return re.sub("(?m)/$", "", text)
406 def add_part(snippets, **fields):
407 doc = self.create_book_doc(book)
408 for n, v in book_fields.items():
411 doc['header_index'] = fields["header_index"]
412 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
413 doc['header_type'] = fields['header_type']
415 doc['text'] = fields['text']
418 snip_pos = snippets.add(fields["text"])
420 doc['snippets_position'] = snip_pos[0]
421 doc['snippets_length'] = snip_pos[1]
422 if snippets.revision:
423 doc["snippets_revision"] = snippets.revision
425 if 'fragment_anchor' in fields:
426 doc["fragment_anchor"] = fields['fragment_anchor']
428 if 'themes' in fields:
429 doc['themes'] = fields['themes']
430 doc['uid'] = "part%s-%s-%s-%s" % (
431 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
435 snippets = Snippets(book.id).open('w')
437 for header, position in zip(list(master), range(len(master))):
439 if header.tag in self.skip_header_tags:
441 if header.tag is etree.Comment:
448 def all_content(text):
449 for frag in fragments.values():
450 frag['text'].append(text)
452 handle_text = [all_content]
454 for start, text, end in walker(header):
456 if start is not None and start.tag in self.footnote_tags:
459 def collect_footnote(t):
462 handle_text.append(collect_footnote)
463 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
465 doc = add_part(snippets, header_index=position, header_type=header.tag,
466 text=u''.join(footnote),
471 # handle fragments and themes.
472 if start is not None and start.tag == 'begin':
473 fid = start.attrib['id'][1:]
475 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
477 # themes for this fragment
478 elif start is not None and start.tag == 'motyw':
479 fid = start.attrib['id'][1:]
480 handle_text.append(lambda text: None)
481 if start.text is not None:
482 fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
483 elif end is not None and end.tag == 'motyw':
486 elif start is not None and start.tag == 'end':
487 fid = start.attrib['id'][1:]
488 if fid not in fragments:
489 continue # a broken <end> node, skip it
490 frag = fragments[fid]
491 if not frag['themes']:
492 continue # empty themes list.
495 doc = add_part(snippets,
496 header_type=frag['start_header'],
497 header_index=frag['start_section'],
498 header_span=position - frag['start_section'] + 1,
500 text=fix_format(frag['text']),
501 themes=frag['themes'])
506 if text is not None and handle_text is not []:
507 hdl = handle_text[-1]
510 # in the end, add a section text.
511 doc = add_part(snippets, header_index=position,
512 header_type=header.tag, text=fix_format(content))
519 def remove_picture(self, picture_or_id):
520 """Removes a picture from search index."""
521 if isinstance(picture_or_id, picture.models.Picture):
522 picture_id = picture_or_id.id
524 picture_id = picture_or_id
525 self.delete_query(self.index.Q(picture_id=picture_id))
527 def index_picture(self, picture, picture_info=None, overwrite=True):
530 Creates a lucene document for extracted metadata
531 and calls self.index_area() to index the contents of the picture.
534 # we don't remove snippets, since they might be still needed by
535 # threads using not reopened index
536 self.remove_picture(picture)
538 picture_doc = {'picture_id': int(picture.id)}
539 meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
540 'authors', 'title', 'epochs', 'kinds', 'genres'])
542 picture_doc.update(meta_fields)
544 picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
545 self.index.add(picture_doc)
546 del picture_doc['is_book']
547 for area in picture.areas.all():
548 self.index_area(area, picture_fields=picture_doc)
550 def index_area(self, area, picture_fields):
552 Indexes themes and objects on the area.
554 doc = dict(picture_fields)
555 doc['area_id'] = area.id
556 doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
557 doc['uid'] = 'area%s' % area.id
562 class SearchResult(object):
563 def __init__(self, doc, how_found=None, query_terms=None):
566 self._processed_hits = None # processed hits
568 self.query_terms = query_terms
572 self._score = doc['score']
576 self.book_id = int(doc["book_id"])
579 self.published_date = int(doc.get("published_date"))
581 self.published_date = 0
584 header_type = doc.get("header_type", None)
585 # we have a content hit in some header of fragment
586 if header_type is not None:
587 sec = (header_type, int(doc["header_index"]))
588 header_span = doc['header_span']
589 header_span = header_span is not None and int(header_span) or 1
590 fragment = doc.get("fragment_anchor", None)
591 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
592 snippets_rev = doc.get('snippets_revision', None)
594 hit = (sec + (header_span,), fragment, self._score, {
595 'how_found': how_found,
596 'snippets_pos': snippets_pos,
597 'snippets_revision': snippets_rev,
598 'themes': doc.get('themes', []),
599 'themes_pl': doc.get('themes_pl', [])
602 self._hits.append(hit)
605 def from_book(cls, book, how_found=None, query_terms=None):
607 'score': book.popularity.count,
611 result = cls(doc, how_found=how_found, query_terms=query_terms)
616 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
617 (self.book_id, len(self._hits),
618 len(self._processed_hits) if self._processed_hits else -1,
619 self._score, len(self.snippets))
622 return str(self).encode('utf-8')
626 return self._score * self.boost
628 def merge(self, other):
629 if self.book_id != other.book_id:
630 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
631 self._hits += other._hits
632 self._score += max(other._score, 0)
636 if self._book is not None:
639 self._book = catalogue.models.Book.objects.get(id=self.book_id)
640 except catalogue.models.Book.DoesNotExist:
644 book = property(get_book)
655 if self._processed_hits is not None:
656 return self._processed_hits
658 # to sections and fragments
659 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
661 sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
663 # sections not covered by fragments
664 sect = filter(lambda s: 0 == len(list(filter(
665 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
666 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
668 def remove_duplicates(lst, keyfn, compare):
673 if compare(els[eif], e) >= 1:
678 # remove fragments with duplicated fid's and duplicated snippets
679 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
680 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
681 # lambda a, b: cmp(a[SCORE], b[SCORE]))
683 # remove duplicate sections
687 si = s[self.POSITION][self.POSITION_INDEX]
690 if sections[si]['score'] >= s[self.SCORE]:
693 m = {'score': s[self.SCORE],
694 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
696 m.update(s[self.OTHER])
699 hits = list(sections.values())
703 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
704 except catalogue.models.Fragment.DoesNotExist:
707 # Figure out if we were searching for a token matching some word in theme name.
708 themes = frag.tags.filter(category='theme')
710 if self.query_terms is not None:
711 for i in range(0, len(f[self.OTHER]['themes'])):
712 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
713 tms = map(str.lower, tms)
714 for qt in self.query_terms:
716 themes_hit.add(f[self.OTHER]['themes'][i])
719 def theme_by_name(n):
720 th = list(filter(lambda t: t.name == n, themes))
725 themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
727 m = {'score': f[self.SCORE],
729 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
731 'themes_hit': themes_hit
733 m.update(f[self.OTHER])
736 hits.sort(key=lambda h: h['score'], reverse=True)
738 self._processed_hits = hits
743 def aggregate(*result_lists):
745 for rl in result_lists:
747 if r.book_id in books:
748 books[r.book_id].merge(r)
751 return books.values()
753 def get_sort_key(self):
756 self.book.sort_key_author if self.book else '',
757 self.book.sort_key if self.book else '')
759 def __lt__(self, other):
760 return self.get_sort_key() > other.get_sort_key()
762 def __eq__(self, other):
763 return self.get_sort_key() == other.get_sort_key()
766 return len(self.hits)
768 def snippet_pos(self, idx=0):
769 return self.hits[idx]['snippets_pos']
771 def snippet_revision(self, idx=0):
773 return self.hits[idx]['snippets_revision']
774 except (IndexError, KeyError):
779 class PictureResult(object):
780 def __init__(self, doc, how_found=None, query_terms=None):
782 self.query_terms = query_terms
785 self._processed_hits = None
788 self._score = doc['score']
792 self.picture_id = int(doc["picture_id"])
794 if doc.get('area_id'):
795 hit = (self._score, {
796 'how_found': how_found,
797 'area_id': doc['area_id'],
798 'themes': doc.get('themes', []),
799 'themes_pl': doc.get('themes_pl', []),
802 self._hits.append(hit)
805 return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
812 return self._score * self.boost
814 def merge(self, other):
815 if self.picture_id != other.picture_id:
817 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
818 self._hits += other._hits
819 self._score += max(other._score, 0)
827 if self._processed_hits is not None:
828 return self._processed_hits
831 for hit in self._hits:
833 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
834 except picture.models.PictureArea.DoesNotExist:
837 # Figure out if we were searching for a token matching some word in theme name.
839 if self.query_terms is not None:
840 for i in range(0, len(hit[self.OTHER]['themes'])):
841 tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
842 tms = map(str.lower, tms)
843 for qt in self.query_terms:
845 themes_hit.add(hit[self.OTHER]['themes'][i])
849 'score': hit[self.SCORE],
851 'themes_hit': themes_hit,
853 m.update(hit[self.OTHER])
856 hits.sort(key=lambda h: h['score'], reverse=True)
858 self._processed_hits = hits
861 def get_picture(self):
862 if self._picture is None:
863 self._picture = picture.models.Picture.objects.get(id=self.picture_id)
866 picture = property(get_picture)
869 def aggregate(*result_lists):
871 for rl in result_lists:
873 if r.picture_id in books:
874 books[r.picture_id].merge(r)
876 books[r.picture_id] = r
877 return books.values()
879 def __lt__(self, other):
880 return self.score < other.score
882 def __eq__(self, other):
883 return self.score == other.score
886 class Search(SolrIndex):
890 def __init__(self, default_field="text"):
891 super(Search, self).__init__(mode='r')
893 def make_term_query(self, query, field='text', modal=operator.or_):
895 Returns term queries joined by boolean query.
896 modal - applies to boolean query
897 fuzzy - should the query by fuzzy.
902 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
906 def search_by_author(self, words):
907 from catalogue.models import Book
908 books = Book.objects.filter(parent=None).order_by('-popularity__count')
910 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
911 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
913 def search_words(self, words, fields, required=None, book=True, picture=False):
914 if book and not picture and fields == ['authors']:
915 return self.search_by_author(words)
918 if book or picture or (word not in stopwords):
921 q = self.index.Q(**{field: word})
922 if word_filter is None:
926 filters.append(word_filter)
928 required_filter = None
929 for field in required:
931 if book or picture or (word not in stopwords):
932 q = self.index.Q(**{field: word})
933 if required_filter is None:
937 filters.append(required_filter)
942 params['is_book'] = True
944 params['picture_id__gt'] = 0
946 params['book_id__gt'] = 0
947 query = self.index.query(**params)
948 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
949 result_class = PictureResult if picture else SearchResult
950 return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
952 def get_snippets(self, searchresult, query, field='text', num=1):
954 Returns a snippet for found scoreDoc.
956 maxnum = len(searchresult)
957 if num is None or num < 0 or num > maxnum:
959 book_id = searchresult.book_id
960 revision = searchresult.snippet_revision()
961 snippets = Snippets(book_id, revision=revision)
962 snips = [None] * maxnum
966 while idx < maxnum and num > 0:
967 position, length = searchresult.snippet_pos(idx)
968 if position is None or length is None:
970 text = snippets.get((int(position),
972 snip = self.index.highlight(text=text, field=field, q=query)
973 if not snip and field == 'text':
974 snip = self.index.highlight(text=text, field='text_nonstem', q=query)
975 if snip not in snips:
982 book = catalogue.models.Book.objects.filter(id=book_id)
984 log.error("Book does not exist for book id = %d" % book_id)
985 elif not book.get().children.exists():
986 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
991 # remove verse end markers..
992 snips = [s.replace("/\n", "\n") if s else s for s in snips]
994 searchresult.snippets = snips
999 def apply_filters(query, filters):
1001 Apply filters to a query
1005 filters = filter(lambda x: x is not None, filters)
1007 query = query.query(f)
1011 if getattr(settings, 'SEARCH_MOCK', False):
1012 from .mock_search import Search