1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 class SolrIndex(object):
25 def __init__(self, mode=None):
26 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
29 class Snippets(object):
31 This class manages snippet files for indexed object (book)
32 the snippets are concatenated together, and their positions and
33 lengths are kept in lucene index fields.
35 SNIPPET_DIR = "snippets"
37 def __init__(self, book_id, revision=None):
38 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39 self.book_id = book_id
40 self.revision = revision
47 fn = "%d.%d" % (self.book_id, self.revision)
49 fn = "%d" % self.book_id
51 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53 def open(self, mode='r'):
55 Open the snippet file. Call .close() afterwards.
61 if os.path.exists(self.path):
64 if not os.path.exists(self.path):
68 self.file = open(self.path, mode)
72 def add(self, snippet):
74 Append a snippet (unicode) to the snippet file.
75 Return a (position, length) tuple
77 txt = snippet.encode('utf-8')
80 pos = (self.position, l)
86 Given a tuple of (position, length) return an unicode
87 of the snippet stored there.
89 self.file.seek(pos[0], 0)
90 txt = self.file.read(pos[1]).decode('utf-8')
94 """Close snippet file"""
110 class Index(SolrIndex):
112 Class indexing books.
115 super(Index, self).__init__(mode='rw')
117 def delete_query(self, *queries):
119 index.delete(queries=...) doesn't work, so let's reimplement it
120 using deletion of list of uids.
124 if isinstance(q, sunburnt.search.LuceneQuery):
125 q = self.index.query(q)
126 q.field_limiter.update(['uid'])
130 ids = q.paginate(start=st, rows=rows).execute()
137 self.index.delete(uids)
142 def index_tags(self, *tags, **kw):
144 Re-index global tag list.
145 Removes all tags from index, then index them again.
146 Indexed fields include: id, name (with and without polish stems), category
148 log.debug("Indexing tags")
149 remove_only = kw.get('remove_only', False)
150 # first, remove tags from index.
154 q_id = self.index.Q(tag_id=tag.id)
156 if isinstance(tag, PDCounterAuthor):
157 q_cat = self.index.Q(tag_category='pd_author')
158 elif isinstance(tag, PDCounterBook):
159 q_cat = self.index.Q(tag_category='pd_book')
161 q_cat = self.index.Q(tag_category=tag.category)
163 q_id_cat = self.index.Q(q_id & q_cat)
164 tag_qs.append(q_id_cat)
165 self.delete_query(*tag_qs)
167 q = self.index.Q(tag_id__any=True)
171 # then add them [all or just one passed]
174 catalogue.models.Tag.objects.exclude(category='set'),
175 PDCounterAuthor.objects.all(),
176 PDCounterBook.objects.all())
179 if isinstance(tag, PDCounterAuthor):
181 "tag_id": int(tag.id),
182 "tag_name": tag.name,
183 "tag_name_pl": tag.name,
184 "tag_category": 'pd_author',
185 "is_pdcounter": True,
186 "uid": "tag%d_pd_a" % tag.id
188 elif isinstance(tag, PDCounterBook):
190 "tag_id": int(tag.id),
191 "tag_name": tag.title,
192 "tag_name_pl": tag.title,
193 "tag_category": 'pd_book',
194 "is_pdcounter": True,
195 "uid": "tag%d_pd_b" % tag.id
199 "tag_id": int(tag.id),
200 "tag_name": tag.name,
201 "tag_name_pl": tag.name,
202 "tag_category": tag.category,
203 "is_pdcounter": False,
204 "uid": "tag%d" % tag.id
208 def create_book_doc(self, book):
210 Create a lucene document referring book id.
212 doc = {'book_id': int(book.id)}
213 if book.parent is not None:
214 doc['parent_id'] = int(book.parent.id)
217 def remove_book(self, book_or_id, remove_snippets=True):
218 """Removes a book from search index.
219 book - Book instance."""
220 if isinstance(book_or_id, catalogue.models.Book):
221 book_id = book_or_id.id
225 self.delete_query(self.index.Q(book_id=book_id))
228 snippets = Snippets(book_id)
231 def index_book(self, book, book_info=None, overwrite=True):
234 Creates a lucene document for extracted metadata
235 and calls self.index_content() to index the contents of the book.
238 # we don't remove snippets, since they might be still needed by
239 # threads using not reopened index
240 self.remove_book(book, remove_snippets=False)
242 book_doc = self.create_book_doc(book)
243 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
244 # let's not index it - it's only used for extracting publish date
245 if 'source_name' in meta_fields:
246 del meta_fields['source_name']
248 for n, f in meta_fields.items():
251 book_doc['uid'] = "book%s" % book_doc['book_id']
252 self.index.add(book_doc)
255 'title': meta_fields['title'],
256 'authors': meta_fields['authors'],
257 'published_date': meta_fields['published_date']
260 if 'translators' in meta_fields:
261 book_fields['translators'] = meta_fields['translators']
263 self.index_content(book, book_fields=book_fields)
268 'dramat_wierszowany_l',
269 'dramat_wierszowany_lp',
270 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
274 ignore_content_tags = [
276 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
278 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
281 footnote_tags = ['pa', 'pt', 'pr', 'pe']
283 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
284 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
286 published_date_re = re.compile("([0-9]+)[\]. ]*$")
288 def extract_metadata(self, book, book_info=None, dc_only=None):
290 Extract metadata from book and returns a map of fields keyed by fieldname
294 if book_info is None:
295 book_info = dcparser.parse(open(book.xml_file.path))
297 fields['slug'] = book.slug
298 fields['tags'] = [t.name for t in book.tags]
299 fields['is_book'] = True
302 for field in dcparser.BookInfo.FIELDS:
303 if dc_only and field.name not in dc_only:
305 if hasattr(book_info, field.name):
306 if not getattr(book_info, field.name):
308 # since no type information is available, we use validator
309 type_indicator = field.validator
310 if type_indicator == dcparser.as_unicode:
311 s = getattr(book_info, field.name)
314 fields[field.name] = s
315 elif type_indicator == dcparser.as_person:
316 p = getattr(book_info, field.name)
317 if isinstance(p, dcparser.Person):
320 persons = ', '.join(map(unicode, p))
321 fields[field.name] = persons
322 elif type_indicator == dcparser.as_date:
323 dt = getattr(book_info, field.name)
324 fields[field.name] = dt
328 if hasattr(book_info, 'source_name') and book_info.source_name:
329 match = self.published_date_re.search(book_info.source_name)
330 if match is not None:
331 pd = str(match.groups()[0])
334 fields["published_date"] = pd
338 # def add_gaps(self, fields, fieldname):
340 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
345 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
348 def get_master(self, root):
350 Returns the first master tag from an etree.
352 for master in root.iter():
353 if master.tag in self.master_tags:
356 def index_content(self, book, book_fields):
358 Walks the book XML and extract content from it.
359 Adds parts for each header tag and for each fragment.
361 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362 root = wld.edoc.getroot()
364 master = self.get_master(root)
368 def walker(node, ignore_tags=()):
369 if node.tag not in ignore_tags:
370 yield node, None, None
371 if node.text is not None:
372 yield None, node.text, None
373 for child in list(node):
374 for b, t, e in walker(child):
376 yield None, None, node
378 if node.tail is not None:
379 yield None, node.tail, None
382 def fix_format(text):
383 # separator = [u" ", u"\t", u".", u";", u","]
384 if isinstance(text, list):
385 # need to join it first
386 text = filter(lambda s: s is not None, content)
387 text = u' '.join(text)
388 # for i in range(len(text)):
390 # if text[i][0] not in separator\
391 # and text[i - 1][-1] not in separator:
392 # text.insert(i, u" ")
394 return re.sub("(?m)/$", "", text)
396 def add_part(snippets, **fields):
397 doc = self.create_book_doc(book)
398 for n, v in book_fields.items():
401 doc['header_index'] = fields["header_index"]
402 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403 doc['header_type'] = fields['header_type']
405 doc['text'] = fields['text']
408 snip_pos = snippets.add(fields["text"])
410 doc['snippets_position'] = snip_pos[0]
411 doc['snippets_length'] = snip_pos[1]
412 if snippets.revision:
413 doc["snippets_revision"] = snippets.revision
415 if 'fragment_anchor' in fields:
416 doc["fragment_anchor"] = fields['fragment_anchor']
418 if 'themes' in fields:
419 doc['themes'] = fields['themes']
420 doc['uid'] = "part%s%s%s" % (doc['header_index'],
422 doc.get('fragment_anchor', ''))
426 if isinstance(s, unicode):
427 return s.encode('utf-8')
432 snippets = Snippets(book.id).open('w')
434 for header, position in zip(list(master), range(len(master))):
436 if header.tag in self.skip_header_tags:
438 if header.tag is etree.Comment:
445 def all_content(text):
446 for frag in fragments.values():
447 frag['text'].append(text)
449 handle_text = [all_content]
451 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
453 if start is not None and start.tag in self.footnote_tags:
456 def collect_footnote(t):
459 handle_text.append(collect_footnote)
460 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
462 doc = add_part(snippets, header_index=position, header_type=header.tag,
463 text=u''.join(footnote),
468 # handle fragments and themes.
469 if start is not None and start.tag == 'begin':
470 fid = start.attrib['id'][1:]
472 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
474 # themes for this fragment
475 elif start is not None and start.tag == 'motyw':
476 fid = start.attrib['id'][1:]
477 handle_text.append(lambda text: None)
478 if start.text is not None:
479 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
480 elif end is not None and end.tag == 'motyw':
483 elif start is not None and start.tag == 'end':
484 fid = start.attrib['id'][1:]
485 if fid not in fragments:
486 continue # a broken <end> node, skip it
487 frag = fragments[fid]
488 if not frag['themes']:
489 continue # empty themes list.
492 doc = add_part(snippets,
493 header_type=frag['start_header'],
494 header_index=frag['start_section'],
495 header_span=position - frag['start_section'] + 1,
497 text=fix_format(frag['text']),
498 themes=frag['themes'])
503 if text is not None and handle_text is not []:
504 hdl = handle_text[-1]
507 # in the end, add a section text.
508 doc = add_part(snippets, header_index=position,
509 header_type=header.tag, text=fix_format(content))
517 class SearchResult(object):
518 def __init__(self, doc, how_found=None, query=None, query_terms=None):
519 # self.search = search
522 self._processed_hits = None # processed hits
524 self.query_terms = query_terms
528 self._score = doc['score']
532 self.book_id = int(doc["book_id"])
535 self.published_date = int(doc.get("published_date"))
537 self.published_date = 0
540 header_type = doc.get("header_type", None)
541 # we have a content hit in some header of fragment
542 if header_type is not None:
543 sec = (header_type, int(doc["header_index"]))
544 header_span = doc['header_span']
545 header_span = header_span is not None and int(header_span) or 1
546 fragment = doc.get("fragment_anchor", None)
547 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
548 snippets_rev = doc.get('snippets_revision', None)
550 hit = (sec + (header_span,), fragment, self._score, {
551 'how_found': how_found,
552 'snippets_pos': snippets_pos,
553 'snippets_revision': snippets_rev,
554 'themes': doc.get('themes', []),
555 'themes_pl': doc.get('themes_pl', [])
558 self._hits.append(hit)
560 def __unicode__(self):
561 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
562 (self.book_id, len(self._hits),
563 len(self._processed_hits) if self._processed_hits else -1,
564 self._score, len(self.snippets))
567 return unicode(self).encode('utf-8')
571 return self._score * self.boost
573 def merge(self, other):
574 if self.book_id != other.book_id:
575 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
576 self._hits += other._hits
577 if other.score > self.score:
578 self._score = other._score
582 if self._book is not None:
584 self._book = catalogue.models.Book.objects.get(id=self.book_id)
587 book = property(get_book)
598 if self._processed_hits is not None:
599 return self._processed_hits
601 # to sections and fragments
602 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
604 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
606 # sections not covered by fragments
607 sect = filter(lambda s: 0 == len(filter(
608 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
609 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
611 def remove_duplicates(lst, keyfn, compare):
616 if compare(els[eif], e) >= 1:
621 # remove fragments with duplicated fid's and duplicated snippets
622 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
623 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
624 # lambda a, b: cmp(a[SCORE], b[SCORE]))
626 # remove duplicate sections
630 si = s[self.POSITION][self.POSITION_INDEX]
633 if sections[si]['score'] >= s[self.SCORE]:
636 m = {'score': s[self.SCORE],
637 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
639 m.update(s[self.OTHER])
642 hits = sections.values()
646 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
647 except catalogue.models.Fragment.DoesNotExist:
650 # Figure out if we were searching for a token matching some word in theme name.
651 themes = frag.tags.filter(category='theme')
653 if self.query_terms is not None:
654 for i in range(0, len(f[self.OTHER]['themes'])):
655 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
656 tms = map(unicode.lower, tms)
657 for qt in self.query_terms:
659 themes_hit.add(f[self.OTHER]['themes'][i])
662 def theme_by_name(n):
663 th = filter(lambda t: t.name == n, themes)
668 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
670 m = {'score': f[self.SCORE],
672 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
674 'themes_hit': themes_hit
676 m.update(f[self.OTHER])
679 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
681 self._processed_hits = hits
686 def aggregate(*result_lists):
688 for rl in result_lists:
690 if r.book_id in books:
691 books[r.book_id].merge(r)
694 return books.values()
696 def __cmp__(self, other):
697 c = cmp(self.score, other.score)
699 # this is inverted, because earlier date is better
700 return cmp(other.published_date, self.published_date)
705 return len(self.hits)
707 def snippet_pos(self, idx=0):
708 return self.hits[idx]['snippets_pos']
710 def snippet_revision(self, idx=0):
712 return self.hits[idx]['snippets_revision']
713 except (IndexError, KeyError):
717 class Search(SolrIndex):
721 def __init__(self, default_field="text"):
722 super(Search, self).__init__(mode='r')
724 def make_term_query(self, query, field='text', modal=operator.or_):
726 Returns term queries joined by boolean query.
727 modal - applies to boolean query
728 fuzzy - should the query by fuzzy.
733 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
737 def search_phrase(self, searched, field='text', book=False,
743 filters.append(self.index.Q(is_book=True))
745 q = self.index.query(**{field: searched})
746 q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
748 return [SearchResult(found, how_found=u'search_phrase') for found in res]
750 def search_some(self, searched, fields, book=True,
751 filters=None, snippets=True, query_terms=None):
752 assert isinstance(fields, list)
756 filters.append(self.index.Q(is_book=True))
758 query = self.index.Q()
761 query = self.index.Q(query | self.make_term_query(searched, fld))
763 query = self.index.query(query)
764 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
765 res = query.execute()
766 return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
768 def search_everywhere(self, searched, query_terms=None):
770 Tries to use search terms to match different fields of book (or its parts).
771 E.g. one word can be an author survey, another be a part of the title, and the rest
772 are some words from third chapter.
775 # content only query : themes x content
776 q = self.make_term_query(searched, 'text')
777 q_themes = self.make_term_query(searched, 'themes_pl')
779 query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
780 res = query.execute()
783 books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
785 # query themes/content x author/title/tags
786 in_content = self.index.Q()
787 in_meta = self.index.Q()
789 for fld in ['themes_pl', 'text']:
790 in_content |= self.make_term_query(searched, field=fld)
792 for fld in ['tags', 'authors', 'title']:
793 in_meta |= self.make_term_query(searched, field=fld)
795 q = in_content & in_meta
796 res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
799 books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
803 def get_snippets(self, searchresult, query, field='text', num=1):
805 Returns a snippet for found scoreDoc.
807 maxnum = len(searchresult)
808 if num is None or num < 0 or num > maxnum:
810 book_id = searchresult.book_id
811 revision = searchresult.snippet_revision()
812 snippets = Snippets(book_id, revision=revision)
813 snips = [None] * maxnum
817 while idx < maxnum and num > 0:
818 position, length = searchresult.snippet_pos(idx)
819 if position is None or length is None:
821 text = snippets.get((int(position),
823 snip = self.index.highlight(text=text, field=field, q=query)
830 book = catalogue.models.Book.objects.filter(id=book_id)
832 log.error("Book does not exist for book id = %d" % book_id)
833 elif not book.get().children.exists():
834 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
839 # remove verse end markers..
840 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
842 searchresult.snippets = snips
846 def hint_tags(self, query, pdcounter=True, prefix=True):
848 Return auto-complete hints for tags
852 query = query.strip()
853 for field in ['tag_name', 'tag_name_pl']:
855 q |= self.index.Q(**{field: query + "*"})
857 q |= self.make_term_query(query, field=field)
858 qu = self.index.query(q)
860 return self.search_tags(qu, pdcounter=pdcounter)
862 def search_tags(self, query, filters=None, pdcounter=False):
864 Search for Tag objects using query.
869 filters.append(~self.index.Q(is_pdcounter=True))
870 res = self.apply_filters(query, filters).execute()
876 is_pdcounter = doc.get('is_pdcounter', False)
877 category = doc.get('tag_category')
880 if category == 'pd_author':
881 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
882 elif category == 'pd_book':
883 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
884 tag.category = 'pd_book' # make it look more lik a tag.
887 print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
888 int(doc.get('tag_id')), category)).encode('utf-8')
891 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
894 except catalogue.models.Tag.DoesNotExist:
896 except PDCounterAuthor.DoesNotExist:
898 except PDCounterBook.DoesNotExist:
901 tags_slugs = set(map(lambda t: t.slug, tags))
902 tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
904 log.debug('search_tags: %s' % tags)
908 def hint_books(self, query, prefix=True):
910 Returns auto-complete hints for book titles
911 Because we do not index 'pseudo' title-tags.
915 query = query.strip()
917 q |= self.index.Q(title=query + "*")
919 q |= self.make_term_query(query, field='title')
920 qu = self.index.query(q)
921 only_books = self.index.Q(is_book=True)
922 return self.search_books(qu, [only_books])
924 def search_books(self, query, filters=None, max_results=10):
926 Searches for Book objects using query
930 query = query.query(is_book=True)
931 res = self.apply_filters(query, filters).field_limit(['book_id'])
935 if bid not in bks_found:
936 bks.append(catalogue.models.Book.objects.get(id=bid))
938 except catalogue.models.Book.DoesNotExist:
943 def apply_filters(query, filters):
945 Apply filters to a query
949 filters = filter(lambda x: x is not None, filters)
951 query = query.query(f)
955 if getattr(settings, 'SEARCH_MOCK', False):
956 from .mock_search import Search