1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 class SolrIndex(object):
25 def __init__(self, mode=None):
26 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
29 class Snippets(object):
31 This class manages snippet files for indexed object (book)
32 the snippets are concatenated together, and their positions and
33 lengths are kept in lucene index fields.
35 SNIPPET_DIR = "snippets"
37 def __init__(self, book_id, revision=None):
38 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39 self.book_id = book_id
40 self.revision = revision
47 fn = "%d.%d" % (self.book_id, self.revision)
49 fn = "%d" % self.book_id
51 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53 def open(self, mode='r'):
55 Open the snippet file. Call .close() afterwards.
61 if os.path.exists(self.path):
64 if not os.path.exists(self.path):
68 self.file = open(self.path, mode)
72 def add(self, snippet):
74 Append a snippet (unicode) to the snippet file.
75 Return a (position, length) tuple
77 txt = snippet.encode('utf-8')
80 pos = (self.position, l)
86 Given a tuple of (position, length) return an unicode
87 of the snippet stored there.
89 self.file.seek(pos[0], 0)
90 txt = self.file.read(pos[1]).decode('utf-8')
94 """Close snippet file"""
110 class Index(SolrIndex):
112 Class indexing books.
115 super(Index, self).__init__(mode='rw')
117 def delete_query(self, *queries):
119 index.delete(queries=...) doesn't work, so let's reimplement it
120 using deletion of list of uids.
124 if isinstance(q, sunburnt.search.LuceneQuery):
125 q = self.index.query(q)
126 q.field_limiter.update(['uid'])
130 ids = q.paginate(start=st, rows=rows).execute()
137 self.index.delete(uids)
143 def index_tags(self, *tags, **kw):
145 Re-index global tag list.
146 Removes all tags from index, then index them again.
147 Indexed fields include: id, name (with and without polish stems), category
149 log.debug("Indexing tags")
150 remove_only = kw.get('remove_only', False)
151 # first, remove tags from index.
155 q_id = self.index.Q(tag_id=tag.id)
157 if isinstance(tag, PDCounterAuthor):
158 q_cat = self.index.Q(tag_category='pd_author')
159 elif isinstance(tag, PDCounterBook):
160 q_cat = self.index.Q(tag_category='pd_book')
162 q_cat = self.index.Q(tag_category=tag.category)
164 q_id_cat = self.index.Q(q_id & q_cat)
165 tag_qs.append(q_id_cat)
166 self.delete_query(*tag_qs)
168 q = self.index.Q(tag_id__any=True)
172 # then add them [all or just one passed]
175 catalogue.models.Tag.objects.exclude(category='set'),
176 PDCounterAuthor.objects.all(),
177 PDCounterBook.objects.all())
180 if isinstance(tag, PDCounterAuthor):
182 "tag_id": int(tag.id),
183 "tag_name": tag.name,
184 "tag_name_pl": tag.name,
185 "tag_category": 'pd_author',
186 "is_pdcounter": True,
187 "uid": "tag%d_pd_a" % tag.id
189 elif isinstance(tag, PDCounterBook):
191 "tag_id": int(tag.id),
192 "tag_name": tag.title,
193 "tag_name_pl": tag.title,
194 "tag_category": 'pd_book',
195 "is_pdcounter": True,
196 "uid": "tag%d_pd_b" % tag.id
200 "tag_id": int(tag.id),
201 "tag_name": tag.name,
202 "tag_name_pl": tag.name,
203 "tag_category": tag.category,
204 "is_pdcounter": False,
205 "uid": "tag%d" % tag.id
209 def create_book_doc(self, book):
211 Create a lucene document referring book id.
213 doc = {'book_id': int(book.id)}
214 if book.parent is not None:
215 doc['parent_id'] = int(book.parent.id)
218 def remove_book(self, book_or_id, remove_snippets=True):
219 """Removes a book from search index.
220 book - Book instance."""
221 if isinstance(book_or_id, catalogue.models.Book):
222 book_id = book_or_id.id
226 self.delete_query(self.index.Q(book_id=book_id))
229 snippets = Snippets(book_id)
232 def index_book(self, book, book_info=None, overwrite=True):
235 Creates a lucene document for extracted metadata
236 and calls self.index_content() to index the contents of the book.
239 # we don't remove snippets, since they might be still needed by
240 # threads using not reopened index
241 self.remove_book(book, remove_snippets=False)
243 book_doc = self.create_book_doc(book)
244 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
245 # let's not index it - it's only used for extracting publish date
246 if 'source_name' in meta_fields:
247 del meta_fields['source_name']
249 for n, f in meta_fields.items():
252 book_doc['uid'] = "book%s" % book_doc['book_id']
253 self.index.add(book_doc)
256 'title': meta_fields['title'],
257 'authors': meta_fields['authors'],
258 'published_date': meta_fields['published_date']
261 if 'translators' in meta_fields:
262 book_fields['translators'] = meta_fields['translators']
264 self.index_content(book, book_fields=book_fields)
269 'dramat_wierszowany_l',
270 'dramat_wierszowany_lp',
271 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
275 ignore_content_tags = [
277 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
279 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
282 footnote_tags = ['pa', 'pt', 'pr', 'pe']
284 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
285 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
287 published_date_re = re.compile("([0-9]+)[\]. ]*$")
289 def extract_metadata(self, book, book_info=None, dc_only=None):
291 Extract metadata from book and returns a map of fields keyed by fieldname
295 if book_info is None:
296 book_info = dcparser.parse(open(book.xml_file.path))
298 fields['slug'] = book.slug
299 fields['tags'] = [t.name for t in book.tags]
300 fields['is_book'] = True
303 for field in dcparser.BookInfo.FIELDS:
304 if dc_only and field.name not in dc_only:
306 if hasattr(book_info, field.name):
307 if not getattr(book_info, field.name):
309 # since no type information is available, we use validator
310 type_indicator = field.validator
311 if type_indicator == dcparser.as_unicode:
312 s = getattr(book_info, field.name)
315 fields[field.name] = s
316 elif type_indicator == dcparser.as_person:
317 p = getattr(book_info, field.name)
318 if isinstance(p, dcparser.Person):
321 persons = ', '.join(map(unicode, p))
322 fields[field.name] = persons
323 elif type_indicator == dcparser.as_date:
324 dt = getattr(book_info, field.name)
325 fields[field.name] = dt
329 if hasattr(book_info, 'source_name') and book_info.source_name:
330 match = self.published_date_re.search(book_info.source_name)
331 if match is not None:
332 pd = str(match.groups()[0])
335 fields["published_date"] = pd
339 # def add_gaps(self, fields, fieldname):
341 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
342 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
346 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
347 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
349 def get_master(self, root):
351 Returns the first master tag from an etree.
353 for master in root.iter():
354 if master.tag in self.master_tags:
357 def index_content(self, book, book_fields):
359 Walks the book XML and extract content from it.
360 Adds parts for each header tag and for each fragment.
362 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
363 root = wld.edoc.getroot()
365 master = self.get_master(root)
369 def walker(node, ignore_tags=()):
370 if node.tag not in ignore_tags:
371 yield node, None, None
372 if node.text is not None:
373 yield None, node.text, None
374 for child in list(node):
375 for b, t, e in walker(child):
377 yield None, None, node
379 if node.tail is not None:
380 yield None, node.tail, None
383 def fix_format(text):
384 # separator = [u" ", u"\t", u".", u";", u","]
385 if isinstance(text, list):
386 # need to join it first
387 text = filter(lambda s: s is not None, content)
388 text = u' '.join(text)
389 # for i in range(len(text)):
391 # if text[i][0] not in separator\
392 # and text[i - 1][-1] not in separator:
393 # text.insert(i, u" ")
395 return re.sub("(?m)/$", "", text)
397 def add_part(snippets, **fields):
398 doc = self.create_book_doc(book)
399 for n, v in book_fields.items():
402 doc['header_index'] = fields["header_index"]
403 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
404 doc['header_type'] = fields['header_type']
406 doc['text'] = fields['text']
409 snip_pos = snippets.add(fields["text"])
411 doc['snippets_position'] = snip_pos[0]
412 doc['snippets_length'] = snip_pos[1]
413 if snippets.revision:
414 doc["snippets_revision"] = snippets.revision
416 if 'fragment_anchor' in fields:
417 doc["fragment_anchor"] = fields['fragment_anchor']
419 if 'themes' in fields:
420 doc['themes'] = fields['themes']
421 doc['uid'] = "part%s%s%s" % (doc['header_index'],
423 doc.get('fragment_anchor', ''))
427 if isinstance(s, unicode):
428 return s.encode('utf-8')
433 snippets = Snippets(book.id).open('w')
435 for header, position in zip(list(master), range(len(master))):
437 if header.tag in self.skip_header_tags:
439 if header.tag is etree.Comment:
446 def all_content(text):
447 for frag in fragments.values():
448 frag['text'].append(text)
450 handle_text = [all_content]
452 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
454 if start is not None and start.tag in self.footnote_tags:
457 def collect_footnote(t):
460 handle_text.append(collect_footnote)
461 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
463 doc = add_part(snippets, header_index=position, header_type=header.tag,
464 text=u''.join(footnote),
469 # handle fragments and themes.
470 if start is not None and start.tag == 'begin':
471 fid = start.attrib['id'][1:]
473 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475 # themes for this fragment
476 elif start is not None and start.tag == 'motyw':
477 fid = start.attrib['id'][1:]
478 handle_text.append(lambda text: None)
479 if start.text is not None:
480 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
481 elif end is not None and end.tag == 'motyw':
484 elif start is not None and start.tag == 'end':
485 fid = start.attrib['id'][1:]
486 if fid not in fragments:
487 continue # a broken <end> node, skip it
488 frag = fragments[fid]
489 if not frag['themes']:
490 continue # empty themes list.
493 doc = add_part(snippets,
494 header_type=frag['start_header'],
495 header_index=frag['start_section'],
496 header_span=position - frag['start_section'] + 1,
498 text=fix_format(frag['text']),
499 themes=frag['themes'])
504 if text is not None and handle_text is not []:
505 hdl = handle_text[-1]
508 # in the end, add a section text.
509 doc = add_part(snippets, header_index=position,
510 header_type=header.tag, text=fix_format(content))
518 class SearchResult(object):
519 def __init__(self, doc, how_found=None, query=None, query_terms=None):
520 # self.search = search
523 self._processed_hits = None # processed hits
525 self.query_terms = query_terms
529 self._score = doc['score']
533 self.book_id = int(doc["book_id"])
536 self.published_date = int(doc.get("published_date"))
538 self.published_date = 0
541 header_type = doc.get("header_type", None)
542 # we have a content hit in some header of fragment
543 if header_type is not None:
544 sec = (header_type, int(doc["header_index"]))
545 header_span = doc['header_span']
546 header_span = header_span is not None and int(header_span) or 1
547 fragment = doc.get("fragment_anchor", None)
548 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549 snippets_rev = doc.get('snippets_revision', None)
551 hit = (sec + (header_span,), fragment, self._score, {
552 'how_found': how_found,
553 'snippets_pos': snippets_pos,
554 'snippets_revision': snippets_rev,
555 'themes': doc.get('themes', []),
556 'themes_pl': doc.get('themes_pl', [])
559 self._hits.append(hit)
561 def __unicode__(self):
562 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
563 (self.book_id, len(self._hits),
564 len(self._processed_hits) if self._processed_hits else -1,
565 self._score, len(self.snippets))
568 return unicode(self).encode('utf-8')
572 return self._score * self.boost
574 def merge(self, other):
575 if self.book_id != other.book_id:
576 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
577 self._hits += other._hits
578 if other.score > self.score:
579 self._score = other._score
583 if self._book is not None:
585 self._book = catalogue.models.Book.objects.get(id=self.book_id)
588 book = property(get_book)
599 if self._processed_hits is not None:
600 return self._processed_hits
602 # to sections and fragments
603 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
605 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
607 # sections not covered by fragments
608 sect = filter(lambda s: 0 == len(filter(
609 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
610 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
612 def remove_duplicates(lst, keyfn, compare):
617 if compare(els[eif], e) >= 1:
622 # remove fragments with duplicated fid's and duplicated snippets
623 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
624 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
625 # lambda a, b: cmp(a[SCORE], b[SCORE]))
627 # remove duplicate sections
631 si = s[self.POSITION][self.POSITION_INDEX]
634 if sections[si]['score'] >= s[self.SCORE]:
637 m = {'score': s[self.SCORE],
638 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
640 m.update(s[self.OTHER])
643 hits = sections.values()
647 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
648 except catalogue.models.Fragment.DoesNotExist:
651 # Figure out if we were searching for a token matching some word in theme name.
652 themes = frag.tags.filter(category='theme')
654 if self.query_terms is not None:
655 for i in range(0, len(f[self.OTHER]['themes'])):
656 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
657 tms = map(unicode.lower, tms)
658 for qt in self.query_terms:
660 themes_hit.add(f[self.OTHER]['themes'][i])
663 def theme_by_name(n):
664 th = filter(lambda t: t.name == n, themes)
669 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
671 m = {'score': f[self.SCORE],
673 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
675 'themes_hit': themes_hit
677 m.update(f[self.OTHER])
680 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
682 self._processed_hits = hits
687 def aggregate(*result_lists):
689 for rl in result_lists:
691 if r.book_id in books:
692 books[r.book_id].merge(r)
695 return books.values()
697 def __cmp__(self, other):
698 c = cmp(self.score, other.score)
700 # this is inverted, because earlier date is better
701 return cmp(other.published_date, self.published_date)
706 return len(self.hits)
708 def snippet_pos(self, idx=0):
709 return self.hits[idx]['snippets_pos']
711 def snippet_revision(self, idx=0):
713 return self.hits[idx]['snippets_revision']
714 except (IndexError, KeyError):
718 class Search(SolrIndex):
722 def __init__(self, default_field="text"):
723 super(Search, self).__init__(mode='r')
725 def make_term_query(self, query, field='text', modal=operator.or_):
727 Returns term queries joined by boolean query.
728 modal - applies to boolean query
729 fuzzy - should the query by fuzzy.
734 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
738 def search_phrase(self, searched, field='text', book=False,
744 filters.append(self.index.Q(is_book=True))
746 q = self.index.query(**{field: searched})
747 q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
749 return [SearchResult(found, how_found=u'search_phrase') for found in res]
751 def search_some(self, searched, fields, book=True,
752 filters=None, snippets=True, query_terms=None):
753 assert isinstance(fields, list)
757 filters.append(self.index.Q(is_book=True))
759 query = self.index.Q()
762 query = self.index.Q(query | self.make_term_query(searched, fld))
764 query = self.index.query(query)
765 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
766 res = query.execute()
767 return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
769 def search_everywhere(self, searched, query_terms=None):
771 Tries to use search terms to match different fields of book (or its parts).
772 E.g. one word can be an author survey, another be a part of the title, and the rest
773 are some words from third chapter.
776 # content only query : themes x content
777 q = self.make_term_query(searched, 'text')
778 q_themes = self.make_term_query(searched, 'themes_pl')
780 query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
781 res = query.execute()
784 books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
786 # query themes/content x author/title/tags
787 in_content = self.index.Q()
788 in_meta = self.index.Q()
790 for fld in ['themes_pl', 'text']:
791 in_content |= self.make_term_query(searched, field=fld)
793 for fld in ['tags', 'authors', 'title']:
794 in_meta |= self.make_term_query(searched, field=fld)
796 q = in_content & in_meta
797 res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
800 books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
804 def get_snippets(self, searchresult, query, field='text', num=1):
806 Returns a snippet for found scoreDoc.
808 maxnum = len(searchresult)
809 if num is None or num < 0 or num > maxnum:
811 book_id = searchresult.book_id
812 revision = searchresult.snippet_revision()
813 snippets = Snippets(book_id, revision=revision)
814 snips = [None] * maxnum
818 while idx < maxnum and num > 0:
819 position, length = searchresult.snippet_pos(idx)
820 if position is None or length is None:
822 text = snippets.get((int(position),
824 snip = self.index.highlight(text=text, field=field, q=query)
831 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
836 # remove verse end markers..
837 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
839 searchresult.snippets = snips
843 def hint_tags(self, query, pdcounter=True, prefix=True):
845 Return auto-complete hints for tags
849 query = query.strip()
850 for field in ['tag_name', 'tag_name_pl']:
852 q |= self.index.Q(**{field: query + "*"})
854 q |= self.make_term_query(query, field=field)
855 qu = self.index.query(q)
857 return self.search_tags(qu, pdcounter=pdcounter)
859 def search_tags(self, query, filters=None, pdcounter=False):
861 Search for Tag objects using query.
866 filters.append(~self.index.Q(is_pdcounter=True))
867 res = self.apply_filters(query, filters).execute()
873 is_pdcounter = doc.get('is_pdcounter', False)
874 category = doc.get('tag_category')
877 if category == 'pd_author':
878 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
879 elif category == 'pd_book':
880 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
881 tag.category = 'pd_book' # make it look more lik a tag.
884 print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
885 int(doc.get('tag_id')), category)).encode('utf-8')
888 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
891 except catalogue.models.Tag.DoesNotExist:
893 except PDCounterAuthor.DoesNotExist:
895 except PDCounterBook.DoesNotExist:
898 tags_slugs = set(map(lambda t: t.slug, tags))
899 tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
901 log.debug('search_tags: %s' % tags)
905 def hint_books(self, query, prefix=True):
907 Returns auto-complete hints for book titles
908 Because we do not index 'pseudo' title-tags.
912 query = query.strip()
914 q |= self.index.Q(title=query + "*")
916 q |= self.make_term_query(query, field='title')
917 qu = self.index.query(q)
918 only_books = self.index.Q(is_book=True)
919 return self.search_books(qu, [only_books])
921 def search_books(self, query, filters=None, max_results=10):
923 Searches for Book objects using query
927 query = query.query(is_book=True)
928 res = self.apply_filters(query, filters).field_limit(['book_id'])
932 if bid not in bks_found:
933 bks.append(catalogue.models.Book.objects.get(id=bid))
935 except catalogue.models.Book.DoesNotExist:
940 def apply_filters(query, filters):
942 Apply filters to a query
946 filters = filter(lambda x: x is not None, filters)
948 query = query.query(f)
952 if getattr(settings, 'SEARCH_MOCK', False):
953 from .mock_search import Search