1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 class SolrIndex(object):
25 def __init__(self, mode=None):
26 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
29 class Snippets(object):
31 This class manages snippet files for indexed object (book)
32 the snippets are concatenated together, and their positions and
33 lengths are kept in lucene index fields.
35 SNIPPET_DIR = "snippets"
37 def __init__(self, book_id, revision=None):
38 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39 self.book_id = book_id
40 self.revision = revision
47 fn = "%d.%d" % (self.book_id, self.revision)
49 fn = "%d" % self.book_id
51 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53 def open(self, mode='r'):
55 Open the snippet file. Call .close() afterwards.
61 if os.path.exists(self.path):
64 if not os.path.exists(self.path):
68 self.file = open(self.path, mode)
72 def add(self, snippet):
74 Append a snippet (unicode) to the snippet file.
75 Return a (position, length) tuple
77 txt = snippet.encode('utf-8')
80 pos = (self.position, l)
86 Given a tuple of (position, length) return an unicode
87 of the snippet stored there.
89 self.file.seek(pos[0], 0)
90 txt = self.file.read(pos[1]).decode('utf-8')
94 """Close snippet file"""
110 class Index(SolrIndex):
112 Class indexing books.
115 super(Index, self).__init__(mode='rw')
117 def delete_query(self, *queries):
119 index.delete(queries=...) doesn't work, so let's reimplement it
120 using deletion of list of uids.
124 if isinstance(q, sunburnt.search.LuceneQuery):
125 q = self.index.query(q)
126 q.field_limiter.update(['uid'])
130 ids = q.paginate(start=st, rows=rows).execute()
137 self.index.delete(uids)
142 def index_tags(self, *tags, **kw):
144 Re-index global tag list.
145 Removes all tags from index, then index them again.
146 Indexed fields include: id, name (with and without polish stems), category
148 log.debug("Indexing tags")
149 remove_only = kw.get('remove_only', False)
150 # first, remove tags from index.
154 q_id = self.index.Q(tag_id=tag.id)
156 if isinstance(tag, PDCounterAuthor):
157 q_cat = self.index.Q(tag_category='pd_author')
158 elif isinstance(tag, PDCounterBook):
159 q_cat = self.index.Q(tag_category='pd_book')
161 q_cat = self.index.Q(tag_category=tag.category)
163 q_id_cat = self.index.Q(q_id & q_cat)
164 tag_qs.append(q_id_cat)
165 self.delete_query(*tag_qs)
167 q = self.index.Q(tag_id__any=True)
171 # then add them [all or just one passed]
174 catalogue.models.Tag.objects.exclude(category='set'),
175 PDCounterAuthor.objects.all(),
176 PDCounterBook.objects.all())
179 if isinstance(tag, PDCounterAuthor):
181 "tag_id": int(tag.id),
182 "tag_name": tag.name,
183 "tag_name_pl": tag.name,
184 "tag_category": 'pd_author',
185 "is_pdcounter": True,
186 "uid": "tag%d_pd_a" % tag.id
188 elif isinstance(tag, PDCounterBook):
190 "tag_id": int(tag.id),
191 "tag_name": tag.title,
192 "tag_name_pl": tag.title,
193 "tag_category": 'pd_book',
194 "is_pdcounter": True,
195 "uid": "tag%d_pd_b" % tag.id
199 "tag_id": int(tag.id),
200 "tag_name": tag.name,
201 "tag_name_pl": tag.name,
202 "tag_category": tag.category,
203 "is_pdcounter": False,
204 "uid": "tag%d" % tag.id
208 def create_book_doc(self, book):
210 Create a lucene document referring book id.
212 doc = {'book_id': int(book.id)}
213 if book.parent is not None:
214 doc['parent_id'] = int(book.parent.id)
217 def remove_book(self, book_or_id, remove_snippets=True):
218 """Removes a book from search index.
219 book - Book instance."""
220 if isinstance(book_or_id, catalogue.models.Book):
221 book_id = book_or_id.id
225 self.delete_query(self.index.Q(book_id=book_id))
228 snippets = Snippets(book_id)
231 def index_book(self, book, book_info=None, overwrite=True):
234 Creates a lucene document for extracted metadata
235 and calls self.index_content() to index the contents of the book.
238 # we don't remove snippets, since they might be still needed by
239 # threads using not reopened index
240 self.remove_book(book, remove_snippets=False)
242 book_doc = self.create_book_doc(book)
243 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
244 # let's not index it - it's only used for extracting publish date
245 if 'source_name' in meta_fields:
246 del meta_fields['source_name']
248 for n, f in meta_fields.items():
251 book_doc['uid'] = "book%s" % book_doc['book_id']
252 self.index.add(book_doc)
255 'title': meta_fields['title'],
256 'authors': meta_fields['authors'],
257 'published_date': meta_fields['published_date']
260 if 'translators' in meta_fields:
261 book_fields['translators'] = meta_fields['translators']
263 self.index_content(book, book_fields=book_fields)
268 'dramat_wierszowany_l',
269 'dramat_wierszowany_lp',
270 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
274 ignore_content_tags = [
276 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
278 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
281 footnote_tags = ['pa', 'pt', 'pr', 'pe']
283 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
284 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
286 published_date_re = re.compile("([0-9]+)[\]. ]*$")
288 def extract_metadata(self, book, book_info=None, dc_only=None):
290 Extract metadata from book and returns a map of fields keyed by fieldname
294 if book_info is None:
295 book_info = dcparser.parse(open(book.xml_file.path))
297 fields['slug'] = book.slug
298 fields['tags'] = [t.name for t in book.tags]
299 fields['is_book'] = True
302 for field in dcparser.BookInfo.FIELDS:
303 if dc_only and field.name not in dc_only:
305 if hasattr(book_info, field.name):
306 if not getattr(book_info, field.name):
308 # since no type information is available, we use validator
309 type_indicator = field.validator
310 if type_indicator == dcparser.as_unicode:
311 s = getattr(book_info, field.name)
314 fields[field.name] = s
315 elif type_indicator == dcparser.as_person:
316 p = getattr(book_info, field.name)
317 if isinstance(p, dcparser.Person):
320 persons = ', '.join(map(unicode, p))
321 fields[field.name] = persons
322 elif type_indicator == dcparser.as_date:
323 dt = getattr(book_info, field.name)
324 fields[field.name] = dt
328 if hasattr(book_info, 'source_name') and book_info.source_name:
329 match = self.published_date_re.search(book_info.source_name)
330 if match is not None:
331 pd = str(match.groups()[0])
334 fields["published_date"] = pd
338 # def add_gaps(self, fields, fieldname):
340 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
345 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
348 def get_master(self, root):
350 Returns the first master tag from an etree.
352 for master in root.iter():
353 if master.tag in self.master_tags:
356 def index_content(self, book, book_fields):
358 Walks the book XML and extract content from it.
359 Adds parts for each header tag and for each fragment.
361 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362 root = wld.edoc.getroot()
364 master = self.get_master(root)
368 def walker(node, ignore_tags=()):
369 if node.tag not in ignore_tags:
370 yield node, None, None
371 if node.text is not None:
372 yield None, node.text, None
373 for child in list(node):
374 for b, t, e in walker(child):
376 yield None, None, node
378 if node.tail is not None:
379 yield None, node.tail, None
382 def fix_format(text):
383 # separator = [u" ", u"\t", u".", u";", u","]
384 if isinstance(text, list):
385 # need to join it first
386 text = filter(lambda s: s is not None, content)
387 text = u' '.join(text)
388 # for i in range(len(text)):
390 # if text[i][0] not in separator\
391 # and text[i - 1][-1] not in separator:
392 # text.insert(i, u" ")
394 return re.sub("(?m)/$", "", text)
396 def add_part(snippets, **fields):
397 doc = self.create_book_doc(book)
398 for n, v in book_fields.items():
401 doc['header_index'] = fields["header_index"]
402 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403 doc['header_type'] = fields['header_type']
405 doc['text'] = fields['text']
408 snip_pos = snippets.add(fields["text"])
410 doc['snippets_position'] = snip_pos[0]
411 doc['snippets_length'] = snip_pos[1]
412 if snippets.revision:
413 doc["snippets_revision"] = snippets.revision
415 if 'fragment_anchor' in fields:
416 doc["fragment_anchor"] = fields['fragment_anchor']
418 if 'themes' in fields:
419 doc['themes'] = fields['themes']
420 doc['uid'] = "part%s-%s-%s-%s" % (
421 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
425 if isinstance(s, unicode):
426 return s.encode('utf-8')
431 snippets = Snippets(book.id).open('w')
433 for header, position in zip(list(master), range(len(master))):
435 if header.tag in self.skip_header_tags:
437 if header.tag is etree.Comment:
444 def all_content(text):
445 for frag in fragments.values():
446 frag['text'].append(text)
448 handle_text = [all_content]
450 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
452 if start is not None and start.tag in self.footnote_tags:
455 def collect_footnote(t):
458 handle_text.append(collect_footnote)
459 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
461 doc = add_part(snippets, header_index=position, header_type=header.tag,
462 text=u''.join(footnote),
467 # handle fragments and themes.
468 if start is not None and start.tag == 'begin':
469 fid = start.attrib['id'][1:]
471 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
473 # themes for this fragment
474 elif start is not None and start.tag == 'motyw':
475 fid = start.attrib['id'][1:]
476 handle_text.append(lambda text: None)
477 if start.text is not None:
478 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
479 elif end is not None and end.tag == 'motyw':
482 elif start is not None and start.tag == 'end':
483 fid = start.attrib['id'][1:]
484 if fid not in fragments:
485 continue # a broken <end> node, skip it
486 frag = fragments[fid]
487 if not frag['themes']:
488 continue # empty themes list.
491 doc = add_part(snippets,
492 header_type=frag['start_header'],
493 header_index=frag['start_section'],
494 header_span=position - frag['start_section'] + 1,
496 text=fix_format(frag['text']),
497 themes=frag['themes'])
502 if text is not None and handle_text is not []:
503 hdl = handle_text[-1]
506 # in the end, add a section text.
507 doc = add_part(snippets, header_index=position,
508 header_type=header.tag, text=fix_format(content))
516 class SearchResult(object):
517 def __init__(self, doc, how_found=None, query_terms=None):
520 self._processed_hits = None # processed hits
522 self.query_terms = query_terms
526 self._score = doc['score']
530 self.book_id = int(doc["book_id"])
533 self.published_date = int(doc.get("published_date"))
535 self.published_date = 0
538 header_type = doc.get("header_type", None)
539 # we have a content hit in some header of fragment
540 if header_type is not None:
541 sec = (header_type, int(doc["header_index"]))
542 header_span = doc['header_span']
543 header_span = header_span is not None and int(header_span) or 1
544 fragment = doc.get("fragment_anchor", None)
545 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
546 snippets_rev = doc.get('snippets_revision', None)
548 hit = (sec + (header_span,), fragment, self._score, {
549 'how_found': how_found,
550 'snippets_pos': snippets_pos,
551 'snippets_revision': snippets_rev,
552 'themes': doc.get('themes', []),
553 'themes_pl': doc.get('themes_pl', [])
556 self._hits.append(hit)
558 def __unicode__(self):
559 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
560 (self.book_id, len(self._hits),
561 len(self._processed_hits) if self._processed_hits else -1,
562 self._score, len(self.snippets))
565 return unicode(self).encode('utf-8')
569 return self._score * self.boost
571 def merge(self, other):
572 if self.book_id != other.book_id:
573 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
574 self._hits += other._hits
575 if other.score > self.score:
576 self._score = other._score
580 if self._book is not None:
582 self._book = catalogue.models.Book.objects.get(id=self.book_id)
585 book = property(get_book)
596 if self._processed_hits is not None:
597 return self._processed_hits
599 # to sections and fragments
600 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
602 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
604 # sections not covered by fragments
605 sect = filter(lambda s: 0 == len(filter(
606 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
607 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
609 def remove_duplicates(lst, keyfn, compare):
614 if compare(els[eif], e) >= 1:
619 # remove fragments with duplicated fid's and duplicated snippets
620 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
621 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
622 # lambda a, b: cmp(a[SCORE], b[SCORE]))
624 # remove duplicate sections
628 si = s[self.POSITION][self.POSITION_INDEX]
631 if sections[si]['score'] >= s[self.SCORE]:
634 m = {'score': s[self.SCORE],
635 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
637 m.update(s[self.OTHER])
640 hits = sections.values()
644 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
645 except catalogue.models.Fragment.DoesNotExist:
648 # Figure out if we were searching for a token matching some word in theme name.
649 themes = frag.tags.filter(category='theme')
651 if self.query_terms is not None:
652 for i in range(0, len(f[self.OTHER]['themes'])):
653 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
654 tms = map(unicode.lower, tms)
655 for qt in self.query_terms:
657 themes_hit.add(f[self.OTHER]['themes'][i])
660 def theme_by_name(n):
661 th = filter(lambda t: t.name == n, themes)
666 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
668 m = {'score': f[self.SCORE],
670 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
672 'themes_hit': themes_hit
674 m.update(f[self.OTHER])
677 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
679 self._processed_hits = hits
684 def aggregate(*result_lists):
686 for rl in result_lists:
688 if r.book_id in books:
689 books[r.book_id].merge(r)
692 return books.values()
694 def __cmp__(self, other):
695 c = cmp(self.score, other.score)
697 # this is inverted, because earlier date is better
698 return cmp(other.published_date, self.published_date)
703 return len(self.hits)
705 def snippet_pos(self, idx=0):
706 return self.hits[idx]['snippets_pos']
708 def snippet_revision(self, idx=0):
710 return self.hits[idx]['snippets_revision']
711 except (IndexError, KeyError):
715 class Search(SolrIndex):
719 def __init__(self, default_field="text"):
720 super(Search, self).__init__(mode='r')
722 def make_term_query(self, query, field='text', modal=operator.or_):
724 Returns term queries joined by boolean query.
725 modal - applies to boolean query
726 fuzzy - should the query by fuzzy.
731 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
735 def search_phrase(self, searched, field='text', book=False,
741 filters.append(self.index.Q(is_book=True))
743 q = self.index.query(**{field: searched})
744 q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
745 res = q.paginate(rows=100).execute()
746 return [SearchResult(found, how_found=u'search_phrase') for found in res]
748 def search_some(self, searched, fields, book=True,
749 filters=None, snippets=True, query_terms=None):
750 assert isinstance(fields, list)
754 filters.append(self.index.Q(is_book=True))
756 query = self.index.Q()
759 query = self.index.Q(query | self.make_term_query(searched, fld))
761 query = self.index.query(query)
762 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
763 res = query.execute()
764 return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
766 def search_everywhere(self, searched, query_terms=None):
768 Tries to use search terms to match different fields of book (or its parts).
769 E.g. one word can be an author survey, another be a part of the title, and the rest
770 are some words from third chapter.
773 # content only query : themes x content
774 q = self.make_term_query(searched, 'text')
775 q_themes = self.make_term_query(searched, 'themes_pl')
777 query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
778 res = query.execute()
781 books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
783 # query themes/content x author/title/tags
784 in_content = self.index.Q()
785 in_meta = self.index.Q()
787 for fld in ['themes_pl', 'text']:
788 in_content |= self.make_term_query(searched, field=fld)
790 for fld in ['tags', 'authors', 'title']:
791 in_meta |= self.make_term_query(searched, field=fld)
793 q = in_content & in_meta
794 res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
797 books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
801 def get_snippets(self, searchresult, query, field='text', num=1):
803 Returns a snippet for found scoreDoc.
805 maxnum = len(searchresult)
806 if num is None or num < 0 or num > maxnum:
808 book_id = searchresult.book_id
809 revision = searchresult.snippet_revision()
810 snippets = Snippets(book_id, revision=revision)
811 snips = [None] * maxnum
815 while idx < maxnum and num > 0:
816 position, length = searchresult.snippet_pos(idx)
817 if position is None or length is None:
819 text = snippets.get((int(position),
821 snip = self.index.highlight(text=text, field=field, q=query)
828 book = catalogue.models.Book.objects.filter(id=book_id)
830 log.error("Book does not exist for book id = %d" % book_id)
831 elif not book.get().children.exists():
832 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
837 # remove verse end markers..
838 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
840 searchresult.snippets = snips
844 def hint_tags(self, query, pdcounter=True, prefix=True):
846 Return auto-complete hints for tags
850 query = query.strip()
851 for field in ['tag_name', 'tag_name_pl']:
853 q |= self.index.Q(**{field: query + "*"})
855 q |= self.make_term_query(query, field=field)
856 qu = self.index.query(q)
858 return self.search_tags(qu, pdcounter=pdcounter)
860 def search_tags(self, query, filters=None, pdcounter=False):
862 Search for Tag objects using query.
867 filters.append(~self.index.Q(is_pdcounter=True))
868 res = self.apply_filters(query, filters).execute()
874 is_pdcounter = doc.get('is_pdcounter', False)
875 category = doc.get('tag_category')
878 if category == 'pd_author':
879 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
880 elif category == 'pd_book':
881 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
882 tag.category = 'pd_book' # make it look more lik a tag.
885 print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
886 int(doc.get('tag_id')), category)).encode('utf-8')
889 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
892 except catalogue.models.Tag.DoesNotExist:
894 except PDCounterAuthor.DoesNotExist:
896 except PDCounterBook.DoesNotExist:
899 tags_slugs = set(map(lambda t: t.slug, tags))
900 tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
902 log.debug('search_tags: %s' % tags)
906 def hint_books(self, query, prefix=True):
908 Returns auto-complete hints for book titles
909 Because we do not index 'pseudo' title-tags.
913 query = query.strip()
915 q |= self.index.Q(title=query + "*")
916 q |= self.index.Q(title_orig=query + "*")
918 q |= self.make_term_query(query, field='title')
919 q |= self.make_term_query(query, field='title_orig')
920 qu = self.index.query(q)
921 only_books = self.index.Q(is_book=True)
922 return self.search_books(qu, [only_books])
924 def search_books(self, query, filters=None, max_results=10):
926 Searches for Book objects using query
930 query = query.query(is_book=True)
931 res = self.apply_filters(query, filters).field_limit(['book_id'])
935 if bid not in bks_found:
936 bks.append(catalogue.models.Book.objects.get(id=bid))
938 except catalogue.models.Book.DoesNotExist:
943 def apply_filters(query, filters):
945 Apply filters to a query
949 filters = filter(lambda x: x is not None, filters)
951 query = query.query(f)
955 if getattr(settings, 'SEARCH_MOCK', False):
956 from .mock_search import Search