1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 class SolrIndex(object):
25 def __init__(self, mode=None):
26 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
29 class Snippets(object):
31 This class manages snippet files for indexed object (book)
32 the snippets are concatenated together, and their positions and
33 lengths are kept in lucene index fields.
35 SNIPPET_DIR = "snippets"
37 def __init__(self, book_id, revision=None):
38 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39 self.book_id = book_id
40 self.revision = revision
47 fn = "%d.%d" % (self.book_id, self.revision)
49 fn = "%d" % self.book_id
51 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53 def open(self, mode='r'):
55 Open the snippet file. Call .close() afterwards.
61 if os.path.exists(self.path):
64 if not os.path.exists(self.path):
68 self.file = open(self.path, mode)
72 def add(self, snippet):
74 Append a snippet (unicode) to the snippet file.
75 Return a (position, length) tuple
77 txt = snippet.encode('utf-8')
80 pos = (self.position, l)
86 Given a tuple of (position, length) return an unicode
87 of the snippet stored there.
89 self.file.seek(pos[0], 0)
90 txt = self.file.read(pos[1]).decode('utf-8')
94 """Close snippet file"""
110 class Index(SolrIndex):
112 Class indexing books.
115 super(Index, self).__init__(mode='rw')
117 def delete_query(self, *queries):
119 index.delete(queries=...) doesn't work, so let's reimplement it
120 using deletion of list of uids.
124 if isinstance(q, sunburnt.search.LuceneQuery):
125 q = self.index.query(q)
126 q.field_limiter.update(['uid'])
130 ids = q.paginate(start=st, rows=rows).execute()
137 self.index.delete(uids)
142 def index_tags(self, *tags, **kw):
144 Re-index global tag list.
145 Removes all tags from index, then index them again.
146 Indexed fields include: id, name (with and without polish stems), category
148 log.debug("Indexing tags")
149 remove_only = kw.get('remove_only', False)
150 # first, remove tags from index.
154 q_id = self.index.Q(tag_id=tag.id)
156 if isinstance(tag, PDCounterAuthor):
157 q_cat = self.index.Q(tag_category='pd_author')
158 elif isinstance(tag, PDCounterBook):
159 q_cat = self.index.Q(tag_category='pd_book')
161 q_cat = self.index.Q(tag_category=tag.category)
163 q_id_cat = self.index.Q(q_id & q_cat)
164 tag_qs.append(q_id_cat)
165 self.delete_query(*tag_qs)
167 q = self.index.Q(tag_id__any=True)
171 # then add them [all or just one passed]
174 catalogue.models.Tag.objects.exclude(category='set'),
175 PDCounterAuthor.objects.all(),
176 PDCounterBook.objects.all())
179 if isinstance(tag, PDCounterAuthor):
181 "tag_id": int(tag.id),
182 "tag_name": tag.name,
183 "tag_name_pl": tag.name,
184 "tag_category": 'pd_author',
185 "is_pdcounter": True,
186 "uid": "tag%d_pd_a" % tag.id
188 elif isinstance(tag, PDCounterBook):
190 "tag_id": int(tag.id),
191 "tag_name": tag.title,
192 "tag_name_pl": tag.title,
193 "tag_category": 'pd_book',
194 "is_pdcounter": True,
195 "uid": "tag%d_pd_b" % tag.id
199 "tag_id": int(tag.id),
200 "tag_name": tag.name,
201 "tag_name_pl": tag.name,
202 "tag_category": tag.category,
203 "is_pdcounter": False,
204 "uid": "tag%d" % tag.id
208 def create_book_doc(self, book):
210 Create a lucene document referring book id.
212 doc = {'book_id': int(book.id)}
213 if book.parent is not None:
214 doc['parent_id'] = int(book.parent.id)
217 def remove_book(self, book_or_id, remove_snippets=True):
218 """Removes a book from search index.
219 book - Book instance."""
220 if isinstance(book_or_id, catalogue.models.Book):
221 book_id = book_or_id.id
225 self.delete_query(self.index.Q(book_id=book_id))
228 snippets = Snippets(book_id)
231 def index_book(self, book, book_info=None, overwrite=True):
234 Creates a lucene document for extracted metadata
235 and calls self.index_content() to index the contents of the book.
238 # we don't remove snippets, since they might be still needed by
239 # threads using not reopened index
240 self.remove_book(book, remove_snippets=False)
242 book_doc = self.create_book_doc(book)
243 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
244 # let's not index it - it's only used for extracting publish date
245 if 'source_name' in meta_fields:
246 del meta_fields['source_name']
248 for n, f in meta_fields.items():
251 book_doc['uid'] = "book%s" % book_doc['book_id']
252 self.index.add(book_doc)
255 'title': meta_fields['title'],
256 'authors': meta_fields['authors'],
257 'published_date': meta_fields['published_date']
260 if 'translators' in meta_fields:
261 book_fields['translators'] = meta_fields['translators']
263 self.index_content(book, book_fields=book_fields)
268 'dramat_wierszowany_l',
269 'dramat_wierszowany_lp',
270 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
274 ignore_content_tags = [
275 'uwaga', 'extra', 'nota_red',
276 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
278 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
281 footnote_tags = ['pa', 'pt', 'pr', 'pe']
283 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
284 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
286 published_date_re = re.compile("([0-9]+)[\]. ]*$")
288 def extract_metadata(self, book, book_info=None, dc_only=None):
290 Extract metadata from book and returns a map of fields keyed by fieldname
294 if book_info is None:
295 book_info = dcparser.parse(open(book.xml_file.path))
297 fields['slug'] = book.slug
298 fields['tags'] = [t.name for t in book.tags]
299 fields['is_book'] = True
302 for field in dcparser.BookInfo.FIELDS:
303 if dc_only and field.name not in dc_only:
305 if hasattr(book_info, field.name):
306 if not getattr(book_info, field.name):
308 # since no type information is available, we use validator
309 type_indicator = field.validator
310 if type_indicator == dcparser.as_unicode:
311 s = getattr(book_info, field.name)
314 fields[field.name] = s
315 elif type_indicator == dcparser.as_person:
316 p = getattr(book_info, field.name)
317 if isinstance(p, dcparser.Person):
320 persons = ', '.join(map(unicode, p))
321 fields[field.name] = persons
322 elif type_indicator == dcparser.as_date:
323 dt = getattr(book_info, field.name)
324 fields[field.name] = dt
328 if hasattr(book_info, 'source_name') and book_info.source_name:
329 match = self.published_date_re.search(book_info.source_name)
330 if match is not None:
331 pd = str(match.groups()[0])
334 fields["published_date"] = pd
338 # def add_gaps(self, fields, fieldname):
340 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
345 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
348 def get_master(self, root):
350 Returns the first master tag from an etree.
352 for master in root.iter():
353 if master.tag in self.master_tags:
356 def index_content(self, book, book_fields):
358 Walks the book XML and extract content from it.
359 Adds parts for each header tag and for each fragment.
361 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362 root = wld.edoc.getroot()
364 master = self.get_master(root)
369 if node.tag not in self.ignore_content_tags:
370 yield node, None, None
371 if node.text is not None:
372 yield None, node.text, None
373 for child in list(node):
374 for b, t, e in walker(child):
376 yield None, None, node
378 if node.tail is not None:
379 yield None, node.tail, None
382 def fix_format(text):
383 # separator = [u" ", u"\t", u".", u";", u","]
384 if isinstance(text, list):
385 # need to join it first
386 text = filter(lambda s: s is not None, content)
387 text = u' '.join(text)
388 # for i in range(len(text)):
390 # if text[i][0] not in separator\
391 # and text[i - 1][-1] not in separator:
392 # text.insert(i, u" ")
394 return re.sub("(?m)/$", "", text)
396 def add_part(snippets, **fields):
397 doc = self.create_book_doc(book)
398 for n, v in book_fields.items():
401 doc['header_index'] = fields["header_index"]
402 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403 doc['header_type'] = fields['header_type']
405 doc['text'] = fields['text']
408 snip_pos = snippets.add(fields["text"])
410 doc['snippets_position'] = snip_pos[0]
411 doc['snippets_length'] = snip_pos[1]
412 if snippets.revision:
413 doc["snippets_revision"] = snippets.revision
415 if 'fragment_anchor' in fields:
416 doc["fragment_anchor"] = fields['fragment_anchor']
418 if 'themes' in fields:
419 doc['themes'] = fields['themes']
420 doc['uid'] = "part%s-%s-%s-%s" % (
421 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
425 snippets = Snippets(book.id).open('w')
427 for header, position in zip(list(master), range(len(master))):
429 if header.tag in self.skip_header_tags:
431 if header.tag is etree.Comment:
438 def all_content(text):
439 for frag in fragments.values():
440 frag['text'].append(text)
442 handle_text = [all_content]
444 for start, text, end in walker(header):
446 if start is not None and start.tag in self.footnote_tags:
449 def collect_footnote(t):
452 handle_text.append(collect_footnote)
453 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
455 doc = add_part(snippets, header_index=position, header_type=header.tag,
456 text=u''.join(footnote),
461 # handle fragments and themes.
462 if start is not None and start.tag == 'begin':
463 fid = start.attrib['id'][1:]
465 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
467 # themes for this fragment
468 elif start is not None and start.tag == 'motyw':
469 fid = start.attrib['id'][1:]
470 handle_text.append(lambda text: None)
471 if start.text is not None:
472 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
473 elif end is not None and end.tag == 'motyw':
476 elif start is not None and start.tag == 'end':
477 fid = start.attrib['id'][1:]
478 if fid not in fragments:
479 continue # a broken <end> node, skip it
480 frag = fragments[fid]
481 if not frag['themes']:
482 continue # empty themes list.
485 doc = add_part(snippets,
486 header_type=frag['start_header'],
487 header_index=frag['start_section'],
488 header_span=position - frag['start_section'] + 1,
490 text=fix_format(frag['text']),
491 themes=frag['themes'])
496 if text is not None and handle_text is not []:
497 hdl = handle_text[-1]
500 # in the end, add a section text.
501 doc = add_part(snippets, header_index=position,
502 header_type=header.tag, text=fix_format(content))
510 class SearchResult(object):
511 def __init__(self, doc, how_found=None, query_terms=None):
514 self._processed_hits = None # processed hits
516 self.query_terms = query_terms
520 self._score = doc['score']
524 self.book_id = int(doc["book_id"])
527 self.published_date = int(doc.get("published_date"))
529 self.published_date = 0
532 header_type = doc.get("header_type", None)
533 # we have a content hit in some header of fragment
534 if header_type is not None:
535 sec = (header_type, int(doc["header_index"]))
536 header_span = doc['header_span']
537 header_span = header_span is not None and int(header_span) or 1
538 fragment = doc.get("fragment_anchor", None)
539 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
540 snippets_rev = doc.get('snippets_revision', None)
542 hit = (sec + (header_span,), fragment, self._score, {
543 'how_found': how_found,
544 'snippets_pos': snippets_pos,
545 'snippets_revision': snippets_rev,
546 'themes': doc.get('themes', []),
547 'themes_pl': doc.get('themes_pl', [])
550 self._hits.append(hit)
552 def __unicode__(self):
553 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
554 (self.book_id, len(self._hits),
555 len(self._processed_hits) if self._processed_hits else -1,
556 self._score, len(self.snippets))
559 return unicode(self).encode('utf-8')
563 return self._score * self.boost
565 def merge(self, other):
566 if self.book_id != other.book_id:
567 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
568 self._hits += other._hits
569 if other.score > self.score:
570 self._score = other._score
574 if self._book is not None:
576 self._book = catalogue.models.Book.objects.get(id=self.book_id)
579 book = property(get_book)
590 if self._processed_hits is not None:
591 return self._processed_hits
593 # to sections and fragments
594 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
596 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
598 # sections not covered by fragments
599 sect = filter(lambda s: 0 == len(filter(
600 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
601 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
603 def remove_duplicates(lst, keyfn, compare):
608 if compare(els[eif], e) >= 1:
613 # remove fragments with duplicated fid's and duplicated snippets
614 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
615 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
616 # lambda a, b: cmp(a[SCORE], b[SCORE]))
618 # remove duplicate sections
622 si = s[self.POSITION][self.POSITION_INDEX]
625 if sections[si]['score'] >= s[self.SCORE]:
628 m = {'score': s[self.SCORE],
629 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
631 m.update(s[self.OTHER])
634 hits = sections.values()
638 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
639 except catalogue.models.Fragment.DoesNotExist:
642 # Figure out if we were searching for a token matching some word in theme name.
643 themes = frag.tags.filter(category='theme')
645 if self.query_terms is not None:
646 for i in range(0, len(f[self.OTHER]['themes'])):
647 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
648 tms = map(unicode.lower, tms)
649 for qt in self.query_terms:
651 themes_hit.add(f[self.OTHER]['themes'][i])
654 def theme_by_name(n):
655 th = filter(lambda t: t.name == n, themes)
660 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
662 m = {'score': f[self.SCORE],
664 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
666 'themes_hit': themes_hit
668 m.update(f[self.OTHER])
671 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
673 self._processed_hits = hits
678 def aggregate(*result_lists):
680 for rl in result_lists:
682 if r.book_id in books:
683 books[r.book_id].merge(r)
686 return books.values()
688 def __cmp__(self, other):
689 c = cmp(self.score, other.score)
691 # this is inverted, because earlier date is better
692 return cmp(other.published_date, self.published_date)
697 return len(self.hits)
699 def snippet_pos(self, idx=0):
700 return self.hits[idx]['snippets_pos']
702 def snippet_revision(self, idx=0):
704 return self.hits[idx]['snippets_revision']
705 except (IndexError, KeyError):
709 class Search(SolrIndex):
713 def __init__(self, default_field="text"):
714 super(Search, self).__init__(mode='r')
716 def make_term_query(self, query, field='text', modal=operator.or_):
718 Returns term queries joined by boolean query.
719 modal - applies to boolean query
720 fuzzy - should the query by fuzzy.
725 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
729 def search_phrase(self, searched, field='text', book=False,
735 filters.append(self.index.Q(is_book=True))
737 q = self.index.query(**{field: searched})
738 q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
739 res = q.paginate(rows=100).execute()
740 return [SearchResult(found, how_found=u'search_phrase') for found in res]
742 def search_some(self, searched, fields, book=True,
743 filters=None, snippets=True, query_terms=None):
744 assert isinstance(fields, list)
748 filters.append(self.index.Q(is_book=True))
750 query = self.index.Q()
753 query = self.index.Q(query | self.make_term_query(searched, fld))
755 query = self.index.query(query)
756 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
757 res = query.execute()
758 return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
760 def search_everywhere(self, searched, query_terms=None):
762 Tries to use search terms to match different fields of book (or its parts).
763 E.g. one word can be an author survey, another be a part of the title, and the rest
764 are some words from third chapter.
767 # content only query : themes x content
768 q = self.make_term_query(searched, 'text')
769 q_themes = self.make_term_query(searched, 'themes_pl')
771 query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
772 res = query.execute()
775 books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
777 # query themes/content x author/title/tags
778 in_content = self.index.Q()
779 in_meta = self.index.Q()
781 for fld in ['themes_pl', 'text']:
782 in_content |= self.make_term_query(searched, field=fld)
784 for fld in ['tags', 'authors', 'title']:
785 in_meta |= self.make_term_query(searched, field=fld)
787 q = in_content & in_meta
788 res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
791 books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
795 def get_snippets(self, searchresult, query, field='text', num=1):
797 Returns a snippet for found scoreDoc.
799 maxnum = len(searchresult)
800 if num is None or num < 0 or num > maxnum:
802 book_id = searchresult.book_id
803 revision = searchresult.snippet_revision()
804 snippets = Snippets(book_id, revision=revision)
805 snips = [None] * maxnum
809 while idx < maxnum and num > 0:
810 position, length = searchresult.snippet_pos(idx)
811 if position is None or length is None:
813 text = snippets.get((int(position),
815 snip = self.index.highlight(text=text, field=field, q=query)
816 if snip not in snips:
823 book = catalogue.models.Book.objects.filter(id=book_id)
825 log.error("Book does not exist for book id = %d" % book_id)
826 elif not book.get().children.exists():
827 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
832 # remove verse end markers..
833 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
835 searchresult.snippets = snips
839 def hint_tags(self, query, pdcounter=True, prefix=True):
841 Return auto-complete hints for tags
845 query = query.strip()
846 for field in ['tag_name', 'tag_name_pl']:
848 q |= self.index.Q(**{field: query + "*"})
850 q |= self.make_term_query(query, field=field)
851 qu = self.index.query(q)
853 return self.search_tags(qu, pdcounter=pdcounter)
855 def search_tags(self, query, filters=None, pdcounter=False):
857 Search for Tag objects using query.
862 filters.append(~self.index.Q(is_pdcounter=True))
863 res = self.apply_filters(query, filters).execute()
869 is_pdcounter = doc.get('is_pdcounter', False)
870 category = doc.get('tag_category')
873 if category == 'pd_author':
874 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
875 elif category == 'pd_book':
876 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
877 tag.category = 'pd_book' # make it look more lik a tag.
880 print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
881 int(doc.get('tag_id')), category)).encode('utf-8')
884 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
887 except catalogue.models.Tag.DoesNotExist:
889 except PDCounterAuthor.DoesNotExist:
891 except PDCounterBook.DoesNotExist:
894 tags_slugs = set(map(lambda t: t.slug, tags))
895 tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
897 log.debug('search_tags: %s' % tags)
901 def hint_books(self, query, prefix=True):
903 Returns auto-complete hints for book titles
904 Because we do not index 'pseudo' title-tags.
908 query = query.strip()
910 q |= self.index.Q(title=query + "*")
911 q |= self.index.Q(title_orig=query + "*")
913 q |= self.make_term_query(query, field='title')
914 q |= self.make_term_query(query, field='title_orig')
915 qu = self.index.query(q)
916 only_books = self.index.Q(is_book=True)
917 return self.search_books(qu, [only_books])
919 def search_books(self, query, filters=None, max_results=10):
921 Searches for Book objects using query
925 query = query.query(is_book=True)
926 res = self.apply_filters(query, filters).field_limit(['book_id'])
930 if bid not in bks_found:
931 bks.append(catalogue.models.Book.objects.get(id=bid))
933 except catalogue.models.Book.DoesNotExist:
938 def apply_filters(query, filters):
940 Apply filters to a query
944 filters = filter(lambda x: x is not None, filters)
946 query = query.query(f)
950 if getattr(settings, 'SEARCH_MOCK', False):
951 from .mock_search import Search