1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
10 from librarian import dcparser
11 from librarian.parser import WLDocument
12 from lxml import etree
13 import catalogue.models
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from itertools import chain
18 log = logging.getLogger('search')
23 log = logging.getLogger('search')
25 class SolrIndex(object):
26 def __init__(self, mode=None):
27 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
30 class Snippets(object):
32 This class manages snippet files for indexed object (book)
33 the snippets are concatenated together, and their positions and
34 lengths are kept in lucene index fields.
36 SNIPPET_DIR = "snippets"
38 def __init__(self, book_id, revision=None):
40 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
41 except OSError as exc:
42 if exc.errno == errno.EEXIST:
45 self.book_id = book_id
46 self.revision = revision
51 if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
52 else: fn = "%d" % self.book_id
54 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
56 def open(self, mode='r'):
58 Open the snippet file. Call .close() afterwards.
64 if os.path.exists(self.path):
67 if not os.path.exists(self.path):
71 self.file = open(self.path, mode)
75 def add(self, snippet):
77 Append a snippet (unicode) to the snippet file.
78 Return a (position, length) tuple
80 txt = snippet.encode('utf-8')
83 pos = (self.position, l)
89 Given a tuple of (position, length) return an unicode
90 of the snippet stored there.
92 self.file.seek(pos[0], 0)
93 txt = self.file.read(pos[1]).decode('utf-8')
97 """Close snippet file"""
113 class Index(SolrIndex):
115 Class indexing books.
118 super(Index, self).__init__(mode='rw')
120 def delete_query(self, *queries):
122 index.delete(queries=...) doesn't work, so let's reimplement it
123 using deletion of list of uids.
127 if isinstance(q, sunburnt.search.LuceneQuery):
128 q = self.index.query(q)
129 q.field_limiter.update(['uid'])
133 ids = q.paginate(start=st, rows=rows).execute()
140 self.index.delete(uids)
145 def index_tags(self, *tags, **kw):
147 Re-index global tag list.
148 Removes all tags from index, then index them again.
149 Indexed fields include: id, name (with and without polish stems), category
151 log.debug("Indexing tags")
152 remove_only = kw.get('remove_only', False)
153 # first, remove tags from index.
157 q_id = self.index.Q(tag_id=tag.id)
159 if isinstance(tag, PDCounterAuthor):
160 q_cat = self.index.Q(tag_category='pd_author')
161 elif isinstance(tag, PDCounterBook):
162 q_cat = self.index.Q(tag_category='pd_book')
164 q_cat = self.index.Q(tag_category=tag.category)
166 q_id_cat = self.index.Q(q_id & q_cat)
167 tag_qs.append(q_id_cat)
168 self.delete_query(*tag_qs)
170 q = self.index.Q(tag_id__any=True)
174 # then add them [all or just one passed]
176 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
177 PDCounterAuthor.objects.all(), \
178 PDCounterBook.objects.all())
181 if isinstance(tag, PDCounterAuthor):
183 "tag_id": int(tag.id),
184 "tag_name": tag.name,
185 "tag_name_pl": tag.name,
186 "tag_category": 'pd_author',
187 "is_pdcounter": True,
188 "uid": "tag%d_pd_a" % tag.id
190 elif isinstance(tag, PDCounterBook):
192 "tag_id": int(tag.id),
193 "tag_name": tag.title,
194 "tag_name_pl": tag.title,
195 "tag_category": 'pd_book',
196 "is_pdcounter": True,
197 "uid": "tag%d_pd_b" % tag.id
201 "tag_id": int(tag.id),
202 "tag_name": tag.name,
203 "tag_name_pl": tag.name,
204 "tag_category": tag.category,
205 "is_pdcounter": False,
206 "uid": "tag%d" % tag.id
210 def create_book_doc(self, book):
212 Create a lucene document referring book id.
215 'book_id': int(book.id),
217 if book.parent is not None:
218 doc["parent_id"] = int(book.parent.id)
221 def remove_book(self, book_or_id, remove_snippets=True):
222 """Removes a book from search index.
223 book - Book instance."""
224 if isinstance(book_or_id, catalogue.models.Book):
225 book_id = book_or_id.id
229 self.delete_query(self.index.Q(book_id=book_id))
232 snippets = Snippets(book_id)
235 def index_book(self, book, book_info=None, overwrite=True):
238 Creates a lucene document for extracted metadata
239 and calls self.index_content() to index the contents of the book.
242 # we don't remove snippets, since they might be still needed by
243 # threads using not reopened index
244 self.remove_book(book, remove_snippets=False)
246 book_doc = self.create_book_doc(book)
247 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
248 # let's not index it - it's only used for extracting publish date
249 if 'source_name' in meta_fields:
250 del meta_fields['source_name']
252 for n, f in meta_fields.items():
255 book_doc['uid'] = "book%s" % book_doc['book_id']
256 self.index.add(book_doc)
259 'title': meta_fields['title'],
260 'authors': meta_fields['authors'],
261 'published_date': meta_fields['published_date']
264 if 'translators' in meta_fields:
265 book_fields['translators'] = meta_fields['translators']
267 self.index_content(book, book_fields=book_fields)
272 'dramat_wierszowany_l',
273 'dramat_wierszowany_lp',
274 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
278 ignore_content_tags = [
280 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
282 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285 footnote_tags = ['pa', 'pt', 'pr', 'pe']
287 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
289 published_date_re = re.compile("([0-9]+)[\]. ]*$")
291 def extract_metadata(self, book, book_info=None, dc_only=None):
293 Extract metadata from book and returns a map of fields keyed by fieldname
297 if book_info is None:
298 book_info = dcparser.parse(open(book.xml_file.path))
300 fields['slug'] = book.slug
301 fields['tags'] = [t.name for t in book.tags]
302 fields['is_book'] = True
305 for field in dcparser.BookInfo.FIELDS:
306 if dc_only and field.name not in dc_only:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
317 fields[field.name] = s
318 elif type_indicator == dcparser.as_person:
319 p = getattr(book_info, field.name)
320 if isinstance(p, dcparser.Person):
323 persons = ', '.join(map(unicode, p))
324 fields[field.name] = persons
325 elif type_indicator == dcparser.as_date:
326 dt = getattr(book_info, field.name)
327 fields[field.name] = dt
331 if hasattr(book_info, 'source_name') and book_info.source_name:
332 match = self.published_date_re.search(book_info.source_name)
333 if match is not None:
334 pd = str(match.groups()[0])
336 fields["published_date"] = pd
340 # def add_gaps(self, fields, fieldname):
342 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
343 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
347 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
348 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
350 def get_master(self, root):
352 Returns the first master tag from an etree.
354 for master in root.iter():
355 if master.tag in self.master_tags:
358 def index_content(self, book, book_fields={}):
360 Walks the book XML and extract content from it.
361 Adds parts for each header tag and for each fragment.
363 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
364 root = wld.edoc.getroot()
366 master = self.get_master(root)
370 def walker(node, ignore_tags=[]):
372 if node.tag not in ignore_tags:
373 yield node, None, None
374 if node.text is not None:
375 yield None, node.text, None
376 for child in list(node):
377 for b, t, e in walker(child):
379 yield None, None, node
381 if node.tail is not None:
382 yield None, node.tail, None
385 def fix_format(text):
386 # separator = [u" ", u"\t", u".", u";", u","]
387 if isinstance(text, list):
388 # need to join it first
389 text = filter(lambda s: s is not None, content)
390 text = u' '.join(text)
391 # for i in range(len(text)):
393 # if text[i][0] not in separator\
394 # and text[i - 1][-1] not in separator:
395 # text.insert(i, u" ")
397 return re.sub("(?m)/$", "", text)
399 def add_part(snippets, **fields):
400 doc = self.create_book_doc(book)
401 for n, v in book_fields.items():
404 doc['header_index'] = fields["header_index"]
405 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
406 doc['header_type'] = fields['header_type']
408 doc['text'] = fields['text']
411 snip_pos = snippets.add(fields["text"])
413 doc['snippets_position'] = snip_pos[0]
414 doc['snippets_length'] = snip_pos[1]
415 if snippets.revision:
416 doc["snippets_revision"] = snippets.revision
418 if 'fragment_anchor' in fields:
419 doc["fragment_anchor"] = fields['fragment_anchor']
421 if 'themes' in fields:
422 doc['themes'] = fields['themes']
423 doc['uid'] = "part%s%s%s" % (doc['header_index'],
425 doc.get('fragment_anchor', ''))
429 if isinstance(s, unicode):
430 return s.encode('utf-8')
435 snippets = Snippets(book.id).open('w')
437 for header, position in zip(list(master), range(len(master))):
439 if header.tag in self.skip_header_tags:
441 if header.tag is etree.Comment:
448 def all_content(text):
449 for frag in fragments.values():
450 frag['text'].append(text)
452 handle_text = [all_content]
454 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
456 if start is not None and start.tag in self.footnote_tags:
459 def collect_footnote(t):
462 handle_text.append(collect_footnote)
463 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
465 doc = add_part(snippets, header_index=position, header_type=header.tag,
466 text=u''.join(footnote),
471 # handle fragments and themes.
472 if start is not None and start.tag == 'begin':
473 fid = start.attrib['id'][1:]
474 fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
476 # themes for this fragment
477 elif start is not None and start.tag == 'motyw':
478 fid = start.attrib['id'][1:]
479 handle_text.append(None)
480 if start.text is not None:
481 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
482 elif end is not None and end.tag == 'motyw':
485 elif start is not None and start.tag == 'end':
486 fid = start.attrib['id'][1:]
487 if fid not in fragments:
488 continue # a broken <end> node, skip it
489 frag = fragments[fid]
490 if frag['themes'] == []:
491 continue # empty themes list.
494 doc = add_part(snippets,
495 header_type=frag['start_header'],
496 header_index=frag['start_section'],
497 header_span=position - frag['start_section'] + 1,
499 text=fix_format(frag['text']),
500 themes=frag['themes'])
505 if text is not None and handle_text is not []:
506 hdl = handle_text[-1]
510 # in the end, add a section text.
511 doc = add_part(snippets, header_index=position,
512 header_type=header.tag, text=fix_format(content))
520 class SearchResult(object):
521 def __init__(self, doc, how_found=None, query=None, query_terms=None):
522 # self.search = search
525 self._processed_hits = None # processed hits
527 self.query_terms = query_terms
530 self._score = doc['score']
534 self.book_id = int(doc["book_id"])
537 self.published_date = int(doc.get("published_date"))
539 self.published_date = 0
542 header_type = doc.get("header_type", None)
543 # we have a content hit in some header of fragment
544 if header_type is not None:
545 sec = (header_type, int(doc["header_index"]))
546 header_span = doc['header_span']
547 header_span = header_span is not None and int(header_span) or 1
548 fragment = doc.get("fragment_anchor", None)
549 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
550 snippets_rev = doc.get('snippets_revision', None)
552 hit = (sec + (header_span,), fragment, self._score, {
553 'how_found': how_found,
554 'snippets_pos': snippets_pos,
555 'snippets_revision': snippets_rev,
556 'themes': doc.get('themes', []),
557 'themes_pl': doc.get('themes_pl', [])
560 self._hits.append(hit)
562 def __unicode__(self):
563 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
564 (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
567 return unicode(self).encode('utf-8')
571 return self._score * self.boost
573 def merge(self, other):
574 if self.book_id != other.book_id:
575 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
576 self._hits += other._hits
577 if other.score > self.score:
578 self._score = other._score
582 if hasattr(self, '_book'):
584 self._book = catalogue.models.Book.objects.get(id=self.book_id)
587 book = property(get_book)
598 if self._processed_hits is not None:
599 return self._processed_hits
601 # to sections and fragments
602 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
604 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
606 # sections not covered by fragments
607 sect = filter(lambda s: 0 == len(filter(
608 lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
609 and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
614 def remove_duplicates(lst, keyfn, compare):
619 if compare(els[eif], e) >= 1:
624 # remove fragments with duplicated fid's and duplicated snippets
625 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
626 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
627 # lambda a, b: cmp(a[SCORE], b[SCORE]))
629 # remove duplicate sections
633 si = s[self.POSITION][self.POSITION_INDEX]
636 if sections[si]['score'] >= s[self.SCORE]:
639 m = {'score': s[self.SCORE],
640 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
642 m.update(s[self.OTHER])
645 hits = sections.values()
649 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
650 except catalogue.models.Fragment.DoesNotExist:
653 # Figure out if we were searching for a token matching some word in theme name.
654 themes = frag.tags.filter(category='theme')
656 if self.query_terms is not None:
657 for i in range(0, len(f[self.OTHER]['themes'])):
658 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
659 tms = map(unicode.lower, tms)
660 for qt in self.query_terms:
662 themes_hit.add(f[self.OTHER]['themes'][i])
665 def theme_by_name(n):
666 th = filter(lambda t: t.name == n, themes)
671 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
673 m = {'score': f[self.SCORE],
675 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
677 'themes_hit': themes_hit
679 m.update(f[self.OTHER])
682 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
684 self._processed_hits = hits
689 def aggregate(*result_lists):
691 for rl in result_lists:
693 if r.book_id in books:
694 books[r.book_id].merge(r)
697 return books.values()
699 def __cmp__(self, other):
700 c = cmp(self.score, other.score)
702 # this is inverted, because earlier date is better
703 return cmp(other.published_date, self.published_date)
708 return len(self.hits)
710 def snippet_pos(self, idx=0):
711 return self.hits[idx]['snippets_pos']
713 def snippet_revision(self, idx=0):
715 return self.hits[idx]['snippets_revision']
720 class Search(SolrIndex):
724 def __init__(self, default_field="text"):
725 super(Search, self).__init__(mode='r')
728 def make_term_query(self, query, field='text', modal=operator.or_):
730 Returns term queries joined by boolean query.
731 modal - applies to boolean query
732 fuzzy - should the query by fuzzy.
734 if query is None: query = ''
736 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
737 query.split(r" ")), q)
741 def search_phrase(self, searched, field='text', book=False,
744 if filters is None: filters = []
745 if book: filters.append(self.index.Q(is_book=True))
747 q = self.index.query(**{field: searched})
748 q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
750 return [SearchResult(found, how_found=u'search_phrase') for found in res]
752 def search_some(self, searched, fields, book=True,
753 filters=None, snippets=True, query_terms=None):
754 assert isinstance(fields, list)
755 if filters is None: filters = []
756 if book: filters.append(self.index.Q(is_book=True))
758 query = self.index.Q()
761 query = self.index.Q(query | self.make_term_query(searched, fld))
763 query = self.index.query(query)
764 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
765 res = query.execute()
766 return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
769 def search_everywhere(self, searched, query_terms=None):
771 Tries to use search terms to match different fields of book (or its parts).
772 E.g. one word can be an author survey, another be a part of the title, and the rest
773 are some words from third chapter.
776 # content only query : themes x content
777 q = self.make_term_query(searched, 'text')
778 q_themes = self.make_term_query(searched, 'themes_pl')
780 query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
781 res = query.execute()
784 books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
786 # query themes/content x author/title/tags
787 in_content = self.index.Q()
788 in_meta = self.index.Q()
790 for fld in ['themes_pl', 'text']:
791 in_content |= self.make_term_query(searched, field=fld)
793 for fld in ['tags', 'authors', 'title']:
794 in_meta |= self.make_term_query(searched, field=fld)
796 q = in_content & in_meta
797 res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
800 books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
804 def get_snippets(self, searchresult, query, field='text', num=1):
806 Returns a snippet for found scoreDoc.
808 maxnum = len(searchresult)
809 if num is None or num < 0 or num > maxnum:
811 book_id = searchresult.book_id
812 revision = searchresult.snippet_revision()
813 snippets = Snippets(book_id, revision=revision)
814 snips = [None] * maxnum
818 while idx < maxnum and num > 0:
819 position, length = searchresult.snippet_pos(idx)
820 if position is None or length is None:
822 text = snippets.get((int(position),
824 snip = self.index.highlight(text=text, field=field, q=query)
831 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
836 # remove verse end markers..
837 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
839 searchresult.snippets = snips
843 def hint_tags(self, query, pdcounter=True, prefix=True):
845 Return auto-complete hints for tags
849 query = query.strip()
850 for field in ['tag_name', 'tag_name_pl']:
852 q |= self.index.Q(**{field: query + "*"})
854 q |= self.make_term_query(query, field=field)
855 qu = self.index.query(q).exclude(tag_category="book")
857 return self.search_tags(qu, pdcounter=pdcounter)
859 def search_tags(self, query, filters=None, pdcounter=False):
861 Search for Tag objects using query.
863 if not filters: filters = []
865 filters.append(~self.index.Q(is_pdcounter=True))
866 res = self.apply_filters(query, filters).execute()
872 is_pdcounter = doc.get('is_pdcounter', False)
873 category = doc.get('tag_category')
875 if is_pdcounter == True:
876 if category == 'pd_author':
877 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
878 elif category == 'pd_book':
879 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
880 tag.category = 'pd_book' # make it look more lik a tag.
882 print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
885 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
888 except catalogue.models.Tag.DoesNotExist: pass
889 except PDCounterAuthor.DoesNotExist: pass
890 except PDCounterBook.DoesNotExist: pass
892 tags_slugs = set(map(lambda t: t.slug, tags))
893 tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
895 log.debug('search_tags: %s' % tags)
899 def hint_books(self, query, prefix=True):
901 Returns auto-complete hints for book titles
902 Because we do not index 'pseudo' title-tags.
906 query = query.strip()
908 q |= self.index.Q(title=query + "*")
910 q |= self.make_term_query(query, field='title')
911 qu = self.index.query(q)
912 only_books = self.index.Q(is_book=True)
913 return self.search_books(qu, [only_books])
915 def search_books(self, query, filters=None, max_results=10):
917 Searches for Book objects using query
921 query = query.query(is_book=True)
922 res = self.apply_filters(query, filters).field_limit(['book_id'])
926 if not bid in bks_found:
927 bks.append(catalogue.models.Book.objects.get(id=bid))
929 except catalogue.models.Book.DoesNotExist: pass
934 def apply_filters(query, filters):
936 Apply filters to a query
938 if filters is None: filters = []
939 filters = filter(lambda x: x is not None, filters)
941 query = query.query(f)