1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
24 class SolrIndex(object):
25 def __init__(self, mode=None):
26 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
29 class Snippets(object):
31 This class manages snippet files for indexed object (book)
32 the snippets are concatenated together, and their positions and
33 lengths are kept in lucene index fields.
35 SNIPPET_DIR = "snippets"
37 def __init__(self, book_id, revision=None):
38 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39 self.book_id = book_id
40 self.revision = revision
47 fn = "%d.%d" % (self.book_id, self.revision)
49 fn = "%d" % self.book_id
51 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53 def open(self, mode='r'):
55 Open the snippet file. Call .close() afterwards.
61 if os.path.exists(self.path):
64 if not os.path.exists(self.path):
68 self.file = open(self.path, mode)
72 def add(self, snippet):
74 Append a snippet (unicode) to the snippet file.
75 Return a (position, length) tuple
77 txt = snippet.encode('utf-8')
80 pos = (self.position, l)
86 Given a tuple of (position, length) return an unicode
87 of the snippet stored there.
89 self.file.seek(pos[0], 0)
90 txt = self.file.read(pos[1]).decode('utf-8')
94 """Close snippet file"""
110 class Index(SolrIndex):
112 Class indexing books.
115 super(Index, self).__init__(mode='rw')
117 def delete_query(self, *queries):
119 index.delete(queries=...) doesn't work, so let's reimplement it
120 using deletion of list of uids.
124 if isinstance(q, sunburnt.search.LuceneQuery):
125 q = self.index.query(q)
126 q.field_limiter.update(['uid'])
130 ids = q.paginate(start=st, rows=rows).execute()
137 self.index.delete(uids)
142 def index_tags(self, *tags, **kw):
144 Re-index global tag list.
145 Removes all tags from index, then index them again.
146 Indexed fields include: id, name (with and without polish stems), category
148 log.debug("Indexing tags")
149 remove_only = kw.get('remove_only', False)
150 # first, remove tags from index.
154 q_id = self.index.Q(tag_id=tag.id)
156 if isinstance(tag, PDCounterAuthor):
157 q_cat = self.index.Q(tag_category='pd_author')
158 elif isinstance(tag, PDCounterBook):
159 q_cat = self.index.Q(tag_category='pd_book')
161 q_cat = self.index.Q(tag_category=tag.category)
163 q_id_cat = self.index.Q(q_id & q_cat)
164 tag_qs.append(q_id_cat)
165 self.delete_query(*tag_qs)
167 q = self.index.Q(tag_id__any=True)
171 # then add them [all or just one passed]
174 catalogue.models.Tag.objects.exclude(category='set'),
175 PDCounterAuthor.objects.all(),
176 PDCounterBook.objects.all())
179 if isinstance(tag, PDCounterAuthor):
181 "tag_id": int(tag.id),
182 "tag_name": tag.name,
183 "tag_name_pl": tag.name,
184 "tag_category": 'pd_author',
185 "is_pdcounter": True,
186 "uid": "tag%d_pd_a" % tag.id
188 elif isinstance(tag, PDCounterBook):
190 "tag_id": int(tag.id),
191 "tag_name": tag.title,
192 "tag_name_pl": tag.title,
193 "tag_category": 'pd_book',
194 "is_pdcounter": True,
195 "uid": "tag%d_pd_b" % tag.id
199 "tag_id": int(tag.id),
200 "tag_name": tag.name,
201 "tag_name_pl": tag.name,
202 "tag_category": tag.category,
203 "is_pdcounter": False,
204 "uid": "tag%d" % tag.id
208 def create_book_doc(self, book):
210 Create a lucene document referring book id.
212 doc = {'book_id': int(book.id)}
213 if book.parent is not None:
214 doc['parent_id'] = int(book.parent.id)
217 def remove_book(self, book_or_id, remove_snippets=True):
218 """Removes a book from search index.
219 book - Book instance."""
220 if isinstance(book_or_id, catalogue.models.Book):
221 book_id = book_or_id.id
225 self.delete_query(self.index.Q(book_id=book_id))
228 snippets = Snippets(book_id)
231 def index_book(self, book, book_info=None, overwrite=True):
234 Creates a lucene document for extracted metadata
235 and calls self.index_content() to index the contents of the book.
238 # we don't remove snippets, since they might be still needed by
239 # threads using not reopened index
240 self.remove_book(book, remove_snippets=False)
242 book_doc = self.create_book_doc(book)
243 meta_fields = self.extract_metadata(book, book_info, dc_only=[
244 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
245 # let's not index it - it's only used for extracting publish date
246 if 'source_name' in meta_fields:
247 del meta_fields['source_name']
249 for n, f in meta_fields.items():
252 book_doc['uid'] = "book%s" % book_doc['book_id']
253 self.index.add(book_doc)
256 'title': meta_fields['title'],
257 'authors': meta_fields['authors'],
258 'published_date': meta_fields['published_date']
261 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
262 if tag_name in meta_fields:
263 book_fields[tag_name] = meta_fields[tag_name]
265 self.index_content(book, book_fields=book_fields)
270 'dramat_wierszowany_l',
271 'dramat_wierszowany_lp',
272 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
276 ignore_content_tags = [
277 'uwaga', 'extra', 'nota_red',
278 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
280 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
283 footnote_tags = ['pa', 'pt', 'pr', 'pe']
285 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
286 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
288 published_date_re = re.compile("([0-9]+)[\]. ]*$")
290 def extract_metadata(self, book, book_info=None, dc_only=None):
292 Extract metadata from book and returns a map of fields keyed by fieldname
296 if book_info is None:
297 book_info = dcparser.parse(open(book.xml_file.path))
299 fields['slug'] = book.slug
300 fields['tags'] = [t.name for t in book.tags]
301 fields['is_book'] = True
304 for field in dcparser.BookInfo.FIELDS:
305 if dc_only and field.name not in dc_only:
307 if hasattr(book_info, field.name):
308 if not getattr(book_info, field.name):
310 # since no type information is available, we use validator
311 type_indicator = field.validator
312 if type_indicator == dcparser.as_unicode:
313 s = getattr(book_info, field.name)
316 fields[field.name] = s
317 elif type_indicator == dcparser.as_person:
318 p = getattr(book_info, field.name)
319 if isinstance(p, dcparser.Person):
322 persons = ', '.join(map(unicode, p))
323 fields[field.name] = persons
324 elif type_indicator == dcparser.as_date:
325 dt = getattr(book_info, field.name)
326 fields[field.name] = dt
330 if hasattr(book_info, 'source_name') and book_info.source_name:
331 match = self.published_date_re.search(book_info.source_name)
332 if match is not None:
333 pd = str(match.groups()[0])
336 fields["published_date"] = pd
340 # def add_gaps(self, fields, fieldname):
342 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
343 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
347 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
348 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
350 def get_master(self, root):
352 Returns the first master tag from an etree.
354 for master in root.iter():
355 if master.tag in self.master_tags:
358 def index_content(self, book, book_fields):
360 Walks the book XML and extract content from it.
361 Adds parts for each header tag and for each fragment.
363 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
364 root = wld.edoc.getroot()
366 master = self.get_master(root)
371 if node.tag not in self.ignore_content_tags:
372 yield node, None, None
373 if node.text is not None:
374 yield None, node.text, None
375 for child in list(node):
376 for b, t, e in walker(child):
378 yield None, None, node
380 if node.tail is not None:
381 yield None, node.tail, None
384 def fix_format(text):
385 # separator = [u" ", u"\t", u".", u";", u","]
386 if isinstance(text, list):
387 # need to join it first
388 text = filter(lambda s: s is not None, content)
389 text = u' '.join(text)
390 # for i in range(len(text)):
392 # if text[i][0] not in separator\
393 # and text[i - 1][-1] not in separator:
394 # text.insert(i, u" ")
396 return re.sub("(?m)/$", "", text)
398 def add_part(snippets, **fields):
399 doc = self.create_book_doc(book)
400 for n, v in book_fields.items():
403 doc['header_index'] = fields["header_index"]
404 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
405 doc['header_type'] = fields['header_type']
407 doc['text'] = fields['text']
410 snip_pos = snippets.add(fields["text"])
412 doc['snippets_position'] = snip_pos[0]
413 doc['snippets_length'] = snip_pos[1]
414 if snippets.revision:
415 doc["snippets_revision"] = snippets.revision
417 if 'fragment_anchor' in fields:
418 doc["fragment_anchor"] = fields['fragment_anchor']
420 if 'themes' in fields:
421 doc['themes'] = fields['themes']
422 doc['uid'] = "part%s-%s-%s-%s" % (
423 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
427 snippets = Snippets(book.id).open('w')
429 for header, position in zip(list(master), range(len(master))):
431 if header.tag in self.skip_header_tags:
433 if header.tag is etree.Comment:
440 def all_content(text):
441 for frag in fragments.values():
442 frag['text'].append(text)
444 handle_text = [all_content]
446 for start, text, end in walker(header):
448 if start is not None and start.tag in self.footnote_tags:
451 def collect_footnote(t):
454 handle_text.append(collect_footnote)
455 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
457 doc = add_part(snippets, header_index=position, header_type=header.tag,
458 text=u''.join(footnote),
463 # handle fragments and themes.
464 if start is not None and start.tag == 'begin':
465 fid = start.attrib['id'][1:]
467 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
469 # themes for this fragment
470 elif start is not None and start.tag == 'motyw':
471 fid = start.attrib['id'][1:]
472 handle_text.append(lambda text: None)
473 if start.text is not None:
474 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
475 elif end is not None and end.tag == 'motyw':
478 elif start is not None and start.tag == 'end':
479 fid = start.attrib['id'][1:]
480 if fid not in fragments:
481 continue # a broken <end> node, skip it
482 frag = fragments[fid]
483 if not frag['themes']:
484 continue # empty themes list.
487 doc = add_part(snippets,
488 header_type=frag['start_header'],
489 header_index=frag['start_section'],
490 header_span=position - frag['start_section'] + 1,
492 text=fix_format(frag['text']),
493 themes=frag['themes'])
498 if text is not None and handle_text is not []:
499 hdl = handle_text[-1]
502 # in the end, add a section text.
503 doc = add_part(snippets, header_index=position,
504 header_type=header.tag, text=fix_format(content))
512 class SearchResult(object):
513 def __init__(self, doc, how_found=None, query_terms=None):
516 self._processed_hits = None # processed hits
518 self.query_terms = query_terms
522 self._score = doc['score']
526 self.book_id = int(doc["book_id"])
529 self.published_date = int(doc.get("published_date"))
531 self.published_date = 0
534 header_type = doc.get("header_type", None)
535 # we have a content hit in some header of fragment
536 if header_type is not None:
537 sec = (header_type, int(doc["header_index"]))
538 header_span = doc['header_span']
539 header_span = header_span is not None and int(header_span) or 1
540 fragment = doc.get("fragment_anchor", None)
541 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
542 snippets_rev = doc.get('snippets_revision', None)
544 hit = (sec + (header_span,), fragment, self._score, {
545 'how_found': how_found,
546 'snippets_pos': snippets_pos,
547 'snippets_revision': snippets_rev,
548 'themes': doc.get('themes', []),
549 'themes_pl': doc.get('themes_pl', [])
552 self._hits.append(hit)
554 def __unicode__(self):
555 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
556 (self.book_id, len(self._hits),
557 len(self._processed_hits) if self._processed_hits else -1,
558 self._score, len(self.snippets))
561 return unicode(self).encode('utf-8')
565 return self._score * self.boost
567 def merge(self, other):
568 if self.book_id != other.book_id:
569 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
570 self._hits += other._hits
571 if other.score > self.score:
572 self._score = other._score
576 if self._book is not None:
578 self._book = catalogue.models.Book.objects.get(id=self.book_id)
581 book = property(get_book)
592 if self._processed_hits is not None:
593 return self._processed_hits
595 # to sections and fragments
596 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
598 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
600 # sections not covered by fragments
601 sect = filter(lambda s: 0 == len(filter(
602 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
603 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
605 def remove_duplicates(lst, keyfn, compare):
610 if compare(els[eif], e) >= 1:
615 # remove fragments with duplicated fid's and duplicated snippets
616 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
617 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
618 # lambda a, b: cmp(a[SCORE], b[SCORE]))
620 # remove duplicate sections
624 si = s[self.POSITION][self.POSITION_INDEX]
627 if sections[si]['score'] >= s[self.SCORE]:
630 m = {'score': s[self.SCORE],
631 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
633 m.update(s[self.OTHER])
636 hits = sections.values()
640 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
641 except catalogue.models.Fragment.DoesNotExist:
644 # Figure out if we were searching for a token matching some word in theme name.
645 themes = frag.tags.filter(category='theme')
647 if self.query_terms is not None:
648 for i in range(0, len(f[self.OTHER]['themes'])):
649 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
650 tms = map(unicode.lower, tms)
651 for qt in self.query_terms:
653 themes_hit.add(f[self.OTHER]['themes'][i])
656 def theme_by_name(n):
657 th = filter(lambda t: t.name == n, themes)
662 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
664 m = {'score': f[self.SCORE],
666 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
668 'themes_hit': themes_hit
670 m.update(f[self.OTHER])
673 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
675 self._processed_hits = hits
680 def aggregate(*result_lists):
682 for rl in result_lists:
684 if r.book_id in books:
685 books[r.book_id].merge(r)
688 return books.values()
690 def __cmp__(self, other):
691 c = cmp(self.score, other.score)
693 # this is inverted, because earlier date is better
694 return cmp(other.published_date, self.published_date)
699 return len(self.hits)
701 def snippet_pos(self, idx=0):
702 return self.hits[idx]['snippets_pos']
704 def snippet_revision(self, idx=0):
706 return self.hits[idx]['snippets_revision']
707 except (IndexError, KeyError):
711 class Search(SolrIndex):
715 def __init__(self, default_field="text"):
716 super(Search, self).__init__(mode='r')
718 def make_term_query(self, query, field='text', modal=operator.or_):
720 Returns term queries joined by boolean query.
721 modal - applies to boolean query
722 fuzzy - should the query by fuzzy.
727 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
731 def search_words(self, words, fields, book=True):
736 q = self.index.Q(**{field: word})
737 if word_filter is None:
741 filters.append(word_filter)
743 query = self.index.query(is_book=True)
745 query = self.index.query()
746 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
747 return [SearchResult(found, how_found='search_words') for found in query.execute()]
749 def get_snippets(self, searchresult, query, field='text', num=1):
751 Returns a snippet for found scoreDoc.
753 maxnum = len(searchresult)
754 if num is None or num < 0 or num > maxnum:
756 book_id = searchresult.book_id
757 revision = searchresult.snippet_revision()
758 snippets = Snippets(book_id, revision=revision)
759 snips = [None] * maxnum
763 while idx < maxnum and num > 0:
764 position, length = searchresult.snippet_pos(idx)
765 if position is None or length is None:
767 text = snippets.get((int(position),
769 snip = self.index.highlight(text=text, field=field, q=query)
770 if snip not in snips:
777 book = catalogue.models.Book.objects.filter(id=book_id)
779 log.error("Book does not exist for book id = %d" % book_id)
780 elif not book.get().children.exists():
781 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
786 # remove verse end markers..
787 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
789 searchresult.snippets = snips
793 def hint_tags(self, query, pdcounter=True, prefix=True):
795 Return auto-complete hints for tags
799 query = query.strip()
800 for field in ['tag_name', 'tag_name_pl']:
802 q |= self.index.Q(**{field: query + "*"})
804 q |= self.make_term_query(query, field=field)
805 qu = self.index.query(q)
807 return self.search_tags(qu, pdcounter=pdcounter)
809 def search_tags(self, query, filters=None, pdcounter=False):
811 Search for Tag objects using query.
816 filters.append(~self.index.Q(is_pdcounter=True))
817 res = self.apply_filters(query, filters).execute()
823 is_pdcounter = doc.get('is_pdcounter', False)
824 category = doc.get('tag_category')
827 if category == 'pd_author':
828 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
829 elif category == 'pd_book':
830 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
831 tag.category = 'pd_book' # make it look more lik a tag.
834 print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
835 int(doc.get('tag_id')), category)).encode('utf-8')
838 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
841 except catalogue.models.Tag.DoesNotExist:
843 except PDCounterAuthor.DoesNotExist:
845 except PDCounterBook.DoesNotExist:
848 tags_slugs = set(map(lambda t: t.slug, tags))
849 tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
851 log.debug('search_tags: %s' % tags)
855 def hint_books(self, query, prefix=True):
857 Returns auto-complete hints for book titles
858 Because we do not index 'pseudo' title-tags.
862 query = query.strip()
864 q |= self.index.Q(title=query + "*")
865 q |= self.index.Q(title_orig=query + "*")
867 q |= self.make_term_query(query, field='title')
868 q |= self.make_term_query(query, field='title_orig')
869 qu = self.index.query(q)
870 only_books = self.index.Q(is_book=True)
871 return self.search_books(qu, [only_books])
873 def search_books(self, query, filters=None, max_results=10):
875 Searches for Book objects using query
879 query = query.query(is_book=True)
880 res = self.apply_filters(query, filters).field_limit(['book_id'])
884 if bid not in bks_found:
885 bks.append(catalogue.models.Book.objects.get(id=bid))
887 except catalogue.models.Book.DoesNotExist:
892 def apply_filters(query, filters):
894 Apply filters to a query
898 filters = filter(lambda x: x is not None, filters)
900 query = query.query(f)
904 if getattr(settings, 'SEARCH_MOCK', False):
905 from .mock_search import Search