1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from django.conf import settings
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
19 from wolnelektury.utils import makedirs
21 log = logging.getLogger('search')
23 if os.path.isfile(settings.SOLR_STOPWORDS):
25 line.decode('utf-8').strip()
26 for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
31 class SolrIndex(object):
32 def __init__(self, mode=None):
33 self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
36 class Snippets(object):
38 This class manages snippet files for indexed object (book)
39 the snippets are concatenated together, and their positions and
40 lengths are kept in lucene index fields.
42 SNIPPET_DIR = "snippets"
44 def __init__(self, book_id, revision=None):
45 makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
46 self.book_id = book_id
47 self.revision = revision
54 fn = "%d.%d" % (self.book_id, self.revision)
56 fn = "%d" % self.book_id
58 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60 def open(self, mode='r'):
62 Open the snippet file. Call .close() afterwards.
68 if os.path.exists(self.path):
71 if not os.path.exists(self.path):
75 self.file = open(self.path, mode)
79 def add(self, snippet):
81 Append a snippet (unicode) to the snippet file.
82 Return a (position, length) tuple
84 txt = snippet.encode('utf-8')
87 pos = (self.position, l)
93 Given a tuple of (position, length) return an unicode
94 of the snippet stored there.
96 self.file.seek(pos[0], 0)
97 txt = self.file.read(pos[1]).decode('utf-8')
101 """Close snippet file"""
117 class Index(SolrIndex):
119 Class indexing books.
122 super(Index, self).__init__(mode='rw')
124 def delete_query(self, *queries):
126 index.delete(queries=...) doesn't work, so let's reimplement it
127 using deletion of list of uids.
131 if isinstance(q, sunburnt.search.LuceneQuery):
132 q = self.index.query(q)
133 q.field_limiter.update(['uid'])
137 ids = q.paginate(start=st, rows=rows).execute()
144 self.index.delete(uids)
149 def index_tags(self, *tags, **kw):
151 Re-index global tag list.
152 Removes all tags from index, then index them again.
153 Indexed fields include: id, name (with and without polish stems), category
155 log.debug("Indexing tags")
156 remove_only = kw.get('remove_only', False)
157 # first, remove tags from index.
161 q_id = self.index.Q(tag_id=tag.id)
163 if isinstance(tag, PDCounterAuthor):
164 q_cat = self.index.Q(tag_category='pd_author')
165 elif isinstance(tag, PDCounterBook):
166 q_cat = self.index.Q(tag_category='pd_book')
168 q_cat = self.index.Q(tag_category=tag.category)
170 q_id_cat = self.index.Q(q_id & q_cat)
171 tag_qs.append(q_id_cat)
172 self.delete_query(*tag_qs)
174 q = self.index.Q(tag_id__any=True)
178 # then add them [all or just one passed]
181 catalogue.models.Tag.objects.exclude(category='set'),
182 PDCounterAuthor.objects.all(),
183 PDCounterBook.objects.all())
186 if isinstance(tag, PDCounterAuthor):
188 "tag_id": int(tag.id),
189 "tag_name": tag.name,
190 "tag_name_pl": tag.name,
191 "tag_category": 'pd_author',
192 "is_pdcounter": True,
193 "uid": "tag%d_pd_a" % tag.id
195 elif isinstance(tag, PDCounterBook):
197 "tag_id": int(tag.id),
198 "tag_name": tag.title,
199 "tag_name_pl": tag.title,
200 "tag_category": 'pd_book',
201 "is_pdcounter": True,
202 "uid": "tag%d_pd_b" % tag.id
206 "tag_id": int(tag.id),
207 "tag_name": tag.name,
208 "tag_name_pl": tag.name,
209 "tag_category": tag.category,
210 "is_pdcounter": False,
211 "uid": "tag%d" % tag.id
215 def create_book_doc(self, book):
217 Create a lucene document referring book id.
219 doc = {'book_id': int(book.id)}
220 if book.parent is not None:
221 doc['parent_id'] = int(book.parent.id)
224 def remove_book(self, book_or_id, remove_snippets=True):
225 """Removes a book from search index.
226 book - Book instance."""
227 if isinstance(book_or_id, catalogue.models.Book):
228 book_id = book_or_id.id
232 self.delete_query(self.index.Q(book_id=book_id))
235 snippets = Snippets(book_id)
238 def index_book(self, book, book_info=None, overwrite=True):
241 Creates a lucene document for extracted metadata
242 and calls self.index_content() to index the contents of the book.
245 # we don't remove snippets, since they might be still needed by
246 # threads using not reopened index
247 self.remove_book(book, remove_snippets=False)
249 book_doc = self.create_book_doc(book)
250 meta_fields = self.extract_metadata(book, book_info, dc_only=[
251 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
252 # let's not index it - it's only used for extracting publish date
253 if 'source_name' in meta_fields:
254 del meta_fields['source_name']
256 for n, f in meta_fields.items():
259 book_doc['uid'] = "book%s" % book_doc['book_id']
260 self.index.add(book_doc)
263 'title': meta_fields['title'],
264 'authors': meta_fields['authors'],
265 'published_date': meta_fields['published_date']
268 for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
269 if tag_name in meta_fields:
270 book_fields[tag_name] = meta_fields[tag_name]
272 self.index_content(book, book_fields=book_fields)
277 'dramat_wierszowany_l',
278 'dramat_wierszowany_lp',
279 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
283 ignore_content_tags = [
284 'uwaga', 'extra', 'nota_red', 'abstrakt',
285 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
287 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
290 footnote_tags = ['pa', 'pt', 'pr', 'pe']
292 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
293 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
295 published_date_re = re.compile("([0-9]+)[\]. ]*$")
297 def extract_metadata(self, book, book_info=None, dc_only=None):
299 Extract metadata from book and returns a map of fields keyed by fieldname
303 if book_info is None:
304 book_info = dcparser.parse(open(book.xml_file.path))
306 fields['slug'] = book.slug
307 fields['tags'] = [t.name for t in book.tags]
308 fields['is_book'] = True
311 for field in dcparser.BookInfo.FIELDS:
312 if dc_only and field.name not in dc_only:
314 if hasattr(book_info, field.name):
315 if not getattr(book_info, field.name):
317 # since no type information is available, we use validator
318 type_indicator = field.validator
319 if type_indicator == dcparser.as_unicode:
320 s = getattr(book_info, field.name)
323 fields[field.name] = s
324 elif type_indicator == dcparser.as_person:
325 p = getattr(book_info, field.name)
326 if isinstance(p, dcparser.Person):
329 persons = ', '.join(map(unicode, p))
330 fields[field.name] = persons
331 elif type_indicator == dcparser.as_date:
332 dt = getattr(book_info, field.name)
333 fields[field.name] = dt
337 if hasattr(book_info, 'source_name') and book_info.source_name:
338 match = self.published_date_re.search(book_info.source_name)
339 if match is not None:
340 pd = str(match.groups()[0])
343 fields["published_date"] = pd
347 # def add_gaps(self, fields, fieldname):
349 # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
350 # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
354 # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
355 # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
357 def get_master(self, root):
359 Returns the first master tag from an etree.
361 for master in root.iter():
362 if master.tag in self.master_tags:
365 def index_content(self, book, book_fields):
367 Walks the book XML and extract content from it.
368 Adds parts for each header tag and for each fragment.
370 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
371 root = wld.edoc.getroot()
373 master = self.get_master(root)
378 if node.tag not in self.ignore_content_tags:
379 yield node, None, None
380 if node.text is not None:
381 yield None, node.text, None
382 for child in list(node):
383 for b, t, e in walker(child):
385 yield None, None, node
387 if node.tail is not None:
388 yield None, node.tail, None
391 def fix_format(text):
392 # separator = [u" ", u"\t", u".", u";", u","]
393 if isinstance(text, list):
394 # need to join it first
395 text = filter(lambda s: s is not None, content)
396 text = u' '.join(text)
397 # for i in range(len(text)):
399 # if text[i][0] not in separator\
400 # and text[i - 1][-1] not in separator:
401 # text.insert(i, u" ")
403 return re.sub("(?m)/$", "", text)
405 def add_part(snippets, **fields):
406 doc = self.create_book_doc(book)
407 for n, v in book_fields.items():
410 doc['header_index'] = fields["header_index"]
411 doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
412 doc['header_type'] = fields['header_type']
414 doc['text'] = fields['text']
417 snip_pos = snippets.add(fields["text"])
419 doc['snippets_position'] = snip_pos[0]
420 doc['snippets_length'] = snip_pos[1]
421 if snippets.revision:
422 doc["snippets_revision"] = snippets.revision
424 if 'fragment_anchor' in fields:
425 doc["fragment_anchor"] = fields['fragment_anchor']
427 if 'themes' in fields:
428 doc['themes'] = fields['themes']
429 doc['uid'] = "part%s-%s-%s-%s" % (
430 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
434 snippets = Snippets(book.id).open('w')
436 for header, position in zip(list(master), range(len(master))):
438 if header.tag in self.skip_header_tags:
440 if header.tag is etree.Comment:
447 def all_content(text):
448 for frag in fragments.values():
449 frag['text'].append(text)
451 handle_text = [all_content]
453 for start, text, end in walker(header):
455 if start is not None and start.tag in self.footnote_tags:
458 def collect_footnote(t):
461 handle_text.append(collect_footnote)
462 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
464 doc = add_part(snippets, header_index=position, header_type=header.tag,
465 text=u''.join(footnote),
470 # handle fragments and themes.
471 if start is not None and start.tag == 'begin':
472 fid = start.attrib['id'][1:]
474 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
476 # themes for this fragment
477 elif start is not None and start.tag == 'motyw':
478 fid = start.attrib['id'][1:]
479 handle_text.append(lambda text: None)
480 if start.text is not None:
481 fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
482 elif end is not None and end.tag == 'motyw':
485 elif start is not None and start.tag == 'end':
486 fid = start.attrib['id'][1:]
487 if fid not in fragments:
488 continue # a broken <end> node, skip it
489 frag = fragments[fid]
490 if not frag['themes']:
491 continue # empty themes list.
494 doc = add_part(snippets,
495 header_type=frag['start_header'],
496 header_index=frag['start_section'],
497 header_span=position - frag['start_section'] + 1,
499 text=fix_format(frag['text']),
500 themes=frag['themes'])
505 if text is not None and handle_text is not []:
506 hdl = handle_text[-1]
509 # in the end, add a section text.
510 doc = add_part(snippets, header_index=position,
511 header_type=header.tag, text=fix_format(content))
519 class SearchResult(object):
520 def __init__(self, doc, how_found=None, query_terms=None):
523 self._processed_hits = None # processed hits
525 self.query_terms = query_terms
529 self._score = doc['score']
533 self.book_id = int(doc["book_id"])
536 self.published_date = int(doc.get("published_date"))
538 self.published_date = 0
541 header_type = doc.get("header_type", None)
542 # we have a content hit in some header of fragment
543 if header_type is not None:
544 sec = (header_type, int(doc["header_index"]))
545 header_span = doc['header_span']
546 header_span = header_span is not None and int(header_span) or 1
547 fragment = doc.get("fragment_anchor", None)
548 snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549 snippets_rev = doc.get('snippets_revision', None)
551 hit = (sec + (header_span,), fragment, self._score, {
552 'how_found': how_found,
553 'snippets_pos': snippets_pos,
554 'snippets_revision': snippets_rev,
555 'themes': doc.get('themes', []),
556 'themes_pl': doc.get('themes_pl', [])
559 self._hits.append(hit)
562 def from_book(cls, book, how_found=None, query_terms=None):
564 'score': book.popularity.count,
568 result = cls(doc, how_found=how_found, query_terms=query_terms)
572 def __unicode__(self):
573 return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
574 (self.book_id, len(self._hits),
575 len(self._processed_hits) if self._processed_hits else -1,
576 self._score, len(self.snippets))
579 return unicode(self).encode('utf-8')
583 return self._score * self.boost
585 def merge(self, other):
586 if self.book_id != other.book_id:
587 raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
588 self._hits += other._hits
589 self._score += max(other._score, 0)
593 if self._book is not None:
595 self._book = catalogue.models.Book.objects.get(id=self.book_id)
598 book = property(get_book)
609 if self._processed_hits is not None:
610 return self._processed_hits
612 # to sections and fragments
613 frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
615 sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
617 # sections not covered by fragments
618 sect = filter(lambda s: 0 == len(filter(
619 lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
620 f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
622 def remove_duplicates(lst, keyfn, compare):
627 if compare(els[eif], e) >= 1:
632 # remove fragments with duplicated fid's and duplicated snippets
633 frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
634 # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
635 # lambda a, b: cmp(a[SCORE], b[SCORE]))
637 # remove duplicate sections
641 si = s[self.POSITION][self.POSITION_INDEX]
644 if sections[si]['score'] >= s[self.SCORE]:
647 m = {'score': s[self.SCORE],
648 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
650 m.update(s[self.OTHER])
653 hits = sections.values()
657 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
658 except catalogue.models.Fragment.DoesNotExist:
661 # Figure out if we were searching for a token matching some word in theme name.
662 themes = frag.tags.filter(category='theme')
664 if self.query_terms is not None:
665 for i in range(0, len(f[self.OTHER]['themes'])):
666 tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
667 tms = map(unicode.lower, tms)
668 for qt in self.query_terms:
670 themes_hit.add(f[self.OTHER]['themes'][i])
673 def theme_by_name(n):
674 th = filter(lambda t: t.name == n, themes)
679 themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
681 m = {'score': f[self.SCORE],
683 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
685 'themes_hit': themes_hit
687 m.update(f[self.OTHER])
690 hits.sort(key=lambda h: h['score'], reverse=True)
692 self._processed_hits = hits
697 def aggregate(*result_lists):
699 for rl in result_lists:
701 if r.book_id in books:
702 books[r.book_id].merge(r)
705 return books.values()
707 def __cmp__(self, other):
708 c = cmp(self.score, other.score)
710 # this is inverted, because earlier date is better
711 return cmp(other.published_date, self.published_date)
716 return len(self.hits)
718 def snippet_pos(self, idx=0):
719 return self.hits[idx]['snippets_pos']
721 def snippet_revision(self, idx=0):
723 return self.hits[idx]['snippets_revision']
724 except (IndexError, KeyError):
728 class Search(SolrIndex):
732 def __init__(self, default_field="text"):
733 super(Search, self).__init__(mode='r')
735 def make_term_query(self, query, field='text', modal=operator.or_):
737 Returns term queries joined by boolean query.
738 modal - applies to boolean query
739 fuzzy - should the query by fuzzy.
744 q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
748 def search_by_author(self, words):
749 from catalogue.models import Book
750 books = Book.objects.filter(parent=None).order_by('-popularity__count')
752 books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
753 return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
755 def search_words(self, words, fields, book=True):
756 if book and fields == ['authors']:
757 return self.search_by_author(words)
760 if book or (word not in stopwords):
763 q = self.index.Q(**{field: word})
764 if word_filter is None:
768 filters.append(word_filter)
772 query = self.index.query(is_book=True)
774 query = self.index.query()
775 query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
776 return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
778 def get_snippets(self, searchresult, query, field='text', num=1):
780 Returns a snippet for found scoreDoc.
782 maxnum = len(searchresult)
783 if num is None or num < 0 or num > maxnum:
785 book_id = searchresult.book_id
786 revision = searchresult.snippet_revision()
787 snippets = Snippets(book_id, revision=revision)
788 snips = [None] * maxnum
792 while idx < maxnum and num > 0:
793 position, length = searchresult.snippet_pos(idx)
794 if position is None or length is None:
796 text = snippets.get((int(position),
798 snip = self.index.highlight(text=text, field=field, q=query)
799 if snip not in snips:
806 book = catalogue.models.Book.objects.filter(id=book_id)
808 log.error("Book does not exist for book id = %d" % book_id)
809 elif not book.get().children.exists():
810 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
815 # remove verse end markers..
816 snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
818 searchresult.snippets = snips
823 def apply_filters(query, filters):
825 Apply filters to a query
829 filters = filter(lambda x: x is not None, filters)
831 query = query.query(f)
835 if getattr(settings, 'SEARCH_MOCK', False):
836 from .mock_search import Search